In [None]:
!pip install langchain

In [None]:
import torch
from transformers import BitsAndBytesConfig
from transformers import AutoTokenizer, AutoModelForCausulLm, pipeline
from langchain.llms.huggingface_pippeline import HuggingFacePipeline

In [None]:
model_name: str = "microsoft/phi-2"

nf4_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type = "nf4",
    bnb_4bit_use_double_quant = True,
    bnb_4bit_compute_dtype = torch.bfloat16
)

model = AutoModelForCausulLm.from_pretrained(
    model_name,
    quantization_config = nf4_config,
    low_cpu_mem_usage = True
)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
max_new_token = 1024

model_pipeline = pipeline(
    "text_generation",
    model = model,
    tokenizer = tokenizer,
    max_new_token = max_new_token,
    pad_token_id = tokenizer.eos_token_id
)

In [None]:
gen_kwargs = {
    "temperature": 0.6
}

llm = HuggingFacePipeline(
    pipeline = model_pipeline,
    model_kwargs = gen_kwargs
)

In [None]:
llm.invoke("Hello, How are you")

#### Langchain: ChatPromptTemplate

In [None]:
from langchain_core.prompts import PromptTemplate

prompt_template = PromptTemplate.from_template(
    """Instruct: {prompt}\nOutput:"
    """
)

user_prompt = "Write a detailed analyze ......."

messages = prompt_template.format(promt=user_prompt)

In [None]:
output = llm.invoke(messages)
print(output)

#### LangChain LLM

In [None]:
chain = prompt_template | llm

output = chain.invoke({"prompt": "Write a detailed analyze ......."})

#### LangChain: OutputParser

In [None]:
#Output parser to JSON
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field

class Joke(BaseModel): 
    setup: str = Field(description="Question to setup a joke")
    puncline: str = Field(description="answer to reslove a joke")
    
parser = JsonOutputParser(pydantic_object=Joke)

In [None]:
from langchain_core.prompts import PromptTemplate

prompt = PromptTemplate(
    template="Answer the user query.\n{format_instruction}\n{query}\n"
    input_variables=["query"]
    partial_variables={"format_instruction": parser.get_format_instruction()}
)

In [None]:
chain = prompt | llm

In [None]:
joke_query = "Tell me a fun joke."

output = chain.invoke({"query": joke_query})
print(output)

In [None]:
parser_output = parser.invoke(output)
print(parser_output)

#### LangChain: Document Loader

In [None]:
from langchain_community.document_loaders import PyPDFLoader, PyPDFDirectoryLoader, WebBaseLoader

In [None]:
#Get by url
url = "https://arxiv.org/pdf/2310.11511"

pdf_loader = PyPDFLoader(url, extract_images=True)
docs = pdf_loader.load()

In [None]:
#Get in directory
path_dir = "./data_source"

pdf_loader = PyPDFDirectoryLoader(path_dir)
docs = pdf_loader.load()

In [None]:
#Get from web page
import bs4

web_paths = ["https://www.businessinsider.com/trump-doge-layoffs-probationary-employees-2025-2"]

classes = ['post-content', 'post-title', 'post-header', 'page-content']

bs4_strainer = bs4.SoupStrainer(class_=classes)

web_loader = WebBaseLoader(
    web_paths = web_paths,
    bs_kwargs = dict(
        parse_only=bs4_strainer
    )
)

docs = web_loader.load()

In [None]:
docs

#### LangChain: Document Splitter (Chunk)

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [None]:
chunk_size = 300
chunk_overlap = 30

separators: List[str] = ['\n\n', '\n', ' ', '']

In [None]:
char_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    length_function=len,
    is_seperator_regex=False,
    separators=separators
)

In [None]:
text_out = char_splitter.split_text(text_data)

print(text_out)

#### Embedding model

In [None]:
from langchain_community.embeddings import HuggingFaceEmbeddings

In [None]:
embeddings = HuggingFaceEmbeddings()

In [None]:
text = "This is a test document."

In [None]:
vector = embeddings.embed_query(text)
print(vector)

#### LangChain: Vector Database

In [None]:
%pip install langchain_chroma

In [None]:
from langchain_chroma import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader

In [None]:
pdf_url = "https://arxiv.org/pdf/2310.11511"

pdf_loader = PyPDFLoader(pdf_url)
pdf_pages = pdf_loader.load()

In [None]:
chunk_size = 300
chunk_overlap = 30

splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    length_function=len,
    is_seperator_regex=False,
)

docs = splitter.split_document(pdf_pages)

In [None]:
embedding_model = HuggingFaceEmbeddings()

In [None]:
chroma_db = Chroma.from_documents(docs, embedding=embedding_model)

In [None]:
query = "What is multimodal Large Language Model"
similar_doc = chroma_db.similarity_search(query, k=4)
print(similar_doc)