In [10]:
# %%capture
# #After executing the cell,please RESTART the kernel and run all the cells.
# !pip install --user "ibm-watsonx-ai==1.0.10"
# !pip install --user "langchain==0.2.6" 
# !pip install --user "langchain-ibm==0.1.8"
# !pip install --user "langchain-community==0.2.1"
# !pip install --user pypdf
# !pip install --user chromadb

## Task 1: Load document using LangChain for different sources (10 points)

In [9]:
from langchain_community.document_loaders import PyPDFLoader

pdf_url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/WgM1DaUn2SYPcCg_It57tA/A-Comprehensive-Review-of-Low-Rank-Adaptation-in-Large-Language-Models-for-Efficient-Parameter-Tuning-1.pdf"
loader = PyPDFLoader(pdf_url)
pages = loader. load_and_split()

full_content = ''.join([page.page_content for page in pages])

print(full_content[:1000])

A Comprehensive Review of Low-Rank
Adaptation in Large Language Models for
Efficient Parameter Tuning
September 10, 2024
Abstract
Natural Language Processing (NLP) often involves pre-training large
models on extensive datasets and then adapting them for specific tasks
through fine-tuning. However, as these models grow larger, like GPT-3
with 175 billion parameters, fully fine-tuning them becomes computa-
tionally expensive. We propose a novel method called LoRA (Low-Rank
Adaptation) that significantly reduces the overhead by freezing the orig-
inal model weights and only training small rank decomposition matrices.
This leads to up to 10,000 times fewer trainable parameters and reduces
GPU memory usage by three times. LoRA not only maintains but some-
times surpasses fine-tuning performance on models like RoBERTa, De-
BERTa, GPT-2, and GPT-3. Unlike other methods, LoRA introduces
no extra latency during inference, making it more efficient for practical
applications. All relevant code an

## Task 2: Apply text splitting techniques (10 points)

In [6]:
from langchain.text_splitter import Language, RecursiveCharacterTextSplitter

latex_text = """
    \documentclass{article}
    
    \begin{document}
    
    \maketitle
    
    \section{Introduction}
    Large language models (LLMs) are a type of machine learning model that can be trained on vast amounts of text data to generate human-like language. In recent years, LLMs have made significant advances in a variety of natural language processing tasks, including language translation, text generation, and sentiment analysis.
    
    \subsection{History of LLMs}
    The earliest LLMs were developed in the 1980s and 1990s, but they were limited by the amount of data that could be processed and the computational power available at the time. In the past decade, however, advances in hardware and software have made it possible to train LLMs on massive datasets, leading to significant improvements in performance.
    
    \subsection{Applications of LLMs}
    LLMs have many applications in industry, including chatbots, content creation, and virtual assistants. They can also be used in academia for research in linguistics, psychology, and computational linguistics.
    
    \end{document}
"""

latex_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.LATEX, chunk_size=60, chunk_overlap=0
)
latex_docs = latex_splitter.create_documents([latex_text])
latex_docs

  latex_text = """


[Document(metadata={}, page_content='\\documentclass{article}\n    \n    \x08egin{document}'),
 Document(metadata={}, page_content='\\maketitle\n    \n    \\section{Introduction}\n    Large'),
 Document(metadata={}, page_content='language models (LLMs) are a type of machine learning model'),
 Document(metadata={}, page_content='that can be trained on vast amounts of text data to'),
 Document(metadata={}, page_content='generate human-like language. In recent years, LLMs have'),
 Document(metadata={}, page_content='made significant advances in a variety of natural language'),
 Document(metadata={}, page_content='processing tasks, including language translation, text'),
 Document(metadata={}, page_content='generation, and sentiment analysis.'),
 Document(metadata={}, page_content='\\subsection{History of LLMs}\n    The earliest LLMs were'),
 Document(metadata={}, page_content='developed in the 1980s and 1990s, but they were limited by'),
 Document(metadata={}, page_content='the amount of 

## Task 3: Embed documents (10 points)

In [1]:
from ibm_watsonx_ai.metanames import EmbedTextParamsMetaNames
from langchain_ibm import WatsonxEmbeddings

embed_params = {
    EmbedTextParamsMetaNames.TRUNCATE_INPUT_TOKENS: 3,
    EmbedTextParamsMetaNames.RETURN_OPTIONS: {"input_text": True},
}

watsonx_embedding = WatsonxEmbeddings(
    model_id="ibm/slate-30m-english-rtrvr",
    url="https://us-south.ml.cloud.ibm.com",
    project_id="skills-network",
    params=embed_params,
)

query = "How are you?"
query_result = watsonx_embedding.embed_documents([query])

query_result[0][:5]

[0.018171897, -0.018608226, 0.059054308, 0.07260351, 0.08736516]

## Task 4: Create and configure vector databases to store embeddings (10 points)

In [11]:
from langchain.vectorstores import Chroma
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

!wget "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/Ec5f3KYU1CpbKRp1whFLZw/new-Policies.txt"
loader = TextLoader("new-Policies.txt")
data = loader.load()

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=20,
    length_function=len,
)
chunks = text_splitter.split_documents(data)

vectordb = Chroma.from_documents(chunks, watsonx_embedding)

query = "Smoking policy"
docs = vectordb.similarity_search(query, k=5)
docs

--2025-04-06 08:35:38--  https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/Ec5f3KYU1CpbKRp1whFLZw/new-Policies.txt
Resolving cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud (cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud)... 169.63.118.104
Connecting to cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud (cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud)|169.63.118.104|:443... connected.
200 OKequest sent, awaiting response... 
Length: 6363 (6.2K) [text/plain]
Saving to: ‘new-Policies.txt.5’


2025-04-06 08:35:38 (704 MB/s) - ‘new-Policies.txt.5’ saved [6363/6363]



[Document(metadata={'source': 'new-Policies.txt'}, page_content='This policy encourages the responsible use of mobile devices in line with legal and ethical'),
 Document(metadata={'source': 'new-Policies.txt'}, page_content='This policy encourages the responsible use of mobile devices in line with legal and ethical'),
 Document(metadata={'source': 'new-Policies.txt'}, page_content='This policy promotes the safe and responsible use of digital communication tools in line with our'),
 Document(metadata={'source': 'new-Policies.txt'}, page_content='This policy promotes the safe and responsible use of digital communication tools in line with our'),
 Document(metadata={'source': 'new-Policies.txt'}, page_content='This policy lays the foundation for a diverse, inclusive, and talented workforce. It ensures that')]

## Task 5: Develop a retriever to fetch document segments based on queries (10 points)

In [5]:
retriever = vectordb.as_retriever(search_kwargs={"k": 2})
query = "Email policy"
docs = retriever.invoke(query)
docs

[Document(metadata={'source': 'new-Policies.txt'}, page_content='and email use, including copyright and data protection laws.'),
 Document(metadata={'source': 'new-Policies.txt'}, page_content='This policy encourages the responsible use of mobile devices in line with legal and ethical')]