## Text Loader

In [1]:
from langchain.document_loaders import TextLoader

In [2]:
loader = TextLoader("nvda_news_1.txt")
data = loader.load()

In [3]:
data[0]



In [4]:
data[0].metadata

{'source': 'nvda_news_1.txt'}

## CSV Loader

In [5]:
from langchain.document_loaders.csv_loader import CSVLoader

loader = CSVLoader("movies.csv", source_column="title")
data = loader.load()
len(data)

9

In [6]:
data[0].page_content

'movie_id: 101\ntitle: K.G.F: Chapter 2\nindustry: Bollywood\nrelease_year: 2022\nimdb_rating: 8.4\nstudio: Hombale Films\nlanguage_id: 3\nbudget: 1\nrevenue: 12.5\nunit: Billions\ncurrency: INR'

In [7]:
data[0]

Document(page_content='movie_id: 101\ntitle: K.G.F: Chapter 2\nindustry: Bollywood\nrelease_year: 2022\nimdb_rating: 8.4\nstudio: Hombale Films\nlanguage_id: 3\nbudget: 1\nrevenue: 12.5\nunit: Billions\ncurrency: INR', metadata={'source': 'K.G.F: Chapter 2', 'row': 0})

In [8]:
#installing necessary libraries, libmagic is used for file type detection
!conda install -c conda-forge libmagic -y
!pip install unstructured

Collecting package metadata (current_repodata.json): done
Solving environment: done


  current version: 23.7.2
  latest version: 23.11.0

Please update conda by running

    $ conda update -n base -c defaults conda

Or to minimize the number of packages updated during conda update use

     conda install conda=23.11.0



# All requested packages already installed.



In [9]:
from langchain.document_loaders import UnstructuredURLLoader

loader = UnstructuredURLLoader(
    urls = [
        "https://www.moneycontrol.com/news/business/banks/hdfc-bank-re-appoints-sanmoy-chakrabarti-as-chief-risk-officer-11259771.html",
        "https://www.moneycontrol.com/news/business/markets/market-corrects-post-rbi-ups-inflation-forecast-icrr-bet-on-these-top-10-rate-sensitive-stocks-ideas-11142611.html"
    ]
)

data = loader.load()
len(data)

2

In [10]:
data[0]

Document(page_content='English\n\nHindi\n\nGujarati\n\nSpecials\n\nTrending Stocks\n\nTata Power\xa0INE245A01021, TATAPOWER, 500400\n\nTata Tech\xa0INE142M01025, TATATECH, 544028\n\nAdani Enterpris\xa0INE423A01024, ADANIENT, 512599\n\nAdani Power\xa0INE814H01011, ADANIPOWER, 533096\n\nSuzlon Energy\xa0INE040H01021, SUZLON, 532667\n\nCheck your Credit Score here!\n\nQuotes\n\nMutual Funds\n\nCommodities\n\nFutures & Options\n\nCurrency\n\nNews\n\nCryptocurrency\n\nForum\n\nNotices\n\nVideos\n\nGlossary\n\nAll\n\nHello, LoginHello, LoginLog-inor Sign-UpMy AccountMy Profile My PortfolioMy WatchlistMy Credit Score₹100 CashbackMy FeedMy MessagesMy AlertsMy Profile My PROMy PortfolioMy WatchlistMy Credit Score₹100 CashbackMy FeedMy MessagesMy AlertsLogoutChat with UsDownload AppFollow us on:\n\nPremium\n\nMy Feed\n\nMarketsHOMEINDIAN INDICESSTOCK ACTIONAll StatsTop GainersTop LosersOnly BuyersOnly Sellers52 Week High52 Week LowPrice ShockersVolume ShockersMost Active StocksGLOBAL MARKETSUS M

## Text Splitters
Why do we need text splitters in first place?

LLM's have token limits. Hence we need to split the text which can be large into small chunks so that each chunk size is under the token limit. There are various text splitter classes in langchain that allows us to do this.

In [11]:
# Taking some random text from wikipedia

text = """Interstellar is a 2014 epic science fiction film co-written, directed, and produced by Christopher Nolan. 
It stars Matthew McConaughey, Anne Hathaway, Jessica Chastain, Bill Irwin, Ellen Burstyn, Matt Damon, and Michael Caine. 
Set in a dystopian future where humanity is embroiled in a catastrophic blight and famine, the film follows a group of astronauts who travel through a wormhole near Saturn in search of a new home for humankind.

Brothers Christopher and Jonathan Nolan wrote the screenplay, which had its origins in a script Jonathan developed in 2007 and was originally set to be directed by Steven Spielberg. 
Kip Thorne, a Caltech theoretical physicist and 2017 Nobel laureate in Physics,[4] was an executive producer, acted as a scientific consultant, and wrote a tie-in book, The Science of Interstellar. 
Cinematographer Hoyte van Hoytema shot it on 35 mm movie film in the Panavision anamorphic format and IMAX 70 mm. Principal photography began in late 2013 and took place in Alberta, Iceland, and Los Angeles. 
Interstellar uses extensive practical and miniature effects, and the company Double Negative created additional digital effects.

Interstellar premiered in Los Angeles on October 26, 2014. In the United States, it was first released on film stock, expanding to venues using digital projectors. The film received generally positive reviews from critics and grossed over $677 million worldwide ($715 million after subsequent re-releases), making it the tenth-highest-grossing film of 2014. 
It has been praised by astronomers for its scientific accuracy and portrayal of theoretical astrophysics.[5][6][7] Interstellar was nominated for five awards at the 87th Academy Awards, winning Best Visual Effects, and received numerous other accolades."""

Manual approach of splitting the text into chunks

In [12]:
# Say LLM token limit is 100, in that case we can do simple thing such as this

text[0:100]

'Interstellar is a 2014 epic science fiction film co-written, directed, and produced by Christopher N'

In [13]:
# Well but we want complete words and want to do this for entire text, may be we can use Python's split funciton

words = text.split(" ")
len(words)

264

In [14]:
chunks = []

s = ""
for word in words:
    s += word + " "
    if len(s)>200:
        chunks.append(s)
        s = ""
        
chunks.append(s)

In [15]:
chunks[:2]

['Interstellar is a 2014 epic science fiction film co-written, directed, and produced by Christopher Nolan. \nIt stars Matthew McConaughey, Anne Hathaway, Jessica Chastain, Bill Irwin, Ellen Burstyn, Matt ',
 'Damon, and Michael Caine. \nSet in a dystopian future where humanity is embroiled in a catastrophic blight and famine, the film follows a group of astronauts who travel through a wormhole near Saturn in ']

Splitting data into chunks can be done in native python but it is a tidious process. Also if necessary, you may need to experiment with various delimiters in an iterative manner to ensure that each chunk does not exceed the token length limit of the respective LLM.

Langchain provides a better way through text splitter classes.

Using Text Splitter Classes from Langchain
## CharacterTextSplitter

In [16]:
from langchain.text_splitter import CharacterTextSplitter

splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_size=200,
    chunk_overlap=0
)

In [17]:
chunks = splitter.split_text(text)
len(chunks)

Created a chunk of size 210, which is longer than the specified 200
Created a chunk of size 208, which is longer than the specified 200
Created a chunk of size 358, which is longer than the specified 200


9

In [18]:
for chunk in chunks:
    print(len(chunk))

105
120
210
181
197
207
128
357
253


As you can see, all though we gave 200 as a chunk size since the split was based on \n, it ended up creating chunks that are bigger than size 200.

Another class from Langchain can be used to recursively split the text based on a list of separators. This class is RecursiveTextSplitter. Let's see how it works

## RecursiveTextSplitter

In [19]:
text

'Interstellar is a 2014 epic science fiction film co-written, directed, and produced by Christopher Nolan. \nIt stars Matthew McConaughey, Anne Hathaway, Jessica Chastain, Bill Irwin, Ellen Burstyn, Matt Damon, and Michael Caine. \nSet in a dystopian future where humanity is embroiled in a catastrophic blight and famine, the film follows a group of astronauts who travel through a wormhole near Saturn in search of a new home for humankind.\n\nBrothers Christopher and Jonathan Nolan wrote the screenplay, which had its origins in a script Jonathan developed in 2007 and was originally set to be directed by Steven Spielberg. \nKip Thorne, a Caltech theoretical physicist and 2017 Nobel laureate in Physics,[4] was an executive producer, acted as a scientific consultant, and wrote a tie-in book, The Science of Interstellar. \nCinematographer Hoyte van Hoytema shot it on 35 mm movie film in the Panavision anamorphic format and IMAX 70 mm. Principal photography began in late 2013 and took place 

In [20]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

r_splitter = RecursiveCharacterTextSplitter(
    separators = ["\n\n", "\n", " "],  # List of separators based on requirement (defaults to ["\n\n", "\n", " "])
    chunk_size = 200,  # size of each chunk created
    chunk_overlap  = 0,  # size of  overlap between chunks in order to maintain the context
    length_function = len  # Function to calculate size, currently we are using "len" which denotes length of string however you can pass any token counter)
)

In [21]:
chunks = r_splitter.split_text(text)

for chunk in chunks:
    print(len(chunk))

105
120
199
10
181
197
198
8
128
191
165
198
54


Let's understand how exactly it formed these chunks

In [22]:
first_split = text.split("\n\n")[0]
first_split

'Interstellar is a 2014 epic science fiction film co-written, directed, and produced by Christopher Nolan. \nIt stars Matthew McConaughey, Anne Hathaway, Jessica Chastain, Bill Irwin, Ellen Burstyn, Matt Damon, and Michael Caine. \nSet in a dystopian future where humanity is embroiled in a catastrophic blight and famine, the film follows a group of astronauts who travel through a wormhole near Saturn in search of a new home for humankind.'

In [23]:
len(first_split)

439

Recursive text splitter uses a list of separators, i.e. separators = ["\n\n", "\n", "."]

So now it will first split using \n\n and then if the resulting chunk size is greater than the chunk_size parameter which is 200 in our case, then it will use the next separator which is \n

In [24]:
second_split = first_split.split("\n")
second_split

['Interstellar is a 2014 epic science fiction film co-written, directed, and produced by Christopher Nolan. ',
 'It stars Matthew McConaughey, Anne Hathaway, Jessica Chastain, Bill Irwin, Ellen Burstyn, Matt Damon, and Michael Caine. ',
 'Set in a dystopian future where humanity is embroiled in a catastrophic blight and famine, the film follows a group of astronauts who travel through a wormhole near Saturn in search of a new home for humankind.']

In [25]:
for split in second_split:
    print(len(split))

106
121
210


Third split exceeds chunk size 200. Now it will further try to split that using the third separator which is ' ' (space)

In [26]:
second_split[2]

'Set in a dystopian future where humanity is embroiled in a catastrophic blight and famine, the film follows a group of astronauts who travel through a wormhole near Saturn in search of a new home for humankind.'

## FAISS

In [27]:
#Install Packages
!pip install faiss-cpu
!pip install sentence-transformers

Collecting sentence-transformers
  Using cached sentence_transformers-2.2.2-py3-none-any.whl
Collecting transformers<5.0.0,>=4.6.0 (from sentence-transformers)
  Using cached transformers-4.35.2-py3-none-any.whl.metadata (123 kB)
Collecting torch>=1.6.0 (from sentence-transformers)
  Using cached torch-2.1.1-cp310-none-macosx_11_0_arm64.whl.metadata (25 kB)
Collecting torchvision (from sentence-transformers)
  Using cached torchvision-0.16.1-cp310-cp310-macosx_11_0_arm64.whl.metadata (6.6 kB)
Collecting scikit-learn (from sentence-transformers)
  Using cached scikit_learn-1.3.2-cp310-cp310-macosx_12_0_arm64.whl.metadata (11 kB)
Collecting scipy (from sentence-transformers)
  Using cached scipy-1.11.4-cp310-cp310-macosx_12_0_arm64.whl.metadata (112 kB)
Collecting sentencepiece (from sentence-transformers)
  Using cached sentencepiece-0.1.99-cp310-cp310-macosx_11_0_arm64.whl (1.2 MB)
Collecting huggingface-hub>=0.4.0 (from sentence-transformers)
  Using cached huggingface_hub-0.19.4-py3-

In [28]:
# import necessary libraries
import pandas as pd
pd.set_option('display.max_colwidth', 100)

In [29]:
df = pd.read_csv("sample_text.csv")
df.shape

(8, 2)

In [30]:
df

Unnamed: 0,text,category
0,Meditation and yoga can improve mental health,Health
1,"Fruits, whole grains and vegetables helps control blood pressure",Health
2,These are the latest fashion trends for this week,Fashion
3,Vibrant color jeans for male are becoming a trend,Fashion
4,The concert starts at 7 PM tonight,Event
5,Navaratri dandiya program at Expo center in Mumbai this october,Event
6,Exciting vacation destinations for your next trip,Travel
7,Maldives and Srilanka are gaining popularity in terms of low budget vacation places,Travel


In [31]:
from sentence_transformers import SentenceTransformer

encoder = SentenceTransformer("all-mpnet-base-v2")
vectors = encoder.encode(df.text)
vectors.shape

  from .autonotebook import tqdm as notebook_tqdm


(8, 768)

In [32]:
dim = vectors.shape[1]
dim

768

In [33]:
import faiss
index = faiss.IndexFlatL2(dim)
index

<faiss.swigfaiss.IndexFlatL2; proxy of <Swig Object of type 'faiss::IndexFlatL2 *' at 0x156299d40> >

In [34]:
index.add(vectors)

In [35]:
search_query = "I want to buy a polo t-shirt"
search_query_vectorized = encoder.encode(search_query)
print(search_query_vectorized.shape)

import numpy as np
search_query_vectorized = np.array(search_query_vectorized).reshape(1,-1)
print(search_query_vectorized.shape)

(768,)
(1, 768)


In [36]:
distances, I = index.search(search_query_vectorized, k=2)
print(distances)
print(I)

[[1.3844838 1.4039098]]
[[3 2]]


In [37]:
df.loc[I[0]]

Unnamed: 0,text,category
3,Vibrant color jeans for male are becoming a trend,Fashion
2,These are the latest fashion trends for this week,Fashion


## Retrieval

In [38]:
import os
import streamlit as st
import pickle
import time
import langchain
from langchain import OpenAI
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.chains.qa_with_sources.loading import load_qa_with_sources_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredURLLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS

In [39]:
from dotenv import load_dotenv
load_dotenv()

True

In [40]:
# Initialise LLM with required params
llm = OpenAI(temperature=0.9, max_tokens=500) 

In [41]:
# load data
loaders = UnstructuredURLLoader(urls=[
    "https://www.moneycontrol.com/news/business/markets/wall-street-rises-as-tesla-soars-on-ai-optimism-11351111.html",
    "https://www.moneycontrol.com/news/business/tata-motors-launches-punch-icng-price-starts-at-rs-7-1-lakh-11098751.html"
])
data = loaders.load() 
len(data)

2

In [42]:
# Split data to create chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)

# As data is of type documents we can directly use split_documents over split_text in order to get the chunks.
docs = text_splitter.split_documents(data)

In [43]:
len(docs)

43

In [44]:
docs[0]

Document(page_content='English\n\nHindi\n\nGujarati\n\nSpecials\n\nTrending Stocks\n\nTata Power\xa0INE245A01021, TATAPOWER, 500400\n\nTata Tech\xa0INE142M01025, TATATECH, 544028\n\nAdani Enterpris\xa0INE423A01024, ADANIENT, 512599\n\nAdani Power\xa0INE814H01011, ADANIPOWER, 533096\n\nSuzlon Energy\xa0INE040H01021, SUZLON, 532667\n\nCheck your Credit Score here!\n\nQuotes\n\nMutual Funds\n\nCommodities\n\nFutures & Options\n\nCurrency\n\nNews\n\nCryptocurrency\n\nForum\n\nNotices\n\nVideos\n\nGlossary\n\nAll\n\nHello, LoginHello, LoginLog-inor Sign-UpMy AccountMy Profile My PortfolioMy WatchlistMy Credit Score₹100 CashbackMy FeedMy MessagesMy AlertsMy Profile My PROMy PortfolioMy WatchlistMy Credit Score₹100 CashbackMy FeedMy MessagesMy AlertsLogoutChat with UsDownload AppFollow us on:\n\nPremium\n\nMy Feed', metadata={'source': 'https://www.moneycontrol.com/news/business/markets/wall-street-rises-as-tesla-soars-on-ai-optimism-11351111.html'})

In [45]:
!pip install tiktoken

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [46]:
from langchain.embeddings import OpenAIEmbeddings
import pickle

In [47]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [48]:
# Create embeddings for these chunks and save them to FAISS index
# Create the embeddings of the chunks using openAIEmbeddings
embeddings = OpenAIEmbeddings()

# Pass the documents and embeddings inorder to create FAISS vector index
vectorindex_openai = FAISS.from_documents(docs, embeddings)

In [49]:
vectorindex_openai.index

<faiss.swigfaiss.IndexFlatL2; proxy of <Swig Object of type 'faiss::IndexFlatL2 *' at 0x2aa697f60> >

In [50]:
import os
import streamlit as st
import pickle
import time
import langchain
from langchain import OpenAI
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.chains.qa_with_sources.loading import load_qa_with_sources_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredURLLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS

# Storing vector index create in local
# file_path="vector_index.pkl"
# with open(file_path, "wb") as f:
#     pickle.dump(vectorindex_openai, f)

# if os.path.exists(file_path):
#     with open(file_path, "rb") as f:
#         vectorIndex = pickle.load(f)

In [51]:
# Retrieve similar embeddings for a given question and call LLM to retrieve final answer
chain = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=vectorindex_openai.as_retriever())
chain



In [52]:
query = "what is the price of Tiago iCNG?"
# query = "what are the main features of punch iCNG?"

langchain.debug=True

chain({"question": query}, return_only_outputs=True)

[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQAWithSourcesChain] Entering Chain run with input:
[0m{
  "question": "what is the price of Tiago iCNG?"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQAWithSourcesChain > 3:chain:MapReduceDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQAWithSourcesChain > 3:chain:MapReduceDocumentsChain > 4:chain:LLMChain] Entering Chain run with input:
[0m{
  "input_list": [
    {
      "context": "The company also said it has also introduced the twin-cylinder technology on its Tiago and Tigor models.\n\nThe Tiago iCNG is priced between Rs 6.55 lakh and Rs 8.1 lakh, while the Tigor iCNG comes at a price range of Rs 7.8 lakh to Rs 8.95 lakh.\n\nTata Motors Passenger Vehicles Ltd Head-Marketing, Vinay Pant said these introductions put together will make the company's CNG line up \"appealing, holistic, and stronger than ever\".\n\nPTI\n\nTags:\n\n#Business\n\n#Companies\n\nfi

{'answer': ' The Tiago iCNG is priced between Rs 6.55 lakh and Rs 8.1 lakh. \n',
 'sources': 'https://www.moneycontrol.com/news/business/tata-motors-launches-punch-icng-price-starts-at-rs-7-1-lakh-11098751.html'}