In [73]:
# importing warnings
import warnings
warnings.filterwarnings('ignore')

In [7]:
# for .env file for getting the key
import os
from dotenv import load_dotenv
load_dotenv()
OPENAI_API_KEY=os.getenv("OPENAI_API_KEY")

##### Text Loader through Langchain

In [9]:
# importing TextLoader from documents_loaders of the langchain
from langchain.document_loaders import TextLoader

In [16]:
# creating the object of the TextLoader
loader = TextLoader("files/nvda_news.txt")
loader.load()

# checking the type of the loader
type(loader)

# checking the path of the loader
loader.file_path

'files/nvda_news.txt'

##### CSV Loader through Langchain

In [15]:
# importing CSVLoader from document_loaders.csv of the langchain
from langchain.document_loaders.csv_loader import CSVLoader

In [38]:
# loadin the csv file through CSVLoader
loader = CSVLoader(file_path="files/movies.csv", source_column="title")
data = loader.load()
data[0].page_content
data[0].metadata
# print(data[0].page_content)

{'source': 'K.G.F: Chapter 2', 'row': 0}

##### UnstructuredURLLoader
- UnstructuredURLLoader of Langchain internally uses unstructured python library to load the content from url's
- installing necessary libraries, libmagic is used for file type detection
- !pip3 install unstructured libmagic python-magic python-magic-bin

In [40]:
# importing the UnstructuredURLLoader()
from langchain.document_loaders import UnstructuredURLLoader

In [41]:
# we aare going to use these urls to retrieve the unstructured data
loader = UnstructuredURLLoader(
    urls = [
        "https://www.moneycontrol.com/news/business/banks/hdfc-bank-re-appoints-sanmoy-chakrabarti-as-chief-risk-officer-11259771.html",
        "https://www.moneycontrol.com/news/business/markets/market-corrects-post-rbi-ups-inflation-forecast-icrr-bet-on-these-top-10-rate-sensitive-stocks-ideas-11142611.html"
    ]
)

In [44]:
# load the url
unstr_data = loader.load()
unstr_data

# checking the length
len(unstr_data)

Document(page_content='English\n\nHindi\n\nGujarati\n\nSpecials\n\nMoneycontrol Trending Stock\n\nInfosys\xa0INE009A01021, INFY, 500209\n\nState Bank of India\xa0INE062A01020, SBIN, 500112\n\nYes Bank\xa0INE528G01027, YESBANK, 532648\n\nBank Nifty\n\nNifty 500\n\nQuotes\n\nMutual Funds\n\nCommodities\n\nFutures & Options\n\nCurrency\n\nNews\n\nCryptocurrency\n\nForum\n\nNotices\n\nVideos\n\nGlossary\n\nAll\n\nHello, Login Hello, LoginLog-inor Sign-UpMy AccountMy Profile My PortfolioMy WatchlistCredit Score₹100 Cash RewardMy FeedMy MessagesPrice AlertsMy Profile My PROMy PortfolioMy WatchlistCredit Score₹100 Cash RewardMy FeedMy MessagesPrice AlertsLogoutChat with UsDownload AppFollow us on:\n\nPremium\n\nAdvisory Alert!\n\nElection 2024MarketsHOMEINDIAN INDICESSTOCK ACTIONAll StatsTop GainersTop LosersOnly BuyersOnly Sellers52 Week High52 Week LowPrice ShockersVolume ShockersMost Active StocksGLOBAL MARKETSUS MARKETSBIG SHARK PORTFOLIOSSTOCK SCANNERECONOMIC CALENDARMARKET ACTIONDashboa

In [None]:
# showing the first data content
unstr_data[0].page_content
unstr_data[0].page_content[:100]
unstr_data[0].metadata
print(unstr_data[0].page_content)

##### Text Splitters :

##### Why do we need text splitters in first place?
LLM's have token limits. Thus we need to split the text into chunk size.
There are various text splitter classes in langchain that allows us to do this.

In [48]:
# Taking some random text from wikipedia

text = '''Interstellar is a 2014 epic science fiction film co-written, directed, and produced by Christopher Nolan. 
It stars Matthew McConaughey, Anne Hathaway, Jessica Chastain, Bill Irwin, Ellen Burstyn, Matt Damon, and Michael Caine. 
Set in a dystopian future where humanity is embroiled in a catastrophic blight and famine, the film follows a group of astronauts who travel through a wormhole near Saturn in search of a new home for humankind.

Brothers Christopher and Jonathan Nolan wrote the screenplay, which had its origins in a script Jonathan developed in 2007 and was originally set to be directed by Steven Spielberg. 
Kip Thorne, a Caltech theoretical physicist and 2017 Nobel laureate in Physics,[4] was an executive producer, acted as a scientific consultant, and wrote a tie-in book, The Science of Interstellar. 
Cinematographer Hoyte van Hoytema shot it on 35 mm movie film in the Panavision anamorphic format and IMAX 70 mm. Principal photography began in late 2013 and took place in Alberta, Iceland, and Los Angeles. 
Interstellar uses extensive practical and miniature effects, and the company Double Negative'''

In [49]:
# importing CharacterTextSpl from text_splitter of the langchain
from langchain.text_splitter import CharacterTextSplitter

# creating an object of CharacterTextSplitter() class and passing few arguments
splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_size=300,
    chunk_overlap=100
)

In [56]:
# splitting the text through split_text() function
chunks = splitter.split_text(text)
chunks[0]
len(chunks)
len(chunks[0])

227

In [None]:
# showing the size of each chunk
for chunk in chunks:
    print(len(chunk))

#### RecursiveTextSplitter

In [None]:
# this is our data
text


In [61]:
# importing RecursiveCharacterTextSplitter() from text_splitter of the langchain
from langchain.text_splitter import RecursiveCharacterTextSplitter

# creating an object of RecursiveCharacterTextSplitter()
recur_splitter = RecursiveCharacterTextSplitter(
    separators = ["\n\n", "\n", " "],  # List of separators based on requirement (defaults to ["\n\n", "\n", " "])
    chunk_size = 250,  # size of each chunk created
    chunk_overlap  = 50,  # size of  overlap between chunks in order to maintain the context
    length_function = len  # Function to calculate size, currently we are using "len" which denotes length of string however you can pass any token counter)
)

In [None]:
# splitting the text through split_text() function
chunks = recur_splitter.split_text(text)

# showing the length of each chunk
for chunk in chunks:
    print(len(chunk))

Recursive text splitter uses a list of separators, i.e. separators = ["\n\n", "\n", "."].
Thus it will first split by '\n\n' and check if the resulted chunk size is greater than 
the declared chunk_size. It will follow the samw rules for the rest seperators.

## FAISS (Facebook AI Similarity Search)

In [64]:
# importing libraries
# import necessary libraries
import pandas as pd
pd.set_option('display.max_colwidth', 100)

In [70]:
# loading the dataset with the help of pandas
# here we have 8 rows and 2 columns
data_frame = pd.read_csv("files/sample_text.csv")
data_frame.head()
data_frame.tail()
data_frame.shape
data_frame

Unnamed: 0,text,category
0,Meditation and yoga can improve mental health,Health
1,"Fruits, whole grains and vegetables helps control blood pressure",Health
2,These are the latest fashion trends for this week,Fashion
3,Vibrant color jeans for male are becoming a trend,Fashion
4,The concert starts at 7 PM tonight,Event
5,Navaratri dandiya program at Expo center in Mumbai this october,Event
6,Exciting vacation destinations for your next trip,Travel
7,Maldives and Srilanka are gaining popularity in terms of low budget vacation places,Travel


Step 1 : Create source embeddings for the text column

In [71]:
# importing libraries
from sentence_transformers import SentenceTransformer

In [72]:
# creating an object of SentenceTransformer()
encoder = SentenceTransformer("all-mpnet-base-v2")
vectors = encoder.encode(data_frame.text)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [80]:
# shape of the vector
vectors.shape

# indxing of vectors
dim = vectors.shape[0]
dim = vectors.shape[1]
dim

768

Step 2 : Build a FAISS Index for vectors

In [82]:
# importing faiss
import faiss

# creating an object of faiss()
index = faiss.IndexFlatL2(dim)

Step 3 : Normalize vectors (we are using L2 distance to measure similarity) through add() function

In [83]:
# normalize the vetor through add() function
index.add(vectors)

Step 4 : Encode search text using same encorder and normalize the output vector

In [84]:
# this is my prompt as search_query
search_query = "I want to buy a polo t-shirt"
# search_query = "looking for places to visit during the holidays"
# search_query = "An apple a day keeps the doctor away"
output_vec = encoder.encode(search_query)
output_vec.shape

(768,)

In [85]:
# importing the numpy for converting 1-dimension vector into 2-dimensional vector
import numpy as np
output_vec = np.array(output_vec).reshape(1,-1)
output_vec.shape

(1, 768)

Step 5: Search for similar vector into the created index

In [89]:
# calling the search() function and 
# k=3 means that the function should return 3 similar vectors
distances, I = index.search(output_vec, k=3)
distances

array([[1.3844836, 1.4039094, 1.7325616]], dtype=float32)

In [92]:
# returning the index no of the vector
I
I.tolist()
row_indices = I.tolist()[0]

In [93]:
# output
data_frame.loc[row_indices]

Unnamed: 0,text,category
3,Vibrant color jeans for male are becoming a trend,Fashion
2,These are the latest fashion trends for this week,Fashion
5,Navaratri dandiya program at Expo center in Mumbai this october,Event


#### Streamlit using for user interface

In [94]:
# importing the necessery libraries
import os
import streamlit as st
import pickle
import time
import langchain
from langchain import OpenAI
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.chains.qa_with_sources.loading import load_qa_with_sources_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredURLLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS

In [96]:
# Initialize the LLM model with required params
llm = OpenAI(temperature=0.9, max_tokens=500) 

In [97]:
# creating object of UnstructuredURLLoader()
loaders = UnstructuredURLLoader(urls=[
    "https://www.moneycontrol.com/news/business/markets/wall-street-rises-as-tesla-soars-on-ai-optimism-11351111.html",
    "https://www.moneycontrol.com/news/business/tata-motors-launches-punch-icng-price-starts-at-rs-7-1-lakh-11098751.html"
])
data = loaders.load() 
len(data)

2

In [98]:
# Splitting data to create chunks
# calling RecursiveCharacterTextSplitter() with passing few parameters
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)


In [None]:
# As data is document type, we can directly use split_documents over split_text in order to get the chunks.
docs = text_splitter.split_documents(data)

# checking the length of chunks
len(docs)

# output
print(docs[0].page_content)

In [104]:
# Create embeddings for these chunks and save them to FAISS index
# Create the embeddings of the chunks using openAIEmbeddings
embeddings = OpenAIEmbeddings()

In [105]:
# Pass the documents and embeddings inorder to create FAISS vector index
vectorindex_openai = FAISS.from_documents(docs, embeddings)

In [None]:
# Storing vector index create in local
file_path="vector_index.pkl"
with open(file_path, "wb") as f:
    pickle.dump(vectorindex_openai, f)

In [None]:
if os.path.exists(file_path):
    with open(file_path, "rb") as f:
        vectorIndex = pickle.load(f)