### Splitting & Embedding text using LangChain

In [4]:
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(),override=True)

True

In [5]:
os.environ.get('PINECONE_API_KEY')

'de59d226-4577-481d-8957-f9f70b7e3640'

### Document Loaders in Langchain
- This module helps us to load any type of document into our model.
- [Here are examples](https://python.langchain.com/docs/integrations/document_loaders/)
- 

In [6]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
with open('./speech.txt') as f:
    gandhi_speech=f.read()
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    # look at 2 cells below and observe the overlapping printed text
    chunk_overlap=20,
    length_function=len
)

In [7]:
chunks = text_splitter.create_documents([gandhi_speech])
print(len(chunks))

170


In [24]:
len(chunks[0].page_content),chunks[0].page_content,chunks[1].page_content

(99,
 'Mahatma Gandhi was born on October 2, 1869, his full name was Mohandas Karamchand Gandhi who was an',
 'Gandhi who was an Indian lawyer,anti-nationalist, and political ethicist. Mahatma Gandhi was')

### Embedding Cost

In [8]:
def print_embedding_cost(texts):
    import tiktoken
    enc = tiktoken.encoding_for_model('text-embedding-ada-002')
    total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
    print(f'Total Tokens: {total_tokens}')
    print(f'Embedding Cost in USD: {total_tokens/1000 * 0.0004:.6f}')

print_embedding_cost(chunks)

Total Tokens: 3448
Embedding Cost in USD: 0.001379


In [9]:
from langchain.embeddings import OpenAIEmbeddings
# NOTE
# 1.No need to pass the api key as the parameter to OpenAIEmbeddings
#   function. It automatically grabs it from the env.
# 2.This class can be used to embed text to vector.
embeddings = OpenAIEmbeddings()

In [10]:
# Example of how a vector looks after it's embedded
vector = embeddings.embed_query(chunks[0].page_content)
#print(vector)

### Inserting the Embeddings into a Pinecone Index
- The above embedding can be stored in a csv file but for a larger
  data, that won't be feasible. Hence we use a database that is suited
  for storing the embeddings.
- Here comes Pinecone to the rescue.

In [11]:
import os
import pinecone
from langchain.vectorstores import Pinecone

pinecone.init(api_key=os.environ.get('PINECONE_API_KEY'),
             environment=os.environ.get('PINECONE_ENV'));

  from tqdm.autonotebook import tqdm


In [12]:
# deleting all indexex
indexes = pinecone.list_indexes()
for i in indexes:
    print('Deleting all indexes ... ', end='')
    pinecone.delete_index(i)
    print('Done')

Deleting all indexes ... Done


In [13]:
# Rules for pinecone index name 
# 'Index name must consist of lower case alphanumeric characters or '-''
#  And all lower case naming and must start and end with an alphanumeric
#  character.

# THis is an example of invalid index name
#index_name = 'Gandhi-speech'


index_name = 'gandhi-speech'
if index_name not in pinecone.list_indexes():
    print(f'Creating index {index_name} ...')
    pinecone.create_index(index_name,dimension=1536,metric='cosine')
    print('Done!')

Creating index gandhi-speech ...
Done!


- This is how the creation of index <ins>[gandhi-speech](https://app.pinecone.io/organizations/-NdN5ATjuVw_4j3LE5br/projects/gcp-starter:b51c29b/indexes/gandhi-speech/browser)</ins> looks like.
<img src='./images/index-creation-example.png' />

In [14]:
# We will updload the vectors to pinecone using langchain.
# Chunks is the list of text documents that has been obtained using the 
# module RecursiveCharacterTextSplitter

# The embeddings is responsible for converting text data into embeddings
# using OpenAIs embedding model.

# Index_name is the index name :) 

vector_store = Pinecone.from_documents(chunks, embeddings, index_name=index_name)

### Asking Questions (Similarity Search)
- [Remember this](./ChatGPT-PineCone-LLMs-POC.ipynb#How-Vector-DB-works)

In [15]:
query = 'What is the name of speaker?'
result = vector_store.similarity_search(query)
print(result)

[Document(page_content='Table Conference Speech (30th of November 1931)The round table conference speech was given by', metadata={}), Document(page_content='speech\xa0Mahatma Gandhi Famous SpeechMahatma Gandhi or Mohandas Karamchand Gandhi was an Indian', metadata={}), Document(page_content='1916)Dandi March Speech (11th of March 1930)Round Table Conference Speech (30th of November', metadata={}), Document(page_content='he was giving the speech.\xa0\xa0Mahatma Gandhi was sharply criticizing the overuse of the English', metadata={})]


In [16]:
for r in result:
    print(r.page_content)
    print('-' * 50)

Table Conference Speech (30th of November 1931)The round table conference speech was given by
--------------------------------------------------
speech Mahatma Gandhi Famous SpeechMahatma Gandhi or Mohandas Karamchand Gandhi was an Indian
--------------------------------------------------
1916)Dandi March Speech (11th of March 1930)Round Table Conference Speech (30th of November
--------------------------------------------------
he was giving the speech.  Mahatma Gandhi was sharply criticizing the overuse of the English
--------------------------------------------------


In [18]:
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI

llm = ChatOpenAI(model='gpt-3.5-turbo',temperature=1)

retriever = vector_store.as_retriever(
            search_type='similarity'
            ,search_kwargs={'k':3}
            )

chain = RetrievalQA.from_chain_type(llm=llm,
     chain_type='stuff',
     retriever=retriever)

In [119]:
query = 'What is the main message of the speaker?'
answer = chain.run(query);
print(answer)
print(retriever.get_relevant_documents(query))

The main message of the speaker is the importance of non-violence and treating everyone with respect.
[Document(page_content='points but mostly in his speech he focused on the idea and the importance of non-violence. He said', metadata={}), Document(page_content='speech by saying the very famous slogan ‘’Do or die’’. The slogan was a message to every Indian', metadata={}), Document(page_content='people but to treat everyone with respect. Mahatma Gandhi finally concluded the speech by saying', metadata={})]


In [59]:
# README: 
# First run : pip install validators


import requests
from bs4 import BeautifulSoup
import os
from urllib.parse import urlparse
import validators

# Replace this URL with the website you want to scrape
url = "https://incometaxindia.gov.in/pages/tutorials.aspx"

# Send an HTTP GET request to the URL
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Parse the HTML content of the page using BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find the elements containing the article titles
    article_titles = soup.find_all('h3', class_='search_title')  # Adjust the selector as per the website's structure

    # Extract and print the titles
    for link in soup.select("a[href$='.pdf']"):
        url = link.get('href')
        a = urlparse(url)
        if validators.url(url):
            print(f'Downloading file {fileName}...')
            fileName = os.path.basename(a.path)
            response = requests.get(url)

            with open(fileName, "wb") as f:
                f.write(response.content)
else:
    print("Failed to retrieve the webpage")



/Documents/Not_7_2017_new.pdf
/Documents/vision-mission-values-2020-07012011.pdf
/Documents/departmental-directory.pdf
/Documents/taxpayer-charter.pdf
/Documents/citizen-charter-grievances-redressal.pdf
/Documents/aayakar-seva-kendra-contact-details.pdf
/Documents/Security-Certificate-IT-Website.pdf


In [110]:
# README: 
# First run : pip install validators
# pip install selenium


import requests
from bs4 import BeautifulSoup
import os
from urllib.parse import urlparse
import validators
import re
from selenium import webdriver
from selenium.webdriver.common.by import By

# Replace this URL with the website you want to scrape
url = "https://incometaxindia.gov.in/pages/tutorials.aspx"

driver = webdriver.Chrome()
driver.get(url)
# driver.fullscreen_window()

# Pages on website
initial_page = 1
total_pages = 9

# This directory will be created for your downloads to keep it clean.
downloadDirName = "./TaxTutorials/"

while initial_page <= total_pages :
    response = requests.get(driver.current_url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content of the page using BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')

        # All the pdf links are of type h3 and has a class 'search_title d-flex'
        elements = driver.find_elements(By.CSS_SELECTOR,"h3[class='search_title d-flex']")
        for element in elements:
            # For each such h3 element there exists only 1 anchor tag
            anchor = element.find_element(By.CSS_SELECTOR,"a")
            # The url for the pdf is inside this onclick attribute but in form of a JS function.
            anchor_url = anchor.get_attribute('onclick')
            # Cleaning out that JS code and extracting the pdf file link
            anchor_url = anchor_url.replace("javascript:OpenFormByType('","").replace("&k=');","")
            # Using urlparse get the filename for download purpose
            a = urlparse(anchor_url)
            # If the url is valid only then download the file
            if validators.url(anchor_url):
                fileName = os.path.basename(a.path)
                print(f'Downloading file {fileName}...')
                response = requests.get(anchor_url)
                
                # Create the download directory
                os.makedirs(downloadDirName, exist_ok=True)

                # Download the file to that directory
                with open(downloadDirName+fileName, "wb") as f:
                    f.write(response.content)
    # Increment the page counter
    initial_page +=1
    # Make selenium clikc the next button to navigate to next page
    driver.find_element(By.CSS_SELECTOR,"input[title='Next Page']").click()



Downloading file 81.tax-treatment-on-compulsory-acquisition-of-land.pdf...
Downloading file 80.deductions-or-allowances-allowed-to-salaried-employee.pdf...
Downloading file 79.computation-of-tax-for-individual.pdf...
Downloading file 78.calculation-of-taxable-salary-income.pdf...


KeyboardInterrupt: 

In [72]:
# pip install selenium