In [1]:
# @title Notebook Settings

from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

In [2]:
# @title Installation
!pip install openai
!pip install -U sentence-transformers
!pip install PyPDF4
!pip install msoffice2pdf
!pip install pymupdf
!!pip install "git+https://github.com/hwchase17/langchain.git"
!pip install pinecone-client
!pip install trafilatura
!sudo apt install libreoffice -y

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting openai
  Downloading openai-0.26.4.tar.gz (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.6/55.6 KB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: openai
  Building wheel for openai (pyproject.toml) ... [?25l[?25hdone
  Created wheel for openai: filename=openai-0.26.4-py3-none-any.whl size=67744 sha256=1049bea5c29f40c466c1553ef85e993956479c1c7385b443dd1a94a054e20f4d
  Stored in directory: /root/.cache/pip/wheels/2b/d8/4e/268f029bd3277c1dd9e8781a0e0296e0a63822665bfa2429fc
Successfully built openai
Installing collected packages: openai
Successfully installed openai-0.26.4
Looking in index

In [3]:
# @title Importing libraries

import re
import os
import fitz
import json
import shutil
import PyPDF4
import openai
import requests
import pinecone
import itertools
import trafilatura
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from msoffice2pdf import convert
from urllib.parse import urljoin
from langchain.llms import OpenAI
from sentence_transformers import util
from langchain.agents import load_tools
from langchain.vectorstores import Pinecone
from langchain.agents import initialize_agent
from PyPDF4 import PdfFileReader, PdfFileWriter
from langchain.embeddings import OpenAIEmbeddings
from langchain import OpenAI, LLMChain, PromptTemplate
from langchain import PromptTemplate, OpenAI, LLMChain
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains.conversation.memory import ConversationalBufferWindowMemory
from langchain.chains.conversation.memory import ConversationBufferMemory, ConversationSummaryBufferMemory
from langchain.embeddings import CohereEmbeddings, OpenAIEmbeddings, HuggingFaceEmbeddings, TensorflowHubEmbeddings
# Mount google drive where PDF file is uploaded [PATH:'/content/drive/MyDrive/sample/']
from google.colab import drive
drive.mount('/content/drive')




Mounted at /content/drive


In [4]:
# @title Get all URLs list from the URL
def get_url_list(path, url_type:bool):
  page = requests.get(path)
  soup = BeautifulSoup(page.text, 'html.parser')
  links_list = []
  try:
    if url_type is True:
      data = soup.find_all('a',{'class':'dds__link dds__link--standalone removeunderline'})
    elif url_type is False:
      data = soup.find_all('a',{'class':'dds__link dds__link--standalone'})
  except: 
    print('Provide url_type type')
  
  for link in data:
      full_link = urljoin(path, str(link.get('href')))
      r = requests.get(full_link)
      if r.status_code == 200 and ' ' not in full_link:
        links_list.append(full_link)

  return list(np.unique(links_list))

In [5]:
#@title Text Extractor function that extracts text from specific Link

def extract_text_from_url(article_url: str):
    try:
      headers = {
          "accept": "application/json",
          "X-API-KEY": EXTRACT_API_KEY,
          "Content-Type": "application/json",
      }

      data = json.dumps({
          "url": article_url
      })

      response = requests.post("https://extract.tldrthis.com/v1/extract-article/", headers=headers, data=data, timeout=30)
      response.raise_for_status()
      if response.status_code == 200:
        article_metadata = response.json()
  
        article_metadata["article_text"] = re.sub(r'\n\s*\n', '\n\n', article_metadata["article_text"])
        return article_metadata
    except requests.exceptions.RequestException as err:
      print(err)
      return False

In [6]:
#@title Function to Extract Text from every link of Company Docs url list

def get_text_from_url_list(links_list):
  webpages_text = []
  for link in links_list:
    downloaded = trafilatura.fetch_url(link)
    extracted_text = trafilatura.extract(downloaded)
    if "video" not in link:
      webpages_text.append(extracted_text)
    print(f'Text from {link} Extracted')
  
  return webpages_text

In [7]:
#@title Function to Extract Text from URLs (calling get_text_from_url_list for getting webpage text). (change url page ranges, current: 20-25)

def extract_text_from_links(url):
  url_list = get_url_list(url, True) # Extract Every link of Docs 
  sub_links = []
  for link in url_list:
    sub_link = get_url_list(link, False)
    sub_links.append(sub_link)

  sub_links = list(itertools.chain.from_iterable(sub_links))
  all_links = url_list + sub_links
  print(f'Total {len(all_links)} Links Found')


  all_links = all_links[1:3]     # Selecting minimum pages including maximum informations to reduce excessive embedding of Text 
  extracted_text = get_text_from_url_list(all_links)                  # Extracted Text from Every link of Docs

  return extracted_text

In [8]:
from PyPDF4.generic import encode_pdfdocencoding
#@title Function to Extract Text from the PDF

def extract_text_from_pdf(file_path):
  pdf_document = fitz.open(file_path)
  num_pages = pdf_document.page_count
  text = []
  for i in range(num_pages):
      page = pdf_document[i]
      page_text = page.get_text()
      text.append(page_text.replace('\xa0', '').replace('\n', '').replace('\\', '').replace('\uf0d8', '').replace('\uf071', '').replace('\uf0a7', '').replace('\uf0fc', ''))
  pdf_document.close()
  return text

In [9]:
#@title Function to Get Text Extract from PDF, DOCX by every page

def extract_text_by_pages(path):
  shutil.copy(path, '/content/')
  splitted_path = path.split('.')
  file_name = splitted_path[0].split('/')[-1]
  file_type = splitted_path[1]

  generated_pdf = None
  new_path = f'/content/{file_name}.{file_type}'
  print(f'File moved, PATH : {new_path}')
  if file_type != 'pdf':
    output = convert(source=new_path, output_dir='/content/', soft=0)
    print(output)
    generated_pdf = output

  if generated_pdf: 
    text = extract_text_from_pdf(generated_pdf)
    text = [x.strip() for x in text]
    
  else:
    text = extract_text_from_pdf(new_path)
    text = [x.strip() for x in text]

  !rm {new_path}
  if generated_pdf:
    !rm {generated_pdf}

In [19]:
#@title Function to create and store embeddings into the Pinecone.

def store_embeddings(texts, index_name, creator="huggingface"):
  if creator == "cohere":
    # embeddings = CohereEmbeddings(cohere_api_key= cohere_api_key)   
    pass
  elif creator == "huggingface":
    embeddings = HuggingFaceEmbeddings()
  elif creator == "tensorflow":
    embeddings = TensorflowHubEmbeddings()
  else:
    embeddings = OpenAIEmbeddings()

  pinecone.init(api_key=os.environ["PINECONE_API_KEY"], environment=os.environ["PINECONE_REGION"])
  if index_name not in pinecone.list_indexes():
      pinecone.create_index(
          index_name,
          dimension=768,
          metric="cosine"
      )
  text_embeddings = Pinecone.from_texts(texts=texts, embedding=embeddings, index_name=index_name)
  return text_embeddings

In [11]:
#@title Function to get text from the file or the docs url.

def text_embeddings(path, index_name):
  if path[:4] == "http":
    texts = extract_text_from_links(path)
  else:
    texts = extract_text_from_pdf(path)
  splitted_texts = []
  text_embeddings = []
  for text in texts:
    text_splitter = CharacterTextSplitter(
      separator = "\n",
      chunk_size = 1500,
      chunk_overlap  = 200,
      length_function = len,
    )
    text = str(text).replace("[", "").replace("]", "")
    splitted_text = text_splitter.split_text(text)
    splitted_texts.extend(splitted_text)
    text_embedding = store_embeddings(splitted_text, index_name)
    text_embeddings.extend(splitted_text)
  return text_embeddings

In [12]:
#@title Construct the prompt containing all the required information.

def construct_prompt(web_text, company_name):
  prompt = f"""You are talking to Ayurveda SupportBot, a Chatbot! Ayurveda SupportBot is designed specifically for the enterprise environment, providing accurate and reliable information to employees and stakeholders. By using only the given SPECIFIC INFORMATION, Ayurveda SupportBot is able to provide answers to questions related to the company.\n\nRead the following SPECIFIC INFORMATION carefully, then answer the request based on the given SPECIFIC INFORMATION. Be sure to provide a lengthy, detailed response for each request. If the user asks for multiple requests, then answer all the requests. Do not mention 'SPECIFIC INFORMATION' words in the generated answer. Do not include any information outside the given SPECIFIC INFORMATION. Be sure that if related information regarding the Question is not found in SPECIFIC INFORMATION, then respond only with the below Format. Do not add any additional words in Format.\nFormat:\n'''\nSorry, I can't help. Reach out to our Customer Support Team.\n'''\n\nSPECIFIC INFORMATION:
  \n
  {web_text}
  ###

  Company Name: {company_name}
  """
  return prompt

In [13]:
#@title Gives the final answer (memory functionality is implemented)

def give_answer(docs, question, complex_answer = 0):
  text = docs[0].page_content
  text = str(text)[:7000].replace("[", "").replace("]", "")
  template = construct_prompt(text, "Ayurveda")
  template += """
  {chat_history}
  Human: {question}
  AI:
  """
  prompt_template = PromptTemplate(input_variables=["chat_history","question"], template=template)
  memory = ConversationalBufferWindowMemory(k=4, memory_key="chat_history")
  llm_chain = LLMChain(
      llm=OpenAI(max_tokens=300),     # max_tokens defines how long each answer will be generated
      prompt=prompt_template,
      verbose=True,
      memory=memory,
  )
  result = llm_chain.predict(question=question)
  print(result)
  while question.lower() != "stop":
    question = input("Enter your question to continue OR 'stop' to exit: ")
    if question.lower() != "stop":
      result = llm_chain.predict(question=question)
      print(result)
  return result

In [27]:
#@title A wrapper around functions.

class ChatBot():
  def __init__(self, path: str, index_name):
    # Required functions to run at the only beginning
    self.index_name = index_name
    self.path = path
    self.raw_texts = text_embeddings(self.path, self.index_name)
  def extract_answer(self, query):
    # Get the similar text from the text and pass it to OpenAI
    self.embeddings = Pinecone.from_existing_index(self.index_name, HuggingFaceEmbeddings())
    docs = self.embeddings.similarity_search(query, k=1)
    give_answer(docs, query)

In [21]:
path = "/content/drive/MyDrive/sample/The_Magic_of_Ayurveda.pdf"  #@param {type:"string"}
index_name = "ayurved"  #@param {type:"string"}
cb = ChatBot(path, index_name)

In [28]:
#@title get answer from support bot
query = "What is Acne?"  #@param {type:"string"}
cb.extract_answer(query)

ApiException: ignored