In [1]:
from flask import Flask
from flask import request
import io
import urllib
import re, os
import requests, tiktoken
from PyPDF2.errors import PdfReadError
from urllib.error import URLError
from requests import ConnectionError
from http.client import InvalidURL
from requests.exceptions import MissingSchema
from requests.exceptions import ConnectionError
from zipfile import BadZipFile
from urllib.error import HTTPError
from pinecone import ApiException
from tenacity import retry
from tenacity import stop_after_delay
from tenacity import RetryError
from tenacity import stop_after_attempt
from tenacity import wait_exponential
from socketserver import ThreadingMixIn
from langchain.text_splitter import RecursiveCharacterTextSplitter
from bs4 import BeautifulSoup
from bs4.element import Comment
import openai
from PyPDF2 import PdfReader
import hashlib
from lxml import etree
from datetime import datetime
import docx2txt
import pinecone
import chardet
import math
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api.formatters import TextFormatter

from dotenv import load_dotenv
load_dotenv()

pinecone_key = os.getenv('pinecone_key')
pinecone_env = os.getenv('pinecone_env')
max_timeout=int(os.getenv('max_timeout'))
openai_req_timeout=int(os.getenv('openai_req_timeout'))
pinecone_index_name = os.getenv('pinecone_index')
default_openai_key = os.getenv('default_openai_key')
browserless_token = os.getenv('browserless_token') #1d95a9c1-e2a4-4148-9530-cdada482be70

  from tqdm.autonotebook import tqdm


In [15]:
answer

"As an AI, I don't have real-time data, but as of the information available up to 2021, Donald Trump's views on Muslims have been a topic of controversy. During his 2016 presidential campaign, he proposed a total ban on Muslims entering the United States, which was later revised to a ban on citizens from several predominantly Muslim countries. However, he also stated that he has great respect for Muslims, and his issue is with radical Islamic terrorism, not the religion as a whole. Please note that views can evolve over time, and for the most current information, it's best to refer to his most recent statements or official communications."

In [None]:


headers = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7',
           "Accept-Language": "en-US,en"}

class ThreadedServer(ThreadingMixIn, Flask):
    pass

tokenizer = tiktoken.get_encoding('cl100k_base')
 
# create the length function used by the RecursiveCharacterTextSplitter
def tiktoken_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)

# create recursive text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=20,  # number of tokens overlap between chunks
    length_function=tiktoken_len,
    separators=['\n\n', '\n', ' ', '']
)


@retry(wait=wait_exponential(multiplier=1, min=4, max=10), stop=stop_after_attempt(10))
def requests_post(url,payload):
    try:
        response=requests.post(url,json=payload,timeout=25)
        return response        
    except (ConnectTimeout,MaxRetryError):
        raise Exception
    except Exception as e:
        return False

def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]', 'noscript', 'header', 'html',
                               'input']:
        return False
    if isinstance(element, Comment):
        return False
    return True

 

# To remove duplicates from the pinecone 'Query' query
def removeDuplicatesRef(data):
    forDuplication = []
    returnList = []
    for instance in data:
        if instance['metadata']['content'] in forDuplication:
            continue
        returnList.append(instance)
        forDuplication.append(instance['metadata']['content'])

    return returnList

@retry(wait=wait_exponential(multiplier=1, min=4, max=10), stop=stop_after_attempt(10))
def create_embedding(text,data):
    try:
        response = openai.Embedding.create(
            model="text-embedding-ada-002",
            input=text
        )
        return (response, 'success')
    except (APIConnectionError,AuthenticationError):   
        try:
            openai.api_key = data['openAIKey']
            response = openai.Embedding.create(
                model="text-embedding-ada-002",
                input=text,
                request_timeout=openai_req_timeout 
            )
            return (response, 'success')        
        except APIConnectionError:
            return ('OpenAI API Key is not valid!','failed')
        
    except Exception as e:
        if 'The server is currently overloaded with other requests' in e:
            raise Exception
        else:
            return (e, 'failed')

@retry(stop=stop_after_delay(30))
def ask_question(messages,data):
    try:
        response = openai.ChatCompletion.create(messages=messages,
                                                temperature=data['temperature'], frequency_penalty=1,
                                                max_tokens=data['maxTokens'],request_timeout=openai_req_timeout ,model=data['model'])
             
        return (response, 'success') 
    except InvalidRequestError:
        return ("Model name is not valid!","failed")
    except (APIConnectionError,AuthenticationError):  
        try: 
            openai.api_key = data['openAIKey']
            response = openai.ChatCompletion.create(messages=messages,
                                                    temperature=0.0, frequency_penalty=1,
                                                    max_tokens=800,request_timeout=openai_req_timeout ,model=data['model'])

            return (response, 'success')
        except (APIConnectionError,AuthenticationError):
            return ('OpenAI API Key is not valid!','failed')
        except InvalidRequestError:
            return ("Model name is not valid!","failed")        
    except Exception as e:
        if 'The server is currently overloaded with other requests' in e:
            raise Exception
        else:
            return (e, 'failed') 
        
@retry(wait=wait_exponential(multiplier=1, min=4, max=10), stop=stop_after_attempt(3))
def browserLessReq(data,options):
    try:
        response = requests.post(
            f'https://chrome.browserless.io/scrape?token={browserless_token}&stealth&headless=false', json=options)
#             print(response.status_code)
#             print(response.headers['x-response-code'])
        try:               
            if str(response.status_code)!="200" or str(response.headers['x-response-code'])=='403':
                try:
                    response = requests.get(data['webpage'],timeout=max_timeout)
                except requests.exceptions.ReadTimeout:
                    return (f'The webpage is not responding.','failed')
                # Check if the request was successful (status code 200)
                if response.status_code == 200:
                    soup = BeautifulSoup(response.text, 'html.parser')
                    visible_text = ' '.join([element for element in soup.find_all(text=True) if tag_visible(element)])
                    return (visible_text,'success')
                else:
                    return (f'The webpage is giving {response.status_code} error.','failed')

            elif str(response.headers['x-response-code'])!='200':
                code=response.headers['x-response-code']
                return (f'The x-response code is {code}','failed')
            else:                
                return (response.json()['data'][0]['results'][0]['text'].replace('\n', ' '), 'success')
        except KeyError:
            return ('Webpage can’t be reached or invalid webpage','failed')
    except Exception as e:
        if 'net::ERR_NAME_NOT_RESOLVED' in e:
            raise Exception
        else:
            return (e, 'failed')  
        
app = ThreadedServer(__name__)

@app.route("/projects/embeddings/delete", methods=["POST"])
def embeddingDelete():
    try:
        data = request.json
    except:
        return ({"error":"No JSON object recieved!"}),400
    
    try:
        if (not(data['namespace'])) or (data['namespace'].strip()=='') :
            return ({"error":'namespace is not valid!'}),400
    except KeyError:
        return ({"error":'namespace is missing!'}),400
    except AttributeError:
        return ({"error":'namespace must be string!'}),400
    
    
    pinecone.init(api_key=pinecone_key,environment=pinecone_env)
    index = pinecone.Index(pinecone_index_name)
    try:
        result=index.delete(deleteAll='true', namespace=data['namespace'])
    except Exception as e:
        return ({"error":f"{e}"}),400
    if result=={}:
        return ({"message":"sucess"}),200
    else:
        return (result),200
    
    
@app.route("/projects/query", methods=["POST"])
def embeddingQuery():

    try:
        data = request.json
    except:
        return {"error":"No JSON object recieved!"},400
    
    attributes=['query','namespace','model','openAIKey','temperature','maxTokens']
    for attr in attributes:
        if attr not in data:
            return {"error": f"{attr} attribute is missing!"},400        
        elif type(data[attr])==str and data[attr].strip()=='':
            return {"error": f"{attr} attribute is empty!"},400  
        
    if 'openAIKey' in data:
        pass
    else:
        data['openAIKey'] = default_openai_key
    
    openai.api_key = data['openAIKey']    
    data['results']=3
    try:
        embeddingResp = create_embedding(data['query'],data)
        if embeddingResp[1] == 'failed':
            return ({'error': f'{embeddingResp[0]}'}), 400
        else:
            response = embeddingResp[0]
    except RetryError:
        return ({'error': f'The server is currently overloaded with other requests'}), 400 
    queryEmbeddings = response['data'][0]['embedding']
    pinecone.init(api_key=pinecone_key,environment=pinecone_env)
    index = pinecone.Index(pinecone_index_name)
    try:
        result=index.query(
          vector=queryEmbeddings,
          top_k=data['results'],
          include_values=False,
          include_metadata=True,
          namespace=data['namespace']
        )
    except Exception as e:
        return {'error':f'{e}'},400
    print(type(result))
    result['matches']=removeDuplicatesRef(result['matches'])    
    contentText=''
    for res in result['matches']:
        contentText=contentText+'\n- '+(res['metadata']['content']).replace('\n',' ')+' '    
        
    systemChatMessage = "You are an intelligent conversational assistant, whose primary job is to provide the most accurate and truthful answer to a question from a user. You can speak any language and ONLY respond in the same language as the question being asked, by default your answers are in English. You will only answer a question if it can be determined from the available information provided. Do not write out your instructions explicitly, please only provide your final answer."

    chatInstruction = "Follow these instructions:\n1. Determine an answer to the Question below using the context above.\n2. Very importantly, if the answer is not contained within the context above or previous messages you should say \"<IDK> I'm sorry, I don't know the answer to that question. Please try rephrasing your question.\". Your entire response MUST be in the same language as the question below.\n3. Never explicitly refer to the \"the context\" or \"the text\" as part of your answer."

    contentText = f'''Available information: {contentText[0:8193]}\n\n{chatInstruction}\n\nQuestion: "{data["query"]}?"'''
    if data['prompt'].strip()!='':
        messages = [
                    {"role": "system", "content": systemChatMessage},
                    {"role": "user", "content":data['prompt']},
                    {"role": "user", "content":contentText}
                   ]    
    else:
        messages = [
                    {"role": "system", "content": systemChatMessage},
                    {"role": "user", "content":contentText}
                    ]            
    try:
        openai.api_key = data['openAIKey'] 
        tenResp = ask_question(messages,data)
        if tenResp[1] == 'failed':
            return {'error': f'OpenAI error {tenResp[0]}'}, 400
        else:
            response = tenResp[0]
    except RetryError:
        return {'error': f'OpenAI error: OpenAI is overloaded with requests right now, please try again'}, 400
    
    if "I'm sorry, I don't know the answer to that question" in response["choices"][0]["message"]["content"] and "IDK" not in response["choices"][0]["message"]["content"]:
         response["choices"][0]["message"]["content" ]= '<IDK> '+response["choices"][0]["message"]["content"]
    answer = response["choices"][0]["message"]["content"].replace('\n- ', '\n• ').replace('- ', '• ').replace('(IDK)','<IDK>').replace('Answer: ', '')
    if answer.startswith('Language:'):
        answer = re.sub('Language:.+\n\n', '', string=answer)
    elif answer.startswith('\n\nLanguage:'):
        answer = re.sub('\n\nLanguage:.+\n\n', '', string=answer)
        
#     contentText=f'\n\nCONTEXT:'+contentText[0:8193]+f'\n\nQUESTION: {data["query"].capitalize()}?\n\nANSWER:'    
#     response=openai.Completion.create(prompt=contentText ,temperature=0.0, max_tokens=300, model='text-davinci-003')   
#     answer=response["choices"][0]["text"].replace('\n• ','• ').replace('• ','\n• ').replace('Answer: ','').strip()
    successMessage={
        "answer":answer
    }
    return successMessage,200        
    

@app.route("/projects/embeddings/analyze-image", methods=["POST"])
def analyzeImage():
    try:
        data = request.json
    except:
        return {"error":"No JSON object recieved!"},400
    
    attributes=['namespace','metadata','img_url']
    for attr in attributes:
        if attr not in data:
            return {"error": f"{attr} attribute is missing!"},400        
        elif type(data[attr])==str and data[attr].strip()=='':
            return {"error": f"{attr} attribute is empty!"},400    
    try:
        if 'link' not in data['metadata']:
            return {"error": f"metadata link attribute is missing!"},400
        elif type(data['metadata']['link'])!=str:
            return {"error": f"metadata link attribute must be string!"},400        
        elif data['metadata']['link'].strip()=='' or data['metadata']['link']==None:
            return {"error": f"metadata link attribute is empty or null!"},400
        
    except (AttributeError, TypeError):
        return {"error": f"metadata link attribute must be string!"},400
    
    

@app.route("/projects/embeddings/create", methods=["POST"])
def embeddingCreate():

    try:
        data = request.json
    except:
        return {"error":"No JSON object recieved!"},400
    
    attributes=['namespace','metadata']
    for attr in attributes:
        if attr not in data:
            return {"error": f"{attr} attribute is missing!"},400        
        elif type(data[attr])==str and data[attr].strip()=='':
            return {"error": f"{attr} attribute is empty!"},400
    
    attributes = ['text', 'fileURL', 'webpage', 'youtubeURL']
    present_attributes = [attr for attr in attributes if attr in data]
    
    if len(present_attributes) != 1:
        return ({"error": "Exactly one of text, fileURL, webpage, youtubeURL should be present!"}), 400
    
    if 'openAIKey' in data:
        pass
    else:
        data['openAIKey'] = default_openai_key


    if 'webpage' in present_attributes:
        fileType = 'webpage'
    elif 'fileURL' in present_attributes:
        if '.pdf' in data.get('fileType', ''):
            fileType = 'file'
        elif '.doc' in data.get('fileType', ''):
            fileType = 'file'
        elif '.txt' in data.get('fileType', ''):
            fileType = 'file'            
        else:
            return ({"error": "Invalid fileType for fileURL! Use pdf or doc or txt."}), 400
        
    elif 'text' in present_attributes:
        fileType = 'text'  
    elif 'youtubeURL' in present_attributes:
        fileType = 'youtubeURL'
        
    if fileType=='fileType':
        pdfContent = data['text']
        
    elif fileType=='file':        
        r = requests.get(data['fileURL'])
        f = io.BytesIO(r.content)     
        if '.pdf' in data['fileURL'].lower():
            try:
                reader = PdfReader(f)
            except PdfReadError:
                return ({"error":"PDF file not valid!"}),400

            totalPages=reader.pages.length_function()
            pdfContent=''
            for page in range(totalPages):
                pdfContent=pdfContent+reader.pages[page].extract_text()
            if pdfContent=='':
                return ({"error":"Oops, we can't read this PDF, it might be scanned or empty. Please ensure the PDF has not been scanned and text can be selected from it."}),400
            
        if '.doc' in data['fileURL'].lower():
            try:
                pdfContent = docx2txt.process(f)
            except BadZipFile:
                return ({"error":"Invalid file link. File may have expired."}),400    
            
        if 'txt' in data['fileURL'].lower():
            pdfContent=''
            for line in urllib.request.urlopen(data['fileURL']):
                try:
                    pdfContent=pdfContent+(line.decode('utf-8',errors='ignore'))
                except:
                    try:
                        pdfContent=pdfContent+(line.decode(chardet.detect(line)['encoding'])) 
                    except:
                        pdfContent=pdfContent+(line.decode('ISO-8859-1'))             

    elif fileType=='webpage':
        try:
            options = {
                "url": data['webpage'],
                "elements": [
                    {
                        "selector": "html"
                    }
                ],
                "gotoOptions": {
                    "timeout": 30000,
                    "waitUntil": "networkidle2"
                }
            }
            try:                
                browserReq = browserLessReq(data,options)
                if browserReq[1] == 'failed':                        
                    return ({'error': f'{browserReq[0]}'}), 400
                else:
                    response = browserReq[0]
            except RetryError:
                return ({'error': f'The url is not responding!'}), 400
            try:
                pdfContent = response

            except:
                return ({"error": "Webpage can’t be reached"}), 400

            if pdfContent == '':
                return ({"error": "Webpage can’t be reached or empty!"}), 400

        except (ConnectionError, URLError):
            options = {
                "url": Object['webpage'],
                "elements": [
                    {
                        "selector": "html"
                    }
                ],
                "gotoOptions": {
                    "timeout": 30000,
                    "waitUntil": "networkidle2"
                }
            }

            try:
                browserReq = browserLessReq(data,options)
                if browserReq[1] == 'failed':
                    return ({'error': f'{browserReq[0]}'}), 400
                else:
                    response = browserReq[0]
            except RetryError:
                return ({'error': f'The server is currently overloaded with other requests'}), 400
            try:
                pdfContent = response                   
            except:
                return ({"error": "Webpage can’t be reached"}), 400
            
    elif fileType == 'youtubeURL':
        video_id = data['youtubeURL'].split('?v=')[-1]
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        formatter = TextFormatter()
        pdfContent = formatter.format_transcript(transcript).replace('\n',' ')        
        r = requests.get(data['youtubeURL'])
        soup = BeautifulSoup(r.text)
        link = soup.find_all(name="title")[0]
        video_title = str(link)
        video_title = video_title.replace("<title>","")
        video_title = video_title.replace("</title>","")
        video_title = video_title.replace(' - YouTube','').strip()        
        
    splittedContent=text_splitter.split_text(pdfContent)
    splittedContent = [' '.join(i.split()) for i in splittedContent if i.strip() != '']  
    if fileType == 'youtubeURL':
        splittedContent = [f'This is a Youtube video transcript. The title of the video is: {video_title}. Transcript Content:  '+i for i in splittedContent]
    try:
        splittedContent.remove('.')
    except:
        pass    
    returnJsonData=[]
    openai.api_key = data['openAIKey']
    for chunk in splittedContent:
        tempJson={}
        metadata=data['metadata'].copy()
        chunk_id=hashlib.sha256(
                    (str(datetime.now()) + chunk[0:15]).encode()).hexdigest()
        metadata['content']=chunk
        try:            
            embeddingResp = create_embedding(chunk,data)
            if embeddingResp[1] == 'failed':
                return ({'error': f'{embeddingResp[0]}'}), 400
            else:
                response = embeddingResp[0]
        except RetryError:
            return ({'error': f'The server is currently overloaded with other requests'}), 400        
         
        tempJson['values']=response['data'][0]['embedding']
        metadata['content']=chunk
        tempJson['id']=chunk_id
        tempJson['metadata']=metadata
        tempJson['namespace']=data['namespace']
        returnJsonData.append(tempJson)
    counter = 0
    batchList = []
    temp = []
    for idx,vector in enumerate(returnJsonData):
        temp.append((vector['id'], vector['values'], vector['metadata']))
        counter = counter + 1
        if counter == 32:
            batchList.append(temp)
            temp = []
            counter = 0
    if counter < 32:
        batchList.append(temp)
        
    pinecone.init(api_key=pinecone_key,environment=pinecone_env)
    index = pinecone.Index(pinecone_index_name)
    for batch in batchList:
        if len(batch) > 0:
            index.upsert(batch, namespace=data['namespace'])  
    finalResponse={
        "index":pinecone_index_name,
        "namespace":data["namespace"],
        "total_chunks":len(returnJsonData)
    }
    return finalResponse,200   

if __name__ == "__main__":
    app.run(port=8006,threaded=True)

In [None]:
image_url = "https://blogassets.leverageedu.com/media/uploads/2021/10/03192208/32c4b91dee1d2545fe8aecc3b60fa690.png"

from openai import OpenAI

client = OpenAI(api_key=default_openai_key)

response = client.chat.completions.create(
  model="gpt-4-vision-preview",
  messages=[
    {
      "role": "user",
      "content": [
        {"type": "text", "text": "Give a breif description of the image?"},
        {
          "type": "image_url",
          "image_url": {
            "url": f"{image_url}",
          },
        },
      ],
    }
  ],
  max_tokens=300,
)

text = (response.choices[0].message.content)



In [None]:
!pip install youtube-transcript-api

In [None]:
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api.formatters import TextFormatter

url ='https://www.youtube.com/watch?v=TyrHqJdgU7w'
video_id = url.split('?v=')[-1]
transcript = YouTubeTranscriptApi.get_transcript(video_id)
formatter = TextFormatter()
pdfContent = formatter.format_transcript(transcript).replace('\n',' ')
import requests
from bs4 import BeautifulSoup

r = requests.get(url)
soup = BeautifulSoup(r.text)

link = soup.find_all(name="title")[0]
title = str(link)
title = title.replace("<title>","")
title = title.replace("</title>","")
title = title.replace(' - YouTube','').strip()

In [None]:
splittedContent=text_splitter.split_text(pdfContent)
splittedContent = [f'This is a Youtube video transcript. The title of the video is: {title}. Transcript Content:  '+i for i in splittedContent]     
try:
    splittedContent.remove('.')
except:
    pass  

In [None]:
splittedContent

In [None]:
title

In [None]:
loader = YoutubeLoader.from_youtube_url(
    'https://www.youtube.com/watch?v=TyrHqJdgU7w', add_video_info=True
)
content = loader.load()


In [None]:
content[0].page_content

In [None]:
video_title = content[0].metadata['title']
video_publish_date = content[0].metadata['publish_date'].strftime('%Y-%m-%d')

In [None]:
splittedContent=text_splitter.split_text(pdfContent)
splittedContent = [' '.join(i.split()) for i in splittedContent if i.strip() != '']  

try:
    splittedContent.remove('.')
except:
    pass    

In [None]:
splittedContent

In [None]:
pdfContent

In [None]:
content = loader.load()

In [None]:
content[0].metadata['author']

In [None]:
content[0].metadata['author']

In [None]:
pip install youtube-transcript-api

In [None]:
pip install pytube

In [None]:
HEADERS = {'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148'}
r = requests.get('https://www.osha.gov/sites/default/files/2019-03/fireprotection.pdf',headers=HEADERS)
f = io.BytesIO(r.content)     
reader = PdfReader(f)

In [None]:
totalPages=reader.pages.length_function()
pdfContent=''
for page in range(totalPages):
    pdfContent=pdfContent+reader.pages[page].extract_text()

In [None]:
r.status_code

In [None]:
pinecone_index='index-educated-change-1'
pinecone.init(api_key=pinecone_key,environment=pinecone_env)
index = pinecone.Index(pinecone_index)  
return_list = []

result = index.query(
    vector=[0] * 1536,
    top_k=10000,
    include_values=False,
    include_metadata=True,
    namespace='owaisahmed142002@gmail.com alphabet-brain'
#     filter={'link':'https://en.wikipedia.org/wiki/B'}
)

In [None]:
from openai import OpenAI
client =OpenAI(api_key=default_openai_key)  
response = client.chat.completions.create(
  model="gpt-4-vision-preview",
  messages=[
    {
      "role": "user",
      "content": [
        {"type": "text", "text": "Who is the employer?"},
        {
          "type": "image_url",
          "image_url": {
            "url" : "https://rehanis3.s3.us-east-2.amazonaws.com/listings/vision_1_20240107_001849.jpg", 
          },
        },
      ],
    }
  ],
)
response.choices[0].message.content

In [None]:
response