In [11]:
#from pandas_profiling import ProfileReport
import sqlite3
import pandas as pd
from collections import Counter
import re
import multiprocessing as mp
from langchain.document_loaders import BSHTMLLoader, DirectoryLoader
import os
import openai
import os
import re

import pinecone
from langchain.vectorstores import Pinecone


  from tqdm.autonotebook import tqdm


In [24]:
import os
from dotenv import load_dotenv
load_dotenv()
pinecone.init(
    api_key=os.getenv('PINECONE_API_KEY'),  
    environment=os.getenv('PINECONE_ENV')  
)



In [14]:
from langchain.llms import OpenAI
llm = OpenAI(model_name="text-ada-001")
llm("explain large language models in one sentence")

'\n\nLarge language models are models that can be solved in one language. This is done in order to improve the efficiency of the modeler, as well as to allow the model to be understood by more people.'

In [13]:
embed_model = "text-embedding-ada-002"

res = openai.Embedding.create(
    input=[
        "Sample document text goes here",
        "there will be several phrases in each batch"
    ], engine=embed_model
)

In [19]:
res.keys()

dict_keys(['object', 'data', 'model', 'usage'])

In [68]:
conn = sqlite3.connect('./all-the-news.db')
df = pd.read_sql_query("SELECT * FROM longform", conn)
#take only 10%
df = df.sample(frac=0.1, random_state=1)
conn.close()

In [69]:
titles = df['title'].tolist()
contents = df['content'].tolist()


In [70]:

# Umwandlung in Kleinbuchstaben und Entfernen von Sonderzeichen
def preprocess_text(text):
    if text is not None:
        text = text.lower()  # Konvertiere in Kleinbuchstaben
        text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Entferne Sonderzeichen
    return text

processed_titles = [preprocess_text(title) for title in titles]
processed_contents = [preprocess_text(content) for content in contents]


In [76]:
import pandas as pd
new_df = pd.DataFrame({'title': processed_titles, 'content': processed_contents})
print(new_df.size)
new_df = new_df.dropna()
print(new_df.size)


40828
38148


In [77]:
print(new_df.head())

                                               title  \
0  trump polls twitter about biased fox debate mo...   
1  espn is baseball squandering its chance to hel...   
2           can clinton make good on her opportunity   
3  good samaritan who saved trooper says god put ...   
4             review spoek mathambo mzansi beat code   

                                             content  
0  on tuesday afternoon donald trump realdonaldtr...  
1  in a long piece at espn jayson stark criticize...  
2   i want to receive updates from partners and s...  
3   cnna good samaritan being hailed as a hero in...  
4  note nprs first listen audio comes down after ...  


In [72]:
import tiktoken

tokenizer_name = tiktoken.encoding_for_model('gpt-4')
tokenizer_name.name

KeyError: 'Could not automatically map gpt-4 to a tokeniser. Please use `tiktok.get_encoding` to explicitly get the tokeniser you expect.'

In [73]:
tokenizer = tiktoken.get_encoding("cl100k_base")

# create the length function
#This code defines a function called `tiktoken_len` that takes a string `text` as input. Inside the function, the `tokenizer.encode()` method is used to tokenize the input text into a sequence of tokens. The `disallowed_special=()` argument specifies that no special tokens should be disallowed during tokenization. Finally, the function returns the length of the token sequence.
def tiktoken_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)

In [74]:

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=20,
    length_function=tiktoken_len,
    separators=["\n\n", "\n", " ", ""]
)

In [78]:
from typing_extensions import Concatenate
from uuid import uuid4
from tqdm.auto import tqdm

chunks = []

# loop over DataFrame rows
for idx, row in tqdm(new_df.iterrows(), total=new_df.shape[0]):
    title = row['title']
    content = row['content']
    if len(content) > 100:
        texts = text_splitter.split_text(content)
        chunks.extend([{
            'id': str(uuid4()),
            'text': texts[i],
            'chunk': i,
        } for i in range(len(texts))])


100%|██████████| 19074/19074 [01:42<00:00, 185.97it/s]


In [79]:

chunks[1]['text']

'adds details on how attack was conducted comment from cisco\nexperts cairolondon four arab states refrained on wednesday from slapping further sanctions on qatar but voiced disappointment at its negative response to their demands and said their boycott of the tiny gulf nation would continue reuters is the news and media division of   thomson reuters is the worlds largest international multimedia news agency providing investing news world news business news technology news headline news small business news news alerts personal finance stock market and mutual funds information available on reuterscom video mobile and interactive television platforms learn more about thomson reuters products  all quotes delayed a minimum of 15 minutes   of exchanges and delays'

In [42]:
from uuid import uuid4
from tqdm.auto import tqdm

chunks = []

for idx in tqdm(range(len(processed_titles))):
    title = processed_titles[idx]
    content = processed_contents[idx]
    
    if content is not None and len(content) > 100:

        chunk = {
            'id': str(uuid4()),
            'text': content,
            'chunk': idx,
        }
        chunks.append(chunk)


100%|██████████| 204135/204135 [00:00<00:00, 426694.25it/s]


In [44]:
chunks[0]


{'id': '540b2e35-61c3-4504-b31d-dfd2f37758e4',
 'text': '      and never more so than in showtimes new series revival some spoilers ahead through episode 4 of season 3 of twin peaks on may 21st showtime brought back david lynchs groundbreaking tv series twin peaks and fulfilled a prophecy in the process in the second season finale back in 1991 the spirit of seriesdefining murder victim laura palmer told fbi special agent and series protagonist dale cooper ill see you again in 25 years that clip plays again in the first episode of lynchs twin peaks revival as a reminder that decades have in fact gone by lauras promise has been carried out and a series canceled midstory is back on the aira lot has changed in 25 years the original cast members who are mostly back on board have all aged heavily and visibly many of the characters have moved on in life getting new jobs forming families or taking up new obsessions but in the opening episode dale cooper was still where the show left him in 199

In [45]:
index_name = 'gpt-4-iteratec-docs'

In [53]:
import time

# check if index already exists (it shouldn't if this is first time)
if index_name not in pinecone.list_indexes():
    # if does not exist, create index
    pinecone.create_index(
        index_name,
        dimension=len(res['data'][0]['embedding']),
        metric='cosine'
    )
    # wait for index to be initialized
    time.sleep(1)


# view index stats
index.describe_index_stats()

NameError: name 'index' is not defined

In [19]:
# connect to index
index_name = "langchain-retrieval-augmentation"
index = pinecone.Index(index_name=index_name)
# view index stats
#index.describe_index_stats()

In [164]:
chunks[1]['text']

'adds details on how attack was conducted comment from cisco\nexperts cairolondon four arab states refrained on wednesday from slapping further sanctions on qatar but voiced disappointment at its negative response to their demands and said their boycott of the tiny gulf nation would continue reuters is the news and media division of   thomson reuters is the worlds largest international multimedia news agency providing investing news world news business news technology news headline news small business news news alerts personal finance stock market and mutual funds information available on reuterscom video mobile and interactive television platforms learn more about thomson reuters products  all quotes delayed a minimum of 15 minutes   of exchanges and delays'

In [80]:
openai.Embedding.create(input=chunks[1]['text'], engine=embed_model)


<OpenAIObject list at 0x2b92a6e5bd0> JSON: {
  "data": [
    {
      "embedding": [
        -0.022285690531134605,
        0.0018991068936884403,
        0.007312714122235775,
        -0.036542292684316635,
        0.0029488466680049896,
        -0.0006423449376598,
        -0.025803737342357635,
        -0.019590413197875023,
        -0.005032367072999477,
        -0.041450534015893936,
        0.02258359082043171,
        0.01740581914782524,
        -0.013242324814200401,
        0.0021509025245904922,
        -0.003815945703536272,
        -0.01730651967227459,
        0.021079909056425095,
        -0.006153035443276167,
        0.02530723810195923,
        -0.03410235419869423,
        0.01568935252726078,
        0.003312354441732168,
        -0.019306698814034462,
        -0.010142755694687366,
        0.012554319575428963,
        -0.004911788739264011,
        0.015604238957166672,
        -0.026456277817487717,
        0.0018264052923768759,
        -0.019689712673425674,
   

In [85]:
from tqdm.auto import tqdm
from time import sleep

batch_size = 100  # how many embeddings we create and insert at once

for i in tqdm(range(0, len(chunks), batch_size)):
    # find end of batch
    i_end = min(len(chunks), i+batch_size)
    meta_batch = chunks[i:i_end]
    # get ids
    ids_batch = [x['id'] for x in meta_batch]
    # get texts to encode
    texts = [x['text'] for x in meta_batch]
    # create embeddings (try-except added to avoid RateLimitError)
    try:
        res = openai.Embedding.create(input=texts, engine=embed_model)
    except Exception as e:
        print(e)
        done = False
        while not done:
            sleep(5)
            try:
                res = openai.Embedding.create(input=texts, engine=embed_model)
                done = True
            except:
                pass
    embeds = [record['embedding'] for record in res['data']]
    # cleanup metadata
    meta_batch = [{
        'text': x['text'],
        'chunk': x['chunk']
    } for x in meta_batch]
    to_upsert = list(zip(ids_batch, embeds, meta_batch))
    # upsert to Pinecone
    index.upsert(vectors=to_upsert)

 16%|█▋        | 77/470 [12:18<1:02:48,  9.59s/it]


KeyboardInterrupt: 

In [163]:
chunks[0]['text']

'on tuesday afternoon donald trump realdonaldtrump tweeted should i do the gopdebate with a twitter poll and a link to an instagram video of himself complaining about kelly of fox news which is owned by twentyfirst century fox inc   she is expected to moderate the thursday debate  megyn kellys really biased against me trump said in the video she knows that i know that everybody knows that do you really think she can be fair at a debate that same video garnered more than 190000 views on facebook in roughly an hour the poll which was embedded in trumps tweet showed a nearsplit between respondents who felt he should attend the debate 52 percent and those who felt he should skip it 48 percent overall the poll had received more than 11000 responses in the first two hours after it went live fox news appeared unfazed by trumps comments we learned from a secret back channel that the ayatollah and putin both intend to treat donald trump unfairly when they meet with him if he becomes president a

In [8]:
def create_embedding(query):
    return openai.Embedding.create(
        input=[query],
        engine=embed_model
    )

In [22]:
def query_pinecone_embedding(openai_embedding):
    # retrieve from Pinecone
    xq = openai_embedding['data'][0]['embedding']
    print(len(xq))
    # get relevant contexts (including the questions)
    return index.query(xq, top_k=5, include_metadata=True)

In [23]:
query = "What is pharmaceuticals international?"

embedding = create_embedding(query)
print(len(embedding))
result = query_pinecone_embedding(embedding)
result

4
1536


MaxRetryError: HTTPSConnectionPool(host='langchain-retrieval-augmentation-unknown.svc.us-west1-gcp.pinecone.io', port=443): Max retries exceeded with url: /query (Caused by SSLError(SSLEOFError(8, 'EOF occurred in violation of protocol (_ssl.c:1129)')))

In [9]:
def pinecone_to_list(res):
    # get list of retrieved texts
    contexts = [item['metadata']['text'] for item in res['matches']]
    return "\n\n---\n\n".join(contexts)+"\n\n-----\n\n"+query

pinecone_to_list(result)

NameError: name 'result' is not defined

In [4]:
def query_gpt(prompt, model="text-davinci-003"):
    return openai.Completion.create(
        model=model,
        prompt=prompt,
        max_tokens=150
    )



In [3]:
def query_gpt(prompt, message, model="gpt-3.5-turbo"):
    return openai.ChatCompletion.create(
        model=model,
        messages=[
            {"role": "system", "content": prompt},
            {"role": "user", "content": message}
        ]
    )


In [1]:
prompt = f"""You are Q&A bot. A highly intelligent system that answers
user questions based on the information provided by the user above
each question. If the information can not be found in the information
provided by the user you truthfully say "I don't know".
"""

In [6]:
def search_docs(query, model="gpt-3.5-turbo"):
    embedding = create_embedding(query)
    result = query_pinecone_embedding(embedding)
    open_ai_message = pinecone_to_list(result)
    return query_gpt(prompt, open_ai_message, model=model)

In [21]:
query = "Where is Texas?"
# "How does iteratec help customers become digital champions?"
# "Who is the CEO of iteratec?"
# "How does iteratec talk about digital champions?
# "What is the iteratec mission?"
# "Which technologies does iteratec use?"

result = search_docs(query)

1536


MaxRetryError: HTTPSConnectionPool(host='langchain-retrieval-augmentation-unknown.svc.us-west1-gcp.pinecone.io', port=443): Max retries exceeded with url: /query (Caused by SSLError(SSLEOFError(8, 'EOF occurred in violation of protocol (_ssl.c:1129)')))

In [144]:
query

'Which Methodes does pharmaceuticals international use?'

result

In [149]:
result

<OpenAIObject chat.completion id=chatcmpl-7PuykdS7oCnSJzt8WYY1Hc9ylihUm at 0x2b924e9e0e0> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "message": {
        "content": "According to the information provided, Valeant Pharmaceuticals International Inc uses a network of specialty pharmacies to help drive up the pricing of several of its drugs.",
        "role": "assistant"
      }
    }
  ],
  "created": 1686411582,
  "id": "chatcmpl-7PuykdS7oCnSJzt8WYY1Hc9ylihUm",
  "model": "gpt-3.5-turbo-0301",
  "object": "chat.completion",
  "usage": {
    "completion_tokens": 29,
    "prompt_tokens": 2131,
    "total_tokens": 2160
  }
}

In [147]:
result

<OpenAIObject chat.completion id=chatcmpl-7PuyRvAT1ghuTdn0VCBMLuYqq7C2i at 0x2b924c864f0> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "message": {
        "content": "I don't know. The name Patrick Ries was not mentioned in the text provided.",
        "role": "assistant"
      }
    }
  ],
  "created": 1686411563,
  "id": "chatcmpl-7PuyRvAT1ghuTdn0VCBMLuYqq7C2i",
  "model": "gpt-3.5-turbo-0301",
  "object": "chat.completion",
  "usage": {
    "completion_tokens": 18,
    "prompt_tokens": 1919,
    "total_tokens": 1937
  }
}

In [150]:
from IPython.display import Markdown

display(Markdown(result['choices'][0]['message']['content']))

According to the information provided, Valeant Pharmaceuticals International Inc uses a network of specialty pharmacies to help drive up the pricing of several of its drugs.

In [152]:
res = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "system", "content": prompt },
        {"role": "user", "content": query}
    ]
)
display(Markdown(res['choices'][0]['message']['content']))

I'm sorry, but I cannot answer your question without additional information about which "pharmaceuticals international" you are referring to. Could you please provide more details or context about the company you are interested in?