In [1]:
from langchain import PromptTemplate
import os 

os.environ["HUGGINGFACEHUB_API_TOKEN"] = ""
OPENAI_KEY = ""

In [2]:
template = """Question: {question}

Answer: """

prompt = PromptTemplate(
    template=template,
    input_variables=['question']
)

# user question
question = "Which NFL team won the Super Bowl in the 2010 season?"

In [3]:
from langchain import HuggingFaceHub, LLMChain

# initialize Hub LLM
hub_llm = HuggingFaceHub(
    repo_id='google/flan-t5-xl',
    model_kwargs={'temperature':1e-10}
)

# create prompt template > LLM chain
llm_chain = LLMChain(
    prompt=prompt,
    llm=hub_llm
)

# ask the user question about NFL 2010
#print(llm_chain.run(question))

Trying to answer multiple questions

In [4]:
qs = [
    {'question': "Which NFL team won the Super Bowl in the 2010 season?"},
    {'question': "If I am 6 ft 4 inches, how tall am I in centimeters?"},
    {'question': "Who was the 12th person on the moon?"},
    {'question': "How many eyes does a blade of grass have?"}
]
#res = llm_chain.generate(qs)
#res

Try OpenAI models

In [5]:
from langchain.llms import OpenAI

# initialize the models
openai = OpenAI(
    model_name="text-davinci-003",
    openai_api_key=OPENAI_KEY
)

In [6]:
from langchain import PromptTemplate

template = """Answer the question based on the context below. If the
question cannot be answered using the information provided answer
with "I don't know".

Context: Large Language Models (LLMs) are the latest models used in NLP.
Their superior performance over smaller models has made them incredibly
useful for developers building NLP enabled applications. These models
can be accessed via Hugging Face's `transformers` library, via OpenAI
using the `openai` library, and via Cohere using the `cohere` library.

Question: {query}

Answer: """

prompt_template = PromptTemplate(
    input_variables=["query"],
    template=template
)

In [7]:
print(
    prompt_template.format(
        query="Which libraries and model providers offer LLMs?"
    )
)

Answer the question based on the context below. If the
question cannot be answered using the information provided answer
with "I don't know".

Context: Large Language Models (LLMs) are the latest models used in NLP.
Their superior performance over smaller models has made them incredibly
useful for developers building NLP enabled applications. These models
can be accessed via Hugging Face's `transformers` library, via OpenAI
using the `openai` library, and via Cohere using the `cohere` library.

Question: Which libraries and model providers offer LLMs?

Answer: 


In [8]:
# print(openai(
#     prompt_template.format(
#         query="Which libraries and model providers offer LLMs?"
#     )
# ))

## Few-Shot Learning

In [9]:
from langchain import FewShotPromptTemplate

# create our examples
examples = [
    {
        "query": "How are you?",
        "answer": "I can't complain but sometimes I still do."
    }, {
        "query": "What time is it?",
        "answer": "It's time to get a watch."
    }
]

# create a example template
example_template = """
User: {query}
AI: {answer}
"""

# create a prompt example from above template
example_prompt = PromptTemplate(
    input_variables=["query", "answer"],
    template=example_template
)

# now break our previous prompt into a prefix and suffix
# the prefix is our instructions
prefix = """The following are exerpts from conversations with an AI
assistant. The assistant is typically sarcastic and witty, producing
creative  and funny responses to the users questions. Here are some
examples: 
"""
# and the suffix our user input and output indicator
suffix = """
User: {query}
AI: """

# now create the few shot prompt template
few_shot_prompt_template = FewShotPromptTemplate(
    examples=examples,
    example_prompt=example_prompt,
    prefix=prefix,
    suffix=suffix,
    input_variables=["query"],
    example_separator="\n\n"
)

In [10]:
# openai(few_shot_prompt_template.format(query="How are you?"))

## Knowledge Base Stuff

In [11]:
from datasets import load_dataset

data = load_dataset("wikipedia", "20220301.simple", split='train[:10000]')
data

Found cached dataset wikipedia (/Users/marcjulianschwarz/.cache/huggingface/datasets/wikipedia/20220301.simple/2.0.0/aa542ed919df55cc5d3347f42dd4521d05ca68751f50dbc32bae2a7f1e167559)


Dataset({
    features: ['id', 'url', 'title', 'text'],
    num_rows: 10000
})

In [12]:
data[6]

{'id': '13',
 'url': 'https://simple.wikipedia.org/wiki/Alan%20Turing',
 'title': 'Alan Turing',
 'text': 'Alan Mathison Turing OBE FRS (London, 23 June 1912 – Wilmslow, Cheshire, 7 June 1954) was an English mathematician and computer scientist. He was born in Maida Vale, London.\n\nEarly life and family \nAlan Turing was born in Maida Vale, London on 23 June 1912. His father was part of a family of merchants from Scotland. His mother, Ethel Sara, was the daughter of an engineer.\n\nEducation \nTuring went to St. Michael\'s, a school at 20 Charles Road, St Leonards-on-sea, when he was five years old.\n"This is only a foretaste of what is to come, and only the shadow of what is going to be.” – Alan Turing.\n\nThe Stoney family were once prominent landlords, here in North Tipperary. His mother Ethel Sara Stoney (1881–1976) was daughter of Edward Waller Stoney (Borrisokane, North Tipperary) and Sarah Crawford (Cartron Abbey, Co. Longford); Protestant Anglo-Irish gentry.\n\nEducated in Dub

In [13]:
import tiktoken  # !pip install tiktoken

tokenizer = tiktoken.get_encoding('p50k_base')

In [14]:
# create the length function
def tiktoken_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)

tiktoken_len("hello I am a chunk of text and using the tiktoken_len function "
             "we can find the length of this chunk of text in tokens")

28

In [15]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Split text into chunks <= chunk_size
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=20,
    length_function=tiktoken_len,
    separators=["\n\n", "\n", " ", ""]
)

In [16]:
# use first three chunks of text
chunks = text_splitter.split_text(data[6]['text'])[:3]
chunks

['Alan Mathison Turing OBE FRS (London, 23 June 1912 – Wilmslow, Cheshire, 7 June 1954) was an English mathematician and computer scientist. He was born in Maida Vale, London.\n\nEarly life and family \nAlan Turing was born in Maida Vale, London on 23 June 1912. His father was part of a family of merchants from Scotland. His mother, Ethel Sara, was the daughter of an engineer.\n\nEducation \nTuring went to St. Michael\'s, a school at 20 Charles Road, St Leonards-on-sea, when he was five years old.\n"This is only a foretaste of what is to come, and only the shadow of what is going to be.” – Alan Turing.\n\nThe Stoney family were once prominent landlords, here in North Tipperary. His mother Ethel Sara Stoney (1881–1976) was daughter of Edward Waller Stoney (Borrisokane, North Tipperary) and Sarah Crawford (Cartron Abbey, Co. Longford); Protestant Anglo-Irish gentry.\n\nEducated in Dublin at Alexandra School and College; on October 1st 1907 she married Julius Mathison Turing, latter son o

In [17]:
# Check chunk lengths
tiktoken_len(chunks[0]), tiktoken_len(chunks[1]), tiktoken_len(chunks[2])

(397, 304, 399)

## ---- Load own Knowledge Base ----

In [40]:
data = []
for file in os.listdir('knowledge_base'):
    with open(os.path.join('knowledge_base', file)) as f:
        data.append(
            {
                'text': f.read(),
                'id': file
            }
        )

### Creating Embeddings using OpenAI model (maybe replace with BERT for costs)

In [31]:
from langchain.embeddings.openai import OpenAIEmbeddings

model_name = 'text-embedding-ada-002'

embed = OpenAIEmbeddings(
    openai_api_key=OPENAI_KEY
)

In [19]:
texts = [
    'this is the first chunk of text',
    'then another second chunk of text is here'
]

res = embed.embed_documents(texts)
len(res), len(res[0])

(2, 1536)

# Create Vector Database

In [20]:
import pinecone
PINECONE_KEY = ""
PINECONE_ENV = ""

In [36]:
index_name = 'langchain-retrieval-augmentation'
index_name = "own-knowledge-base"

pinecone.init(
        api_key=PINECONE_KEY,  # find api key in console at app.pinecone.io
        environment=PINECONE_ENV  # find next to api key in console
)

# we create a new index
pinecone.create_index(
        name=index_name,
        metric='dotproduct',
        dimension=len(res[0]) # 1536 dim of text-embedding-ada-002
)

### Connect to Vector Database Index

In [38]:
index = pinecone.Index(index_name)
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

### Add vectors

In [41]:
from tqdm.auto import tqdm
from uuid import uuid4

batch_limit = 100

texts = []
metadatas = []

for i, record in enumerate(tqdm(data)):
    # first get metadata fields for this record
    metadata = {
        'id': str(record['id']),
    }

    # now we create chunks from the record text
    record_texts = text_splitter.split_text(record['text'])
    
    # create individual metadata dicts for each chunk
    record_metadatas = [{
        "chunk": j, "text": text, **metadata
    } for j, text in enumerate(record_texts)]
    
    # append these to current batches
    texts.extend(record_texts)
    metadatas.extend(record_metadatas)
    
    # if we have reached the batch_limit we can add texts
    if len(texts) >= batch_limit:
        ids = [str(uuid4()) for _ in range(len(texts))]
        embeds = embed.embed_documents(texts)
        index.upsert(vectors=zip(ids, embeds, metadatas))
        texts = []
        metadatas = []

  0%|          | 0/1712 [00:00<?, ?it/s]

Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised APIConnectionError: Error communicating with OpenAI: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')).


ApiException: (400)
Reason: Bad Request
HTTP response headers: HTTPHeaderDict({'content-type': 'application/json', 'date': 'Thu, 25 May 2023 06:08:24 GMT', 'x-envoy-upstream-service-time': '98', 'content-length': '113', 'server': 'envoy'})
HTTP response body: {"code":3,"message":"Number of provided vectors: 2210 exceeds the maximum amount per request: 1000","details":[]}


In [42]:
from langchain.vectorstores import Pinecone

text_field = "text"

# switch back to normal index for langchain
index = pinecone.Index(index_name)

vectorstore = Pinecone(
    index, embed.embed_query, text_field
)

In [89]:
query = "How can I perform the Shapiro Wilk Test for normality?"

vectorstore.similarity_search(
    query,  # our search query
    k=3  # return 3 most relevant docs
)

[Document(page_content='---\nmodule: "ISSP"\nchapter: "GOF Tests"\n---\n# Shapiro-Wilk Test of Normality\n\nIs a [[qq GOF Test for Class of Distributions]], in this case [[Normalverteilung|Normal Distributions]].\n\n## How to perform the test \n\nMake [[qq-Plot]] for any [[Normalverteilung|Normal Distribution]] against [[Standardnormalverteilung|Standard Normal Distribution]]. Use simple [[Linear Regression]] to fit a diagonal line to the plot and read off the slope $S$. \n\nCalculate [[Test Statistic]] \n$$T=\\frac{S^2}{s^2}$$\nwhere $s$ is the empirical [[Standardabweichung|standard deviation]] for the data sequence.\n\nGiven $H_{0}$ the test statistic will be close to $1$. You can quanitfy this by simulating the test statistic and determening its [[Wahrscheinlichkeitsmaß|distribution]]. You can calculate an [[Acceptance Domain]] with \n$$[z_{\\alpha / 2}, z_{1- \\alpha / 2}]$$\nwhere $\\alpha$ is the probability for [[Type 1 Error]].', metadata={'chunk': 0.0, 'id': 'Shapiro-Wilk Tes

In [90]:
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA

# completion llm
llm = ChatOpenAI(
    openai_api_key=OPENAI_KEY,
    model_name='gpt-3.5-turbo',
    temperature=0.0
)

qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever()
)

In [91]:
qa.run(query)

'To perform the Shapiro-Wilk Test for normality, you need to follow these steps:\n\n1. Create a Q-Q plot for any normal distribution against the standard normal distribution.\n2. Use simple linear regression to fit a diagonal line to the plot and read off the slope S.\n3. Calculate the test statistic T using the formula T = S^2 / s^2, where s is the empirical standard deviation for the data sequence.\n4. Given H0, the test statistic will be close to 1. You can quantify this by simulating the test statistic and determining its distribution.\n5. Calculate an acceptance domain with [zα/2, z1-α/2], where α is the probability for Type 1 Error.\n\nNote that the Shapiro-Wilk Test is a goodness-of-fit test that can test whether the unknown distribution of a data sequence equals some distribution G. It is specifically used to determine whether a data sequence falls into the class of normal distributions.'