In [38]:
import openai
from openai import OpenAI
from tqdm import tqdm
import pickle
import pandas as pd
import numpy as np
import os
from dotenv import load_dotenv

load_dotenv()

client = OpenAI(api_key = os.getenv("OPENAI_API_KEY"))
top_most_similar_text_chunks = 5

In [39]:
!pip install pyarrow



In [40]:
df = pd.read_feather("text_chunks_and_embedding_dataframe.feather")
print(len(df))
df.head()

8843


Unnamed: 0,filename,text chunk,embedding
0,apidocs/api.python.langchain.com/en/latest/_mo...,\n\n\n\nOverview: module code ‚Äî ü¶úüîó LangChain ...,"[-0.038963668048381805, 0.060036975890398026, ..."
1,apidocs/api.python.langchain.com/en/latest/_mo...,.base\nlangchain.agents.chat.output_parser\nla...,"[0.013684544712305069, 0.05675216019153595, 0...."
2,apidocs/api.python.langchain.com/en/latest/_mo...,.langchain\nlangchain.callbacks.tracers.langch...,"[0.012743242084980011, 0.04998200014233589, 0...."
3,apidocs/api.python.langchain.com/en/latest/_mo...,chain.chains.openai_functions.qa_with_structur...,"[0.015767386183142662, 0.030865758657455444, 0..."
4,apidocs/api.python.langchain.com/en/latest/_mo...,_loaders.arxiv\nlangchain.document_loaders.ass...,"[0.02353993058204651, 0.05262213572859764, 0.0..."


In [41]:
def get_embedding(list_texts, model="text-embedding-3-small"):
    list_texts_replaced = [t.replace("\n", " ") for t in list_texts]
    embeddings = client.embeddings.create(input = list_texts_replaced, model=model).data
    return [x.embedding for x in embeddings]

In [42]:
def cosine_similarity(a, b):
    """Calculates cosine similarity between two vectors using NumPy."""
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    return dot_product / (norm_a * norm_b)

In [43]:
def find_most_similar_text_chunks(search_term, df, top_most_similar_text_chunks = 5):
    embedding_search_term = get_embedding([search_term])[0]
    df['similarity'] = df["embedding"].apply(lambda x: cosine_similarity(x, embedding_search_term))
    df = df.sort_values(by='similarity', ascending=False)

    return (list(df["text chunk"])[:top_most_similar_text_chunks], list(df["similarity"])[:top_most_similar_text_chunks])

In [44]:
search_term = "How to create an app using LangChain?"
most_similar_text_chunks = find_most_similar_text_chunks(search_term, df)

In [45]:
most_similar_text_chunks

([' \n\n\n\nlangchain.tools.zapier.tool ‚Äî ü¶úüîó LangChain 0.0.305\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nAPI\n\n\nExperimental\n\n\nPython Docs\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nToggle Menu\n\n\n\nPrevUp\nNext\n\n\n\nLangChain 0.0.305\n\n\n\n\n\n\n\n\nSource code for langchain.tools.zapier.tool\n"""## Zapier Natural Language Actions API\n\\\nFull docs here: https://nla.zapier.com/start/\n\n**Zapier Natural Language Actions** gives you access to the 5k+ apps, 20k+ actions\non Zapier\'s platform through a natural language API interface.\n\nNLA supports apps like Gmail, Salesforce, Trello, Slack, Asana, HubSpot, Google Sheets,\nMicrosoft Teams, and thousands more apps: https://zapier.com/apps\n\nZapier NLA handles ALL the underlying API auth and translation from\nnatural language --> underlying API call --> return simplified output for LLMs\nThe key idea is you, or your users, expose a set of actions via an oauth-like setup\nwindow, which you can then query and e

In [46]:
df.head()

Unnamed: 0,filename,text chunk,embedding,similarity
0,apidocs/api.python.langchain.com/en/latest/_mo...,\n\n\n\nOverview: module code ‚Äî ü¶úüîó LangChain ...,"[-0.038963668048381805, 0.060036975890398026, ...",0.362716
1,apidocs/api.python.langchain.com/en/latest/_mo...,.base\nlangchain.agents.chat.output_parser\nla...,"[0.013684544712305069, 0.05675216019153595, 0....",0.356514
2,apidocs/api.python.langchain.com/en/latest/_mo...,.langchain\nlangchain.callbacks.tracers.langch...,"[0.012743242084980011, 0.04998200014233589, 0....",0.434078
3,apidocs/api.python.langchain.com/en/latest/_mo...,chain.chains.openai_functions.qa_with_structur...,"[0.015767386183142662, 0.030865758657455444, 0...",0.412369
4,apidocs/api.python.langchain.com/en/latest/_mo...,_loaders.arxiv\nlangchain.document_loaders.ass...,"[0.02353993058204651, 0.05262213572859764, 0.0...",0.331863


In [47]:
def get_completion(prompt, model="gpt-3.5-turbo"):
    """Generates a completion from ChatGPT given a prompt."""
    messages = [{"role": "user", "content": prompt}]
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=0, # this is the degree of randomness of the model's output
    )
    return response.choices[0].message.content

In [48]:
prompt = search_term
response = get_completion(prompt)
print(response)

To create an app using LangChain, follow these steps:

1. Install LangChain SDK: Start by installing the LangChain SDK on your development environment. You can find the installation instructions on the LangChain website.

2. Set up your project: Create a new project in your preferred development environment (e.g. Android Studio, Xcode, etc.) and import the LangChain SDK into your project.

3. Initialize LangChain: Initialize the LangChain SDK in your app by providing your API key and other necessary configurations.

4. Implement language translation: Use the LangChain SDK to implement language translation functionality in your app. You can translate text, speech, or even images using the SDK.

5. Customize the user interface: Customize the user interface of your app to provide a seamless translation experience for your users. You can add language selection options, translation history, and other features to enhance the user experience.

6. Test your app: Test your app thoroughly to ens

In [49]:
prompt = search_term + " Please answer based on your knowledge and also by consulting the following documents: " + ", ".join(most_similar_text_chunks[0])

In [None]:
response = get_completion(prompt)
print(response)