In [1]:
import os
import openai

import pandas as pd

from langchain.embeddings.openai import OpenAIEmbeddings

⚠️⚠️⚠️ **HEADS UP**: You need to get an OpenAI API key for the following to work.

                                 
How to get an OpenAI API Key: 
https://www.maisieai.com/help/how-to-get-an-openai-api-key-for-chatgpt

In [None]:
OPENAI_API_KEY = 'sk-00000000000sk-J843jy5RdT8tW0J3Tq6ZT3BlbkFJd5De7FgG5yDKB9AmuWFt000000000' # Change this to your key

os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [3]:
embeddings = OpenAIEmbeddings(model="text-search-ada-doc-001",)

## Let's try embedding a single string

Here's a great introduction to embeddings: 

https://www.youtube.com/watch?v=A8HEPBdKVMA

In [4]:
text = "Hello world"
query_result = embeddings.embed_query(text)

In [5]:
# This is a big list of numbers
print(query_result[:10])

[-3.692254236655495e-05, 0.04646899626594531, -0.013200964061653775, 0.0028441711790668364, 0.0019094788313712397, 0.01489493740689739, 0.011226331997380481, -0.04338175501323181, 0.05212225477080107, -0.03626506089435034]


In [6]:
# Our embedding has 1024 dimensions 
len(query_result)

1024

## Let's take a catalog and try to create embeddings for each entry

In [7]:
df_processors = pd.read_csv("./processors.csv")
df_processors

Unnamed: 0,Microprocessor,Description
0,Intel 8086,"The first 16-bit microprocessor, introduced in..."
1,Intel 80386 (i386),"A 32-bit microprocessor, part of the x86 famil..."
2,Intel Pentium,A highly successful microprocessor family know...
3,AMD Ryzen,Modern high-performance processors known for t...
4,ARM Cortex-A series,A family of processors designed for mobile dev...
5,IBM POWER9,Used in high-performance computing and data ce...
6,Apple M1,"Apple's ARM-based processor, known for its per..."
7,Qualcomm Snapdragon,"Found in many Android devices, known for its i..."
8,Raspberry Pi (ARM),"A low-cost, single-board computer with ARM-bas..."
9,NVIDIA Tegra,"Used in mobile devices and embedded systems, k..."


First we will combine the title and description to a single string of the format `"Title: Description"`. 

We will then create embeddings for this string.


In [8]:
df_processors["summary"] = df_processors['Microprocessor'] + ": " + df_processors['Description']
df_processors

Unnamed: 0,Microprocessor,Description,summary
0,Intel 8086,"The first 16-bit microprocessor, introduced in...","Intel 8086: The first 16-bit microprocessor, i..."
1,Intel 80386 (i386),"A 32-bit microprocessor, part of the x86 famil...","Intel 80386 (i386): A 32-bit microprocessor, p..."
2,Intel Pentium,A highly successful microprocessor family know...,Intel Pentium: A highly successful microproces...
3,AMD Ryzen,Modern high-performance processors known for t...,AMD Ryzen: Modern high-performance processors ...
4,ARM Cortex-A series,A family of processors designed for mobile dev...,ARM Cortex-A series: A family of processors de...
5,IBM POWER9,Used in high-performance computing and data ce...,IBM POWER9: Used in high-performance computing...
6,Apple M1,"Apple's ARM-based processor, known for its per...","Apple M1: Apple's ARM-based processor, known f..."
7,Qualcomm Snapdragon,"Found in many Android devices, known for its i...",Qualcomm Snapdragon: Found in many Android dev...
8,Raspberry Pi (ARM),"A low-cost, single-board computer with ARM-bas...","Raspberry Pi (ARM): A low-cost, single-board c..."
9,NVIDIA Tegra,"Used in mobile devices and embedded systems, k...",NVIDIA Tegra: Used in mobile devices and embed...


## Find embeddings for each of the "summary" text we created

In [9]:
df_processors['embedding'] = df_processors['summary'].apply(lambda text: embeddings.embed_query(text))
df_processors

Unnamed: 0,Microprocessor,Description,summary,embedding
0,Intel 8086,"The first 16-bit microprocessor, introduced in...","Intel 8086: The first 16-bit microprocessor, i...","[-0.012585897130804023, 0.023532795035589537, ..."
1,Intel 80386 (i386),"A 32-bit microprocessor, part of the x86 famil...","Intel 80386 (i386): A 32-bit microprocessor, p...","[-6.49205707325187e-05, 0.026906083312139274, ..."
2,Intel Pentium,A highly successful microprocessor family know...,Intel Pentium: A highly successful microproces...,"[-0.007438277610880901, 0.004308385184397193, ..."
3,AMD Ryzen,Modern high-performance processors known for t...,AMD Ryzen: Modern high-performance processors ...,"[-0.0036651403856808123, 0.011632280492034271,..."
4,ARM Cortex-A series,A family of processors designed for mobile dev...,ARM Cortex-A series: A family of processors de...,"[-0.026204830037545337, 0.013215189800663944, ..."
5,IBM POWER9,Used in high-performance computing and data ce...,IBM POWER9: Used in high-performance computing...,"[0.004770409163829388, 0.0011287585006871457, ..."
6,Apple M1,"Apple's ARM-based processor, known for its per...","Apple M1: Apple's ARM-based processor, known f...","[-0.03232243370008354, 0.015615646208767934, 0..."
7,Qualcomm Snapdragon,"Found in many Android devices, known for its i...",Qualcomm Snapdragon: Found in many Android dev...,"[-0.011854934040482542, 0.023709868080965084, ..."
8,Raspberry Pi (ARM),"A low-cost, single-board computer with ARM-bas...","Raspberry Pi (ARM): A low-cost, single-board c...","[-0.014947313203693384, 0.02705044374934957, 0..."
9,NVIDIA Tegra,"Used in mobile devices and embedded systems, k...",NVIDIA Tegra: Used in mobile devices and embed...,"[-0.016137256936813425, 0.009999975387625605, ..."


In [10]:
## Now we write some functions to help with searching

In [11]:
import numpy as np
from numpy.linalg import norm

In [12]:
def cosine(a ,b):
    return np.dot(a, b)/(norm(a)*norm(b))    

In [13]:
def search_with_embeddings(q, df):
    """Search using cosine similarity between document and query embedding """
    df = df.copy()
    q_emb = embeddings.embed_query(q.lower())
    df['similarities'] = df['embedding'].apply(lambda x: cosine(q_emb, x))
    res = df.sort_values('similarities', ascending=False).head(3)
    res = res[['Microprocessor', 'Description', 'similarities']]
    return res

In [14]:
def search_with_common_words(q, df):
    """Search using words common in the query and the documents"""
    df = df.copy()
    df['summary'] = df['summary'].apply(lambda x: x.lower())
    df['common'] = df['summary'].apply(lambda x: set(x.split()).intersection(q.lower().split()))
    df['similarities'] = df['common'].apply(len)
    df = df[df['similarities'] > 0]
    res = df.sort_values('similarities', ascending=False).head(3)
    return res[['Microprocessor', 'Description', 'similarities']]

## Standard text search (using common words)

In [15]:
search_with_common_words("Intel", df_processors)

Unnamed: 0,Microprocessor,Description,similarities
0,Intel 8086,"The first 16-bit microprocessor, introduced in...",1
1,Intel 80386 (i386),"A 32-bit microprocessor, part of the x86 famil...",1
2,Intel Pentium,A highly successful microprocessor family know...,1


In [16]:
search_with_common_words("phone chip", df_processors)

Unnamed: 0,Microprocessor,Description,similarities


## Semantic search (using embeddings)

In [17]:
# Semantic search is still able to give reasonable results even when there are no words in common
search_with_embeddings("phone chip", df=df_processors)

Unnamed: 0,Microprocessor,Description,similarities
7,Qualcomm Snapdragon,"Found in many Android devices, known for its i...",0.613546
9,NVIDIA Tegra,"Used in mobile devices and embedded systems, k...",0.598488
1,Intel 80386 (i386),"A 32-bit microprocessor, part of the x86 famil...",0.590407


In [18]:
# It even works for full fledged questions instead of just keywords
search_with_embeddings("what processor would smartphones likely use", df=df_processors)

Unnamed: 0,Microprocessor,Description,similarities
7,Qualcomm Snapdragon,"Found in many Android devices, known for its i...",0.745235
6,Apple M1,"Apple's ARM-based processor, known for its per...",0.727412
2,Intel Pentium,A highly successful microprocessor family know...,0.715093
