In [50]:
import openai
from dotenv import dotenv_values
config = dotenv_values(".env")
openai.api_key = config["AIKEY"]

In [51]:
import pandas as pd
import numpy as np
from tenacity import retry, wait_random_exponential, stop_after_attempt
import tiktoken
import pickle

In [52]:
dataset_path = "./plots.csv"

In [53]:
df = pd.read_csv(dataset_path)
movies = df[df["Origin/Ethnicity"] == "American"].sort_values("Release Year",ascending=False).head(1000)

In [54]:
movie_plots = movies["Plot"].values

In [55]:
enc = tiktoken.encoding_for_model("text-embedding-ada-002")
totalTokens = sum([len(enc.encode(plot)) for plot in movie_plots])

In [56]:
totalTokens
cost = (0.0004/1000)*totalTokens
print(f"Estimated cost ${cost:.2f}")

Estimated cost $0.30


In [57]:
@retry(wait=wait_random_exponential(min=1, max=20),stop=stop_after_attempt(6))
def get_embedding(text,model="text-embedding-ada-002"):
    text = text.replace("\n"," ")
    return openai.Embedding.create(input=text,model=model)["data"][0]["embedding"]

In [58]:
embedding_cache_path = "movie_embeddings.pkl"

#load cache if it exists and save a copy to disk
try:
    embedding_cache = pd.read_pickle(embedding_cache_path)
except:
    embedding_cache = {}
with open(embedding_cache_path,"wb") as embedding_cache_file:
    pickle.dump(embedding_cache,embedding_cache_file)
    
#retrieve cache if present from a file or request via API
def embedding_from_string(
    string,
    model="text-embedding-ada-002",
    embedding_cache=embedding_cache
):
    if(string,model) not in embedding_cache.keys():
        embedding_cache[(string,model)] = get_embedding(string,model)
        print(f"Got embedding from API for {string[:20]}")
        with open(embedding_cache_path,"wb")as embedding_cache_file:
            pickle.dump(embedding_cache,embedding_cache_file)
    return embedding_cache[(string,model)]

In [59]:
plot_embeddings = [embedding_from_string(plot,model="text-embedding-ada-002") for plot in movie_plots]

In [60]:
from nomic import atlas

In [64]:
data = movies[["Title","Genre"]].to_dict("records")

In [65]:
project = atlas.map_embeddings(
    embeddings=np.array(plot_embeddings),
    data=data
)

[32m2023-10-04 12:07:28.421[0m | [1mINFO    [0m | [36mnomic.project[0m:[36m_create_project[0m:[36m790[0m - [1mCreating project `tacit-brilliant` in organization `ognjenstancevic`[0m
[32m2023-10-04 12:07:29.677[0m | [1mINFO    [0m | [36mnomic.atlas[0m:[36mmap_embeddings[0m:[36m110[0m - [1mUploading embeddings to Atlas.[0m
1it [00:02,  2.53s/it]
[32m2023-10-04 12:07:32.233[0m | [1mINFO    [0m | [36mnomic.project[0m:[36m_add_data[0m:[36m1422[0m - [1mUpload succeeded.[0m
[32m2023-10-04 12:07:32.234[0m | [1mINFO    [0m | [36mnomic.atlas[0m:[36mmap_embeddings[0m:[36m129[0m - [1mEmbedding upload succeeded.[0m
[32m2023-10-04 12:07:34.106[0m | [1mINFO    [0m | [36mnomic.project[0m:[36mcreate_index[0m:[36m1132[0m - [1mCreated map `tacit-brilliant` in project `tacit-brilliant`: https://atlas.nomic.ai/map/723161ba-3750-49e8-af78-2a351887bf10/1c9f742a-2133-4a2a-a26c-1c2b15738e0d[0m
[32m2023-10-04 12:07:34.108[0m | [1mINFO    [0m | [36

In [77]:
from openai.embeddings_utils import distances_from_embeddings, indices_of_nearest_neighbors_from_distances
def recommendations(
    strings,
    index_of_str,
    k_nearest_neighbors=3,
    model="text-embedding-ada-002"
):
    #Get embeddings
    embeddings = [embedding_from_string(string) for string in strings]
    #Get embedding for query
    query_embedding = embeddings[index_of_str]
    #get distances between our and other embeddings
    distances = distances_from_embeddings(query_embedding,embeddings)
    #get indices
    indicies = indices_of_nearest_neighbors_from_distances(distances)
    
    query = strings[index_of_str]
    matches = 0
    for i in indicies:
        if query == strings[i]:
            continue
        if matches >= k_nearest_neighbors:
            break
        matches += 1
        print(f"Found {matches} closest match: ")
        print(f"Distance of : {distances[i]}")
        print(strings[i])
        
    

In [78]:
recommendations(movie_plots,2)

Found 1 closest match: 
Distance of : 0.13664061546949713
In the near future, the unmanned Pilgrim 7 space probe returns from Mars to Earth orbit with soil samples potentially containing evidence of extraterrestrial life. The probe is captured and its samples retrieved by the International Space Station and its six-member crew. Exobiologist Hugh Derry, who is paralyzed from the waist down, revives a dormant cell from the sample, which quickly grows into a multi-celled organism that American school children name "Calvin". Hugh realizes that Calvin's cells can change their specialisation, acting as muscle, sensor, and neuron cells all at once.
An accident in the lab causes Calvin to become dormant; Hugh attempts to revive Calvin with electric shocks, but Calvin immediately becomes hostile and attacks Hugh, crushing his hand. While Hugh lies unconscious from Calvin's attack, Calvin uses Hugh's electric shock tool to escape its enclosure; now free in the laboratory, Calvin devours a lab r