In [None]:
!pip install openai
!pip install numpy
!pip install pandas
!pip install tenacity
!pip install tiktoken

Collecting openai
  Downloading openai-1.52.1-py3-none-any.whl.metadata (24 kB)
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting jiter<1,>=0.4.0 (from openai)
  Downloading jiter-0.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.2 kB)
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Downloading httpcore-1.0.6-py3-none-any.whl.metadata (21 kB)
Collecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai)
  Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Downloading openai-1.52.1-py3-none-any.whl (386 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m386.9/386.9 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading httpx-0.27.2-py3-none-any.whl (76 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.4/76.4 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading httpcore-1.0.6-py3-none-any.whl (78 kB)
[2K   [90m━

In [None]:
import os
import json
import openai
import pandas as pd
import tiktoken
from google.colab import userdata
from tenacity import retry, wait_random_exponential, stop_after_attempt
import pickle

Environment Variables

In [None]:
os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')
client = openai.OpenAI()

In [None]:
dataset_path = './wiki_movie_plots_deduped.csv'
df = pd.read_csv(dataset_path)

In [None]:
df

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr..."
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov..."
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed..."
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_...",Lasting just 61 seconds and consisting of two ...
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Bea...,The earliest known adaptation of the classic f...
...,...,...,...,...,...,...,...,...
22320,2013,Mama,Canadian,Andres Muschietti,"Jessica Chastain, Nikolaj Coster-Waldau",horror,https://en.wikipedia.org/wiki/Mama_(2013_film),Distraught after losing his fortune in the 200...
22321,2013,The Mortal Instruments: City of Bones,Canadian,Harald Zwart,"Lily Collins, Jamie Campbell Bower, Robert She...",action-adventure science fantasy,https://en.wikipedia.org/wiki/The_Mortal_Instr...,New York City teenager Clary Fray begins seein...
22322,2013,Please Kill Mr. Know It All,Canadian,Colin Carter & Sandra Feldman,"Lara Jean Chorostecki, Jefferson Brown",romantic comedy,https://en.wikipedia.org/wiki/Please_Kill_Mr._...,Cynical Sally lives a life free of excitement ...
22323,2013,Rhymes for Young Ghouls,Canadian,Jeff Barnaby,"Kawennahere Devery Jacobs, Glen Gould",drama,https://en.wikipedia.org/wiki/Rhymes_for_Young...,The film opens with a brief prologue explainin...


In [None]:
movies = df[df['Origin/Ethnicity'] == 'American'].sort_values("Release Year", ascending=False).head(5000)

In [None]:
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def get_embedding(text, model='text-embedding-ada-002'):
  text = text.replace('\n', ' ')
  return client.embeddings.create(input=text, model=model).data[0].embedding

In [None]:
get_embedding('Hellow World')

[-0.004892924800515175,
 0.00449153920635581,
 0.00375511241145432,
 -0.027705565094947815,
 -0.012565693818032742,
 -0.0011477640364319086,
 -0.0016519840573891997,
 -0.007032542489469051,
 -0.01752828061580658,
 -0.021243587136268616,
 0.017130212858319283,
 -0.0033553852699697018,
 -0.012074743397533894,
 -0.005861558020114899,
 0.005407096352428198,
 0.0014148016925901175,
 0.030253201723098755,
 -0.012552425265312195,
 0.01340827252715826,
 0.017727315425872803,
 -0.019372664391994476,
 0.002078249119222164,
 0.01872248575091362,
 -0.011338316835463047,
 -0.016307538375258446,
 -0.011743019334971905,
 0.00538387568667531,
 -0.0296959076076746,
 0.022198950871825218,
 -0.024905815720558167,
 0.0030302961822599173,
 -0.003947512246668339,
 -0.014967373572289944,
 -0.021907033398747444,
 -0.00966642890125513,
 -0.023287003859877586,
 0.0015151480911299586,
 -0.011975225992500782,
 0.016480034217238426,
 -0.0021976695861667395,
 0.015206214971840382,
 0.013142893090844154,
 0.00185101

In [None]:
embedding_cache_path = 'movie_embeddings_demo.pkl'

try:
  embedding_cache = pd.read_pickle(embedding_cache_path)
except FileNotFoundError:
  embedding_cache = {}
with open(embedding_cache_path, 'wb') as embedding_cache_file:
  pickle.dump(embedding_cache, embedding_cache_file)

def embedding_from_string(
    string,
    model='text-embedding-ada-002',
    embedding_cache=embedding_cache
):
  if (string, model) not in embedding_cache.keys():
    embedding_cache[(string, model)] = get_embedding(string, model)
    print(f'GOT EMBEDDING FOR {string[:20]}')
    with open(embedding_cache_path, 'wb') as embedding_cache_file:
      pickle.dump(embedding_cache, embedding_cache_file)
  return embedding_cache[(string, model)]

In [None]:
embedding_from_string('Chicken is friend')

[-0.002181887859478593,
 -0.012682124972343445,
 -0.012019905261695385,
 -0.013831637799739838,
 0.005219662562012672,
 -0.006125528831034899,
 -0.017867427319288254,
 -0.02558915503323078,
 -0.010776682756841183,
 -0.030786952003836632,
 0.004804213996976614,
 0.005282136145979166,
 0.005147818010300398,
 0.005066602490842342,
 -0.021940700709819794,
 0.010732950642704964,
 0.0468551404774189,
 -0.00789040606468916,
 0.031211771070957184,
 -0.027563318610191345,
 -0.03433544933795929,
 0.008790024556219578,
 -0.00725942337885499,
 -0.024714525789022446,
 -0.0022834071423858404,
 0.02851291559636593,
 0.011463891714811325,
 -0.035010162740945816,
 0.00947723351418972,
 -0.00252080662176013,
 0.03940829634666443,
 -0.002792566316202283,
 0.0011151523794978857,
 0.008027847856283188,
 -0.008914971724152565,
 -0.010039495304226875,
 0.003147103590890765,
 -0.0008785339305177331,
 0.01784243807196617,
 0.0005708517855964601,
 0.035310033708810806,
 -0.004548072349280119,
 -0.00717196054756

In [None]:
movie_plots = movies['Plot'].values

In [None]:
enc = tiktoken.encoding_for_model('text-embedding-ada-002')


In [None]:
total_tokens = sum([ len(enc.encode(plot)) for plot in movie_plots])

In [None]:
total_tokens
cost = total_tokens * (.0004 / 1000)
print(f'Estimated cost ${cost:.2f}')

Estimated cost $1.45


In [None]:
plot_embeddings = [embedding_from_string(plot, model='text-embedding-ada-002') for plot in movie_plots]

GOT EMBEDDING FOR In 1954 London, reno
GOT EMBEDDING FOR Eighteen-year-old Ma
GOT EMBEDDING FOR In a prologue, busin
GOT EMBEDDING FOR Anne (Diane Lane) is
GOT EMBEDDING FOR During the Iraq War,
GOT EMBEDDING FOR A contemporary tale 
GOT EMBEDDING FOR Recently fired from 
GOT EMBEDDING FOR The movie opens with
GOT EMBEDDING FOR Mary (Debra Winger) 
GOT EMBEDDING FOR In 2014, Peter Quill
GOT EMBEDDING FOR A young street magic
GOT EMBEDDING FOR Having made a career
GOT EMBEDDING FOR When her car breaks 
GOT EMBEDDING FOR Mikael (Oscar Isaac)
GOT EMBEDDING FOR Julia Banks is being
GOT EMBEDDING FOR This documentary fol
GOT EMBEDDING FOR Best friends Mindy (
GOT EMBEDDING FOR Dash (voiced by Schw
GOT EMBEDDING FOR Ireland, 1905: Percy
GOT EMBEDDING FOR Thirteen years ago, 
GOT EMBEDDING FOR Dominic "Dom" Torett
GOT EMBEDDING FOR Roman Melnyk, a cons
GOT EMBEDDING FOR In a small town near
GOT EMBEDDING FOR Gloria is an unemplo
GOT EMBEDDING FOR Joe, Willie, and Alb
GOT EMBEDDING FOR One yea

In [None]:
data = movies[['Title', 'Genre']].to_dict('records')

In [None]:
!pip install nomic

Collecting nomic
  Downloading nomic-3.1.2.tar.gz (45 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/45.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.6/45.6 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting jsonlines (from nomic)
  Downloading jsonlines-4.0.0-py3-none-any.whl.metadata (1.6 kB)
Collecting loguru (from nomic)
  Downloading loguru-0.7.2-py3-none-any.whl.metadata (23 kB)
Downloading jsonlines-4.0.0-py3-none-any.whl (8.7 kB)
Downloading loguru-0.7.2-py3-none-any.whl (62 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.5/62.5 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: nomic
  Building wheel for nomic (pyproject.toml) ... [?25l[?25hdone
 

In [None]:
from nomic import atlas
import nomic
import numpy as np

In [None]:
nomic.login(userdata.get('NOMIC_API_TOKEN'))

In [None]:
project = atlas.map_data(
    embeddings=np.array(plot_embeddings),
    data=data,
)

[32m2024-10-22 00:51:30.718[0m | [1mINFO    [0m | [36mnomic.dataset[0m:[36m_create_project[0m:[36m857[0m - [1mCreating dataset `logical-ramanujan`[0m
[32m2024-10-22 00:51:30.858[0m | [1mINFO    [0m | [36mnomic.atlas[0m:[36mmap_data[0m:[36m140[0m - [1mUploading data to Atlas.[0m
100%|██████████| 1/1 [00:01<00:00,  1.94s/it]
[32m2024-10-22 00:51:33.027[0m | [1mINFO    [0m | [36mnomic.dataset[0m:[36m_add_data[0m:[36m1668[0m - [1mUpload succeeded.[0m
[32m2024-10-22 00:51:33.028[0m | [1mINFO    [0m | [36mnomic.atlas[0m:[36mmap_data[0m:[36m158[0m - [1m`naoki/logical-ramanujan`: Data upload succeeded to dataset`[0m
[32m2024-10-22 00:51:33.833[0m | [1mINFO    [0m | [36mnomic.dataset[0m:[36mcreate_index[0m:[36m1262[0m - [1mCreated map `logical-ramanujan` in dataset `naoki/logical-ramanujan`: https://atlas.nomic.ai/data/naoki/logical-ramanujan[0m


In [None]:
from typing import List, Optional
from scipy import spatial

def distances_from_embeddings(
    query_embedding: List[float],
    embeddings: List[List[float]],
    distance_metric="cosine",
) -> List[List]:
    """Return the distances between a query embedding and a list of embeddings."""
    distance_metrics = {
        "cosine": spatial.distance.cosine,
        "L1": spatial.distance.cityblock,
        "L2": spatial.distance.euclidean,
        "Linf": spatial.distance.chebyshev,
    }
    distances = [
        distance_metrics[distance_metric](query_embedding, embedding)
        for embedding in embeddings
    ]
    return distances


def indices_of_nearest_neighbors_from_distances(distances) -> np.ndarray:
  """Return a list of indices of nearest neighbors from a list of distances."""
  return np.argsort(distances)

In [None]:
def print_recommendations_from_strings(
    strings,
    index_of_source_string,
    k_nearest_neighbors=3,
    model='text-embedding-ada-002'
):
  # Get all of the embeddings
  embeddings = [embedding_from_string(string) for string in strings]
  # get embedding for our specific query string
  query_embedding = embeddings[index_of_source_string]
  # get distances between our embedding and all other embeddings
  distances = distances_from_embeddings(query_embedding, embeddings)
  print(distances)
  # ge indices of the nearest neighbors
  indices_of_nearest_neighbors = indices_of_nearest_neighbors_from_distances(distances)
  query_string = strings[index_of_source_string]
  match_count = 0
  for i in indices_of_nearest_neighbors:
    if query_string == strings[i]:
      continue
    if match_count >= k_nearest_neighbors:
      break
    match_count += 1
    print(f'\nFound {match_count} closest match')
    print(f"Distance of {distances[i]}")
    print(strings[i])

In [None]:
print_recommendations_from_strings(movie_plots, 2)

[0.2518517182864721, 0.24701209411519698, 0.0, 0.23095605608559755, 0.2343782703980325, 0.24754564483961206, 0.22776189538149005, 0.24703127335440567, 0.2298327701762537, 0.18793624576101198, 0.25728992930031114, 0.2604211661090994, 0.2469218679177544, 0.2094798543471672, 0.24137044502532723, 0.2863864101552428, 0.2612565706596256, 0.2490906606555815, 0.24663778962598149, 0.20250460108479573, 0.2026195150201886, 0.22929903127902518, 0.25206550809093975, 0.20876366413683511, 0.25310278711278533, 0.27562548155017197, 0.2403552574587947, 0.2657675498820934, 0.24188004705396737, 0.23531991776489436, 0.25419594796892775, 0.24601872235143196, 0.24061528690646605, 0.20972378711420636, 0.26367269031542084, 0.23565906055335561, 0.200983785297691, 0.2405650536703441, 0.2482772943736682, 0.25871377269722096, 0.2525365396942907, 0.2504317080382624, 0.24862568023906562, 0.25325599744185523, 0.25122992330022154, 0.21705785582320103, 0.21310881800941206, 0.2363194643111367, 0.24839485026398622, 0.217