In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

---
## Read the data

Movie names from [here](https://dataverse.harvard.edu/file.xhtml?fileId=4435583&version=1.0)

In [None]:
DATA_PATH = os.path.join("..", "data", "movies.csv")

In [None]:
movies = []
with open(DATA_PATH, "r") as f:
    for line in f.readlines():
        movie = line.split(",")[0]
        movies.append(movie)
len(movies)

In [None]:
rand_idx = np.random.randint(0, len(movies))
movies[rand_idx]

---
## Use the Wikipedia API to get plots and summary for each movie

In [None]:
# import requests
# import time
# import json

# def fetch_movie_data(movie_title, rate_limit=1):
#     time.sleep(rate_limit)
#     url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{movie_title}"
#     response = requests.get(url)
#     if response.status_code == 200:
#         return response.json()
#     else:
#         print(f"Failed to get data for {movie_title}")
#         return None

# def main():
#     # Your list of 20,000 movie titles
#     movie_titles = ["Inception", "Avatar"]  # Extend this list
#     movie_data = {}
#     cache_file = "movie_data_cache.json"

#     try:
#         with open(cache_file, "r") as f:
#             movie_data = json.load(f)
#     except FileNotFoundError:
#         print("No cache file found. Creating a new one.")
    
#     for title in movie_titles:
#         if title in movie_data:
#             print(f"Cache hit for {title}")
#             continue

#         print(f"Fetching data for {title}")
#         data = fetch_movie_data(title)
#         if data:
#             movie_data[title] = {
#                 "plot": data.get("extract", "N/A"),
#                 "summary": data.get("description", "N/A")
#             }

#     with open(cache_file, "w") as f:
#         json.dump(movie_data, f)

#     print("All movie data fetched and cached.")

# if __name__ == "__main__":
#     main()


---
# Generate embeddings

We use [sbert](https://www.sbert.net/) 

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

In [195]:
sentences = [
    'This framework generates embeddings for each input sentence',
    'Sentences are passed as a list of string.',
    'The quick brown fox jumps over the lazy dog.'
    ]

embeddings = model.encode(sentences)

In [196]:
k = 2
query = "dog jumps"
emb = model.encode("dog jumps")
result = embeddings @ emb # np.dot(embeddings, query)
topk = np.argsort(result)[::-1][:k]
print("all result:", result)
print("top k results:", topk)

print()
print(f"top {k} results for query: '{query}'")
for match in topk:
    print("   -", sentences[match])

all result: [0.13834487 0.08460952 0.53164786]
top k results: [2 0]

top 2 results for query: 'dog jumps'
   - The quick brown fox jumps over the lazy dog.
   - This framework generates embeddings for each input sentence


---
## Embed some movies

In [197]:
DATA_PATH = os.path.join("..", "data", "wiki_movies.csv")

In [198]:
data = pd.read_csv(DATA_PATH)
df = data[["Title", "Wiki Page", "Plot"]]
df.shape

(34886, 3)

In [199]:
df

Unnamed: 0,Title,Wiki Page,Plot
0,Kansas Saloon Smashers,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr..."
1,Love by the Light of the Moon,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov..."
2,The Martyred Presidents,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed..."
3,"Terrible Teddy, the Grizzly King","https://en.wikipedia.org/wiki/Terrible_Teddy,_...",Lasting just 61 seconds and consisting of two ...
4,Jack and the Beanstalk,https://en.wikipedia.org/wiki/Jack_and_the_Bea...,The earliest known adaptation of the classic f...
...,...,...,...
34881,The Water Diviner,https://en.wikipedia.org/wiki/The_Water_Diviner,"The film begins in 1919, just after World War ..."
34882,Çalgı Çengi İkimiz,https://en.wikipedia.org/wiki/%C3%87alg%C4%B1_...,"Two musicians, Salih and Gürkan, described the..."
34883,Olanlar Oldu,https://en.wikipedia.org/wiki/Olanlar_Oldu,"Zafer, a sailor living with his mother Döndü i..."
34884,Non-Transferable,https://en.wikipedia.org/wiki/Non-Transferable...,The film centres around a young woman named Am...


In [200]:
total_embeddings = 20
plots = list(df["Plot"])[:total_embeddings]
embeddings = model.encode(plots)

In [201]:
np.save("movie_embeddings.npy", embeddings)

In [202]:
k = 20
query = "there is a bean"
emb = model.encode(query)
result = embeddings @ emb # np.dot(embeddings, query)
topk = np.argsort(result)[::-1][:k]
# topk = np.argsort(result)[-k:]
print("top k results:", topk)

print()
print(f"top {k} results for query: '{query}'")
for match in topk:
    # print("   -", plots[match])
    print("   -", df.iloc[match]["Title"])

top k results: [ 4  5  3 17 13  8 15 11  1  6  2  7 18 14  0 16 12 10  9 19]

top 20 results for query: 'there is a bean'
   - Jack and the Beanstalk
   - Alice in Wonderland
   - Terrible Teddy, the Grizzly King
   - The Black Viper
   - Daniel Boone
   - The Little Train Robbery
   - Laughing Gas
   - From Leadville to Aspen: A Hold-Up in the Rockies
   - Love by the Light of the Moon
   - The Great Train Robbery
   - The Martyred Presidents
   - The Suburbanite
   - A Calamitous Elopement
   - How Brown Saw the Baseball Game
   - Kansas Saloon Smashers
   - The Adventures of Dollie
   - Kathleen Mavourneen
   - Dream of a Rarebit Fiend
   - The Night Before Christmas
   - The Call of the Wild


In [203]:
embeddings.shape

(20, 384)

In [14]:
import numpy as np
lst = ["1", "12", "123", "1"]
longest = lambda x: len(x)
arr = len(max(lst, key=len))
arr = max([len(x) for x in lst])
arr

3