## Load data

In [1]:
import pandas as pd
import pandas as pd
import numpy as np
import warnings
import time
import os
import openai
from sklearn.manifold import TSNE
import plotly.express as px
from helpers import load_data, get_embedding
import tqdm
import plotly.graph_objects as go

warnings.filterwarnings("ignore")

DATA_PATH = 'data/'

# Load the data
loaded_data = load_data(DATA_PATH)

character_metadata = loaded_data['character_metadata']
movie_metadata = loaded_data['movie_metadata']
plot_summaries = loaded_data['plot_summaries']
embeddings = loaded_data['embeddings']
combined_plot_summaries = loaded_data['combined_plot_summaries']

## Similarity search

In [2]:
wikipedia_movie_id = 3746

# Find the embedding of the movie with the wikipedia_movie_id
embedding = combined_plot_summaries.loc[combined_plot_summaries['Wikipedia movie ID'] == wikipedia_movie_id]['embedding'].values[0]

# Calculate the cosine similarity between the embedding of the movie with the wikipedia_movie_id and all other embeddings
cosine_similarities = np.dot(embeddings, embedding)

# Find the top 10 most similar movies
similar_indices = cosine_similarities.argsort()[:-20:-1]

wikipedia_movie_ids = combined_plot_summaries.loc[similar_indices]['Wikipedia movie ID'].values

# Print the movie name of the movie with the wikipedia_movie_id
print("Most similar movies to: \"" + movie_metadata.loc[movie_metadata['Wikipedia movie ID'] == wikipedia_movie_id]['Movie name'].values[0] + "\"")
print("Similar movies:")
print("")

similar_movies = movie_metadata.loc[movie_metadata['Wikipedia movie ID'].isin(wikipedia_movie_ids)]

similarity_of_closest = []

for movie in similar_movies.to_dict('records'):
    similarity_of_closest.append(cosine_similarities[combined_plot_summaries.loc[combined_plot_summaries['Wikipedia movie ID'] == movie['Wikipedia movie ID']].index[0]])

similar_movies['Similarity'] = similarity_of_closest

similar_movies = similar_movies.sort_values(by='Similarity', ascending=False)

for movie in similar_movies.to_dict('records'):
    print(movie['Movie name'] + " - " + str(movie['Similarity']))

Most similar movies to: "Blade Runner"
Similar movies:

Blade Runner - 1.0000001950264656
The Secret of Santa Vittoria - 0.8648360924208145
7 Seconds - 0.8613130933664959
Replicant - 0.8515000793442092
The Sea Inside - 0.8504307359201224
The Boxer - 0.8496721249815664
Taxi 2 - 0.8483836570553862
That Forsyte Woman - 0.8470240714455639
Quadrophenia - 0.846921800338462
Puppet Master 5: The Final Chapter - 0.84654547881106
Last Tango in Paris - 0.8463689760393782
Doom - 0.8452793406749852
Wild Things: Foursome - 0.8449324207068024
Contraband - 0.8448401877431744
High Hopes - 0.8439061403344252
Terminal Voyage - 0.8417892291299902
Terminator Salvation - 0.8414827401562219
The New Adventures of Tarzan - 0.8398815001123681
Predator 2 - 0.8390135524258946


## Querying

In [4]:
search_term = "Romantic movies"

search_embedding = get_embedding(search_term)

cosine_similarities = np.dot(embeddings, search_embedding)

similar_indices = cosine_similarities.argsort()[:-20:-1]

wikipedia_movie_ids = combined_plot_summaries.loc[similar_indices]['Wikipedia movie ID'].values

print("Search results for: \"" + search_term + "\"")
print("Similar movies:")
print("")

similar_movies = movie_metadata.loc[movie_metadata['Wikipedia movie ID'].isin(wikipedia_movie_ids)]

similarity_of_closest = []

for movie in similar_movies.to_dict('records'):
    similarity_of_closest.append(cosine_similarities[combined_plot_summaries.loc[combined_plot_summaries['Wikipedia movie ID'] == movie['Wikipedia movie ID']].index[0]])

similar_movies['Similarity'] = similarity_of_closest

similar_movies = similar_movies.sort_values(by='Similarity', ascending=False)

for movie in similar_movies.to_dict('records'):
    print(movie['Movie name'] + " - " + str(movie['Similarity']))

Search results for: "Romantic movies"
Similar movies:

Detention: The Siege at Johnson High - 0.8508188601073823
Chattaniki Kallu Levu - 0.8422551758332317
Midnight Son - 0.8412166848096354
Adiós problemas - 0.8411263822689758
Panique - 0.8385504751930782
Mere Naam Hai Mohabbat - 0.8364439465152772
While Parents Sleep - 0.8349711907773558
The Tin Drum - 0.8340832869753669
The Umbrellas of Cherbourg - 0.8325119648767937
Office Lady Sisters: I Want to Sleep with You - 0.8324287957668389
The Gingerdead Man - 0.8321649946202401
Dance Flick - 0.8313681341477678
The Water Margin - 0.8306093796222924
Barbarella - 0.8300114781358661
Tinker Bell and the Pixie Hollow Games - 0.8294585979312624
God Grew Tired of Us - 0.8292485204271566
The Living and the Dead - 0.8285825173320386
Pinky - 0.8284589433080742
Ennodishtam Koodamo - 0.8273394910996203


## T-SNE

In [5]:
tsne = TSNE(n_components=2, random_state=0)
tsne_obj = tsne.fit_transform(embeddings)

In [6]:
def movie_id_to_name(wikipedia_movie_id):
    if wikipedia_movie_id is None:
        return None
    
    names = movie_metadata.loc[movie_metadata['Wikipedia movie ID'] == wikipedia_movie_id]['Movie name']

    if names.empty:
        return "Unknown"
    return names.values[0]

movie_names = combined_plot_summaries['Wikipedia movie ID'].apply(movie_id_to_name)

In [7]:
search_term = "pink" # Interesting searches with cool clusters: pink, sherlock holmes

search_embedding = get_embedding(search_term)
cosine_similarities = np.dot(embeddings, search_embedding)

tsne_df = pd.DataFrame({'X':tsne_obj[:,0],
                        'Y':tsne_obj[:,1],
                        'Movie': movie_names,
                        })

fig = px.scatter(tsne_df, x='X', y='Y', width=1000, height=1000, title='T-SNE plot of movie embeddings for search of ' + search_term, hover_name='Movie', color=cosine_similarities, color_continuous_scale='RdBu')

fig.show()

In [11]:
# histogram of cosine similarities
fig = go.Figure(data=[go.Histogram(x=cosine_similarities)])
fig.update_layout(title_text='Histogram of cosine similarities for search of ' + search_term)
fig.show()

# Cities

In [18]:
cities_and_countries = [
    'Paris', 'Berlin', 'Bucharest', 'Kiev', 'Amsterdam', 'Zurich', 'Lausanne',
    'London', 'Madrid', 'Lisbon', 'Rome', 'Athens', 'Oslo', 'Stockholm', 'Helsinki',
    'Copenhagen', 'Dublin', 'Brussels', 'Warsaw', 'Prague', 'Vienna', 'Budapest',
    'Belgrade', 'Sofia', 'Tirana', 'Reykjavik', 'Luxembourg', 'Monaco', 'Vaduz',
    'San Marino', 'Andorra la Vella', 'Moscow', 'Saint Petersburg', 'Istanbul',
    'Ankara', 'Edinburgh', 'Glasgow', 'Cardiff', 'Belfast', 'Munich', 'Frankfurt',
    'Hamburg', 'Cologne', 'Stuttgart', 'Düsseldorf', 'Nuremberg', 'Leipzig',
    'Dresden', 'Hannover', 'Bremen', 'Bonn', 'Marseille', 'Lyon', 'Toulouse',
     'Nantes', 'Strasbourg', 'Montpellier', 'Bordeaux', 'Lille', 'Rennes',
    'Reims', 'Le Havre', 'Saint-Étienne', 'Toulon', 'Grenoble', 'Dijon', 'Angers',
    'Nîmes', 'Clermont-Ferrand', 'Saint-Denis', 'La Rochelle', 'Le Mans', 'Aix-en-Provence',
    'Brest', 'Limoges', 'Tours', 'Amiens', 'Perpignan', 'Metz', 'Besançon', 'Caen',
    'Orléans', 'Mulhouse', 'Rouen', 'Boulogne-Billancourt', 'Nancy', 'Argenteuil',
    'Montreuil', 'Saint-Paul', 'Avignon', 'Saint-Denis', 'Versailles', 'Nanterre',
    'Pau', 'Courbevoie', 'Vitry-sur-Seine', 'Colombes', 'Asnières-sur-Seine',
    'Aulnay-sous-Bois', 'Rueil-Malmaison', 'Antibes', 'Saint-Maur-des-Fossés',
    'Calais', 'Bezons', 'Dunkerque', 'Aubervilliers',
    # Capitals in the Americas
    'Washington D.C.', 'Ottawa', 'Mexico City', 'Buenos Aires', 'Brasília', 'Santiago',
    'Lima', 'Bogotá', 'Caracas', 'Quito', 'Montevideo', 'Havana', 'Kingston',
    'San José', 'Panama City', 'Tegucigalpa', 'San Salvador', 'Guatemala City',
    'Managua', 'Port-au-Prince', 'Santo Domingo',
    # Capitals in Asia
    'Beijing', 'Tokyo', 'New Delhi', 'Seoul', 'Jakarta', 'Bangkok', 'Manila',
    'Kuala Lumpur', 'Singapore', 'Islamabad', 'Dhaka', 'Astana', 'Ulaanbaatar',
    'Riyadh', 'Ankara', 'Tehran', 'Baghdad', 'Jerusalem', 'Doha', 'Dubai', 'Abu Dhabi',
    'Kabul', 'Yerevan', 'Baku', 'Tbilisi',
    # Capitals in Africa
    'Cairo', 'Nairobi', 'Pretoria', 'Algiers', 'Rabat', 'Lagos', 'Addis Ababa',
    'Accra', 'Dakar', 'Tunis', 'Tripoli', 'Khartoum', 'Luanda', 'Harare', 'Kigali',
    'Kampala', 'Mogadishu', 'Bamako', 'Niamey'
]


embedded_cities_and_countries = []

for city_country in tqdm.tqdm(cities_and_countries):
    embedded_cities_and_countries.append(get_embedding(city_country))
    

100%|██████████| 171/171 [00:43<00:00,  3.96it/s]


In [19]:
movies_in_cities_and_countries = { city_country: [] for city_country in cities_and_countries }

for city_country, city_embedding in tqdm.tqdm(zip(cities_and_countries, embedded_cities_and_countries)):
    cosine_similarities = np.dot(embeddings, city_embedding)
    
    # We take 3 stds above the mean movies to get a good sample of movies
    similarity_mean = np.mean(cosine_similarities)
    similarity_std = np.std(cosine_similarities)
    
    similar_indices = np.where(cosine_similarities > similarity_mean + 3 * similarity_std)[0]

    wikipedia_movie_ids = combined_plot_summaries.iloc[similar_indices]['Wikipedia movie ID'].values
    movies_in_cities_and_countries[city_country] = wikipedia_movie_ids


24it [00:00, 38.40it/s]

171it [00:04, 36.43it/s]


In [20]:
embeddings_of_movies_in_cities_and_countries = { city_country: [] for city_country in cities_and_countries }

for city_country in cities_and_countries:
    embeddings_of_movies_in_cities_and_countries[city_country] = np.array(combined_plot_summaries.loc[combined_plot_summaries['Wikipedia movie ID'].isin(movies_in_cities_and_countries[city_country])]['embedding'].values.tolist())

In [21]:
general_terms = ['Drugs', 'Love', 'War', 'Poverty', 'Comedy', 'Happiness', 'Sadness', 'Gang', 'Hippies', 'Guns']
embeddings_of_general_terms = { general_term: get_embedding(general_term) for general_term in general_terms }
similarity_movie_to_term = { city : { general_term: 0.0 for general_term in general_terms } for city in cities_and_countries }

embedded_cities_and_countries = np.array(embedded_cities_and_countries)

for city_or_country in cities_and_countries:

    for term, term_embedding in embeddings_of_general_terms.items():
        
        cosine_similarities = np.dot(np.array(embeddings_of_movies_in_cities_and_countries[city_or_country]).reshape(-1, 1536), term_embedding)
        
        similarity_movie_to_term[city_or_country][term] = np.mean(cosine_similarities)

In [22]:
import plotly.graph_objects as go

data_pairs = [(city, similarity_movie_to_term[city]['Guns']) for city in cities_and_countries]
data_pairs.sort(key=lambda x: x[1])

sorted_cities = [pair[0] for pair in data_pairs]
similarity_scores = [pair[1] for pair in data_pairs]

hover_text = [f"{city}: {score:.2f}" for city, score in zip(sorted_cities, similarity_scores)]

fig = go.Figure(data=[go.Scatter(
    x=similarity_scores, 
    y=[1] * len(similarity_scores), 
    mode='markers',
    text=hover_text,
    hoverinfo='text'
)])

fig.update_layout(
    title_text='Similarity to the Word "Guns" for Cities and Countries',
    xaxis=dict(
        title='Similarity Score',
        tickvals=similarity_scores,
        ticktext=sorted_cities
    ),
    yaxis=dict(
        showgrid=False,
        zeroline=False,
        showticklabels=False,
    ),
    hovermode='closest',
    showlegend=False
)

fig.update_xaxes(rangeslider=dict(visible=True))

fig.show()


In [23]:
import googlemaps
import pandas as pd
import plotly.express as px
from sklearn.preprocessing import MinMaxScaler

gmaps = googlemaps.Client(key=os.environ['GOOGLE_MAPS_API_KEY'])


coordinates = []
for city in  [pair[0] for pair in data_pairs]:
    geocode_result = gmaps.geocode(city)
    if geocode_result:
        lat = geocode_result[0]["geometry"]["location"]["lat"]
        lng = geocode_result[0]["geometry"]["location"]["lng"]
        coordinates.append((lat, lng))
    else:
        coordinates.append((None, None))

df = pd.DataFrame({
    'City':  [pair[0] for pair in data_pairs],
    'Latitude': [coord[0] for coord in coordinates],
    'Longitude': [coord[1] for coord in coordinates],
    'Similarity': [pair[1] for pair in data_pairs]
})

scaler = MinMaxScaler()
df['Similarity_Scaled'] = scaler.fit_transform(df[['Similarity']])

# Creating the map
fig = px.scatter_geo(df,
                     lat='Latitude',
                     lon='Longitude',
                     color='Similarity_Scaled',
                     hover_name='City',
                     hover_data={'Latitude': False, 'Longitude': False, 'Similarity_Scaled': False, 'Similarity': True},
                     projection="natural earth",
                     title="Map for Similarity to the Word 'Guns' for Cities and Countries")

fig.show()
