In [None]:
import pandas as pd
import pandas as pd
import numpy as np
import warnings
from sklearn.manifold import TSNE
import plotly.express as px
from helpers import load_data, get_embedding
import plotly.graph_objects as go

warnings.filterwarnings("ignore")

DATA_PATH = 'data/'

# Load the data
loaded_data = load_data(DATA_PATH)

character_metadata = loaded_data['character_metadata']
movie_metadata = loaded_data['movie_metadata']
plot_summaries = loaded_data['plot_summaries']
embeddings = loaded_data['embeddings']
combined_plot_summaries = loaded_data['combined_plot_summaries']

In [None]:
tsne = TSNE(n_components=2, random_state=0)
tsne_obj = tsne.fit_transform(embeddings)

In [None]:
def movie_id_to_name(wikipedia_movie_id):
    if wikipedia_movie_id is None:
        return None
    
    names = movie_metadata.loc[movie_metadata['Wikipedia movie ID'] == wikipedia_movie_id]['Movie name']

    if names.empty:
        return "Unknown"
    return names.values[0]

movie_names = combined_plot_summaries['Wikipedia movie ID'].apply(movie_id_to_name)

In [None]:
search_term = "pink" # Interesting searches with cool clusters: pink, sherlock holmes

search_embedding = get_embedding(search_term)
cosine_similarities = np.dot(embeddings, search_embedding)

tsne_df = pd.DataFrame({'X':tsne_obj[:,0],
                        'Y':tsne_obj[:,1],
                        'Movie': movie_names,
                        })

fig = px.scatter(tsne_df, x='X', y='Y', width=1000, height=1000, title='T-SNE plot of movie embeddings for search of ' + search_term, hover_name='Movie', color=cosine_similarities, color_continuous_scale='RdBu')

fig.show()