Loading data

In [None]:
import pandas as pd
import openai
import concurrent.futures
import pandas as pd
import numpy as np
import warnings
import time
import os
from tqdm import tqdm

warnings.filterwarnings("ignore")

DATA_PATH = 'data/MovieSummaries/'
character_metadata = pd.read_csv(DATA_PATH + 'character.metadata.tsv', 
                                 sep='\t', 
                                 names= [
                                     'Wikipedia movie ID',
                                     'Freebase movie ID',
                                     'Movie release date',
                                     'Character name',
                                     'Actor date of birth',
                                     'Actor gender',
                                     'Actor height (in meters)',
                                     'Actor ethnicity (Freebase ID)',
                                     'Actor name',
                                     'Actor age at movie release',
                                     'Freebase character/actor map ID',
                                     'Freebase character ID',
                                     'Freebase actor ID'
                                 ]
                                 )

movie_metadata = pd.read_csv(DATA_PATH + 'movie.metadata.tsv', sep='\t', header=0,
                             names=['Wikipedia movie ID',
                                         'Freebase movie ID',
                                         'Movie name',
                                         'Movie release date',
                                         'Movie box office revenue',
                                         'Movie runtime',
                                         'Movie languages (Freebase ID:name tuples)',
                                         'Movie countries (Freebase ID:name tuples)',
                                         'Movie genres (Freebase ID:name tuples)'
                                         ])

plot_summaries = pd.read_csv(DATA_PATH + 'plot_summaries.txt', sep='\t', names=[
    'Wikipedia movie ID',
    'Summary'
])

In [None]:
# Load data into a DataFrame
texts = plot_summaries['Summary'].tolist()
if os.path.exists('data/embedded_summaries.csv'):
    embedding_df = pd.read_csv('data/embedded_summaries.csv')
else:
    embedding_df = pd.DataFrame(columns=['Summary', 'embedding'])

In [None]:
print(embedding_df.shape)

In [None]:
def get_embedding(text, model="text-embedding-ada-002"):
    text = text.replace("\n", " ")
    text = text.replace("\t", " ")
    text = text.replace("\r", " ")
    text = text.replace("\x0b", " ")
    text = text.replace("\x0c", " ")
    try:
        embedding = openai.Embedding.create(input=[text], model=model)['data'][0]['embedding']

        if embedding is None:
            print(f"Failed to process text: {text}. Error: embedding is None")
            return None
        return embedding
    except openai.error.OpenAIError as e:
        print(f"Failed to process text: {text}. Error: {str(e)}")
        time.sleep(60)
        return None

def process_texts(start, end):
    global embedding_df
    for i in range(start, end):
        if i >= len(texts):
            break
        if (i-start) % 500 == 0 and i - start > 0:
            print(f"Processing {i-start}th text")
        text = texts[i]

        # also check that the result is not nan
        if embedding_df[embedding_df['Summary']==text].shape[0] > 0 and not embedding_df[embedding_df['Summary']==text]['embedding'].isna().iloc[0]:
            continue
        if embedding_df[embedding_df['Summary']==text].shape[0] > 0:
            embedding_df.loc[embedding_df['Summary']==text, 'embedding'] = get_embedding(text)
        else:
            embedding_df = embedding_df.append({'Summary': text, 'embedding': get_embedding(text)}, ignore_index=True)


Calculate all embeddings

In [None]:
import multiprocessing

num_cores = multiprocessing.cpu_count()

chunk_size = len(texts) // num_cores
chunks = [(i * chunk_size, (i + 1) * chunk_size) for i in range(num_cores)]

with concurrent.futures.ThreadPoolExecutor(max_workers=num_cores) as executor:
    futures = [executor.submit(process_texts, start, end) for start, end in chunks]
    concurrent.futures.wait(futures)

Save to file

In [76]:
embedding_df.to_csv('data/embedded_summaries.csv', index=False)

Compress

In [None]:
embeddings = pd.read_csv('data/embedded_summaries.csv')
combined_plot_summaries = pd.merge(plot_summaries, embeddings, on='Summary')

In [None]:
import numpy as np

def convert_embedding_to_numpy(embedding_str):
    return np.array([float(x) for x in embedding_str[1:-1].split(',')])

tqdm.pandas(desc="Converting embeddings")
combined_plot_summaries['embedding'] = combined_plot_summaries['embedding'].progress_apply(convert_embedding_to_numpy)

Remove outliers

In [None]:
# We now sample 1000 movie embeddings and remove all movies that are less than 0.2 similar to the mean of the sample
# We do this to remove outliers
sample = np.random.choice(combined_plot_summaries.index, 1000)
sample_embeddings = combined_plot_summaries.loc[sample]['embedding'].values
sample_mean = np.mean(sample_embeddings, axis=0)

# We now remove all movies that are less than 0.2 similar to the mean of the sample

def remove_outliers(embedding):
    return np.dot(embedding, sample_mean) > 0.2

combined_plot_summaries = combined_plot_summaries[combined_plot_summaries['embedding'].apply(remove_outliers)]


In [None]:
# save the embeddings to disk in a efficient format with both embeddings and Wikipedia movie ID
np.save('data/embeddings.npy', combined_plot_summaries[['Wikipedia movie ID', 'embedding']].values)