In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer

# Load the SentenceTransformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load movies metadata from movies_metadata.csv
movies_metadata = pd.read_csv('../movies_metadata.csv')
movies_metadata = movies_metadata.dropna(subset=['overview'])
movies_metadata['id'] = movies_metadata['id'].astype(int) # ID is actually the tmdbId
movies_metadata['overview'] = movies_metadata['overview'].astype('string')

In [3]:
# Load links data from links.csv
links = pd.read_csv('../links.csv')

In [4]:
# Load ratings data from ratings.csv
ratings_data = pd.read_csv('../ratings.csv')

In [5]:
# Group ratings by movieId and calculate average ratings
computed_ratings = ratings_data.groupby('movieId')['rating'].mean().reset_index()

# Assuming you have a DataFrame named links with movieId and tmdbId columns
# Merge computed_ratings with links on movieId
computed_ratings = computed_ratings.merge(links, on='movieId', how='inner')

In [6]:
# Merge movies metadata with average ratings using movieId as key
merged_data = pd.merge(movies_metadata, computed_ratings, left_on='id', right_on='tmdbId', how='inner')
print(merged_data.shape)

(43864, 28)


In [7]:
# Process overview text and obtain embeddings
overview_embeddings = model.encode(merged_data['overview'])

In [9]:
# Pair movieId with overview embeddings and average ratings
paired_data = pd.DataFrame({
    'movieId': merged_data['movieId'],
    'overview_embedding': overview_embeddings.tolist(),
    'average_rating': merged_data['rating']
})

print(merged_data.shape)
print(paired_data.shape)

# Save paired data to a new CSV file
paired_data.to_csv('../paired_data.csv', index=False)

(43864, 28)
(43864, 3)
