In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer

# Load the SentenceTransformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Load movies metadata from movies_metadata.csv
movies_metadata = pd.read_csv('../movies_metadata.csv')
movies_metadata = movies_metadata.dropna(subset=['overview'])
movies_metadata['id'] = movies_metadata['id'].astype(int)
movies_metadata['overview'] = movies_metadata['overview'].astype('string')

In [4]:
# Load ratings data from ratings.csv
ratings_data = pd.read_csv('../ratings.csv')

In [5]:
# Group ratings by movieId and calculate average ratings
computed_ratings = ratings_data.groupby('movieId')['rating'].mean().reset_index()

# Optionally, you can round the average ratings to a certain number of decimal places
computed_ratings['average'] = computed_ratings['rating'].round(2)

       movieId    rating  average
0            1  3.888157     3.89
1            2  3.236953     3.24
2            3  3.175550     3.18
3            4  2.875713     2.88
4            5  3.079565     3.08
...        ...       ...      ...
45110   176267  4.000000     4.00
45111   176269  3.500000     3.50
45112   176271  5.000000     5.00
45113   176273  1.000000     1.00
45114   176275  3.000000     3.00

[45115 rows x 3 columns]


In [6]:
# Merge movies metadata with average ratings using movieId as key
merged_data = pd.merge(movies_metadata, computed_ratings, left_on='id', right_on='movieId', how='inner')

In [7]:
# Process overview text and obtain embeddings
overview_embeddings = model.encode(merged_data['overview'])

In [8]:
# Pair movieId with overview embeddings and average ratings
paired_data = pd.DataFrame({
    'movieId': merged_data['movieId'],
    'overview_embedding': overview_embeddings.tolist(),
    'average_rating': merged_data['rating']
})

# Save paired data to a new CSV file
paired_data.to_csv('../paired_data.csv', index=False)