In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer

# Load the SentenceTransformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load movies metadata from movies_metadata.csv
movies_metadata = pd.read_csv('movies_metadata.csv')
movies_metadata = movies_metadata.dropna(subset=['overview'])
movies_metadata['id'] = movies_metadata['id'].astype(int)
movies_metadata['overview'] = movies_metadata['overview'].astype('string')
movies_metadata.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173


In [3]:
# Load ratings data from ratings.csv
ratings_data = pd.read_csv('ratings.csv')
ratings_data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,1425941529
1,1,147,4.5,1425942435
2,1,858,5.0,1425941523
3,1,1221,5.0,1425941546
4,1,1246,5.0,1425941556


In [4]:
# Group ratings by movieId and calculate average ratings
average_ratings = ratings_data.groupby('movieId')['rating'].mean().reset_index()

# Optionally, you can round the average ratings to a certain number of decimal places
average_ratings['average_rating'] = average_ratings['rating'].round(2)

# Display the dataframe with movieId and average_rating
print(average_ratings)

       movieId    rating  average_rating
0            1  3.888157            3.89
1            2  3.236953            3.24
2            3  3.175550            3.18
3            4  2.875713            2.88
4            5  3.079565            3.08
...        ...       ...             ...
45110   176267  4.000000            4.00
45111   176269  3.500000            3.50
45112   176271  5.000000            5.00
45113   176273  1.000000            1.00
45114   176275  3.000000            3.00

[45115 rows x 3 columns]


In [5]:
# Merge movies metadata with average ratings using movieId as key
merged_data = pd.merge(movies_metadata, average_ratings, left_on='id', right_on='movieId', how='inner')

In [6]:
# Process overview text and obtain embeddings
overview_embeddings = model.encode(merged_data['overview'])

In [7]:
# Pair movieId with overview embeddings and average ratings
paired_data = pd.DataFrame({
    'movieId': merged_data['movieId'],
    'overview_embedding': overview_embeddings.tolist(),
    'average_rating': merged_data['rating']
})

# Save paired data to a new CSV file
paired_data.to_csv('paired_data.csv', index=False)

: 