## CS 533 Group Project

In [1]:
# Import the required tools and libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Import the movies dataframe
movies_path = './ml-25m/movies.csv'
movies_df = pd.read_csv(movies_path)
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
# Import the ratings dataframe
ratings_path = './ml-25m/ratings.csv'
ratings_df = pd.read_csv(ratings_path)
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [4]:
# Import the tags dataframe
tags_path = './ml-25m/tags.csv'
tags_df = pd.read_csv(tags_path)
tags_df.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,3,260,classic,1439472355
1,3,260,sci-fi,1439472256
2,4,1732,dark comedy,1573943598
3,4,1732,great dialogue,1573943604
4,4,7569,so bad it's good,1573943455


Merging to ratings and movies dataset on each instance would not work because the ratings dataset is too large and there would be way too many unique combinations of ratings and movies for the scope of this project.

To handle merging movies and ratings, we need to get some more concise values of ratings to add to the dataset.

We can do this by grouping the ratings by movie ID, then getting some statistics from the groups and ratings such as mean, median, mode, min, max, the quartiles, and more.


In [21]:
ratings_df.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,25000100.0,25000100.0,25000100.0,25000100.0
mean,81189.28,21387.98,3.533854,1215601000.0
std,46791.72,39198.86,1.060744,226875800.0
min,1.0,1.0,0.5,789652000.0
25%,40510.0,1196.0,3.0,1011747000.0
50%,80914.0,2947.0,3.5,1198868000.0
75%,121557.0,8623.0,4.0,1447205000.0
max,162541.0,209171.0,5.0,1574328000.0


In [38]:
ratings_stats = ratings_df.groupby('movieId')['rating'].agg(['mean', 'median', 'min', 'max', 'std', 'var', 'count']).reset_index()
ratings_stats['q25'] = ratings_df.groupby('movieId')['rating'].quantile(0.25).reset_index()['rating']
ratings_stats['q50'] = ratings_df.groupby('movieId')['rating'].quantile(0.50).reset_index()['rating']
ratings_stats['q75'] = ratings_df.groupby('movieId')['rating'].quantile(0.75).reset_index()['rating']
ratings_stats['mode'] = ratings_df.groupby('movieId')['rating'].agg(lambda x: x.mode().iat[0]).reset_index()['rating']
rename_cols_dict = {}
for col in ratings_stats.columns:
    if col != 'movieId':
        new_col_name = 'ratings_' + str(col) 
        rename_cols_dict[col] = new_col_name
ratings_stats = ratings_stats.rename(columns=rename_cols_dict)
ratings_stats

Unnamed: 0,movieId,ratings_mean,ratings_median,ratings_min,ratings_max,ratings_std,ratings_var,ratings_count,ratings_q25,ratings_q50,ratings_q75,ratings_mode
0,1,3.893708,4.0,0.5,5.0,0.921552,0.849258,57309,3.5,4.0,4.5,4.0
1,2,3.251527,3.0,0.5,5.0,0.959851,0.921315,24228,3.0,3.0,4.0,3.0
2,3,3.142028,3.0,0.5,5.0,1.008443,1.016957,11804,3.0,3.0,4.0,3.0
3,4,2.853547,3.0,0.5,5.0,1.108531,1.228841,2523,2.0,3.0,4.0,3.0
4,5,3.058434,3.0,0.5,5.0,0.996611,0.993234,11714,2.5,3.0,4.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...
59042,209157,1.500000,1.5,1.5,1.5,,,1,1.5,1.5,1.5,1.5
59043,209159,3.000000,3.0,3.0,3.0,,,1,3.0,3.0,3.0,3.0
59044,209163,4.500000,4.5,4.5,4.5,,,1,4.5,4.5,4.5,4.5
59045,209169,3.000000,3.0,3.0,3.0,,,1,3.0,3.0,3.0,3.0


Now that we have some basic descriptive stats for ratings, we can join these with the movies dataset on movie ID.

In [39]:
movies_df = pd.merge(movies_df, ratings_stats, on="movieId", how="inner")
movies_df.head()

Unnamed: 0,movieId,title,genres,ratings_mean,ratings_median,ratings_min,ratings_max,ratings_std,ratings_var,ratings_count,ratings_q25,ratings_q50,ratings_q75,ratings_mode
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.893708,4.0,0.5,5.0,0.921552,0.849258,57309,3.5,4.0,4.5,4.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,3.251527,3.0,0.5,5.0,0.959851,0.921315,24228,3.0,3.0,4.0,3.0
2,3,Grumpier Old Men (1995),Comedy|Romance,3.142028,3.0,0.5,5.0,1.008443,1.016957,11804,3.0,3.0,4.0,3.0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,2.853547,3.0,0.5,5.0,1.108531,1.228841,2523,2.0,3.0,4.0,3.0
4,5,Father of the Bride Part II (1995),Comedy,3.058434,3.0,0.5,5.0,0.996611,0.993234,11714,2.5,3.0,4.0,3.0
