### Importing Libraries

In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer

pd.set_option('display.max_colwidth', 150)

### Loading Datasets

Loading movies.csv and ratings.csv datasets from MovieLens small dataset

In [8]:
movies_df = pd.read_csv("ml-latest/movies.csv")
movies_df = movies_df[movies_df['genres'] != '(no genres listed)']
movies_df

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
86532,288967,State of Siege: Temple Attack (2021),Action|Drama
86533,288971,Ouija Japan (2021),Action|Horror
86534,288975,The Men Who Made the Movies: Howard Hawks (1973),Documentary
86535,288977,Skinford: Death Sentence (2023),Crime|Thriller


In [5]:
ratings_df = pd.read_csv("ml-latest/ratings.csv")

# Dropping timestamp
ratings_df.drop(columns='timestamp', inplace=True) 

#ratings_df.to_csv('ratings_df.csv', index=False)

ratings_df = ratings_df.sample(frac=0.2)
ratings_df.to_csv('csv_files/ratings_df_big.csv', index=False)
ratings_df

Unnamed: 0,userId,movieId,rating
25573816,249359,1272,4.0
32510145,317773,3190,1.0
45241,461,191359,3.5
29497186,288134,122892,5.0
18502502,181125,2174,2.5
...,...,...,...
28904890,282413,628,4.0
10999913,108430,153,5.0
3670346,35692,6377,5.0
10361472,101974,169864,3.0


### Movies Descriptive Dataframe

In [None]:
# Feature Engeneering
# creating year column
movies_df['year'] = movies_df['title'].str.strip().apply(lambda x:x[-5:-1])
movies_without_date = movies_df.loc[movies_df['year'].str.isdigit() == False]
movies_without_date.loc[:,'year'] = '2006'
movies_df = movies_df[movies_df['year'].str.isdigit()]
movies_df = pd.concat([movies_df, movies_without_date], axis=0)
movies_df.loc[:,'year'] = movies_df['year'].astype('int')

# Adding num_ratings and avg_movie_rate
num_ratings = ratings_df.groupby('movieId').size().to_frame()
avg_rating = ratings_df.groupby('movieId')['rating'].mean().round(2).to_frame()
avg_rating.rename(columns={'rating':'avg_movie_rating'}, inplace=True)

movies_df = pd.merge(movies_df, num_ratings, on='movieId')
movies_df = pd.merge(movies_df, avg_rating, on='movieId')
movies_df.rename(columns={0:'num_ratings'}, inplace=True)

# genres one hot encoding
movies_df['genres'] = movies_df['genres'].str.split('|')

mlb = MultiLabelBinarizer()
mlb.fit(movies_df['genres'])

genres_one_hot = pd.DataFrame(mlb.transform(movies_df['genres']), columns = mlb.classes_, index=movies_df.index)
movies_encoded_by_genre = pd.concat([movies_df[['movieId','title','num_ratings','avg_movie_rating','year']],genres_one_hot], axis=1)
movies_encoded_by_genre.drop(columns=['Film-Noir', 'IMAX'], inplace=True)

movies_encoded_by_genre

Unnamed: 0,movieId,title,num_ratings,avg_movie_rating,year,Action,Adventure,Animation,Children,Comedy,...,Drama,Fantasy,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),15521,3.90,1995,0,1,1,1,1,...,0,1,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),6149,3.29,1995,0,1,0,1,0,...,0,1,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),3103,3.17,1995,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale (1995),610,2.90,1995,0,0,0,0,1,...,1,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II (1995),3129,3.08,1995,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50874,281920,When It Melts,1,3.50,2006,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
50875,282085,Spoonful of Sugar,1,4.00,2006,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
50876,283477,Baby Driver 2,1,2.50,2006,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
50877,283571,Limbo,1,4.00,2006,0,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0


In [7]:
movies_encoded_by_genre.to_csv('csv_files/movies_encoded_by_genre_big.csv', index=False)

### Users Descriptive Dataframe

In [None]:
ratings_encoded = pd.merge(ratings_df,movies_encoded_by_genre.drop(columns=['title','num_ratings','avg_movie_rating','year']),on='movieId')

for genre in mlb.classes_:
    ratings_encoded[genre] = ratings_encoded[genre]*ratings_encoded['rating']
ratings_encoded.drop(columns='movieId',inplace=True)

user_ratings_by_genre = ratings_encoded.groupby('userId').agg('mean').round(2)
user_ratings_by_genre.rename(columns={'rating':'avg_user_rating'},inplace=True)
user_ratings_by_genre.drop(columns=['Film-Noir', 'IMAX'], inplace=True)
user_ratings_by_genre = user_ratings_by_genre.reset_index()

user_ratings_by_genre

Unnamed: 0,userId,avg_user_rating,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,3.73,1.55,0.68,0.00,0.00,1.00,0.55,0.00,2.09,...,0.0,0.45,0.45,0.00,0.73,1.00,1.55,0.95,0.36,0.00
1,2,3.68,1.53,1.00,0.21,0.21,1.21,0.63,0.00,1.89,...,0.0,0.00,0.00,0.21,0.00,1.00,0.32,0.79,0.68,0.26
2,3,5.00,2.50,0.00,0.00,0.00,0.00,2.50,0.00,5.00,...,0.0,0.00,0.00,0.00,0.00,0.00,0.00,2.50,1.25,1.25
3,4,4.00,0.80,2.40,2.40,2.40,2.40,0.80,0.00,2.60,...,0.0,0.00,0.80,0.00,0.80,1.70,0.80,0.80,0.00,0.00
4,5,3.56,1.06,0.00,0.00,0.19,1.12,0.50,0.00,2.56,...,0.0,0.38,0.00,0.19,0.25,0.25,0.00,0.88,0.25,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
305692,330971,4.36,1.86,2.05,0.45,0.45,0.82,1.32,0.00,2.09,...,0.0,1.14,0.00,0.00,0.00,0.27,0.77,1.64,0.00,0.00
305693,330972,3.11,0.21,0.32,0.00,0.32,1.89,0.47,0.00,1.16,...,0.0,0.26,0.26,0.00,0.00,1.37,0.21,0.53,0.26,0.00
305694,330973,2.00,0.00,0.00,0.00,2.00,2.00,0.00,0.00,0.00,...,0.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
305695,330974,3.53,1.68,2.15,0.35,0.35,1.71,0.74,0.00,0.85,...,0.0,0.00,0.06,0.29,0.26,0.29,1.71,1.09,0.24,0.15


In [11]:
user_ratings_by_genre.to_csv('csv_files/user_ratings_by_genre_big.csv', index=False)