In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from datetime import datetime
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [3]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

In [4]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [5]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [6]:
tags

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200
...,...,...,...,...
3678,606,7382,for katie,1171234019
3679,606,7936,austere,1173392334
3680,610,3265,gun fu,1493843984
3681,610,3265,heroic bloodshed,1493843978


In [7]:
data = (
    pd.merge(left=tags, right=ratings, how='left', on=['movieId', 'userId'])
    .merge(right=movies, how='left', on='movieId')
)

In [8]:
cleaned_data = data.drop(columns=['timestamp_x', 'timestamp_y', 'title'])

In [35]:
mean_user_rating = cleaned_data.groupby(by=['userId']).mean().drop(columns='movieId')
median_user_rating = cleaned_data.groupby(by=['userId']).median().drop(columns='movieId')

In [9]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

In [10]:
def tfidf_transform(d):
    tfidf = TfidfVectorizer()
    X_train_tfidf = tfidf.fit_transform(d)
    return X_train_tfidf, tfidf.get_feature_names_out()

In [11]:
def rmse(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)
    model = LinearRegression().fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return np.sqrt(mean_squared_error(y_true=y_test, y_pred=y_pred))

In [12]:
tags_ = [change_string(t) for t in cleaned_data.tag.values]

In [13]:
len(tags_)

3683

In [14]:
genres_ = [change_string(g) for g in cleaned_data.genres.values]

In [15]:
len(genres_)

3683

In [16]:
tags_tfidf, tag_names = tfidf_transform(tags_)

In [17]:
tags_tfidf = pd.DataFrame(tags_tfidf.toarray(), columns=tag_names)

In [37]:
tags_tfidf

Unnamed: 0,06oscarnominatedbestmovieanimation,1900s,1920s,1950s,1960s,1970s,1980s,1990s,2001like,2danimation,...,worldwari,worldwarii,writing,wrongfulimprisonment,wry,youngermen,zither,zoekazan,zombies,zooeydeschanel
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3678,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3679,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3680,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3681,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
genres_tfidf, genre_name = tfidf_transform(genres_)

In [20]:
genres_tfidf = pd.DataFrame(genres_tfidf.toarray(), columns=genre_name) 

In [38]:
genres_tfidf

Unnamed: 0,action,adventure,animation,children,comedy,crime,documentary,drama,fantasy,filmnoir,horror,imax,musical,mystery,nogenreslisted,romance,scifi,thriller,war,western
0,0.000000,0.0,0.0,0.0,1.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0
1,0.000000,0.0,0.0,0.0,1.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0
2,0.000000,0.0,0.0,0.0,1.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0
3,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,1.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0
4,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,1.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3678,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.371290,0.0,0.0,0.0,0.0,0.0,0.766292,0.0,0.0,0.0,0.524348,0.000000,0.0
3679,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.355506,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.934674,0.0
3680,0.556682,0.0,0.0,0.0,0.0,0.566816,0.0,0.350958,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.495634,0.000000,0.0
3681,0.556682,0.0,0.0,0.0,0.0,0.566816,0.0,0.350958,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.495634,0.000000,0.0


In [22]:
y = cleaned_data.rating.fillna(0)

In [51]:
X = (
    cleaned_data.drop(columns=['rating', 'movieId', 'tag', 'genres' ])
    .merge(tags_tfidf, how='left', left_index=True, right_index=True)
    .merge(genres_tfidf, how='left', left_index=True, right_index=True)
    .merge(mean_user_rating, how='left', on='userId')
    .merge(median_user_rating, how='left', on='userId')
    .rename(columns={'rating_x': 'mean_rating', 'rating_y': 'median_rating'})
    .fillna(0)
    )

## RMSE

In [52]:
rmse(X, y)

1.0837729214615714