In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
%matplotlib inline
import copy
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor

In [2]:
links = pd.read_csv('links.csv')
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

In [3]:
movies = pd.merge(ratings, movies, on='movieId', how='left')
movies['genres'] = movies.apply(lambda k: ' '.join(k['genres'].split('|')), axis=1)
movies['tags'] = movies.apply(lambda k: ' '.join(tags[tags['movieId']==k['movieId']]['tag'].values), axis=1)
movies.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,tags
0,1,1,4.0,964982703,Toy Story (1995),Adventure Animation Children Comedy Fantasy,pixar pixar fun
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy Romance,moldy old
2,1,6,4.0,964982224,Heat (1995),Action Crime Thriller,
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery Thriller,mystery twist ending serial killer
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime Mystery Thriller,mindfuck suspense thriller tricky twist ending...


In [4]:
drop_col = ['userId','movieId','timestamp','title','rating']
count_vec = CountVectorizer()
tfidf = TfidfTransformer()
sc = StandardScaler()

In [5]:
def get_tfidf(data, feature):
    X_train_count = count_vec.fit_transform(data[feature])
    X_train_tfidf = tfidf.fit_transform(X_train_count).toarray()
    col = [col for col in range(len(count_vec.vocabulary_))]
    for k in count_vec.vocabulary_:
        col[count_vec.vocabulary_[k]] = k
    df_tfidf = pd.DataFrame(X_train_tfidf, columns=col)
    return pd.concat((data.drop(feature, 1), df_tfidf), axis=1)

def get_rec_for_user(df, model_for_rec, user_id):
    y = df[df['userId']==user_id]['rating']
    X = df[df['userId']==user_id].drop(drop_col, axis=1, inplace=False)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
    X_train = sc.fit_transform(X_train)
    model = model_for_rec
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print('USER_ID: {}\nMODEL: {}\nRMSE: {}'.format(user_id, str(model_for_rec).split('(')[0], 
                                                    np.sqrt(mean_squared_error(y_test, y_pred))))
    
    return X_test, y_pred

In [6]:
movies_tfidf = get_tfidf(movies, 'genres')
movies_tfidf = get_tfidf(movies_tfidf, 'tags')
movies_tfidf.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,action,adventure,animation,children,comedy,...,york,you,younger,your,zellweger,zither,zoe,zombie,zombies,zooey
0,1,1,4.0,964982703,Toy Story (1995),0.0,0.363885,0.549735,0.508407,0.291944,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,3,4.0,964981247,Grumpier Old Men (1995),0.0,0.0,0.0,0.0,0.582902,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,6,4.0,964982224,Heat (1995),0.515013,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Поскольку задача — предсказать оценку, которую поставит пользователь, выберем пользователя с самым большим количеством оценок и обучим модель. Далее эту модель можно использовать для конкретного пользователя на выборке.

In [7]:
USER_ID = movies.groupby(['userId'])['rating'].count().sort_values(ascending=False).index[0]

In [8]:
df_for_pred, y_pred = get_rec_for_user(movies_tfidf, DecisionTreeRegressor(), USER_ID)

df_with_pred = pd.concat([df_for_pred.reset_index(), pd.DataFrame(columns=['predicted_rating'], 
            data=y_pred)], axis=1)
df = pd.merge(movies_tfidf[['movieId', 'title', 'rating']].reset_index(),
                    df_with_pred[['index', 'predicted_rating']], how='inner', on='index')
df.drop(['index'], axis=1).head()

USER_ID: 414
MODEL: DecisionTreeRegressor
RMSE: 1.0954047624636893


Unnamed: 0,movieId,title,rating,predicted_rating
0,1,Toy Story (1995),4.0,2.5
1,5,Father of the Bride Part II (1995),2.0,2.5
2,23,Assassins (1995),2.0,4.0
3,24,Powder (1995),3.0,3.0
4,36,Dead Man Walking (1995),3.0,4.0


Добавим средние оценки пользователя и фильма.

In [9]:
user_rating = ratings.groupby('userId')['rating'].agg(['mean','median','std','var']).reset_index()
movie_rating = ratings.groupby('movieId')['rating'].agg(['mean','median','std','var']).reset_index()

In [10]:
movies_tfidf_with_mean = pd.merge(movies_tfidf, user_rating, on='userId')

movies_tfidf_with_mean.rename(columns={
    'mean': ' user_mean',
    'median': 'user_median',
    'std': 'user_std',
    'var': 'user_var'
}, inplace=True)

movies_tfidf_with_mean = pd.merge(movies_tfidf_with_mean, movie_rating, on='movieId')

movies_tfidf_with_mean.rename(columns={
    'mean': ' movie_mean',
    'median': 'movie_median',
    'std': 'movie_std',
    'var': 'movie_var'
}, inplace=True)

movies_tfidf_with_mean.fillna(0, inplace=True)

In [11]:
movies_tfidf_with_mean.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,action,adventure,animation,children,comedy,...,zombies,zooey,user_mean,user_median,user_std,user_var,movie_mean,movie_median,movie_std,movie_var
0,1,1,4.0,964982703,Toy Story (1995),0.0,0.363885,0.549735,0.508407,0.291944,...,0.0,0.0,4.366379,5.0,0.800048,0.640077,3.92093,4.0,0.834859,0.69699
1,5,1,4.0,847434962,Toy Story (1995),0.0,0.363885,0.549735,0.508407,0.291944,...,0.0,0.0,3.636364,4.0,0.990441,0.980973,3.92093,4.0,0.834859,0.69699
2,7,1,4.5,1106635946,Toy Story (1995),0.0,0.363885,0.549735,0.508407,0.291944,...,0.0,0.0,3.230263,3.5,1.329594,1.76782,3.92093,4.0,0.834859,0.69699
3,15,1,2.5,1510577970,Toy Story (1995),0.0,0.363885,0.549735,0.508407,0.291944,...,0.0,0.0,3.448148,3.5,1.133404,1.284605,3.92093,4.0,0.834859,0.69699
4,17,1,4.5,1305696483,Toy Story (1995),0.0,0.363885,0.549735,0.508407,0.291944,...,0.0,0.0,4.209524,4.0,0.50849,0.258562,3.92093,4.0,0.834859,0.69699


In [12]:
df_for_pred, y_pred = get_rec_for_user(movies_tfidf_with_mean, DecisionTreeRegressor(), USER_ID)

df_with_pred = pd.concat([df_for_pred.reset_index(), pd.DataFrame(columns=['predicted_rating'], 
            data=y_pred)], axis=1)
df = pd.merge(movies_tfidf_with_mean[['movieId', 'title', 'rating']].reset_index(),
                    df_with_pred[['index', 'predicted_rating']], how='inner', on='index')
df.drop(['index'], axis=1).head()

USER_ID: 414
MODEL: DecisionTreeRegressor
RMSE: 1.0654506958021028


Unnamed: 0,movieId,title,rating,predicted_rating
0,47,Seven (a.k.a. Se7en) (1995),4.0,3.0
1,110,Braveheart (1995),5.0,3.0
2,223,Clerks (1994),5.0,3.0
3,296,Pulp Fiction (1994),5.0,3.0
4,441,Dazed and Confused (1993),4.0,3.0


RMSE улучшился.