# ДЗ Рекомендательные системы на основе содержания

Использовать dataset MovieLens

Построить рекомендации (регрессия, предсказываем оценку) на фичах:

+ TF-IDF на тегах и жанрах
+ Средние оценки (+ median, variance, etc.) пользователя и фильма
+ Оценить RMSE на тестовой выборке

In [282]:
import pandas as pd
import numpy as np
from datetime import datetime

from tqdm.notebook import tqdm


from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
import statsmodels.api as sm

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import r2_score

import matplotlib.pyplot as plt

%matplotlib inline

In [283]:
links = pd.read_csv('data/movies/links.csv')
movies = pd.read_csv('data/movies/movies.csv')
ratings = pd.read_csv('data/movies/ratings.csv')
tags = pd.read_csv('data/movies/tags.csv')

Список пользователей, которые чаще всего оценивали фильм

In [284]:
ratings['userId'].value_counts().head(5)

414    2698
599    2478
474    2108
448    1864
274    1346
Name: userId, dtype: int64

In [285]:
tags['userId'].value_counts().head(5)

474    1507
567     432
62      370
599     323
477     280
Name: userId, dtype: int64

In [288]:
movies_with_tags = movies.join(tags.set_index('movieId'), on='movieId')

In [325]:
movies_with_tags.head(5)

Unnamed: 0,movieId,title,genres,userId,tag,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336.0,pixar,1139046000.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474.0,pixar,1137207000.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567.0,fun,1525286000.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,fantasy,1528844000.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,magic board game,1528844000.0


In [290]:
movies_with_tags.tag.unique().shape

(1590,)

In [291]:
movies_with_tags.dropna(inplace=True)

In [292]:
movies_with_tags.title.unique().shape

(1572,)

In [293]:
movies_all = pd.merge(movies_with_tags, ratings,  how='left', left_on=['movieId','userId'], right_on = ['movieId','userId'])

In [295]:
movies_all.dropna(inplace=True)
movies_all = movies_all.reset_index(drop=True)

Объединили фильмы, теги и рейтинги в один датафрейм и удалили строки, содержащие пустые значения. 



### Построение модели
Для сравнения построим несколько моделей  на жанрах и тегах

In [296]:
y = movies_all['rating'].astype('int')
X = movies_all

X = X.drop(['rating','timestamp_x','timestamp_y','title'], axis = 1) 


In [297]:
X.head(3)

Unnamed: 0,movieId,genres,userId,tag
0,1,Adventure|Animation|Children|Comedy|Fantasy,336.0,pixar
1,1,Adventure|Animation|Children|Comedy|Fantasy,474.0,pixar
2,1,Adventure|Animation|Children|Comedy|Fantasy,567.0,fun


In [298]:
le = LabelEncoder()
le.fit( X['genres'] )
X['genres'] = le.transform(X['genres'] )
le.fit( X['tag'] )
X['tag'] = le.transform( X['tag'] )

In [299]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=800)

In [300]:
model = linear_model.LinearRegression()
model.fit( X_train, y_train )

y_predict_ln = model.predict(X_test)
score_ln = model.score(X_test, y_test)
mse_ln = mean_squared_error(y_test, y_predict_ln)
sqrt_mse_ln = np.sqrt(mse_ln)
print("LinearRegression")
print("MSE: %.5f" % mse_ln)
print("SQ MSE: %.5f" % sqrt_mse_ln)
print("Score: %.5f" % score_ln)
print('R-squared: %.5f' % model.score(X_test, y_test))
print('Предсказанные оценки:', y_predict_ln)

LinearRegression
MSE: 0.85800
SQ MSE: 0.92628
Score: -0.00061
R-squared: -0.00061
Предсказанные оценки: [3.7371764  3.84099459 3.78109534 ... 3.67053703 3.79994205 3.8182997 ]


In [301]:
clf_rf = RandomForestClassifier(n_estimators=10, max_depth=10, min_samples_leaf=20, max_features=0.5, n_jobs=-1)
clf_rf.fit(X_train, y_train)   
y_predict_rf = clf_rf.predict(X_test)
print("RandomForestClassifier")
print("Значимость признаков:", clf_rf.feature_importances_)  

mse_rf = mean_squared_error(y_test, y_predict_rf)
sqrt_mse_rf = np.sqrt(mse_rf)


print("MSE: %.5f" % mse_rf)
print("SQ MSE: %.5f" % sqrt_mse_rf)
print('R-squared: %.5f' % clf_rf.score(X_test, y_test))

RandomForestClassifier
Значимость признаков: [0.28594174 0.15461298 0.49266359 0.06678169]
MSE: 0.70853
SQ MSE: 0.84174
R-squared: 0.63950


Добавить новые фичи - среднюю, медиану и дисперсию оценки для каждого пользователя  и каждого фильма

In [302]:
mean_user = movies_all.groupby(['userId']).mean().rating
mean_movies = movies_all.groupby(['movieId']).mean().rating

med_user = movies_all.groupby(['userId']).median().rating
med_movies = movies_all.groupby(['movieId']).median().rating

var_user = movies_all.groupby(['userId']).var().rating
var_movies = movies_all.groupby(['movieId']).var().rating

In [303]:
movies_st = movies_all.merge(mean_user, on='userId', suffixes=('', '_meanuser'))
movies_st = movies_st.merge(med_user, on='userId', suffixes=('', '_meduser'))
movies_st = movies_st.merge(var_user, on='userId', suffixes=('', '_varuser'))
movies_st = movies_st.merge(mean_movies, on='movieId', suffixes=('', '_meanmov'))
movies_st = movies_st.merge(med_movies, on='movieId', suffixes=('', '_medmov'))
movies_st = movies_st.merge(var_movies, on='movieId', suffixes=('', '_varmov'))


In [304]:
movies_st['rating_varuser'] = movies_st['rating_varuser'].fillna(0)
movies_st['rating_varmov'] = movies_st['rating_varmov'].fillna(0)

In [305]:
movies_st.head(5)

Unnamed: 0,movieId,title,genres,userId,tag,timestamp_x,rating,timestamp_y,rating_meanuser,rating_meduser,rating_varuser,rating_meanmov,rating_medmov,rating_varmov
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336.0,pixar,1139046000.0,4.0,1122227000.0,3.777778,4.0,0.444444,3.833333,4.0,0.083333
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474.0,pixar,1137207000.0,4.0,978575800.0,3.701909,4.0,0.666033,3.833333,4.0,0.083333
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567.0,fun,1525286000.0,3.5,1525286000.0,3.917824,4.0,0.649264,3.833333,4.0,0.083333
3,552,"Three Musketeers, The (1993)",Action|Adventure|Comedy|Romance,336.0,knights,1139046000.0,3.0,1120568000.0,3.777778,4.0,0.444444,3.0,3.0,0.0
4,1246,Dead Poets Society (1989),Drama,336.0,highschool,1139047000.0,4.5,1139047000.0,3.777778,4.0,0.444444,4.5,4.5,0.0


Добавим к новым фичам TF IDF по тегам и жанрам

In [306]:
tag_strings = []
ganre_strings = []
movies_list = []

for movie, group in tqdm(movies_st.groupby('title')):

    tag_strings.append(' '.join([str(s).replace(' ', '').replace('-', '').replace('|', '') for s in group.tag.values]))
    ganre_strings.append(' '.join([str(s).replace(' ', '').replace('-', '') for s in group.genres.values]))

    movies_list.append(movie)

HBox(children=(IntProgress(value=0, max=1464), HTML(value='')))




Матрица тегов

In [307]:
count_vect = CountVectorizer()
X_tag_counts = count_vect.fit_transform(tag_strings)
tfidf_transformer = TfidfTransformer()
X_tag_tfidf = tfidf_transformer.fit_transform(X_tag_counts)

Матрица жанров

In [308]:
count_vect = CountVectorizer()
X_ganre_counts = count_vect.fit_transform(ganre_strings)
tfidf_transformer = TfidfTransformer()
X_ganre_tfidf = tfidf_transformer.fit_transform(X_ganre_counts)

Объединяем все фичи в один датасет

In [309]:
from scipy.sparse import coo_matrix, hstack
tag_ganre_df = hstack([X_tag_tfidf,X_ganre_tfidf]).toarray()


In [310]:
movie_list_df = pd.DataFrame(movies_list, columns = ['title'])

In [311]:
newDf = pd.concat([movie_list_df, pd.DataFrame(tag_ganre_df)], axis = 1)

In [312]:
X_df = pd.merge(movies_st, newDf,  how='left', 
                      left_on=['title'], 
                      right_on = ['title'])

In [313]:
X_df.head(5)

Unnamed: 0,movieId,title,genres,userId,tag,timestamp_x,rating,timestamp_y,rating_meanuser,rating_meduser,...,1446,1447,1448,1449,1450,1451,1452,1453,1454,1455
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336.0,pixar,1139046000.0,4.0,1122227000.0,3.777778,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474.0,pixar,1137207000.0,4.0,978575800.0,3.701909,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567.0,fun,1525286000.0,3.5,1525286000.0,3.917824,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,552,"Three Musketeers, The (1993)",Action|Adventure|Comedy|Romance,336.0,knights,1139046000.0,3.0,1120568000.0,3.777778,4.0,...,0.0,0.0,0.0,0.0,0.0,0.474665,0.0,0.0,0.0,0.0
4,1246,Dead Poets Society (1989),Drama,336.0,highschool,1139047000.0,4.5,1139047000.0,3.777778,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [314]:
y = X_df['rating'].astype('int')
X = X_df
X = X.drop(['rating','timestamp_x','timestamp_y','title','genres','tag'], axis = 1) 

In [315]:
X.head(5)

Unnamed: 0,movieId,userId,rating_meanuser,rating_meduser,rating_varuser,rating_meanmov,rating_medmov,rating_varmov,0,1,...,1446,1447,1448,1449,1450,1451,1452,1453,1454,1455
0,1,336.0,3.777778,4.0,0.444444,3.833333,4.0,0.083333,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,474.0,3.701909,4.0,0.666033,3.833333,4.0,0.083333,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,567.0,3.917824,4.0,0.649264,3.833333,4.0,0.083333,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,552,336.0,3.777778,4.0,0.444444,3.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.474665,0.0,0.0,0.0,0.0
4,1246,336.0,3.777778,4.0,0.444444,4.5,4.5,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Стоим модель на новом датасете со всеми новыми фичами

In [316]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=800)

print('*********')

model = linear_model.LinearRegression()
model.fit( X_train, y_train )

y_predict_ln = model.predict(X_test)
score_ln = model.score(X_test, y_test)
mse_ln = mean_squared_error(y_test, y_predict_ln)
sqrt_mse_ln = np.sqrt(mse_ln)

print("LinearRegression")
print("MSE: %.5f" % mse_ln)
print("SQ MSE: %.5f" % sqrt_mse_ln)
print("Score: %.5f" % score_ln)
print('R-squared: %.5f' %  r2_score(y_test, y_predict_ln))
print('Предсказанные оценки:', y_predict_ln)

print('*********')
print('LinearRegression Ridge L1')

model = linear_model.Ridge(alpha=0.87)
model.fit( X_train, y_train )

y_predict_ln = model.predict(X_test)
score_ln = model.score(X_test, y_test)
mse_ln = mean_squared_error(y_test, y_predict_ln)
sqrt_mse_ln = np.sqrt(mse_ln)


print("MSE: %.5f" % mse_ln)
print("SQ MSE: %.5f" % sqrt_mse_ln)
print("Score: %.5f" % score_ln)
print('R-squared: %.5f' % r2_score(y_test, y_predict_ln))
print('Предсказанные оценки:', y_predict_ln)


print('*********')
print('LinearRegression Lasso L2')

model = linear_model.Lasso(alpha=0.05)
model.fit( X_train, y_train )

y_predict_ln = model.predict(X_test)
score_ln = model.score(X_test, y_test)
mse_ln = mean_squared_error(y_test, y_predict_ln)
sqrt_mse_ln = np.sqrt(mse_ln)


print("MSE: %.5f" % mse_ln)
print("SQ MSE: %.5f" % sqrt_mse_ln)
print("Score: %.5f" % score_ln)
print('R-squared: %.5f' % r2_score(y_test, y_predict_ln))
print('Предсказанные оценки:', y_predict_ln)



*********
LinearRegression
MSE: 1087319688710.89832
SQ MSE: 1042746.22450
Score: -1173470646402.57593
R-squared: -1173470646402.57593
Предсказанные оценки: [2.21203521 3.00000003 3.99999999 ... 4.82917252 4.00000001 3.17590617]
*********
LinearRegression Ridge L1
MSE: 0.10765
SQ MSE: 0.32810
Score: 0.88382
R-squared: 0.88382
Предсказанные оценки: [2.2861887  3.05068596 3.88846527 ... 4.70255784 4.06128264 3.1702038 ]
*********
LinearRegression Lasso L2
MSE: 0.12416
SQ MSE: 0.35236
Score: 0.86600
R-squared: 0.86600
Предсказанные оценки: [2.38077743 3.27212232 3.75767974 ... 4.32653    4.25864867 3.18192721]


Сравение моделей на новом наборе показывает, что необходимо использовать регуляризацию.  

### Рекомендации 

Для рекомендации фильмов пользователям выборку делим на две части: по тем фильмам, которые пользователь оценил - обучаем модель, по тем, которые он не оценивал - строим предсказание оценки. 

Отбираем фильмы с самой лучшей оценкой и рекомендуем пользователю. 

In [317]:
def Rec_mov(df,user,top):
    X1_train = df[df.loc[:,'userId'] == user]   # обучаем по фильмам, которые 474 уже оценил
    X1_test = df[df.loc[:,'userId'] != user]    # предсказываем по фильмам, которые еще не видел (не оценил)

    y1_train = X1_train['rating'].astype('int')
    y1_test = X1_test['rating'].astype('int')

    X1_train = X1_train.drop(['rating','timestamp_x','timestamp_y','title','genres','tag'], axis = 1) 

    X1_test = X1_test.drop(['rating','timestamp_x','timestamp_y','title','genres','tag'], axis = 1) 

    model = linear_model.Ridge(alpha=0.7)
    model.fit( X1_train, y1_train )

    y1_predict_ln = model.predict(X1_test)
    score_ln = model.score(X1_test, y1_test)
    mse_ln = mean_squared_error(y1_test, y1_predict_ln)
    sqrt_mse_ln = np.sqrt(mse_ln)

    print("MSE: %.5f" % mse_ln)
    print("SQ MSE: %.5f" % sqrt_mse_ln)
    print("Score: %.5f" % score_ln)
    print('R-squared: %.5f' % r2_score(y1_test, y1_predict_ln))
    print('Предсказанные оценки:', y1_predict_ln)

    rez = np.argsort(y1_predict_ln)[-top:]
    recomend = []


    for i in reversed(rez):
        recomend.append([X_df['title'][i], np.round(X_df['rating_meanmov'][i]), np.round(y1_predict_ln[i],2)])

    recomend = pd.DataFrame(recomend)               
    recomend.columns = ['title', 'mean rating', 'predict user rating']
    recomend = recomend.drop_duplicates()
    
    return recomend

Построим список для пользователя 474

In [318]:
Rec_mov(X_df,474,20)

MSE: 0.38965
SQ MSE: 0.62422
Score: 0.53308
R-squared: 0.53308
Предсказанные оценки: [3.82915856 3.82915856 2.80000099 ... 4.74025456 4.74025456 4.74025456]


Unnamed: 0,title,mean rating,predict user rating
0,"Birds, The (1963)",4.0,5.08
1,"Shining, The (1980)",4.0,5.06
3,"Treasure of the Sierra Madre, The (1948)",4.0,5.06
4,Better Off Dead... (1985),3.0,5.06
5,Chinatown (1974),4.0,5.06
6,"Bridge on the River Kwai, The (1957)",4.0,5.06
7,"Femme Nikita, La (Nikita) (1990)",4.0,5.06
15,Stand by Me (1986),4.0,5.06
16,"Day the Earth Stood Still, The (1951)",4.0,5.06
17,"Lord of the Rings: The Return of the King, The...",5.0,5.04


#### Проверим: пользователю рекомендован фильм Birds, The (1963). Сравним жанры и теги рекомендованного фильма с фильмами, которым пользователь поставил максимальную оценку.

In [319]:
movies_all[movies_all.loc[:,'title'] == 'Birds, The (1963)']

Unnamed: 0,movieId,title,genres,userId,tag,timestamp_x,rating,timestamp_y
904,1333,"Birds, The (1963)",Horror|Thriller,474.0,birds,1137203000.0,4.5,1090874000.0


In [320]:
movies_all[movies_all.loc[:,'userId'] == 474].sort_values(by=['rating'])[-10:]

Unnamed: 0,movieId,title,genres,userId,tag,timestamp_x,rating,timestamp_y
589,912,Casablanca (1942),Drama|Romance,474.0,start of a beautiful friendship,1137202000.0,5.0,983032200.0
918,1356,Star Trek: First Contact (1996),Action|Adventure|Sci-Fi|Thriller,474.0,Borg,1137203000.0,5.0,974667400.0
1294,2745,"Mission, The (1986)",Drama,474.0,Missionary,1137203000.0,5.0,1046979000.0
2363,8636,Spider-Man 2 (2004),Action|Adventure|Sci-Fi|IMAX,474.0,Doc Ock,1137203000.0,5.0,1092012000.0
903,1307,When Harry Met Sally... (1989),Comedy|Romance,474.0,New York,1137203000.0,5.0,978575800.0
1295,2745,"Mission, The (1986)",Drama,474.0,Priest,1137191000.0,5.0,1046979000.0
1307,2762,"Sixth Sense, The (1999)",Drama|Horror|Mystery,474.0,ghosts,1137205000.0,5.0,979179900.0
647,928,Rebecca (1940),Drama|Mystery|Romance|Thriller,474.0,Mrs. DeWinter,1137203000.0,5.0,983032200.0
387,318,"Shawshank Redemption, The (1994)",Crime|Drama,474.0,Stephen King,1137181000.0,5.0,979179800.0
1159,2171,Next Stop Wonderland (1998),Comedy|Drama|Romance,474.0,Boston,1137203000.0,5.0,974668800.0


Построим список для пользователя 567

In [321]:
Rec_mov(X_df,567,10)

MSE: 0.30595
SQ MSE: 0.55313
Score: 0.65337
R-squared: 0.65337
Предсказанные оценки: [3.15101261 3.15101261 2.54147314 ... 4.6317915  4.6317915  4.6317915 ]


Unnamed: 0,title,mean rating,predict user rating
0,"Hello, Dolly! (1969)",2.0,4.97
1,My Life Without Me (2003),4.0,4.97
2,Luther (2003),4.0,4.97
3,Secondhand Lions (2003),4.0,4.97
4,Once Bitten (1985),2.0,4.97
5,Paper Moon (1973),4.0,4.97
6,Fantastic Four: Rise of the Silver Surfer (2007),3.0,4.9


#### Проверим: пользователю рекомендован фильм Paper Moon (1973). Сравним жанры и теги рекомендованного фильма с фильмами, которым пользователь поставил максимальную оценку.

In [322]:
movies_all[movies_all.loc[:,'title'] == 'Paper Moon (1973)']

Unnamed: 0,movieId,title,genres,userId,tag,timestamp_x,rating,timestamp_y
2113,6724,Paper Moon (1973),Comedy|Crime|Drama,474.0,In Netflix queue,1137202000.0,3.5,1181348000.0


In [323]:
movies_all[movies_all.loc[:,'userId'] == 567].sort_values(by=['rating'])[-10:]

Unnamed: 0,movieId,title,genres,userId,tag,timestamp_x,rating,timestamp_y
2912,71899,Mary and Max (2009),Animation|Comedy|Drama,567.0,friendship,1525283000.0,5.0,1525282000.0
2913,71899,Mary and Max (2009),Animation|Comedy|Drama,567.0,loneliness,1525283000.0,5.0,1525282000.0
2914,71899,Mary and Max (2009),Animation|Comedy|Drama,567.0,mental illness,1525283000.0,5.0,1525282000.0
1478,3266,Man Bites Dog (C'est arrivé près de chez vous)...,Comedy|Crime|Drama|Thriller,567.0,black comedy,1525283000.0,5.0,1525282000.0
1479,3266,Man Bites Dog (C'est arrivé près de chez vous)...,Comedy|Crime|Drama|Thriller,567.0,crazy,1525283000.0,5.0,1525282000.0
1480,3266,Man Bites Dog (C'est arrivé près de chez vous)...,Comedy|Crime|Drama|Thriller,567.0,dark,1525283000.0,5.0,1525282000.0
1481,3266,Man Bites Dog (C'est arrivé près de chez vous)...,Comedy|Crime|Drama|Thriller,567.0,dark comedy,1525283000.0,5.0,1525282000.0
2915,71899,Mary and Max (2009),Animation|Comedy|Drama,567.0,philosophical,1525283000.0,5.0,1525282000.0
2583,40491,"Match Factory Girl, The (Tulitikkutehtaan tytt...",Comedy|Drama,567.0,depression,1525282000.0,5.0,1525282000.0
3101,99764,It's Such a Beautiful Day (2012),Animation|Comedy|Drama|Fantasy|Sci-Fi,567.0,weird,1525282000.0,5.0,1525282000.0
