1) Использовать dataset MovieLens

2) Построить рекомендации (регрессия, предсказываем оценку) на фичах:

. TF-IDF на тегах и жанрах

. Средние оценки (+ median, variance, etc.) пользователя и фильма

3) Оценить RMSE на тестовой выборке

In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer

%matplotlib inline

In [163]:
p=print

In [2]:
links = pd.read_csv('links.csv')
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

In [18]:
movies.head(1)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [19]:
ratings.head(1)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703


In [None]:
# добавим несколько фич

In [47]:
# количество оценок по фильмам
movies_rating_count = ratings.groupby('movieId')[['rating']].count().reset_index()

In [43]:
# средняя оценка фильма по пользователям
movies_avg_rating = ratings.groupby('movieId')[['rating']].mean().reset_index()

In [52]:
# медианная оценка фильма по пользователям
movies_rating_med = ratings.groupby('movieId', sort=False)[['rating']].median().reset_index()

In [58]:
# количество тэгов для каждого фильма
movies_tag_count = tags.groupby('movieId')[['tag']].count().reset_index()

In [72]:
# количество всех тегов
join_tags = tags.groupby('movieId')[['tag']].agg(' '.join).reset_index()

In [81]:
def pull_years(row):
    try:
        return int(row[-5:-1])
    except:
        return 0

In [None]:
# заполним датасет фичами

In [85]:
movies['year'] = movies['title'].apply(pull_years)

In [45]:
movies['mean_rating'] = movies_avg_rating['rating']

In [48]:
movies['rating_count'] = movies_rating_count['rating']

In [54]:
movies['median_rating'] = movies_rating_med['rating']

In [60]:
movies['tag_count'] = movies_tag_count['tag']

In [73]:
movies['tags'] = join_tags['tag']

In [91]:
movies['genres'] = movies['genres'].str.lower().str.replace('|', ' ')

In [None]:
# проведем обучение над теми фильмами , которые содержат теги

In [110]:
movies_main = movies.copy()

In [111]:
movies_main.dropna(inplace=True)

In [112]:
movies_main.head()

Unnamed: 0,movieId,title,genres,mean_rating,rating_count,median_rating,tag_count,tags,year
0,1,Toy Story (1995),adventure animation children comedy fantasy,3.92093,215.0,4.0,3.0,pixar pixar fun,1995
1,2,Jumanji (1995),adventure children fantasy,3.431818,110.0,3.0,4.0,fantasy magic board game Robin Williams game,1995
2,3,Grumpier Old Men (1995),comedy romance,3.259615,52.0,4.0,2.0,moldy old,1995
3,4,Waiting to Exhale (1995),comedy drama romance,2.357143,7.0,4.0,2.0,pregnancy remake,1995
4,5,Father of the Bride Part II (1995),comedy,3.071429,49.0,4.5,1.0,remake,1995


In [113]:
# выделим два корпуса и применим преобразование TF*IDF
korpus_1 = movies_main['tags']
korpus_2 = movies_main['genres']

vect1 = CountVectorizer()
vect2 = CountVectorizer()


tag_counts = vect1.fit_transform(korpus_1)
genre_counts = vect2.fit_transform(korpus_2)


tfidf_transformer1 = TfidfTransformer()
tfidf_transformer2 = TfidfTransformer()


tag_tfidf = tfidf_transformer1.fit_transform(tag_counts)
genre_tfidf = tfidf_transformer2.fit_transform(genre_counts)


In [116]:
genres = pd.DataFrame(genre_tfidf.toarray(), columns=vect2.get_feature_names())

In [120]:
tags = pd.DataFrame(tag_tfidf.toarray(), columns=vect1.get_feature_names())

In [209]:
movies_main.shape,genres.shape,tags.shape

((1572, 9), (1572, 21), (1572, 1744))

In [258]:
movies_final = pd.concat([movies_main,genres],axis=1)
movies_final.shape

(1572, 30)

In [259]:
movies_final.shape

(1572, 30)

In [260]:
movies_final = pd.concat([movies_final,tags], axis=1)

In [261]:
movies_final.drop(columns=['title','genres','tags'],inplace=True)

In [262]:
movies_final.head(1)

Unnamed: 0,movieId,mean_rating,rating_count,median_rating,tag_count,year,action,adventure,animation,children,...,york,you,younger,your,zellweger,zither,zoe,zombie,zombies,zooey
0,1,3.92093,215.0,4.0,3.0,1995,0.0,0.409764,0.574977,0.425991,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [263]:
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor

In [264]:
y = movies_final['mean_rating']

In [265]:
X = movies_final.drop(columns='mean_rating')

In [266]:
from sklearn.model_selection import train_test_split

In [267]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = .2)

In [268]:
lasso = Lasso()
lasso.fit(X_train, y_train)

random_f_reg = RandomForestRegressor(n_estimators=100)
random_f_reg.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [269]:
y_pred_lasso = lasso.predict(X_test)
y_pred_rfr = random_f_reg.predict(X_test)

In [270]:
RMSE_lasso = np.sqrt(mean_squared_error(y_test, y_pred_lasso))
RMSE_random_f_reg = np.sqrt(mean_squared_error(y_test, y_pred_rfr))

In [271]:
p(f'RMSE Lasso = {RMSE_lasso:.2f}')

RMSE Lasso = 0.65


In [272]:
p(f'RMSE Random Forest = {RMSE_random_f_reg:.2f}')

RMSE Random Forest = 0.63


In [273]:
# что имеем
pd.DataFrame({'y_test' : y_test, 'y_Random_F' : y_pred_rfr,'y_Lasso':y_pred_lasso}).head(20)

Unnamed: 0,y_test,y_Random_F,y_Lasso
377,2.75,2.364714,3.20256
960,4.127907,3.888497,3.36362
744,4.1,3.806985,3.450532
839,4.1,3.675699,3.270692
831,3.405405,3.621086,3.31001
762,3.0,2.864667,3.169634
330,2.761905,3.44355,3.251694
759,3.0,2.855301,3.158703
1081,4.5,3.242208,3.146762
1178,4.5,2.994965,3.136449


In [None]:
# попробуем XGBRegressor

In [274]:
from xgboost import XGBRegressor

In [275]:
xgbreg = XGBRegressor(max_depth=4,learning_rate=0.2,n_estimators=1000)

In [None]:
# xgboost не работвет с дубликатами

In [277]:
movies_final_xgb = movies_final.copy()

In [278]:
movies_final_xgb = movies_final_xgb.loc[:,~movies_final_xgb.columns.duplicated()]

In [279]:
movies_final_xgb.columns.duplicated().sum()

0

In [280]:
y = movies_final_xgb['mean_rating']

In [281]:
X = movies_final_xgb.drop(columns='mean_rating')

In [282]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = .2)

In [283]:
xgbreg.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.2, max_delta_step=0,
       max_depth=4, min_child_weight=1, missing=None, n_estimators=1000,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [284]:
y_pred_xgbreg = xgbreg.predict(X_test)

In [285]:
RMSE_xgbreg = np.sqrt(mean_squared_error(y_test, y_pred_xgbreg))

In [286]:
p(f'RMSE XGBRegressor = {RMSE_xgbreg:.2f}')

RMSE XGBRegressor = 0.68


In [287]:
pd.DataFrame({'y_test' : y_test, 'y_XGBRegressor' : y_pred_xgbreg}).head(10)

Unnamed: 0,y_test,y_XGBRegressor
1363,1.944444,3.328979
1539,5.0,3.404861
139,2.833333,3.251162
774,2.9,2.900802
1098,3.0,2.789423
769,2.9375,3.313422
683,3.0,3.742869
5,3.946078,3.634551
761,2.5,3.131827
1204,2.461538,3.143906


In [199]:
# из всех моделей лучший результат показал RandomForestRegressor