In [41]:
import pandas as pd
import numpy as np

from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.neighbors import KNeighborsRegressor

%matplotlib inline

In [2]:
path = '/Users/mac/Desktop/Netology/Pychon/10. Продвинутый pandas/Python_2_join/ml-latest-small'

In [3]:
links = pd.read_csv(path + '/links.csv')
movies = pd.read_csv(path + '/movies.csv')
ratings = pd.read_csv(path + '/ratings.csv')
tags = pd.read_csv(path + '/tags.csv')

In [4]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
def change_string(row):
    return ' '.join(row.replace(' ', '').replace('-','').split('|'))

In [9]:
movie_genres = movies['genres'].apply(change_string)
movie_genres = pd.DataFrame(movie_genres)
movie_genres.head()

Unnamed: 0,genres
0,Adventure Animation Children Comedy Fantasy
1,Adventure Children Fantasy
2,Comedy Romance
3,Comedy Drama Romance
4,Comedy


In [8]:
# найдем дублирующиеся строки с фильмами
dup = movies[movies.title.duplicated()==True]['title'].to_list()
movies[movies.title.isin(dup)].sort_values(by='title')

Unnamed: 0,movieId,title,genres
2872,3598,Hamlet (2000),Crime|Drama|Romance|Thriller
7151,65665,Hamlet (2000),Drama
6172,34048,War of the Worlds (2005),Action|Adventure|Sci-Fi|Thriller
7127,64997,War of the Worlds (2005),Action|Sci-Fi


In [10]:
# удалим менее информативные строки с жанрами для дублирующихся фильмов
movies.drop([7151, 7127], inplace=True)

In [30]:
# добавим тэги
movies_tags = movies.join(tags.set_index('movieId'), on='movieId')
movies_tags['genres'] = movies_tags['genres'].apply(change_string)
movies_tags = movies_tags.drop(['userId', 'timestamp'],1)
movies_tags.head()

Unnamed: 0,movieId,title,genres,tag
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,Pixar
1,2,Jumanji (1995),Adventure Children Fantasy,
2,3,Grumpier Old Men (1995),Comedy Romance,
3,4,Waiting to Exhale (1995),Comedy Drama Romance,
4,5,Father of the Bride Part II (1995),Comedy,steve martin


In [35]:
ratings = ratings.drop(['userId','timestamp'],axis=1)

In [36]:
# Получим средний и медианный рейтинг фильма
mean_ratings = ratings.groupby('movieId').mean()
mean_ratings.rename(columns = {'rating':'mean_ratings'}, inplace = True)

median_ratings = ratings.groupby('movieId').median()
median_ratings.rename(columns = {'rating':'med_ratings'}, inplace = True)

variance_ratings = ratings.groupby('movieId').var()
variance_ratings.rename(columns = {'rating':'var_ratings'}, inplace = True)

# добавим колонку со средним и медианным рейтингом фильма
movies_tags_ratings = movies_tags.join(mean_ratings, on='movieId')
movies_tags_ratings = movies_tags_ratings.join(median_ratings, on='movieId')
movies_tags_ratings = movies_tags_ratings.join(variance_ratings, on='movieId')

In [37]:
movies_tags_ratings.head()

Unnamed: 0,movieId,title,genres,tag,mean_ratings,med_ratings,var_ratings
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,Pixar,3.87247,4.0,0.919646
1,2,Jumanji (1995),Adventure Children Fantasy,,3.401869,3.0,0.775657
2,3,Grumpier Old Men (1995),Comedy Romance,,3.161017,3.0,1.322764
3,4,Waiting to Exhale (1995),Comedy Drama Romance,,2.384615,3.0,0.88141
4,5,Father of the Bride Part II (1995),Comedy,steve martin,3.267857,3.0,0.899675


### Построим рекомендацию по жанрам

In [65]:
# получим список жанров
genres = movies_tags_ratings['genres'].to_list()

In [66]:
CV = CountVectorizer()
genres_CV = CV.fit_transform(genres)

Tf = TfidfTransformer()
genres_Tf = Tf.fit_transform(X_train_counts)

model = KNeighborsRegressor(n_neighbors=7, n_jobs=-1, metric='euclidean')
model.fit(genres_CV, genres_Tf)

KNeighborsRegressor(metric='euclidean', n_jobs=-1, n_neighbors=7)

In [68]:
# проверим на тесте

test = change_string('Adventure|Comedy|Fantasy|Crime')

# test = G_test.apply(change_string)

predict_genres = CV.transform([test])
predict_genres = Tf.transform(predict_genres)

predicted_movies = model.kneighbors(predict_genres, return_distance=True)
predicted_movies

(array([[0.88767951, 0.88767951, 0.88767951, 0.88767951, 0.88767951,
         0.88767951, 0.88767951]]),
 array([[4095, 7806, 1846, 9332, 6050, 4563, 4633]]))

In [60]:
# выведем список фильмов в порядке убавания рейтинга
movies_tags_ratings.iloc[predicted_movies[1][0]].sort_values(by=['mean_ratings'], ascending=False)

Unnamed: 0,movieId,title,genres,tag,mean_ratings,med_ratings,var_ratings
4155,5473,Fox and His Friends (Faustrecht der Freiheit) ...,Drama,,5.0,5.0,
4165,5498,Red Beard (Akahige) (1965),Drama,,4.75,4.75,0.125
4136,5440,She Wore a Yellow Ribbon (1949),Western,,4.5,4.5,
4162,5483,"Kid Stays in the Picture, The (2002)",Documentary,,3.6,4.0,0.425
4163,5489,Nosferatu the Vampyre (Nosferatu: Phantom der ...,Horror,,3.416667,3.25,0.741667
4141,5447,Sunshine State (2002),Drama,,3.166667,3.0,0.583333
4161,5481,Austin Powers in Goldmember (2002),Comedy,mike myers,2.911111,3.0,1.321465


### Построим рекомендацию по тегам

In [70]:
# проверим кол-во уникальных тегов
movies_tags_ratings.tag.unique().shape[0]

583

In [71]:
# удалим фильмы с отсутствующими тегами
movies_tags_ratings = movies_tags_ratings.dropna()

In [72]:
# получим список тегов
tag = movies_tags_ratings.tag.to_list()

# обучим модель
tag_CV = CV.fit_transform(tag)

tag_Tf = Tf.fit_transform(tag_CV)

model = KNeighborsRegressor(n_neighbors=10, n_jobs=-1, metric='manhattan')
model.fit(tag_CV, tag_Tf)

KNeighborsRegressor(metric='manhattan', n_jobs=-1, n_neighbors=10)

In [74]:
# возьмем теги 'scifi timetravel fun'
test_tags = 'scifi timetravel fun'

predict_tags = CV.transform([test_tags])
predict_tags = Tf.transform(predict_tags)

predicted_teg = model.kneighbors(predict_tags, return_distance=True)
predicted_teg

(array([[2., 2., 2., 2., 2., 2., 2., 2., 2., 2.]]),
 array([[709, 707, 716, 715, 714, 710, 718, 702, 704, 719]]))

In [75]:
# выведем список фильмов в порядке убавания рейтинга
movies_tags_ratings.iloc[predicted_teg[1][0]].sort_values(by=['mean_ratings'], ascending=False)

Unnamed: 0,movieId,title,genres,tag,mean_ratings,med_ratings,var_ratings
6539,48516,"Departed, The (2006)",Crime Drama Thriller,toplist06,4.202381,4.5,0.488669
6556,48780,"Prestige, The (2006)",Drama Mystery SciFi Thriller,nonlinear,4.125,4.5,0.76348
6556,48780,"Prestige, The (2006)",Drama Mystery SciFi Thriller,complicated,4.125,4.5,0.76348
6556,48780,"Prestige, The (2006)",Drama Mystery SciFi Thriller,psychological,4.125,4.5,0.76348
6601,50068,Letters from Iwo Jima (2006),Drama War,holes00s,4.045455,4.0,0.422727
6552,48738,"Last King of Scotland, The (2006)",Drama Thriller,toplist06,4.033333,4.0,0.195238
6652,51540,Zodiac (2007),Crime Drama Thriller,toplist07,3.925,4.0,0.480921
6669,52281,Grindhouse (2007),Action Crime Horror SciFi Thriller,hdtv,3.666667,3.5,0.441176
6669,52281,Grindhouse (2007),Action Crime Horror SciFi Thriller,holes00s,3.666667,3.5,0.441176
6591,49824,Dreamgirls (2006),Drama Musical,toplist06,3.25,3.25,1.416667


### Оценить RMSE на тестовой выборке

In [76]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [79]:
# разделим на train, test
X_train, X_test, y_train, y_test = train_test_split(movies_tags_ratings.drop(['mean_ratings'], axis=1), 
                                                    movies_tags_ratings.mean_ratings, test_size=0.3, 
                                                    random_state=42)
train_df = pd.concat([X_train, y_train], axis=1)
test_df = pd.concat([X_test, y_test], axis=1)

In [80]:
# обучим train

train_genres = train_df.genres.to_list()

X_train_CV = CV.fit_transform(train_genres)

X_train_Tf = Tf.fit_transform(X_train_CV)

model.fit(X_train_CV, train_df.mean_ratings)

KNeighborsRegressor(metric='manhattan', n_jobs=-1, n_neighbors=10)

In [81]:
# предскажем test
test_genres = test_df.genres.to_list()

X_test_CV = CV.transform(test_genres)
X_test_Tf = Tf.transform(X_test_CV)

predicted = model.predict(X_test_CV)

In [82]:
# получим RMSE на тестовой выборке
mean_squared_error(test_df.mean_ratings, predicted)

0.1635053421837346