# Загрузка данных

In [3]:
import numpy as np 
import pandas as pd 

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

from scipy.stats import pearsonr
from sklearn.model_selection import train_test_split

import tqdm
from tqdm import tqdm

**tag** [userId, movieId, tag, timestamp]

**rating** [userId, movieId, rating, timestamp] 

**movies** [movieId, title, ganres]

Tag genome - это структура данных, которая описывает релевантность тэгов по отношению к фильму. 

**genome_scores** [movieId, tagId, relevance] - сгенерированная релевантность соответствия тега фильму

**genome_tag** [tagId, tag] 

**link** [movieId, imdbId, tmdbId] 

movieId - идентификатор фильма на https://movielens.org. Toy Story - https://movielens.org/movies/1.
imdbId - идентификатор фильма на http://www.imdb.com. Toy Story - http://www.imdb.com/title/tt0114709/.
tmdbId is - идентификатор фильма на https://www.themoviedb.org. Toy Story - https://www.themoviedb.org/movie/862.

In [4]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [5]:
downloaded = drive.CreateFile({'id':"1AFJdxwFCSOyCVFmtmlaUzY9OunwbAVVZ"}) 
downloaded.GetContentFile('tags.csv')
tag=pd.read_csv('tags.csv')

In [6]:
downloaded = drive.CreateFile({'id':"1PZZUhKWPmRCzgwJP9yX5g7ZycV273oVk"}) 
downloaded.GetContentFile('rating.csv')
rating=pd.read_csv('rating.csv')

In [7]:
downloaded = drive.CreateFile({'id':"1Wgw78UNg1uPV238DH0Oj3QYLfgiU7qpg"}) 
downloaded.GetContentFile('movies.csv')
movies=pd.read_csv('movies.csv')

In [8]:
downloaded = drive.CreateFile({'id':"1k4ZwrBeCO80LHThymYzHfD_H-gGCaMr8"}) 
downloaded.GetContentFile('genome-scores.csv')
genome_scores=pd.read_csv('genome-scores.csv')

In [9]:
downloaded = drive.CreateFile({'id':"12lE3TfDja2VPrqbHpXc92m2r0Sdf0Sog"}) 
downloaded.GetContentFile('genome-tags.csv')
genome_tag=pd.read_csv('genome-tags.csv')

In [10]:
downloaded = drive.CreateFile({'id':"1kx_nH4Sv9h1h9fXoLN6531YZWsDxXqO-"}) 
downloaded.GetContentFile('links.csv')
link=pd.read_csv('links.csv')
del(downloaded)

In [11]:
print("movies shape \t\t", movies.shape)
print("tag shape \t\t", tag.shape)
print("rating shape \t\t", rating.shape)
print("genome_scores shape \t", genome_scores.shape)
print("genome_tag shape \t", genome_tag.shape)
print("link shape \t\t", link.shape)

movies shape 		 (62423, 3)
tag shape 		 (1093360, 4)
rating shape 		 (25000095, 4)
genome_scores shape 	 (15584448, 3)
genome_tag shape 	 (1128, 2)
link shape 		 (62423, 3)


**movies** [movieId, title (year), genres] - поименованный список фильмов, где жанры перечислены с разделителем "|" 

**tag** [userId, movieId, tag, timestamp] - список тегов, присвоенных в качестве жанров в таблице movies (возможно частично). Теги - это сгенерированные пользователями метаданные о фильмах. Каждый тэг обычно представляет собой слово или коротку фразу. Значение, ценность и цель каждого тэга определяется каждым пользователем.

**rating** [userId, movieId, rating, timestamp] - оценки пользователей фильмам -- целевая переменная 

**genome_scores** [movieId, tagId, relevance] - релевантность тэгов по отношению к фильму. Заданы 1128 тэгов, и по каждому фильму указаны значения релевантности для каждого тэга.
Описание [в статье][genome-paper]. Tag genome был рассчитан с помощью алгоритма на основе пользовательского контента, включая тэги, ретинги и текстовые описания.

**genome_tag** [tagId, tag] - расшифровка идентификаторов тегов из genome_scores

pd.merge(movies,rating) by users

# Табличные преобразования

In [12]:
def count_by_sep(s):
  if s.count("|")>0 or s.count("(")==0:
    return s.count("|")+1
  else:
      return 0

In [13]:
movies["year"] = movies.title.str[-5:-1]
movies.title = movies.title.str[:-7]
# movies["genres_str_len"] = movies.genres.str.len()
movies["genres_count"] = movies.genres.map(lambda x: count_by_sep(x))

In [14]:
#movies[movies.genres_count == movies.sort_values(by="genres_str_len", ascending=True).genres_count.max()]

In [15]:
# s = movies.genres[movies.genres_count == movies.sort_values(by="genres_str_len", ascending=True).genres_count.max()].values[0]
# len(s), s.count("|"), movies.genres_count.max()

In [16]:
df = pd.merge(movies, link)
genome = pd.merge(genome_scores, genome_tag)

In [17]:
# genome[genome.movieId==1].sort_values(by="relevance", ascending=False)

In [18]:
s = genome.groupby(by="movieId").count()
s[s.tag!=1128]

Unnamed: 0_level_0,tagId,relevance,tag
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1


In [19]:
genome[genome.movieId==1].tag.tolist() == genome[genome.movieId==2].tag.tolist()

True

In [20]:
genome_p = genome.pivot(index="movieId", columns="tag", values="relevance")

In [21]:
genome_p.columns = genome[genome.movieId==1].tag.tolist()
genome_p.shape

(13816, 1128)

In [22]:
genome_p.reset_index(inplace=True)

Жанры фильма записываются в одном поле через разделитель |. Список допустимых жанров:

    Action, Adventure, Animation, Children's, Comedy, Crime,  Documentary, Drama, Fantasy, Film-Noir, Horror, Musical, Mystery, Romance, Sci-Fi, Thriller, War, Western, (no genres listed)

In [23]:
s = movies.genres.str.cat(sep="")
s

'Adventure|Animation|Children|Comedy|FantasyAdventure|Children|FantasyComedy|RomanceComedy|Drama|RomanceComedyAction|Crime|ThrillerComedy|RomanceAdventure|ChildrenActionAction|Adventure|ThrillerComedy|Drama|RomanceComedy|HorrorAdventure|Animation|ChildrenDramaAction|Adventure|RomanceCrime|DramaDrama|RomanceComedyComedyAction|Comedy|Crime|Drama|ThrillerComedy|Crime|ThrillerCrime|Drama|Horror|Mystery|ThrillerAction|Crime|ThrillerDrama|Sci-FiDrama|RomanceDramaChildren|DramaDrama|RomanceAdventure|Drama|Fantasy|Mystery|Sci-FiCrime|DramaDramaMystery|Sci-Fi|ThrillerAdventure|Romance|IMAXChildren|DramaDrama|RomanceCrime|DramaDocumentary|IMAXChildren|ComedyComedy|RomanceDramaDrama|WarAction|Crime|DramaDramaAction|Adventure|FantasyComedy|Drama|ThrillerDrama|RomanceMystery|ThrillerAnimation|Children|Drama|Musical|RomanceDrama|RomanceCrime|Mystery|ThrillerAction|Drama|ThrillerComedy|Drama|RomanceAdventure|DramaChildren|ComedyDramaAdventure|Children|Comedy|FantasyDramaComedy|Drama|RomanceDrama|Myst

In [24]:
genres_tags_l = ["Action", "Adventure", "Animation", "Children's", "Children", "Comedy", "Crime", "Documentary", "Drama", "IMAX", "IMAXChildren", "Fantasy", "Film-Noir", "Horror", "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western", "(no genres listed)"]
genres_tags = pd.DataFrame(genres_tags_l, columns=["tag"])
# movies[movies.genres.str.find("IMAX")!=-1] # вспомогательная проверка
genres_tags

Unnamed: 0,tag
0,Action
1,Adventure
2,Animation
3,Children's
4,Children
5,Comedy
6,Crime
7,Documentary
8,Drama
9,IMAX


In [25]:
res = s.replace("|", "")
for i in genres_tags_l:
  res = res.replace(i, "")
print(res)
print("пустая строка означает, что список l - полный список жанров")


пустая строка означает, что список l - полный список жанров


In [26]:
df[:2]

Unnamed: 0,movieId,title,genres,year,genres_count,imdbId,tmdbId
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995,5,114709,862.0
1,2,Jumanji,Adventure|Children|Fantasy,1995,3,113497,8844.0


In [27]:
res = {}
res["movieId"] = movies.movieId.values
for key in genres_tags_l:
  res[key] = [1 if movies.genres[i].find(key)!=-1 else 0 for i in range(df.shape[0])]

In [28]:
df = pd.merge(df, pd.DataFrame(res), how="inner", on="movieId")
del(res)
df[-2:]

Unnamed: 0,movieId,title,genres,year,genres_count,imdbId,tmdbId,Action,Adventure,Animation,Children's,Children,Comedy,Crime,Documentary,Drama,IMAX,IMAXChildren,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,(no genres listed)
62421,209169,A Girl Thing,(no genres listed),2001,0,249603,162892.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
62422,209171,Women of Devil's Island,Action|Adventure|Drama,1962,3,55323,79513.0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [29]:
print(df.shape)
pd.concat([df[df.movieId==205425], df[df.movieId==206499], df[-2:]])

(62423, 29)


Unnamed: 0,movieId,title,genres,year,genres_count,imdbId,tmdbId,Action,Adventure,Animation,Children's,Children,Comedy,Crime,Documentary,Drama,IMAX,IMAXChildren,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,(no genres listed)
61271,205425,Dave Chappelle: Sticks & Stones,Comedy,2019,1,10810424,624932.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
61660,206499,Between Two Ferns: The Movie,Comedy,2019,1,9398640,584962.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
62421,209169,A Girl Thing,(no genres listed),2001,0,249603,162892.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
62422,209171,Women of Devil's Island,Action|Adventure|Drama,1962,3,55323,79513.0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [30]:
df2 = pd.merge(df, genome_p, on="movieId")
print(df2.shape)
df2[-2:]

(13816, 1157)


Unnamed: 0,movieId,title,genres,year,genres_count,imdbId,tmdbId,Action,Adventure,Animation,Children's,Children,Comedy,Crime,Documentary,Drama,IMAX,IMAXChildren,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,(no genres listed),007,007 (series),18th century,1920s,1930s,1950s,1960s,1970s,1980s,19th century,3d,...,visceral,visual,visually appealing,visually stunning,visuals,voodoo,voyeurism,war,war movie,wartime,waste of time,watch the credits,weapons,wedding,weed,weird,werewolf,werewolves,western,whimsical,wilderness,wine,wistful,witch,witches,witty,wizards,women,working class,workplace,world politics,world war i,world war ii,writer's life,writers,writing,wuxia,wwii,zombie,zombies
13814,205425,Dave Chappelle: Sticks & Stones,Comedy,2019,1,10810424,624932.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.04525,0.04125,0.0425,0.07425,0.1155,0.105,0.08275,0.13575,0.16125,0.05875,0.03875,...,0.3085,0.3065,0.55125,0.3545,0.09975,0.03825,0.065,0.07625,0.04875,0.17675,0.035,0.17475,0.47425,0.056,0.50825,0.739,0.06075,0.01625,0.061,0.18075,0.0665,0.03525,0.1005,0.04525,0.086,0.5135,0.08175,0.10775,0.06,0.0585,0.25075,0.0455,0.01425,0.03925,0.217,0.06,0.0725,0.015,0.1105,0.0285
13815,206499,Between Two Ferns: The Movie,Comedy,2019,1,9398640,584962.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.1005,0.09325,0.02225,0.0455,0.21125,0.06125,0.10425,0.2975,0.1245,0.067,0.0525,...,0.176,0.15375,0.384,0.19675,0.12075,0.023,0.0815,0.05025,0.06175,0.17525,0.02175,0.14525,0.43075,0.04175,0.35375,0.3695,0.0405,0.01125,0.06,0.15675,0.0965,0.181,0.0275,0.022,0.0255,0.23025,0.04375,0.3285,0.03375,0.17775,0.1275,0.0275,0.0225,0.042,0.175,0.11,0.0485,0.01325,0.14025,0.0335


In [31]:
'''
df_2 = pd.merge(df, genome_p, how="left", on="movieId")
print(df_2.shape)
df_2[-2:]
'''

'\ndf_2 = pd.merge(df, genome_p, how="left", on="movieId")\nprint(df_2.shape)\ndf_2[-2:]\n'

In [32]:
# df.drop("genres", axis=1, inplace=True)
df2.drop("genres", axis=1, inplace=True)
# df_2.drop("genres", axis=1, inplace=True)

In [33]:
del(df)

Проверим сопостовимость tag и genome

In [34]:
genome_uniq = pd.DataFrame(genome.tag.values).drop_duplicates()
genome_uniq.columns = ["tag"]

In [35]:
# pd.merge(tag, genome_uniq, how="inner", left_on="tag", right_on="tag").sort_values(by="tag")

все теги всех датасетов перечислены в tag.

Выделены датасеты:

**ganres_tag** - список тегов из жанров movies

**genome_uniq** - спосок уникальных тегов, использованных для оценки

**genome_n_genres** - объединение тегов из genome и movies.ganres 


In [36]:
len(tag[tag.tag.isin(genome_uniq)==False].tag.unique()), len(genome_uniq), len(genres_tags)

(73050, 1128, 22)

In [37]:
genome_n_genres_tags = pd.concat([genome_uniq, genres_tags])
genome_n_genres_tags.shape

(1150, 1)

# Рекомендательная система

Нужно добавить оценки K пользователей.

Зная, что в задании 2.1 один пользователь показал высокую степень соседства с другими и высоку точность, выберу его снова, чтобы сосредоточиться на более трудоемком и сильно привязанном к конкретной предметной области методе, основанном на описании объектов, которые требуется рекомендовать. 

In [68]:
k=8
movies_leave_percent = 0.3
test_data_percent = 0.3
user = 160670
neares_users = [6867, 26248, 136106, 67587, 111705, 42518, 151986, 5078]
all_users = neares_users + [user]
# all_users = rating['userId'].value_counts()

In [45]:
dfu = pd.merge(df2, rating[rating.userId.isin(all_users)])

In [46]:
train, test = train_test_split(dfu, test_size=test_data_percent)

In [74]:
train

Unnamed: 0,movieId,title,year,genres_count,imdbId,tmdbId,Action,Adventure,Animation,Children's,Children,Comedy,Crime,Documentary,Drama,IMAX,IMAXChildren,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,(no genres listed),007,007 (series),18th century,1920s,1930s,1950s,1960s,1970s,1980s,19th century,3d,70mm,...,visually stunning,visuals,voodoo,voyeurism,war,war movie,wartime,waste of time,watch the credits,weapons,wedding,weed,weird,werewolf,werewolves,western,whimsical,wilderness,wine,wistful,witch,witches,witty,wizards,women,working class,workplace,world politics,world war i,world war ii,writer's life,writers,writing,wuxia,wwii,zombie,zombies,userId,rating,timestamp
943,8874,Shaun of the Dead,2004,2,365748,747.0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0.04550,0.03825,0.05900,0.05550,0.09575,0.07600,0.07425,0.23450,0.17400,0.05000,0.01825,0.03275,...,0.17600,0.08850,0.20275,0.03125,0.09925,0.06425,0.18100,0.00950,0.25800,0.57425,0.03050,0.27975,0.70525,0.18525,0.15850,0.02075,0.37725,0.24500,0.10350,0.08500,0.06900,0.16575,0.73200,0.03700,0.11925,0.25675,0.37925,0.07500,0.03225,0.02050,0.06850,0.26775,0.09225,0.05025,0.01725,0.94775,0.98100,42518,4.5,1421493709
373,1966,Metropolitan,1990,1,100142,15389.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.02625,0.03050,0.04650,0.10625,0.12950,0.09775,0.05325,0.44225,0.15850,0.14850,0.02225,0.01600,...,0.10400,0.07200,0.00975,0.06125,0.02250,0.03300,0.09425,0.01050,0.03700,0.19800,0.02400,0.03725,0.58200,0.04075,0.00925,0.01175,0.34075,0.07950,0.34450,0.23100,0.02200,0.05650,0.84125,0.02675,0.23625,0.12250,0.26925,0.09575,0.03675,0.01475,0.06700,0.50625,0.38125,0.02150,0.01150,0.09475,0.02200,26248,5.0,1240369477
932,8677,Flash Gordon Conquers the Universe,1940,2,32475,68064.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0.03800,0.04300,0.14775,0.09700,0.25975,0.12350,0.04725,0.09150,0.06425,0.21950,0.09375,0.16850,...,0.12150,0.05900,0.00675,0.07100,0.30550,0.04500,0.35900,0.00375,0.01050,0.31175,0.01575,0.01925,0.28300,0.05575,0.00850,0.49650,0.13200,0.05300,0.03500,0.04475,0.04500,0.15800,0.08200,0.18525,0.24200,0.02725,0.05550,0.15050,0.27150,0.07200,0.02350,0.14900,0.04550,0.02275,0.06125,0.05100,0.01050,151986,3.0,1230683146
1123,51991,"Italian, The (Italianetz)",2005,1,450450,17388.0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.08275,0.10125,0.60450,0.06925,0.17025,0.10300,0.14750,0.27225,0.01175,0.06550,0.01750,0.01275,...,0.11500,0.03250,0.01125,0.09000,0.38575,0.11050,0.37975,0.00400,0.01025,0.13750,0.00750,0.02500,0.34000,0.03800,0.00975,0.01675,0.26475,0.18200,0.06650,0.05875,0.01400,0.04700,0.15450,0.04075,0.08600,0.19100,0.19375,0.14800,0.05250,0.09925,0.17000,0.18425,0.05725,0.03350,0.16325,0.07350,0.01800,6867,4.5,1213017534
188,930,Notorious,1946,3,38787,303.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0.05825,0.05325,0.12725,0.24300,0.22400,0.22600,0.14925,0.19475,0.03900,0.10925,0.02275,0.04075,...,0.25875,0.17425,0.01100,0.18650,0.16200,0.09075,0.69925,0.00550,0.02050,0.28600,0.07625,0.02300,0.48100,0.03800,0.00450,0.01175,0.29650,0.10200,0.64925,0.09625,0.03925,0.15200,0.54200,0.04000,0.22150,0.03475,0.14075,0.14950,0.19400,0.51550,0.03500,0.17025,0.14075,0.03375,0.36625,0.07325,0.02025,26248,4.5,1240369625
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
793,6333,X2: X-Men United,2003,4,290334,36658.0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0.04300,0.03575,0.03475,0.05575,0.06600,0.03850,0.03950,0.14050,0.03900,0.03450,0.09700,0.06350,...,0.59025,0.29925,0.01425,0.04550,0.31125,0.04700,0.09200,0.03125,0.48250,0.73675,0.07850,0.03500,0.27075,0.15750,0.10300,0.01650,0.18625,0.04350,0.10250,0.03250,0.04250,0.06500,0.46100,0.12700,0.15525,0.05625,0.10375,0.12625,0.02375,0.02200,0.03325,0.28425,0.13025,0.04075,0.01950,0.17625,0.03625,111705,4.0,1450735459
656,4718,American Pie 2,2001,1,252866,2770.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.05500,0.05050,0.02575,0.03950,0.05175,0.02975,0.02850,0.10300,0.04250,0.02450,0.03300,0.02825,...,0.10000,0.05575,0.02725,0.15450,0.05775,0.03750,0.07500,0.04400,0.19125,0.21300,0.19175,0.23775,0.25200,0.06300,0.02625,0.02375,0.11200,0.06825,0.06200,0.02100,0.02050,0.02650,0.13200,0.03675,0.33550,0.02275,0.10000,0.06750,0.01700,0.01675,0.02200,0.24975,0.04725,0.02525,0.01525,0.11550,0.01875,42518,2.0,1421492740
29,110,Braveheart,1995,3,112573,197.0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0.04075,0.03425,0.42425,0.15625,0.20150,0.16625,0.11825,0.32350,0.22100,0.41900,0.02200,0.25275,...,0.69250,0.16250,0.01625,0.03250,0.89725,0.47175,0.23300,0.00900,0.10175,0.49225,0.03500,0.05150,0.25425,0.07775,0.02125,0.04050,0.13300,0.12575,0.02950,0.02750,0.02000,0.04600,0.26775,0.06400,0.12825,0.03000,0.03925,0.06525,0.13625,0.06400,0.04425,0.10250,0.10975,0.06825,0.15775,0.10525,0.02500,67587,3.0,1458856168
689,4993,"Lord of the Rings: The Fellowship of the Ring,...",2001,2,120737,120.0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0.03600,0.02800,0.14450,0.09975,0.11050,0.04925,0.03525,0.23950,0.14275,0.11525,0.03350,0.24550,...,0.89875,0.60325,0.01675,0.03800,0.72600,0.13650,0.09450,0.01725,0.05250,0.45150,0.02825,0.03925,0.42775,0.12075,0.07725,0.02375,0.34650,0.12875,0.04050,0.07100,0.16250,0.36350,0.40250,0.99375,0.10125,0.04150,0.06125,0.09525,0.01950,0.02000,0.05400,0.13150,0.13300,0.04475,0.01750,0.11700,0.03475,42518,5.0,1421489216


In [113]:
train_mtrx = train[["movieId", "userId", "rating"]].pivot(index="userId", columns="movieId", values="rating").fillna(0)
test_mtrx = test[["movieId", "userId", "rating"]].pivot(index="userId", columns="movieId", values="rating").fillna(0)

In [106]:
s= pd.DataFrame(train_mtrx.count(), columns=["c"])
movie = s[s.c==s.c.max()].index.values[0]
movie

4993

In [161]:
similar_movies = train_mtrx.corrwith(train_mtrx[movie])

In [184]:
corr_df = pd.DataFrame(similar_movies, columns=["corr_coef"])
corr_df = corr_df[corr_df.corr_coef>0.5].sort_values(by="corr_coef", ascending=False)
corr_df

Unnamed: 0_level_0,corr_coef
movieId,Unnamed: 1_level_1
4993,1.0
5816,0.534207
56174,0.531326
260,0.52071
47,0.504262


In [185]:
corr_movies = list(corr_df.index)
corr_movies

[4993, 5816, 56174, 260, 47]

In [186]:
movies_stat = train.groupby("movieId").agg({"rating": [np.size, np.mean]})
#popular_movies = list(movies_stat[movies_stat["rating"]["size"] >= 4].index)
popular_movies = movies_stat["rating"]["size"] >= 4
movies_stat[popular_movies].sort_values([("rating", "mean")], ascending=False)

Unnamed: 0_level_0,rating,rating
Unnamed: 0_level_1,size,mean
movieId,Unnamed: 1_level_2,Unnamed: 2_level_2
7153,5.0,5.0
5952,6.0,5.0
7143,4.0,4.875
5816,4.0,4.875
4993,7.0,4.857143
356,4.0,4.75
4027,4.0,4.75
2959,6.0,4.666667
3578,4.0,4.625
56174,4.0,4.625


In [187]:
df = movies_stat[popular_movies].join(corr_df)
df.columns = ["rating_size", "rating_mean", "similarity"]
df.dropna(inplace=True)
df



Unnamed: 0_level_0,rating_size,rating_mean,similarity
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
47,4.0,3.625,0.504262
260,4.0,4.0,0.52071
4993,7.0,4.857143,1.0
5816,4.0,4.875,0.534207
56174,4.0,4.625,0.531326


In [188]:
user_rating = train.pivot_table(index="userId", columns="movieId", values="rating")
corr_mtrx = user_rating.corr(method="pearson")

In [189]:
user_rating

movieId,1,2,6,17,19,32,34,39,44,47,48,70,110,141,150,153,158,161,162,165,180,208,223,231,260,266,273,293,296,306,316,318,327,344,353,356,357,364,367,370,...,99114,101864,102716,103249,103810,103883,104243,104374,104841,106072,106487,106489,106782,106785,106916,106918,109487,109578,111659,111759,111781,112171,112852,115713,119141,120825,122882,122886,122904,122912,129354,129937,130634,134130,134853,162590,192803,193944,197201,197203
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
5078,,3.5,,,2.5,,,,,,,,,4.0,,,,,,,,,,,,,,,,,,,,4.0,,5.0,,,3.5,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
6867,2.0,,,,,,,,,,,,5.0,,3.5,3.0,,,,2.5,,4.0,,3.5,,,4.0,5.0,4.0,4.5,,4.5,,3.5,,5.0,4.5,,4.0,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
26248,,,4.5,,,3.5,,,,,,,,,4.0,,,4.0,4.0,,5.0,,5.0,,,,,,5.0,,,4.0,,,,,,,,,...,,,,,,,,,,,,,,,4.0,,,,,,,,,,,,,,,,,,,,,,,,,
42518,,,,,3.5,,,,,2.5,,,4.5,,,,,,,,,,4.5,,3.0,,,,,,,,,,4.0,,,,,,...,4.5,,,3.5,,,,4.0,,,5.0,4.5,3.5,,,4.5,,,4.0,,,,,,3.0,2.0,,,,,,,,,,,,,,
67587,3.0,5.0,,5.0,,,1.0,3.0,,4.0,2.0,2.0,3.0,,5.0,,2.0,,3.0,,,,,2.0,5.0,4.5,,,,,,,,,,5.0,,,,4.0,...,,,,,,,,,,4.0,,,,,,,5.0,,,,,,,,,,,,4.0,,,,,,,5.0,,,,
111705,,,,,,,,,,5.0,,,5.0,,,,,,,,,,,,4.0,,,,,,,5.0,,,,,,,,,...,,5.0,5.0,,5.0,3.5,5.0,,,3.0,,,,4.0,,,,5.0,,5.0,5.0,5.0,,,,,4.0,3.0,,,5.0,4.0,5.0,,4.5,,,,,
136106,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,3.0,,,,,,,,,...,,,,,,,,,,,,,,,,,5.0,,,,,,5.0,,,,,,4.0,5.0,,,,5.0,,,,,1.0,1.0
151986,,,,,,4.5,,,3.0,3.0,2.0,,,,,2.0,,,,2.5,,,,,4.0,,,,5.0,,3.5,5.0,3.5,,,,,5.0,2.5,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
160670,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,4.0,,,,,...,,,,,,,,,4.5,,,,,,,,4.0,,,,,,,4.0,,,,,,,,,,4.0,,,4.5,4.0,,


In [183]:
corr_mtrx[corr_mtrx.index.isin(corr_movies)][corr_movies]

movieId,5816,56174,260,47
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
47,1.0,0.654654,0.552345,1.0
260,1.0,1.0,1.0,0.552345
5816,1.0,1.0,1.0,1.0
56174,1.0,1.0,1.0,0.654654


In [192]:
my_rating = user_rating.loc[user].dropna()
my_rating

movieId
356       4.0
541       5.0
2028      4.0
2571      4.0
2712      4.0
2959      4.0
3578      4.0
4993      4.0
8665      4.0
30707     4.0
30749     4.0
48385     3.5
49530     4.0
51255     4.0
54286     4.0
58559     3.5
60069     4.0
69640     3.0
72998     3.5
79132     4.0
97938     4.0
104841    4.5
109487    4.0
115713    4.0
134130    4.0
192803    4.5
193944    4.0
Name: 160670, dtype: float64

In [199]:
sim_candidates = pd.Series()
for i in my_rating.index:
  sims= corr_mtrx[i].dropna()
  sims = sims.map(lambda x: x*my_rating[i])
  sim_candidates = sim_candidates.append(sims)
sim_candidates.sort_values(inplace=True, ascending=True)

  """Entry point for launching an IPython kernel.


In [232]:
recommendation_df = pd.DataFrame(sim_candidates).reset_index()
recommendation_df.columns = ["movieId", "pred"]
recommendation_df.sort_values(by="movieId",inplace=True)
recommendation_df.drop_duplicates(inplace=True)
recommendation_df.shape

(221, 2)

In [230]:
recommendation_df[recommendation_df.movieId.isin( train[train.userId==user][["movieId"]] ) == False]

Unnamed: 0,movieId,pred
345,1,4.000000
309,19,4.000000
379,32,4.000000
113,47,-3.500000
124,47,-0.755929
...,...,...
396,84152,4.000000
11,85414,-4.000000
226,92507,4.000000
234,109487,4.000000


In [241]:
recommendation_df = recommendation_df[recommendation_df.pred>0]

In [242]:
check_df = pd.merge(test, recommendation_df, on='movieId')
check_df

Unnamed: 0,movieId,title,year,genres_count,imdbId,tmdbId,Action,Adventure,Animation,Children's,Children,Comedy,Crime,Documentary,Drama,IMAX,IMAXChildren,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,(no genres listed),007,007 (series),18th century,1920s,1930s,1950s,1960s,1970s,1980s,19th century,3d,70mm,...,visuals,voodoo,voyeurism,war,war movie,wartime,waste of time,watch the credits,weapons,wedding,weed,weird,werewolf,werewolves,western,whimsical,wilderness,wine,wistful,witch,witches,witty,wizards,women,working class,workplace,world politics,world war i,world war ii,writer's life,writers,writing,wuxia,wwii,zombie,zombies,userId,rating,timestamp,pred
0,54286,"Bourne Ultimatum, The",2007,3,440963,2503.0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0.51175,0.16825,0.04175,0.06250,0.08500,0.06500,0.06650,0.15525,0.04975,0.05800,0.03250,0.02800,...,0.27350,0.01800,0.13275,0.10175,0.04525,0.07650,0.02400,0.04850,0.59600,0.04125,0.02225,0.24475,0.05800,0.01400,0.01425,0.12550,0.0740,0.06725,0.02650,0.01825,0.04425,0.31425,0.02975,0.13675,0.03775,0.05050,0.09125,0.02500,0.01225,0.02700,0.11850,0.08550,0.04925,0.01300,0.09675,0.02250,151986,5.0,1230687704,4.0
1,54286,"Bourne Ultimatum, The",2007,3,440963,2503.0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0.51175,0.16825,0.04175,0.06250,0.08500,0.06500,0.06650,0.15525,0.04975,0.05800,0.03250,0.02800,...,0.27350,0.01800,0.13275,0.10175,0.04525,0.07650,0.02400,0.04850,0.59600,0.04125,0.02225,0.24475,0.05800,0.01400,0.01425,0.12550,0.0740,0.06725,0.02650,0.01825,0.04425,0.31425,0.02975,0.13675,0.03775,0.05050,0.09125,0.02500,0.01225,0.02700,0.11850,0.08550,0.04925,0.01300,0.09675,0.02250,151986,5.0,1230687704,3.5
2,54286,"Bourne Ultimatum, The",2007,3,440963,2503.0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0.51175,0.16825,0.04175,0.06250,0.08500,0.06500,0.06650,0.15525,0.04975,0.05800,0.03250,0.02800,...,0.27350,0.01800,0.13275,0.10175,0.04525,0.07650,0.02400,0.04850,0.59600,0.04125,0.02225,0.24475,0.05800,0.01400,0.01425,0.12550,0.0740,0.06725,0.02650,0.01825,0.04425,0.31425,0.02975,0.13675,0.03775,0.05050,0.09125,0.02500,0.01225,0.02700,0.11850,0.08550,0.04925,0.01300,0.09675,0.02250,6867,5.0,1212948554,4.0
3,54286,"Bourne Ultimatum, The",2007,3,440963,2503.0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0.51175,0.16825,0.04175,0.06250,0.08500,0.06500,0.06650,0.15525,0.04975,0.05800,0.03250,0.02800,...,0.27350,0.01800,0.13275,0.10175,0.04525,0.07650,0.02400,0.04850,0.59600,0.04125,0.02225,0.24475,0.05800,0.01400,0.01425,0.12550,0.0740,0.06725,0.02650,0.01825,0.04425,0.31425,0.02975,0.13675,0.03775,0.05050,0.09125,0.02500,0.01225,0.02700,0.11850,0.08550,0.04925,0.01300,0.09675,0.02250,6867,5.0,1212948554,3.5
4,3081,Sleepy Hollow,1999,4,162661,2668.0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,1,0,0,0,0,0,0.05150,0.02475,0.84200,0.17950,0.13850,0.07725,0.04100,0.14525,0.06725,0.49725,0.03850,0.03575,...,0.76750,0.09000,0.02775,0.06350,0.05050,0.09050,0.01100,0.06425,0.20500,0.03025,0.04100,0.78025,0.18425,0.14450,0.02525,0.57625,0.1830,0.07000,0.04975,0.80325,0.60850,0.35425,0.10750,0.16825,0.01800,0.07500,0.05975,0.02050,0.01950,0.03525,0.22500,0.08550,0.06700,0.01400,0.19550,0.03600,67587,4.0,1458856800,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,79132,Inception,2010,7,1375666,27205.0,1,0,0,0,0,0,1,0,1,1,0,0,0,0,0,1,0,1,1,0,0,0,0.04850,0.02650,0.08950,0.11600,0.10875,0.14925,0.06525,0.20925,0.04950,0.14325,0.12700,0.12050,...,0.68850,0.01100,0.07400,0.07925,0.05050,0.06250,0.05325,0.15450,0.41200,0.02000,0.03850,0.85225,0.05100,0.00625,0.01200,0.38600,0.1010,0.06375,0.04725,0.00900,0.03675,0.39200,0.07900,0.10000,0.02250,0.08275,0.11025,0.01275,0.02225,0.05050,0.23450,0.33825,0.04575,0.02075,0.07200,0.02175,67587,5.0,1458856441,4.0
107,79132,Inception,2010,7,1375666,27205.0,1,0,0,0,0,0,1,0,1,1,0,0,0,0,0,1,0,1,1,0,0,0,0.04850,0.02650,0.08950,0.11600,0.10875,0.14925,0.06525,0.20925,0.04950,0.14325,0.12700,0.12050,...,0.68850,0.01100,0.07400,0.07925,0.05050,0.06250,0.05325,0.15450,0.41200,0.02000,0.03850,0.85225,0.05100,0.00625,0.01200,0.38600,0.1010,0.06375,0.04725,0.00900,0.03675,0.39200,0.07900,0.10000,0.02250,0.08275,0.11025,0.01275,0.02225,0.05050,0.23450,0.33825,0.04575,0.02075,0.07200,0.02175,67587,5.0,1458856441,3.5
108,7147,Big Fish,2003,3,319061,587.0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0.02600,0.02275,0.12475,0.14775,0.11075,0.11925,0.05575,0.22650,0.07875,0.10425,0.04150,0.02725,...,0.65400,0.02250,0.04850,0.06675,0.06175,0.23925,0.01575,0.07675,0.14300,0.04575,0.10950,0.83575,0.16825,0.11150,0.02425,0.92700,0.1670,0.08925,0.32825,0.46775,0.23025,0.33800,0.10925,0.16825,0.04850,0.09000,0.05600,0.01400,0.04000,0.13675,0.29150,0.15775,0.10100,0.01925,0.08925,0.02000,67587,5.0,1458856785,4.0
109,2174,Beetlejuice,1988,2,94721,4011.0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0.02575,0.02400,0.08925,0.07175,0.12225,0.05725,0.07200,0.17150,0.41775,0.06025,0.02900,0.14800,...,0.50600,0.07825,0.05100,0.02525,0.04275,0.09200,0.00725,0.04175,0.30125,0.09625,0.03025,0.95275,0.14275,0.06450,0.01400,0.57500,0.1090,0.22425,0.11000,0.28325,0.44975,0.55250,0.07150,0.14025,0.03750,0.24100,0.05800,0.01900,0.01425,0.04475,0.25400,0.05625,0.03075,0.01050,0.22950,0.07125,5078,4.0,1159971980,4.0


In [237]:
from sklearn.metrics import mean_squared_error
from math import sqrt

In [244]:
mse = sqrt(mean_squared_error(check_df.rating, check_df.pred))
print(mse, "- RMSE (Root Mean Squared Error)")

1.2719576113563786 - RMSE (Root Mean Squared Error)


Не самое точное предсказание.