In [2]:
import sys
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial.distance import cosine

In [69]:
ratings = pd.read_json('ratings.jsonl', lines = True)
ratings.head()

Unnamed: 0,UserId,ItemId,Timestamp,Rating
0,c4ca4238a0,91766eac45,2013-10-05 22:00:50,8
1,c81e728d9d,5c739554f7,2013-08-17 16:26:38,9
2,c81e728d9d,48f6d7ce7c,2013-08-17 13:28:27,8
3,c81e728d9d,e9318d627a,2013-06-15 15:38:09,1
4,a87ff679a2,17e6357973,2014-01-31 23:27:59,8


In [70]:
content = pd.read_json('content.jsonl', lines = True)
content.head()

Unnamed: 0,ItemId,Title,Year,Rated,Released,Runtime,Genre,Director,Writer,Actors,...,Type,DVD,BoxOffice,Production,Website,Response,totalSeasons,Season,Episode,seriesID
0,c9f0f895fb,Edison Kinetoscopic Record of a Sneeze,1894,,09 Jan 1894,1 min,"Documentary, Short",William K.L. Dickson,,Fred Ott,...,movie,,,,,True,,,,
1,d3d9446802,Leaving the Factory,1895,Not Rated,22 Mar 1895,1 min,"Documentary, Short",Louis Lumière,,,...,movie,,,,,True,,,,
2,c20ad4d76f,The Arrival of a Train,1896,Not Rated,25 Jan 1896,1 min,"Documentary, Short","Auguste Lumière, Louis Lumière",,"Madeleine Koehler, Marcel Koehler, Mrs. August...",...,movie,,,,,True,,,,
3,8e296a067a,The Oxford and Cambridge University Boat Race,1895,,,,"Short, News, Sport",Birt Acres,,,...,movie,,,,,True,,,,
4,54229abfcf,The House of the Devil,1896,Not Rated,24 Dec 1896,3 min,"Short, Horror",Georges Méliès,Georges Méliès,"Jehanne d'Alcy, Jules-Eugène Legris, Georges M...",...,movie,,,,,True,,,,


In [71]:
targets = pd.read_csv('targets.csv')
targets.head()

Unnamed: 0,UserId,ItemId
0,0006246bee,01d2404d4c
1,0006246bee,03d43fdf92
2,0006246bee,0808a9666b
3,0006246bee,0a5d7dd6f6
4,0006246bee,0bab4a8104


# Data Treatment

In [72]:
content = content.drop(columns = ['Plot','Year','Released','Runtime','Director','Writer','Actors','Language','Country','Awards','Poster','Ratings','Type','DVD','BoxOffice','Production','Website','Response','Rated','totalSeasons','Season','Episode','seriesID'])
print(content.head())

       ItemId                                          Title  \
0  c9f0f895fb         Edison Kinetoscopic Record of a Sneeze   
1  d3d9446802                            Leaving the Factory   
2  c20ad4d76f                         The Arrival of a Train   
3  8e296a067a  The Oxford and Cambridge University Boat Race   
4  54229abfcf                         The House of the Devil   

                Genre Metascore imdbRating imdbVotes  
0  Documentary, Short       N/A        5.5     1,980  
1  Documentary, Short       N/A        6.9     6,633  
2  Documentary, Short       N/A        7.5    11,407  
3  Short, News, Sport       N/A        4.2        39  
4       Short, Horror       N/A        6.7     3,268  


In [73]:
content['Genre'] = content['Genre'].str.replace('Sci-Fi','SciFi')
content['Genre'] = content['Genre'].str.replace('Film-Noir','Noir')
# create an object for TfidfVectorizer
tfidf_vector = TfidfVectorizer(stop_words='english')
# apply the object to the genres column
term_item_matrix = tfidf_vector.fit_transform(content['Genre'])
print(list(enumerate(tfidf_vector.get_feature_names())))
term_item_matrix = term_item_matrix.T.toarray()
print(term_item_matrix)

[(0, 'action'), (1, 'adult'), (2, 'adventure'), (3, 'animation'), (4, 'biography'), (5, 'comedy'), (6, 'crime'), (7, 'documentary'), (8, 'drama'), (9, 'family'), (10, 'fantasy'), (11, 'game'), (12, 'history'), (13, 'horror'), (14, 'music'), (15, 'musical'), (16, 'mystery'), (17, 'news'), (18, 'noir'), (19, 'reality'), (20, 'romance'), (21, 'scifi'), (22, 'short'), (23, 'sport'), (24, 'talk'), (25, 'thriller'), (26, 'tv'), (27, 'war'), (28, 'western')]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [74]:
content['imdbVotes'] = content['imdbVotes'].str.replace(",","")
content['imdbVotes'] = content['imdbVotes'].str.replace('N/A','0')
content['imdbVotes'] = content['imdbVotes'].astype(float)
content['imdbVotes'] = (content['imdbVotes']-content['imdbVotes'].min())/(content['imdbVotes'].max()-content['imdbVotes'].min())

In [75]:
term_item_matrix = np.vstack((term_item_matrix,20*content['imdbVotes'].T.astype(float)))
print(term_item_matrix)

[[0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 ...
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [1.58166420e-02 5.29857506e-02 9.11214319e-02 ... 1.42110132e-02
  3.99410151e-05 0.00000000e+00]]


In [76]:
user_dict = ratings.groupby('UserId')[['ItemId',"Rating"]].apply(lambda g: g.values.tolist()).to_dict()

mapping_items = pd.Series(content.index,index = content['ItemId'])

In [77]:
def get_user_atributes_df(term_item_matrix, user_dict):

   user_atributes = {}
   for user in user_dict.items():
      item_rating = np.array(user[1]).T
      item_atributes = term_item_matrix[:,mapping_items[item_rating[0]]]
      user_ratings = item_rating[1]
      matrix_final = item_atributes.astype(float)*user_ratings.astype(float)
      user_column = np.mean(matrix_final,axis=1)
      user_atributes[user[0]] = user_column
   
   user_atributes_df = pd.DataFrame(user_atributes)
   return user_atributes_df

In [78]:
def get_scores(targets, term_item_matrix, user_dict):

   user_atributes_df = get_user_atributes_df(term_item_matrix, user_dict) 
   scores = [] 
   for target in targets.itertuples(index=False):
      user_target = target[0]
      item_target = target[1]
      scores.append(cosine(user_atributes_df[user_target].values.astype(float), term_item_matrix[:,mapping_items[item_target]].astype(float)) * 10)

   return scores

In [79]:
targets["Prediction"] = get_scores(targets, term_item_matrix, user_dict)

  dist = 1.0 - uv / np.sqrt(uu * vv)


In [80]:
targets = targets.merge(content, on='ItemId')
targets = targets[['UserId','Title', 'Prediction']]
print(targets.head(5))

       UserId   Title  Prediction
0  0006246bee  Romeos    9.352431
1  34470a05eb  Romeos    8.323801
2  54bff62713  Romeos    9.096139
3  5f9491903a  Romeos    9.230947
4  6ad4ba33df  Romeos    9.738317


In [82]:
targets = targets[['UserId','Title', 'Prediction']]
print(targets.sample(50))

            UserId                                  Title  Prediction
283055  7e90c4b14a                                   Dads    9.836210
347990  7fcc48d228                 Gambling, Gods and LSD    9.954868
585064  a37a3f28f2                                   Seek    9.627263
276815  c58af5aba6              7 pistole per i MacGregor    9.985986
462465  f51f6d9168                              Human Zoo    9.432401
14281   50f677a316                            Deep Throat    9.999669
210567  bb14f8fa48                              Novitiate    6.548840
320075  9454893cb1                             Warsaw '44    8.952283
377515  92d79ee90a    UFC Fight Night: Poirier vs. Pettis    9.996989
27040   409cc06019                      Avengers: Endgame    0.105988
40191   d3d9022eea                                 Asylum    9.229349
491849  c252b9390b                              Blackfish    5.311056
293093  8d9a6e908e                              Stray Dog    8.067783
306395  3c3c9c9207  