### Configuration and data load

In [30]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [22]:
movies = pd.read_csv('../data/movies.csv')
ratings = pd.read_csv('../data/ratings.csv')
tags = pd.read_csv('../data/tags.csv')

In [23]:
display(movies.head())
display(tags.head())
display(ratings.head())

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


Unnamed: 0,userId,movieId,tag,timestamp
0,22,26479,Kevin Kline,1583038886
1,22,79592,misogyny,1581476297
2,22,247150,acrophobia,1622483469
3,34,2174,music,1249808064
4,34,2174,weird,1249808102


Unnamed: 0,userId,movieId,rating,timestamp
0,1,17,4.0,944249077
1,1,25,1.0,944250228
2,1,29,2.0,943230976
3,1,30,5.0,944249077
4,1,32,5.0,943228858


### Data preparation

#### Create tags vector

In [24]:
movies_tags = tags.groupby('movieId')['tag'].apply(lambda x: ' '.join(x.astype(str))).reset_index()
print(movies_tags.head())

   movieId                                                tag
0        1  children Disney animation children Disney Disn...
1        2  Robin Williams fantasy Robin Williams time tra...
2        3  comedinha de velhinhos engraÃƒÂ§ada comedinha ...
3        4  characters slurs based on novel or book chick ...
4        5  Fantasy pregnancy remake family Steve Martin s...


In [25]:
movies_merged = movies.merge(movies_tags, on='movieId', how='left')
display(movies_merged.head())

Unnamed: 0,movieId,title,genres,tag
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,children Disney animation children Disney Disn...
1,2,Jumanji (1995),Adventure|Children|Fantasy,Robin Williams fantasy Robin Williams time tra...
2,3,Grumpier Old Men (1995),Comedy|Romance,comedinha de velhinhos engraÃƒÂ§ada comedinha ...
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,characters slurs based on novel or book chick ...
4,5,Father of the Bride Part II (1995),Comedy,Fantasy pregnancy remake family Steve Martin s...


In [26]:
print(f"Filmy bez tagów: {len(movies_merged[movies_merged['tag'].isna()])}")
print(f"Filmy z tagami: {len(movies_merged[movies_merged['tag'].notna()])}")

movies_merged['tag'] = movies_merged['tag'].fillna('')


Filmy bez tagów: 36262
Filmy z tagami: 51323


#### Combine tags and genres into vector

In [29]:
movies_merged['genres'] = movies_merged['genres'].str.replace('|', ' ')

movies_merged['features'] = movies_merged['genres'] + ' ' + movies_merged['tag']

dataset = movies_merged[['movieId', 'title', 'features']]
display(dataset.head())

Unnamed: 0,movieId,title,features
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy ch...
1,2,Jumanji (1995),Adventure Children Fantasy Robin Williams fant...
2,3,Grumpier Old Men (1995),Comedy Romance comedinha de velhinhos engraÃƒÂ...
3,4,Waiting to Exhale (1995),Comedy Drama Romance characters slurs based on...
4,5,Father of the Bride Part II (1995),Comedy Fantasy pregnancy remake family Steve M...


#### Transform using TfidfVectorizer

In [31]:
vectorizer = TfidfVectorizer(stop_words='english', min_df=2)
tfid_matrix = vectorizer.fit_transform(dataset['features'])

print(tfid_matrix.shape)

(87585, 28306)


### Recommendation system

In [47]:
def build_user_profile(user_id, tfid_matrix, threshold=4.0):
    """
        threshold - takes into account only movies that were rated by this user above threshold

        returns: vector with features fitted for specific user
    """

    well_rated = ratings[(ratings.userId == user_id) & (ratings.rating >= threshold)]
    movies_idxs = well_rated.movieId.map(lambda m: movies.index[movies.movieId == m][0]).values
    user_vec = np.asarray(tfid_matrix[movies_idxs].mean(axis=0))
    return user_vec

In [48]:
def recommend_for_user(user_id, tfid_matrix, top_n=10):
    """
        top_n - returns top_n movies fitted for user

        returns: 
    """
    user_vec = build_user_profile(user_id, tfid_matrix)
    sim_scores = cosine_similarity(user_vec, tfid_matrix).flatten()
    
    seen = set(ratings[ratings.userId == user_id].movieId)
    unseen_idxs = [i for i, m in enumerate(movies.movieId) if m not in seen]
    
    # Choose only unseen movies
    unseen_scores = [(i, sim_scores[i]) for i in unseen_idxs]
    unseen_scores.sort(key=lambda x: x[1], reverse=True)
    
    top_idxs = [i for i, score in unseen_scores[:top_n]]
    return movies.iloc[top_idxs][['movieId', 'title', 'genres']]

In [50]:
user_id = 5
top = 10

print(f"10 recommendations for user {user_id}: ")
display(recommend_for_user(user_id, tfid_matrix, top))

10 recommendations for user 5: 


Unnamed: 0,movieId,title,genres
3163,3256,Patriot Games (1992),Action|Crime|Drama|Thriller
1168,1198,Raiders of the Lost Ark (Indiana Jones and the...,Action|Adventure
1656,1722,Tomorrow Never Dies (1997),Action|Adventure|Thriller
1864,1953,"French Connection, The (1971)",Action|Crime|Thriller
1013,1036,Die Hard (1988),Action|Crime|Thriller
1550,1608,Air Force One (1997),Action|Thriller
899,920,Gone with the Wind (1939),Drama|Romance|War
1939,2028,Saving Private Ryan (1998),Action|Drama|War
1869,1958,Terms of Endearment (1983),Comedy|Drama
3525,3623,Mission: Impossible II (2000),Action|Adventure|Thriller
