Movie Recommender System

Steps:

1. import packages and data
2. train hybrid model on MovieLens data

    2a. Content-based for cold start

    2b. Collaborative filtering afterwards
    
3. evaluate model
4. deploy model using flask/etc.
    4a. Api calls to collect user events
    4b. Retrain model with new batch every night

User Perspective:

Simple website that requests an account signup. The user will input 10 movies and their ratings. When they watch movies they will update their profile. They will be provided with 10 recommended movies.



In [6]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix

In [7]:
small = 'data/ml-latest-small'
big = 'data/ml-32m'
folder = small

ratings = pd.read_csv(f'{folder}/ratings.csv')


movies = pd.read_csv(f'{folder}/movies.csv')

In [8]:
# Generates a sparse utility matrix
def create_X(df):
    """
    Args:
        df: pandas dataframe containing 3 columns (userId, movieId, rating)
    
    Returns:
        X: sparse matrix
        user_mapper: dict that maps user id's to user indices
        user_inv_mapper: dict that maps user indices to user id's
        movie_mapper: dict that maps movie id's to movie indices
        movie_inv_mapper: dict that maps movie indices to movie id's
    """
    M = df['userId'].nunique()
    N = df['movieId'].nunique()

    user_mapper = dict(zip(np.unique(df["userId"]), list(range(M))))
    movie_mapper = dict(zip(np.unique(df["movieId"]), list(range(N))))
    
    user_inv_mapper = dict(zip(list(range(M)), np.unique(df["userId"])))
    movie_inv_mapper = dict(zip(list(range(N)), np.unique(df["movieId"])))
    
    user_index = [user_mapper[i] for i in df['userId']]
    item_index = [movie_mapper[i] for i in df['movieId']]

    X = csr_matrix((df["rating"], (user_index,item_index)), shape=(M,N))
    
    return X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper

In [9]:
# item-based collaborative filtering
from sklearn.neighbors import NearestNeighbors


def find_similar_movies(movie_id, X, movie_mapper, movie_inv_mapper, k, metric=
'cosine'):
    """
    Finds k-nearest neighbours for a given movie id.
    
    Args:
        movie_id: id of the movie of interest
        X: user-item utility matrix
        k: number of similar movies to retrieve
        metric: distance metric for kNN calculations
    
    Output: returns list of k similar movie ID's
    """
    X = X.T
    neighbour_ids = []
    
    movie_ind = movie_mapper[movie_id]
    movie_vec = X[movie_ind]
    if isinstance(movie_vec, (np.ndarray)):
        movie_vec = movie_vec.reshape(1,-1)
    # use k+1 since kNN output includes the movieId of interest
    kNN = NearestNeighbors(n_neighbors=k+1, algorithm="brute", metric=metric)
    kNN.fit(X)
    neighbour = kNN.kneighbors(movie_vec, return_distance=False)
    for i in range(0,k):
        n = neighbour.item(i)
        neighbour_ids.append(movie_inv_mapper[n])
    neighbour_ids.pop(0)
    return neighbour_ids

In [10]:
X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper = create_X(ratings)

In [11]:
# item-based collaborative filtering
from sklearn.decomposition import TruncatedSVD
X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper = create_X(ratings)
similar_movies = find_similar_movies(1, X, movie_mapper, movie_inv_mapper, k=10)

svd = TruncatedSVD(n_components=20, n_iter=10)
Q = svd.fit_transform(X.T)
Q.shape

movie_titles = dict(zip(movies['movieId'], movies['title']))
movie_id = 1
similar_movies = find_similar_movies(movie_id, Q.T, movie_mapper, movie_inv_mapper, metric='cosine', k=10)
movie_title = movie_titles[movie_id]

print(f"Because you watched {movie_title}:")
for i in similar_movies:
    print(movie_titles[i])

Because you watched Toy Story (1995):
Home Alone (1990)
Jurassic Park (1993)
Aladdin (1992)
Willy Wonka & the Chocolate Factory (1971)
Back to the Future (1985)
Forrest Gump (1994)
Groundhog Day (1993)
Star Wars: Episode IV - A New Hope (1977)
Princess Bride, The (1987)


In [None]:
# all-to-all item similarities (top k for each item)

import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
import pickle

k = 15

svd = TruncatedSVD(n_components=20, n_iter=10)
Q = svd.fit_transform(X.T)
Q.shape
S = cosine_similarity(Q) # similarity matrix

seed_movies = [(1, 5.0), (2, 3.5), (3, 5.0),(4, 2.5), (5, 4.0),
               (6, 1.5),  (7, 1.0),  (8, 3.0),  (9, 2.5),  (10, 2.5),
    (11, 2.0), (12, 1.5), (13, 5.0), (14, 1.5), (15, 4.0),
    (16, 1.0), (17, 1.0), (18, 1.5), (19, 2.5), (20, 2.5),
    (21, 5.0), (22, 2.5), (23, 2.5), (24, 3.0), (25, 3.0),
    (26, 4.0), (27, 2.0), (28, 4.5), (29, 1.5), (30, 2.5),
    (31, 4.5), (32, 3.5), (33, 2.0), (34, 1.5), (35, 1.0),
    (36, 2.0), (37, 3.0), (38, 1.0), (39, 4.5), (40, 2.0),
    (41, 3.0), (42, 3.5), (43, 3.0), (44, 1.5), (45, 1.5),
    (46, 3.0), (47, 1.5), (48, 3.5), (49, 2.0), (50, 1.0),
    (51, 2.5), (52, 1.5), (53, 3.5), (54, 1.5), (55, 3.0),
    (57, 3.5), (58, 4.5), (59, 2.5), (60, 3.5),
    (61, 1.5), (62, 3.0), (63, 3.5), (64, 1.0), (65, 3.0),
    (66, 4.5), (67, 2.5), (68, 2.0), (69, 5.0), (70, 4.0),
    (71, 2.5), (72, 1.0), (73, 1.0), (74, 4.0), (75, 3.5),
    (76, 3.0), (77, 3.5), (78, 2.0), (79, 4.0), (80, 3.0),
    (81, 2.0), (82, 4.5), (83, 5.0), (85, 2.0),
    (86, 3.5), (87, 1.5), (88, 2.0), (89, 1.0), (90, 3.5),
    (91, 2.5), (92, 4.5), (93, 4.0), (94, 4.5), (95, 2.0),
    (96, 3.0), (97, 2.0), (98, 2.5), (99, 5.0), (100, 5.0),
    (101, 3.0), (102, 4.0), (103, 4.0), (104, 3.5), (105, 2.5),]

# take all movies rated higher than 4 stars
# if none, take top 3 movies
def find_highly_rated_movies(seed_movies):
    sorted_movies = sorted(seed_movies, key=lambda x: x[1], reverse=True)
    output = []
    for i in range(len(sorted_movies)):
        movie = sorted_movies[i]
        if movie[1] < 4.0 and len(output) > 3:
            break
        output.append(movie[0])
    return output


alpha = 10

# count corated users
def co_rating_count(i_index, j_index):
    col_i = X[:, i_index]
    col_j = X[:, j_index]
    common = col_i.multiply(col_j)
    return common.count_nonzero()


# Get top k movies based on weight of movie
# returns [(movie_id, weighted_sim, raw_sim, co_cnt, TESTING: og_movie_id)]
def topk_from_matrix_weighted(movie_id, k=15):
    i = movie_mapper[movie_id]
    row = S[i]  # raw SVD-cosine similarities to all items

    # find top (k+1) indices by raw cosine
    idxs = np.argpartition(-row, k+1)[: (k+1)]
    idxs = [j for j in idxs if j != i]
    idxs = sorted(idxs, key=lambda j: -row[j])[:k]

    results = []
    for j in idxs:
        raw_sim = row[j]
        co_cnt  = co_rating_count(i, j)
        shrink  = co_cnt / (co_cnt + alpha)
        if co_cnt < 10:
            weighted_sim = raw_sim * shrink
        else:
            weighted_sim = raw_sim
        results.append((movie_inv_mapper[j], weighted_sim, raw_sim, co_cnt, movie_id))
    # sort final list by descending weighted_sim
    results.sort(key=lambda tup: -tup[1])
    return results

# print(topk_from_matrix_weighted(1))


# helper function to produce sorted recommendation list
def insert_rec(all_recs, rec):
    score = rec[1]
    low = 0
    high = len(all_recs)
    while low < high:
        mid = (low + high) // 2

        if score > all_recs[mid][1]:
            high = mid
        else:
            low = mid + 1
    all_recs.insert(low, rec)



highly_rated_movies = find_highly_rated_movies(seed_movies)

all_recs = []
for i in range(len(highly_rated_movies)):
    recs = topk_from_matrix_weighted(highly_rated_movies[i])
    for rec in recs:
        insert_rec(all_recs, rec)

for rec in all_recs:
    # print(f'{movie_titles[rec[0]]} || {rec[1]}|| {movie_titles[rec[-1]]} ')
    print(rec)

"""
    Rec is of the form
    (movie_id, similarity)
"""


# topk_idx = np.argsort(-S, axis=1)[:,1:k+1]
# neighbors = {
#     movie_inv_mapper[i]: [movie_inv_mapper[j] for j in topk_idx[i]]
#     for i in range(S.shape[0])
# }
# with open('item_neighbors.pkl','wb') as f:
#     pickle.dump(neighbors, f)

Dave (1993) || 0.9251490911768534|| Get Shorty (1995) 
Cube (1997) || 0.915735064819071|| From Dusk Till Dawn (1996) 
Home Alone (1990) || 0.9143524104612629|| Toy Story (1995) 
Aladdin (1992) || 0.9059640678682034|| Toy Story (1995) 
Jurassic Park (1993) || 0.9055931334817263|| Toy Story (1995) 
I.Q. (1994) || 0.9025229529429379|| Dangerous Minds (1995) 
Dolores Claiborne (1995) || 0.9018603064347831|| Dangerous Minds (1995) 
In the Line of Fire (1993) || 0.9015753682464347|| Get Shorty (1995) 
Willy Wonka & the Chocolate Factory (1971) || 0.9003735245093182|| Toy Story (1995) 
Back to the Future (1985) || 0.8970020022007816|| Toy Story (1995) 
Tombstone (1993) || 0.8966216646050291|| Grumpier Old Men (1995) 
Groundhog Day (1993) || 0.896140511877725|| Toy Story (1995) 
Sabrina (1995) || 0.8953051214667411|| Dangerous Minds (1995) 
Philadelphia (1993) || 0.8947848545588678|| Dangerous Minds (1995) 
While You Were Sleeping (1995) || 0.894009432809737|| Father of the Bride Part II (1995

'\n    Rec is of the form\n    (movie_id, similarity)\n'

In [None]:
# print('S', S)
# movie_titles = dict(zip(movies['movieId'], movies['title']))
# movie_titles_inv = dict(zip(movies['title'], movies['movieId']))



# movie = 673
# title = movie_titles[movie]

# similar_movies = neighbors.get(movie)[:10]
# # movie_titles = [movie_titles[mov] for mov in similar_movies]
# # print(f'Similar movies to {title}:', movie_titles) 

S [[ 1.          0.83438553  0.63196475 ...  0.07649183  0.07649183
   0.19202011]
 [ 0.83438553  1.          0.70275044 ...  0.09481015  0.09481015
   0.09781401]
 [ 0.63196475  0.70275044  1.         ... -0.14120941 -0.14120941
  -0.04933272]
 ...
 [ 0.07649183  0.09481015 -0.14120941 ...  1.          1.
   0.3618736 ]
 [ 0.07649183  0.09481015 -0.14120941 ...  1.          1.
   0.3618736 ]
 [ 0.19202011  0.09781401 -0.04933272 ...  0.3618736   0.3618736
   1.        ]]


In [None]:
# later: 
# with open('item_neighbors.pkl','rb') as f:
#     neighbors = pickle.load(f)
# then neighbors[movie_id] gives your k similar movies directly