Movie Recommender System

Steps:

1. import packages and data
2. train hybrid model on MovieLens data

    2a. Content-based for cold start

    2b. Collaborative filtering afterwards
    
3. evaluate model
4. deploy model using flask/etc.
    4a. Api calls to collect user events
    4b. Retrain model with new batch every night

User Perspective:

Simple website that requests an account signup. The user will input 10 movies and their ratings. When they watch movies they will update their profile. They will be provided with 10 recommended movies.



In [2]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from dotenv import load_dotenv
import os
load_dotenv()
POSTGRES_USER = os.getenv("POSTGRES_USER")
POSTGRES_PASSWORD = os.getenv("POSTGRES_PASSWORD")



In [3]:
from sqlalchemy import create_engine, text
url = f"postgresql+psycopg2://{POSTGRES_USER}:{POSTGRES_PASSWORD}@localhost:5432/movie_db"
engine = create_engine(url)
with engine.connect() as conn:
    print(conn.execute(text("SELECT 1")).scalar())



1


In [4]:
small = 'data/ml-latest-small'
big = 'data/ml-32m'
folder = big

ratings = pd.read_csv(f'{folder}/ratings.csv')


movies = pd.read_csv(f'{folder}/movies.csv')

In [5]:
# Generates a sparse utility matrix
def create_X(df):
    """
    Args:
        df: pandas dataframe containing 3 columns (userId, movieId, rating)
    
    Returns:
        X: sparse matrix
        user_mapper: dict that maps user id's to user indices
        user_inv_mapper: dict that maps user indices to user id's
        movie_mapper: dict that maps movie id's to movie indices
        movie_inv_mapper: dict that maps movie indices to movie id's
    """
    M = df['userId'].nunique()
    N = df['movieId'].nunique()

    user_mapper = dict(zip(np.unique(df["userId"]), list(range(M))))
    movie_mapper = dict(zip(np.unique(df["movieId"]), list(range(N))))
    
    user_inv_mapper = dict(zip(list(range(M)), np.unique(df["userId"])))
    movie_inv_mapper = dict(zip(list(range(N)), np.unique(df["movieId"])))
    
    user_index = [user_mapper[i] for i in df['userId']]
    item_index = [movie_mapper[i] for i in df['movieId']]

    X = csr_matrix((df["rating"], (user_index,item_index)), shape=(M,N))
    
    return X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper

In [6]:

import time
X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper = create_X(ratings)
movie_titles = dict(zip(movies['movieId'], movies['title']))


In [18]:
import numpy as np
import scipy.sparse as sp
from sklearn.neighbors import NearestNeighbors

# X is (n_users × n_items) csr_matrix already loaded.

X_csc = X.tocsc()  # shape = (n_users, n_items)

n_items = X_csc.shape[1]
# supports[i] = set of user‐indices who rated item i
supports = []
for i in range(n_items):
    # nonzero()[0] gives the row‐indices of nonzero entries in column i
    users_who_rated_i = set(X_csc[:, i].nonzero()[0])
    supports.append(users_who_rated_i)

# fit NearestNeighbors on the (n_items × n_users) transpose:
item_features = X_csc.T  # now shape = (n_items, n_users), still sparse

K = 15
nn = NearestNeighbors(
    n_neighbors=K + 1,
    metric="cosine",
    algorithm="brute",
    n_jobs=-1,
)
nn.fit(item_features)

distances, indices = nn.kneighbors(item_features, return_distance=True)

alpha = 10  # shrinkage parameter
anchor_ids = []
neighbor_ids = []
raw_sims   = []
co_counts  = []
weighted_sims = []

for i in range(n_items):
    anchor_id = movie_inv_mapper[i]
    for rank in range(1, K+1):
        j = indices[i][rank]
        neighbor_id = movie_inv_mapper[j]

        if anchor_id < neighbor_id:
            raw_sim = 1.0 - distances[i][rank]
            co_cnt  = len(supports[i] & supports[j])
            shrink  = co_cnt / (co_cnt + alpha)
            w_sim   = raw_sim * shrink

            anchor_ids.append(anchor_id)
            neighbor_ids.append(neighbor_id)
            raw_sims.append(raw_sim)
            co_counts.append(co_cnt)
            weighted_sims.append(w_sim)
# FORM (anchor_ids, neighbor_ids, raw_sims, co_counts, weighted_sims)

In [8]:

print(neighbor_ids[0])
print(type(raw_sims[0]))
print(co_counts[0])
print(weighted_sims[0])

3114
<class 'float'>
26553
0.5745980425227315


In [None]:
# from models import MovieSimilarity, Movie
# from db import SessionLocal

# def insert_movies():
#     session = SessionLocal()
#     batch = [
#         Movie(id=row.movieId, title=row.title, genres=row.genres)
#         for row in movies.itertuples()
#     ]
#     session.bulk_save_objects(batch)
#     session.commit()
#     session.close()

# def insert_all_similarities():
#     session = SessionLocal()
#     batch = []
#     # anchor_ids, neighbor_ids, raw_sims, co_counts, weighted_sims
#     for (a_id, n_id, r_sim, c_cnt, w_sim) in zip(
#         anchor_ids, neighbor_ids, raw_sims, co_counts, weighted_sims
#     ):
#         batch.append(
#             MovieSimilarity(
#                 movie_id=a_id,
#                 neighbor_id=n_id,
#                 raw_sim=r_sim,
#                 co_count=c_cnt,
#                 weighted_sim=w_sim,
#             # )
#         )

#     # Bulk‐save
#     session.bulk_save_objects(batch)
#     session.commit()
#     session.close()
# # insert_movies()
# # insert_all_similarities()


# need to update db
## TODO: Change functions to delete and repopulate tables


IntegrityError: (psycopg2.errors.UniqueViolation) duplicate key value violates unique constraint "movies_pkey"
DETAIL:  Key (id)=(1) already exists.

[SQL: INSERT INTO movies (id, title, genres) VALUES (%(id__0)s, %(title__0)s, %(genres__0)s), (%(id__1)s, %(title__1)s, %(genres__1)s), (%(id__2)s, %(title__2)s, %(genres__2)s), (%(id__3)s, %(title__3)s, %(genres__3)s), (%(id__4)s, %(title__4)s, %(genres__ ... 47364 characters truncated ... 7)s), (%(id__998)s, %(title__998)s, %(genres__998)s), (%(id__999)s, %(title__999)s, %(genres__999)s)]
[parameters: {'genres__0': 'Adventure|Animation|Children|Comedy|Fantasy', 'id__0': 1, 'title__0': 'Toy Story (1995)', 'genres__1': 'Adventure|Children|Fantasy', 'id__1': 2, 'title__1': 'Jumanji (1995)', 'genres__2': 'Comedy|Romance', 'id__2': 3, 'title__2': 'Grumpier Old Men (1995)', 'genres__3': 'Comedy|Drama|Romance', 'id__3': 4, 'title__3': 'Waiting to Exhale (1995)', 'genres__4': 'Comedy', 'id__4': 5, 'title__4': 'Father of the Bride Part II (1995)', 'genres__5': 'Action|Crime|Thriller', 'id__5': 6, 'title__5': 'Heat (1995)', 'genres__6': 'Comedy|Romance', 'id__6': 7, 'title__6': 'Sabrina (1995)', 'genres__7': 'Adventure|Children', 'id__7': 8, 'title__7': 'Tom and Huck (1995)', 'genres__8': 'Action', 'id__8': 9, 'title__8': 'Sudden Death (1995)', 'genres__9': 'Action|Adventure|Thriller', 'id__9': 10, 'title__9': 'GoldenEye (1995)', 'genres__10': 'Comedy|Drama|Romance', 'id__10': 11, 'title__10': 'American President, The (1995)', 'genres__11': 'Comedy|Horror', 'id__11': 12, 'title__11': 'Dracula: Dead and Loving It (1995)', 'genres__12': 'Adventure|Animation|Children', 'id__12': 13, 'title__12': 'Balto (1995)', 'genres__13': 'Drama', 'id__13': 14, 'title__13': 'Nixon (1995)', 'genres__14': 'Action|Adventure|Romance', 'id__14': 15, 'title__14': 'Cutthroat Island (1995)', 'genres__15': 'Crime|Drama', 'id__15': 16, 'title__15': 'Casino (1995)', 'genres__16': 'Drama|Romance', 'id__16': 17 ... 2900 parameters truncated ... 'id__983': 1006, 'title__983': 'Chamber, The (1996)', 'genres__984': 'Children|Comedy|Western', 'id__984': 1007, 'title__984': 'Apple Dumpling Gang, The (1975)', 'genres__985': 'Adventure|Western', 'id__985': 1008, 'title__985': 'Davy Crockett, King of the Wild Frontier (1955)', 'genres__986': 'Adventure|Children|Fantasy', 'id__986': 1009, 'title__986': 'Escape to Witch Mountain (1975)', 'genres__987': 'Children|Comedy', 'id__987': 1010, 'title__987': 'Love Bug, The (1969)', 'genres__988': 'Children|Comedy|Fantasy|Romance', 'id__988': 1011, 'title__988': 'Herbie Rides Again (1974)', 'genres__989': 'Children|Drama', 'id__989': 1012, 'title__989': 'Old Yeller (1957)', 'genres__990': 'Children|Comedy|Romance', 'id__990': 1013, 'title__990': 'Parent Trap, The (1961)', 'genres__991': 'Children|Comedy|Drama', 'id__991': 1014, 'title__991': 'Pollyanna (1960)', 'genres__992': 'Adventure|Children|Drama', 'id__992': 1015, 'title__992': 'Homeward Bound: The Incredible Journey (1993)', 'genres__993': 'Children|Comedy', 'id__993': 1016, 'title__993': 'Shaggy Dog, The (1959)', 'genres__994': 'Adventure|Children', 'id__994': 1017, 'title__994': 'Swiss Family Robinson (1960)', 'genres__995': 'Children|Comedy|Mystery', 'id__995': 1018, 'title__995': 'That Darn Cat! (1965)', 'genres__996': 'Adventure|Drama|Sci-Fi', 'id__996': 1019, 'title__996': '20,000 Leagues Under the Sea (1954)', 'genres__997': 'Comedy', 'id__997': 1020, 'title__997': 'Cool Runnings (1993)', 'genres__998': 'Children|Comedy', 'id__998': 1021, 'title__998': 'Angels in the Outfield (1994)', 'genres__999': 'Animation|Children|Fantasy|Musical|Romance', 'id__999': 1022, 'title__999': 'Cinderella (1950)'}]
(Background on this error at: https://sqlalche.me/e/20/gkpj)

In [None]:
# insert_all_similarities()

In [10]:
from db import SessionLocal
from models import MovieSimilarity

session = SessionLocal()

# Query the first 10 rows
examples = (
    session.query(MovieSimilarity)
           .limit(10)
           .all()
)
print(examples)
for row in examples:
    print(
        f"movie_id={row.movie_id}, "
        f"neighbor_id={row.neighbor_id}, "
        f"raw_sim={row.raw_sim:.4f}, "
        f"co_count={row.co_count}, "
        f"weighted_sim={row.weighted_sim:.4f}"
    )

session.close()

[<models.MovieSimilarity object at 0x3754234d0>, <models.MovieSimilarity object at 0x467e99450>, <models.MovieSimilarity object at 0x467e99310>, <models.MovieSimilarity object at 0x37542b100>, <models.MovieSimilarity object at 0x37542b230>, <models.MovieSimilarity object at 0x386ae20f0>, <models.MovieSimilarity object at 0x375496690>, <models.MovieSimilarity object at 0x3754967a0>, <models.MovieSimilarity object at 0x467e6b850>, <models.MovieSimilarity object at 0x467e6b950>]
movie_id=292737, neighbor_id=289301, raw_sim=1.0000, co_count=1, weighted_sim=0.0909
movie_id=292737, neighbor_id=279890, raw_sim=1.0000, co_count=1, weighted_sim=0.0909
movie_id=292737, neighbor_id=290215, raw_sim=1.0000, co_count=1, weighted_sim=0.0909
movie_id=292753, neighbor_id=292753, raw_sim=1.0000, co_count=1, weighted_sim=0.0909
movie_id=292753, neighbor_id=253662, raw_sim=1.0000, co_count=1, weighted_sim=0.0909
movie_id=292753, neighbor_id=230665, raw_sim=0.9615, co_count=1, weighted_sim=0.0874
movie_id=

In [17]:
# test with seed movies

# grab all similar movies to movie 
from sqlalchemy import or_
import heapq

def topk_movies(movie_id, k):
    session = SessionLocal()
    rows = (
        session.query(MovieSimilarity)
            .filter(
                or_(
                    MovieSimilarity.movie_id   == movie_id,
                    MovieSimilarity.neighbor_id == movie_id
                )
                
            )
            .order_by(MovieSimilarity.weighted_sim.desc())
            .all()
    )
    # for r in rows:
    #     print(r.movie_id, r.neighbor_id, r.raw_sim, r.co_count, r.weighted_sim)
    session.close()
    return rows[:k]

seed_movies = [(1, 5.0), (2, 3.5), (3, 5.0),(4, 2.5), (5, 4.0),
               (6, 1.5),  (7, 1.0),  (8, 3.0),  (9, 2.5),  (10, 2.5),
    (11, 2.0), (12, 1.5), (13, 5.0), (14, 1.5), (15, 4.0),
    (16, 1.0), (17, 1.0), (18, 1.5), (19, 2.5), (20, 2.5),
    (21, 5.0), (22, 2.5), (23, 2.5), (24, 3.0), (25, 3.0),
    (26, 4.0), (27, 2.0), (28, 4.5), (29, 1.5), (30, 2.5),
    (31, 4.5), (32, 3.5), (33, 2.0), (34, 1.5), (35, 1.0),
    (36, 2.0), (37, 3.0), (38, 1.0), (39, 4.5), (40, 2.0),
    (41, 3.0), (42, 3.5), (43, 3.0), (44, 1.5), (45, 1.5),
    (46, 3.0), (47, 1.5), (48, 3.5), (49, 2.0), (50, 1.0),
    (51, 2.5), (52, 1.5), (53, 3.5), (54, 1.5), (55, 3.0),
    (57, 3.5), (58, 4.5), (59, 2.5), (60, 3.5),
    (61, 1.5), (62, 3.0), (63, 3.5), (64, 1.0), (65, 3.0),
    (66, 4.5), (67, 2.5), (68, 2.0), (69, 5.0), (70, 4.0),
    (71, 2.5), (72, 1.0), (73, 1.0), (74, 4.0), (75, 3.5),
    (76, 3.0), (77, 3.5), (78, 2.0), (79, 4.0), (80, 3.0),
    (81, 2.0), (82, 4.5), (83, 5.0), (85, 2.0),
    (86, 3.5), (87, 1.5), (88, 2.0), (89, 1.0), (90, 3.5),
    (91, 2.5), (92, 4.5), (93, 4.0), (94, 4.5), (95, 2.0),
    (96, 3.0), (97, 2.0), (98, 2.5), (99, 5.0), (100, 5.0),
    (101, 3.0), (102, 4.0), (103, 4.0), (104, 3.5), (105, 2.5),]


minimum_seed_count = 3
# take all movies rated higher than 4 stars
# if none, take top 3 movies
def find_highly_rated_movies(seed_movies):
    sorted_movies = sorted(seed_movies, key=lambda x: x[1], reverse=True)
    output = []
    for i in range(len(sorted_movies)):
        movie = sorted_movies[i]
        if movie[1] < 4.0 and len(output) > minimum_seed_count:
            break
        output.append(movie[0])
    return output

highly_rated_movies = find_highly_rated_movies(seed_movies)



def find_recommended_movies(rated_movies):
    heap = []
    lists = []

    for i in range(len(rated_movies)):
        list = topk_movies(rated_movies[i], 10)
        lists.append(list)
        elem = list[0]
        score = -elem.weighted_sim # negative score so its descending order
        heap.append((score, i, 0, elem))
    heapq.heapify(heap)


    result = []
    while heap:
        score, i, j, elem = heapq.heappop(heap)
        result.append(elem.movie_id)
        result.append(elem.neighbor_id)

        # advance in list i
        if j + 1 < len(lists[i]):
            nxt = lists[i][j + 1]
            nxt_score = -nxt.weighted_sim
            heapq.heappush(heap, (nxt_score, i, j + 1, nxt))
    return result

print(find_recommended_movies(highly_rated_movies))

[1, 3114, 3114, 1, 260, 1, 1, 260, 1, 1270, 1, 356, 1, 480, 1, 364, 364, 1, 1, 1210, 21, 457, 21, 380, 21, 300, 300, 21, 39, 597, 21, 592, 39, 539, 21, 377, 39, 357, 357, 39, 21, 349, 39, 500, 357, 21, 21, 357, 39, 377, 39, 588, 34, 39, 39, 34, 39, 339, 21, 590, 5, 3, 3, 5, 3, 5, 5, 3, 2167, 70, 70, 2167, 70, 1676, 3, 736, 163, 70, 70, 163, 62, 5, 5, 62, 1320, 70, 70, 1320, 788, 3, 3, 788, 1215, 70, 70, 1215, 7, 5, 5, 7, 70, 1573, 3, 62, 62, 3, 5, 736, 3, 494, 494, 3, 79, 494, 494, 79, 3, 7, 140, 74, 74, 140, 58, 17, 17, 58, 58, 265, 265, 58, 5, 141, 5, 788, 5, 708, 58, 36, 36, 58, 61, 79, 79, 61, 58, 52, 52, 58, 140, 79, 79, 140, 58, 25, 28, 17, 58, 1183, 376, 79, 79, 376, 28, 838, 838, 28, 640, 79, 79, 640, 31, 185, 31, 350, 100, 79, 79, 100, 31, 256, 100, 14, 14, 100, 31, 292, 100, 707, 707, 100, 31, 186, 74, 708, 74, 7, 3177, 69, 69, 3177, 31, 22, 31, 339, 31, 587, 31, 168, 100, 494, 74, 64, 64, 74, 31, 454, 100, 640, 640, 100, 69, 180, 1753, 69, 69, 1753, 94, 708, 100, 628, 69, 44

In [None]:
# import numpy as np
# from sklearn.metrics.pairwise import cosine_similarity
# from sklearn.decomposition import TruncatedSVD
# import pickle
# import time



# start = time.time()
# X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper = create_X(ratings)
# end = time.time()
# print(end-start)

# start = time.time()
# svd = TruncatedSVD(n_components=20, n_iter=10)
# Q = svd.fit_transform(X.T)
# Q.shape
# end = time.time()
# print(end-start)

# start = time.time()
# S = cosine_similarity(Q) # similarity matrix
# end = time.time()
# print(end-start)

# start = time.time()
# # Save S to database
# with open('database/similarity_matrix.pkl','wb') as f:
#     pickle.dump(S, f)
# end = time.time()
# print(end - start)

In [10]:


# print('S', S)
# movie_titles = dict(zip(movies['movieId'], movies['title']))
# movie_titles_inv = dict(zip(movies['title'], movies['movieId']))



# movie = 673
# title = movie_titles[movie]

# similar_movies = neighbors.get(movie)[:10]
# # movie_titles = [movie_titles[mov] for mov in similar_movies]
# # print(f'Similar movies to {title}:', movie_titles) 

In [11]:
# later: 
# with open('item_neighbors.pkl','rb') as f:
#     neighbors = pickle.load(f)
# then neighbors[movie_id] gives your k similar movies directly