Movie Recommender System

Steps:

1. import packages and data
2. train hybrid model on MovieLens data

    2a. Content-based for cold start

    2b. Collaborative filtering afterwards
    
3. evaluate model
4. deploy model using flask/etc.
    4a. Api calls to collect user events
    4b. Retrain model with new batch every night

User Perspective:

Simple website that requests an account signup. The user will input 10 movies and their ratings. When they watch movies they will update their profile. They will be provided with 10 recommended movies.



In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from dotenv import load_dotenv
import os
load_dotenv()
POSTGRES_USER = os.getenv("POSTGRES_USER")
POSTGRES_PASSWORD = os.getenv("POSTGRES_PASSWORD")



In [2]:
from sqlalchemy import create_engine, text
url = f"postgresql+psycopg2://{POSTGRES_USER}:{POSTGRES_PASSWORD}@localhost:5432/movie_db"
engine = create_engine(url)
with engine.connect() as conn:
    print(conn.execute(text("SELECT 1")).scalar())



1


In [3]:
small = 'data/ml-latest-small'
big = 'data/ml-32m'
folder = big

ratings = pd.read_csv(f'{folder}/ratings.csv')


movies = pd.read_csv(f'{folder}/movies.csv')

In [4]:
# Generates a sparse utility matrix
def create_X(df):
    """
    Args:
        df: pandas dataframe containing 3 columns (userId, movieId, rating)
    
    Returns:
        X: sparse matrix
        user_mapper: dict that maps user id's to user indices
        user_inv_mapper: dict that maps user indices to user id's
        movie_mapper: dict that maps movie id's to movie indices
        movie_inv_mapper: dict that maps movie indices to movie id's
    """
    M = df['userId'].nunique()
    N = df['movieId'].nunique()

    user_mapper = dict(zip(np.unique(df["userId"]), list(range(M))))
    movie_mapper = dict(zip(np.unique(df["movieId"]), list(range(N))))
    
    user_inv_mapper = dict(zip(list(range(M)), np.unique(df["userId"])))
    movie_inv_mapper = dict(zip(list(range(N)), np.unique(df["movieId"])))
    
    user_index = [user_mapper[i] for i in df['userId']]
    item_index = [movie_mapper[i] for i in df['movieId']]

    X = csr_matrix((df["rating"], (user_index,item_index)), shape=(M,N))
    
    return X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper

In [5]:

import time
X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper = create_X(ratings)
movie_titles = dict(zip(movies['movieId'], movies['title']))


In [6]:
import numpy as np
import scipy.sparse as sp
from sklearn.neighbors import NearestNeighbors

# X is (n_users × n_items) csr_matrix already loaded.

X_csc = X.tocsc()  # shape = (n_users, n_items)

n_items = X_csc.shape[1]
# supports[i] = set of user‐indices who rated item i
supports = []
for i in range(n_items):
    # nonzero()[0] gives the row‐indices of nonzero entries in column i
    users_who_rated_i = set(X_csc[:, i].nonzero()[0])
    supports.append(users_who_rated_i)

# fit NearestNeighbors on the (n_items × n_users) transpose:
item_features = X_csc.T  # now shape = (n_items, n_users), still sparse

K = 15
nn = NearestNeighbors(
    n_neighbors=K + 1,
    metric="cosine",
    algorithm="brute",
    n_jobs=-1,
)
nn.fit(item_features)

distances, indices = nn.kneighbors(item_features, return_distance=True)

alpha = 10  # shrinkage parameter
anchor_ids = []
neighbor_ids = []
raw_sims   = []
co_counts  = []
weighted_sims = []

for i in range(n_items):
    anchor_id = movie_inv_mapper[i]
    for rank in range(1, K + 1):  # skip rank=0 (itself)
        j = indices[i][rank]
        neighbor_id = movie_inv_mapper[j]

        raw_sim = 1.0 - distances[i][rank]

        # set intersection for efficient co_count computation
        co_cnt = len(supports[i] & supports[j])
        shrink = co_cnt / (co_cnt + alpha)
        w_sim = raw_sim * shrink

        anchor_ids.append(int(anchor_id))
        neighbor_ids.append(int(neighbor_id))
        raw_sims.append(float(raw_sim))
        co_counts.append(co_cnt)
        weighted_sims.append(float(w_sim))

# FORM (anchor_ids, neighbor_ids, raw_sims, co_counts, weighted_sims)

In [7]:

print(neighbor_ids[0])
print(type(raw_sims[0]))
print(co_counts[0])
print(weighted_sims[0])

3114
<class 'float'>
26553
0.5745980425227315


In [None]:
from models import MovieSimilarity, Movie
from db import SessionLocal

def insert_movies():
    session = SessionLocal()
    batch = [
        Movie(id=row.movieId, title=row.title, genres=row.genres)
        for row in movies.itertuples()
    ]
    session.bulk_save_objects(batch)
    session.commit()
    session.close()

def insert_all_similarities():
    session = SessionLocal()
    batch = []
    # anchor_ids, neighbor_ids, raw_sims, co_counts, weighted_sims
    for (a_id, n_id, r_sim, c_cnt, w_sim) in zip(
        anchor_ids, neighbor_ids, raw_sims, co_counts, weighted_sims
    ):
        batch.append(
            MovieSimilarity(
                movie_id=a_id,
                neighbor_id=n_id,
                raw_sim=r_sim,
                co_count=c_cnt,
                weighted_sim=w_sim,
            )
        )

    # Bulk‐save
    session.bulk_save_objects(batch)
    session.commit()
    session.close()
insert_movies()

In [12]:
insert_all_similarities()

In [13]:
from db import SessionLocal
from models import MovieSimilarity

session = SessionLocal()

# Query the first 10 rows
examples = (
    session.query(MovieSimilarity)
           .limit(10)
           .all()
)
print(examples)
for row in examples:
    print(
        f"movie_id={row.movie_id}, "
        f"neighbor_id={row.neighbor_id}, "
        f"raw_sim={row.raw_sim:.4f}, "
        f"co_count={row.co_count}, "
        f"weighted_sim={row.weighted_sim:.4f}"
    )

session.close()

[<models.MovieSimilarity object at 0x4582a9d50>, <models.MovieSimilarity object at 0x4582a9dd0>, <models.MovieSimilarity object at 0x4582aa0d0>, <models.MovieSimilarity object at 0x4582ab4d0>, <models.MovieSimilarity object at 0x4582abd50>, <models.MovieSimilarity object at 0x4582ab2d0>, <models.MovieSimilarity object at 0x4582ab650>, <models.MovieSimilarity object at 0x4582abe50>, <models.MovieSimilarity object at 0x4582abbd0>, <models.MovieSimilarity object at 0x4582ab3d0>]
movie_id=1, neighbor_id=3114, raw_sim=0.5748, co_count=26553, weighted_sim=0.5746
movie_id=1, neighbor_id=260, raw_sim=0.5617, co_count=44291, weighted_sim=0.5616
movie_id=1, neighbor_id=1270, raw_sim=0.5455, co_count=36826, weighted_sim=0.5454
movie_id=1, neighbor_id=356, raw_sim=0.5416, co_count=47506, weighted_sim=0.5415
movie_id=1, neighbor_id=480, raw_sim=0.5399, co_count=40580, weighted_sim=0.5398
movie_id=1, neighbor_id=364, raw_sim=0.5367, co_count=32569, weighted_sim=0.5366
movie_id=1, neighbor_id=1210, r

In [None]:
# test with seed movies

seed_movies = [(1, 5.0), (2, 3.5), (3, 5.0),(4, 2.5), (5, 4.0),
               (6, 1.5),  (7, 1.0),  (8, 3.0),  (9, 2.5),  (10, 2.5),
    (11, 2.0), (12, 1.5), (13, 5.0), (14, 1.5), (15, 4.0),
    (16, 1.0), (17, 1.0), (18, 1.5), (19, 2.5), (20, 2.5),
    (21, 5.0), (22, 2.5), (23, 2.5), (24, 3.0), (25, 3.0),
    (26, 4.0), (27, 2.0), (28, 4.5), (29, 1.5), (30, 2.5),
    (31, 4.5), (32, 3.5), (33, 2.0), (34, 1.5), (35, 1.0),
    (36, 2.0), (37, 3.0), (38, 1.0), (39, 4.5), (40, 2.0),
    (41, 3.0), (42, 3.5), (43, 3.0), (44, 1.5), (45, 1.5),
    (46, 3.0), (47, 1.5), (48, 3.5), (49, 2.0), (50, 1.0),
    (51, 2.5), (52, 1.5), (53, 3.5), (54, 1.5), (55, 3.0),
    (57, 3.5), (58, 4.5), (59, 2.5), (60, 3.5),
    (61, 1.5), (62, 3.0), (63, 3.5), (64, 1.0), (65, 3.0),
    (66, 4.5), (67, 2.5), (68, 2.0), (69, 5.0), (70, 4.0),
    (71, 2.5), (72, 1.0), (73, 1.0), (74, 4.0), (75, 3.5),
    (76, 3.0), (77, 3.5), (78, 2.0), (79, 4.0), (80, 3.0),
    (81, 2.0), (82, 4.5), (83, 5.0), (85, 2.0),
    (86, 3.5), (87, 1.5), (88, 2.0), (89, 1.0), (90, 3.5),
    (91, 2.5), (92, 4.5), (93, 4.0), (94, 4.5), (95, 2.0),
    (96, 3.0), (97, 2.0), (98, 2.5), (99, 5.0), (100, 5.0),
    (101, 3.0), (102, 4.0), (103, 4.0), (104, 3.5), (105, 2.5),]


minimum_seed_count = 3
# take all movies rated higher than 4 stars
# if none, take top 3 movies
def find_highly_rated_movies(seed_movies):
    sorted_movies = sorted(seed_movies, key=lambda x: x[1], reverse=True)
    output = []
    for i in range(len(sorted_movies)):
        movie = sorted_movies[i]
        if movie[1] < 4.0 and len(output) > minimum_seed_count:
            break
        output.append(movie[0])
    return output

In [None]:
# import numpy as np
# from sklearn.metrics.pairwise import cosine_similarity
# from sklearn.decomposition import TruncatedSVD
# import pickle
# import time



# start = time.time()
# X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper = create_X(ratings)
# end = time.time()
# print(end-start)

# start = time.time()
# svd = TruncatedSVD(n_components=20, n_iter=10)
# Q = svd.fit_transform(X.T)
# Q.shape
# end = time.time()
# print(end-start)

# start = time.time()
# S = cosine_similarity(Q) # similarity matrix
# end = time.time()
# print(end-start)

# start = time.time()
# # Save S to database
# with open('database/similarity_matrix.pkl','wb') as f:
#     pickle.dump(S, f)
# end = time.time()
# print(end - start)

In [10]:


# print('S', S)
# movie_titles = dict(zip(movies['movieId'], movies['title']))
# movie_titles_inv = dict(zip(movies['title'], movies['movieId']))



# movie = 673
# title = movie_titles[movie]

# similar_movies = neighbors.get(movie)[:10]
# # movie_titles = [movie_titles[mov] for mov in similar_movies]
# # print(f'Similar movies to {title}:', movie_titles) 

In [11]:
# later: 
# with open('item_neighbors.pkl','rb') as f:
#     neighbors = pickle.load(f)
# then neighbors[movie_id] gives your k similar movies directly