In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from typing import Tuple
from tensorflow.keras.datasets import cifar10
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Concatenate, Dense, Lambda
from urllib.request import urlretrieve
import zipfile

In [None]:
urlretrieve("http://files.grouplens.org/datasets/movielens/ml-100k.zip", "movielens.zip")
zip_ref = zipfile.ZipFile('movielens.zip', "r")
zip_ref.extractall()

print("Done. Dataset contains:")
print(zip_ref.read('ml-100k/u.info'))

Done. Dataset contains:
b'943 users\n1682 items\n100000 ratings\n'


In [None]:
# Load each data set (users, movies, and ratings).
users_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('ml-100k/u.user', sep='|', names=users_cols, encoding='latin-1')
ratings_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=ratings_cols, encoding='latin-1')

In [None]:
# The movies file contains a binary feature for each genre.
genre_cols = [
   "genre_unknown", "Action", "Adventure", "Animation", "Children", "Comedy",
   "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror",
   "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"
]

movies_cols = [
   'movie_id', 'title', 'release_date', "video_release_date", "imdb_url"
] + genre_cols

movies = pd.read_csv('ml-100k/u.item', sep='|', names=movies_cols, encoding='latin-1')

In [None]:
#Basic statistics about the data

print(f"\nNumber of users: {users['user_id'].nunique()}")
print(f"Number of movies: {movies['movie_id'].nunique()}")
print(f"Number of ratings: {ratings.shape[0]}")


Number of users: 943
Number of movies: 1682
Number of ratings: 100000


Implementing the unweighted regularized Matrix Factorization Collaborative Filtering algorithm to build a recommendation system.

In [None]:
# Loading the ratings
ratings_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=ratings_cols, encoding='latin-1')

In [None]:
# Mapping the user and movie IDs to continuous indices
user_map = {id: idx for idx, id in enumerate(ratings['user_id'].unique())}
movie_map = {id: idx for idx, id in enumerate(ratings['movie_id'].unique())}

ratings['user_idx'] = ratings['user_id'].map(user_map)
ratings['movie_idx'] = ratings['movie_id'].map(movie_map)

In [None]:
# Preparing the training data
num_users = len(user_map)
num_movies = len(movie_map)
num_features = 15
lr = 0.01
reg = 0.1
epochs = 20

In [None]:
# Initialising the latent features
np.random.seed(0)
P = np.random.normal(scale=1./num_features, size=(num_users, num_features))
Q = np.random.normal(scale=1./num_features, size=(num_movies, num_features))

In [None]:
# Stochastic Gradient Descent
for epoch in range(epochs):
    for row in ratings.itertuples():
        u = row.user_idx
        m = row.movie_idx
        r = row.rating

        pred = np.dot(P[u], Q[m])
        pred = np.clip(pred, 1, 5)

        err = r - pred

        P[u] += lr * (err * Q[m] - reg * P[u])
        Q[m] += lr * (err * P[u] - reg * Q[m])

    mse = np.mean([
        (row.rating - np.clip(np.dot(P[row.user_idx], Q[row.movie_idx]), 1, 5)) ** 2
        for row in ratings.itertuples()
    ])
    print(f"Epoch {epoch+1}, MSE: {mse:.4f}")

Epoch 1, MSE: 6.8464
Epoch 2, MSE: 1.4525
Epoch 3, MSE: 1.0556
Epoch 4, MSE: 0.9610
Epoch 5, MSE: 0.9218
Epoch 6, MSE: 0.8987
Epoch 7, MSE: 0.8814
Epoch 8, MSE: 0.8666
Epoch 9, MSE: 0.8532
Epoch 10, MSE: 0.8410
Epoch 11, MSE: 0.8300
Epoch 12, MSE: 0.8203
Epoch 13, MSE: 0.8116
Epoch 14, MSE: 0.8036
Epoch 15, MSE: 0.7961
Epoch 16, MSE: 0.7889
Epoch 17, MSE: 0.7819
Epoch 18, MSE: 0.7750
Epoch 19, MSE: 0.7681
Epoch 20, MSE: 0.7613


Implementing the algorithm:

In [None]:
def recommend_movies(user_id, top_n=5):
    u_idx = user_map[user_id]
    scores = np.dot(P[u_idx], Q.T)
    scores = np.clip(scores, 1, 5)

    watched = ratings[ratings['user_id'] == user_id]['movie_id'].tolist()
    unwatched_scores = [(mid, score) for mid, score in zip(movie_map.keys(), scores) if mid not in watched]

    top_movies = sorted(unwatched_scores, key=lambda x: -x[1])[:top_n]

    recommended_titles = [
        (movies[movies['movie_id'] == movie_id]['title'].values[0], float(round(score, 2)))
        for movie_id, score in top_movies
    ]

    return recommended_titles

In [None]:
recommend_movies(284)

[('Pather Panchali (1955)', 4.81),
 ('Wrong Trousers, The (1993)', 4.67),
 ("Schindler's List (1993)", 4.62),
 ('Rear Window (1954)', 4.53),
 ('Close Shave, A (1995)', 4.5)]

Building a deep neural network using Tensorflow to implement a recommendation system using the same dataset.

In [None]:
# Encoding the user and movie IDs as categorical indices
ratings['user'] = ratings['user_id'].map(user_map)
ratings['movie'] = ratings['movie_id'].map(movie_map)

X = ratings[['user', 'movie']].values
y = ratings['rating'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
num_users = len(user_map)
num_movies = len(movie_map)
embedding_dim = 20

# User input
user_input = Input(shape = (1,))
user_embedding = Embedding(input_dim = num_users, output_dim = embedding_dim)(user_input)
user_vec = Flatten()(user_embedding)

# Movie input
movie_input = Input(shape = (1,))
movie_embedding = Embedding(input_dim = num_movies, output_dim = embedding_dim)(movie_input)
movie_vec = Flatten()(movie_embedding)

# Concatenating and feeding to the MLP
concat = Concatenate()([user_vec, movie_vec])
dense = Dense(64, activation = 'relu')(concat)
dense = Dense(32, activation = 'relu')(dense)
output = Dense(1, activation = 'linear')(dense)

model = Model([user_input, movie_input], output)
model.compile(optimizer = 'adam', loss = 'mse', metrics = ['mae'])

model.summary()

In [None]:
history = model.fit(
    [X_train[:, 0], X_train[:, 1]],
    y_train,
    batch_size=256,
    epochs=10,
    validation_data = ([X_test[:, 0], X_test[:, 1]], y_test)
)

Epoch 1/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - loss: 5.5178 - mae: 1.8485 - val_loss: 0.9061 - val_mae: 0.7497
Epoch 2/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - loss: 0.8859 - mae: 0.7438 - val_loss: 0.8954 - val_mae: 0.7476
Epoch 3/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - loss: 0.8622 - mae: 0.7344 - val_loss: 0.8917 - val_mae: 0.7467
Epoch 4/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.8620 - mae: 0.7323 - val_loss: 0.8900 - val_mae: 0.7505
Epoch 5/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 0.8518 - mae: 0.7292 - val_loss: 0.8818 - val_mae: 0.7431
Epoch 6/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - loss: 0.8253 - mae: 0.7165 - val_loss: 0.8672 - val_mae: 0.7359
Epoch 7/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - 

In [None]:
def deep_recommend_movies(user_id, top_n=5):
    user_idx = user_map[user_id]
    watched = ratings[ratings['user_id'] == user_id]['movie_id'].tolist()

    all_movie_ids = list(movie_map.keys())
    unwatched = [m for m in all_movie_ids if m not in watched]

    movie_indices = [movie_map[m] for m in unwatched]
    user_indices = [user_idx] * len(movie_indices)

    predictions = model.predict([np.array(user_indices), np.array(movie_indices)], verbose=0)
    top_indices = np.argsort(predictions.flatten())[::-1][:top_n]
    top_movie_ids = [unwatched[i] for i in top_indices]

    return [(movies[movies['movie_id'] == mid]['title'].values[0], round(float(predictions[i][0]), 2))
            for i, mid in zip(top_indices, top_movie_ids)]

In [None]:
deep_recommend_movies(345)

[('Pather Panchali (1955)', 4.96),
 ('Santa with Muscles (1996)', 4.95),
 ('Whole Wide World, The (1996)', 4.84),
 ('Casablanca (1942)', 4.8),
 ('Anna (1996)', 4.77)]

**My Design Choices**

1. **Embeddings for Users and Movies**: Instead of one-hot encoding user and movie IDs (which would result in sparse high-dimensional vectors), I used embedding layers to learn dense vector representations:

  * User Embedding Layer: `Embedding(input_dim = num_users, output_dim = 20)`

  * Movie Embedding Layer: `Embedding(input_dim = num_movies, output_dim = 20)`

  I did this because embeddings allow the model to learn latent features about users (preference for genres) and movies (appeal to certain audiences) directly from the data. I chose a dimensionality of 20, which balances expressive power with model simplicity. Larger embedding sizes can capture more detail but risk overfitting on small datasets.

2. **Hidden Layers (Multi-Layer Perceptron)**: After concatenating the user and movie embeddings, I passed the result through two fully connected (dense) layers:

  `dense = Dense(64, activation = 'relu')(concat)`

  `dense = Dense(32, activation = 'relu')(dense)`

  I used two layers to introduce non-linearity and enable the model to learn more complex interactions between users and movies. The first layer has 64 neurons, and the second has 32, which follows a common architecture design pattern of reducing dimensions progressively. I used ReLU activation because it is used for its computational efficiency and its ability to avoid vanishing gradients compared to sigmoid or tanh.

3. **Output Layer**: `output = Dense(1, activation = 'linear')(dense)`
  
  The model outputs a single predicted rating as a continuous value.
  
  I used a linear activation so the model is not artificially constrained during learning, which means it can output any real number, which is useful during early training. If needed, predictions can be clipped to the [1, 5] range later, but the linear activation gives the network flexibility to explore.

4. **Loss Function and Optimization**: `model.compile(optimizer = 'adam', loss = 'mse', metrics = ['mae'])`

  I used Mean Squared Error (MSE) as the loss function because we are predicting real-valued ratings (regression). MSE penalizes larger errors more strongly, which encourages the model to make precise predictions. I also included Mean Absolute Error (MAE) as a metric to track performance in a more interpretable way (average absolute deviation from actual ratings). Something new I learnt is the Adam optimizer which is a strong default choice for deep learning models, combining momentum and adaptive learning rates for faster convergence.

5. **Train-Test Split and Evaluation**: I split the data into 80% training and 20% test to ensure the model can generalize and avoid overfitting. I also monitor validation loss during training to verify that learning is progressing as expected.