# Content Based Filtering
Using the MovieLens small dataset I will implement a movie recommender based on the movies ratings

In [1]:
import numpy as np
import numpy.ma as ma
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
import tabulate

2025-03-23 18:03:55.715818: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
ratings = pd.read_csv("data/raw/ratings.csv")
movies = pd.read_csv("data/raw/movies.csv")

In [3]:
movies.head(6)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller


In [4]:
ratings.head(6)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
5,1,70,3.0,964982400


Now we want to merge both sets to make predictions about the movies based on their ratings. Also this will simplify the search of missing values

In [5]:
df = ratings.merge(movies, on="movieId")
df.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


In [6]:
print(df.isnull().sum())

userId       0
movieId      0
rating       0
timestamp    0
title        0
genres       0
dtype: int64


In [7]:
df_missing = df[df.isnull().any(axis=1)]
print(df_missing)

Empty DataFrame
Columns: [userId, movieId, rating, timestamp, title, genres]
Index: []


In [8]:
df.info

<bound method DataFrame.info of         userId  movieId  rating   timestamp                           title  \
0            1        1     4.0   964982703                Toy Story (1995)   
1            1        3     4.0   964981247         Grumpier Old Men (1995)   
2            1        6     4.0   964982224                     Heat (1995)   
3            1       47     5.0   964983815     Seven (a.k.a. Se7en) (1995)   
4            1       50     5.0   964982931      Usual Suspects, The (1995)   
...        ...      ...     ...         ...                             ...   
100831     610   166534     4.0  1493848402                    Split (2017)   
100832     610   168248     5.0  1493850091   John Wick: Chapter Two (2017)   
100833     610   168250     5.0  1494273047                  Get Out (2017)   
100834     610   168252     5.0  1493846352                    Logan (2017)   
100835     610   170875     3.0  1493846415  The Fate of the Furious (2017)   

                   

In [9]:
df.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
title         object
genres        object
dtype: object

The columns datatypes are wrong. I will transform them so I can make the feature engineering needed. Mainly the title and genres datatypes which I want to be strings

In [10]:
print(df["title"].dtype)
print(df["title"].nunique())
print(df["title"].unique()[:10])

print(df["genres"].dtype)
print(df["genres"].nunique())
print(df["genres"].unique()[:10]) 


object
9719
['Toy Story (1995)' 'Grumpier Old Men (1995)' 'Heat (1995)'
 'Seven (a.k.a. Se7en) (1995)' 'Usual Suspects, The (1995)'
 'From Dusk Till Dawn (1996)' 'Bottle Rocket (1996)' 'Braveheart (1995)'
 'Rob Roy (1995)' 'Canadian Bacon (1995)']
object
951
['Adventure|Animation|Children|Comedy|Fantasy' 'Comedy|Romance'
 'Action|Crime|Thriller' 'Mystery|Thriller' 'Crime|Mystery|Thriller'
 'Action|Comedy|Horror|Thriller' 'Adventure|Comedy|Crime|Romance'
 'Action|Drama|War' 'Action|Drama|Romance|War' 'Comedy|War']


In [11]:
df["title"] = df["title"].astype(str)
df["genres"] = df["genres"].astype(str)

df["genres"] = df["genres"].str.split("|")
df = df.explode("genres")
df.head(6)

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure
0,1,1,4.0,964982703,Toy Story (1995),Animation
0,1,1,4.0,964982703,Toy Story (1995),Children
0,1,1,4.0,964982703,Toy Story (1995),Comedy
0,1,1,4.0,964982703,Toy Story (1995),Fantasy
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy


Now we clean the df a little bit

In [12]:
df.drop("timestamp", axis=1, inplace=True)

df = df.drop_duplicates()
df = df.reset_index(drop=True)
print(df.head())

   userId  movieId  rating             title     genres
0       1        1     4.0  Toy Story (1995)  Adventure
1       1        1     4.0  Toy Story (1995)  Animation
2       1        1     4.0  Toy Story (1995)   Children
3       1        1     4.0  Toy Story (1995)     Comedy
4       1        1     4.0  Toy Story (1995)    Fantasy


In [13]:
print(df["genres"].nunique())
print(df["genres"].unique())

20
['Adventure' 'Animation' 'Children' 'Comedy' 'Fantasy' 'Romance' 'Action'
 'Crime' 'Thriller' 'Mystery' 'Horror' 'Drama' 'War' 'Western' 'Sci-Fi'
 'Musical' 'Film-Noir' 'IMAX' 'Documentary' '(no genres listed)']


In [14]:
filtered_df = df[df["genres"] == "(no genres listed)"]

print(filtered_df)

        userId  movieId  rating  \
10172       21   122896     4.0   
20159       50   114335     3.0   
20298       50   174403     2.5   
25113       62   122896     3.5   
25222       62   172591     5.0   
25234       62   176601     5.0   
38470       89   155589     3.0   
46293      105   147250     5.0   
46347      105   171749     5.0   
46375      105   173535     4.5   
48571      111   122896     3.5   
48623      111   134861     2.5   
48692      111   159161     4.5   
48763      111   171631     1.0   
48766      111   171891     3.5   
53392      125   142456     4.5   
74636      184   181413     4.5   
82701      210   159779     4.0   
83666      212   122896     3.5   
100101     248   122896     4.0   
103406     252   122896     3.0   
121967     296   169034     4.5   
136406     318   142456     3.5   
136455     318   171495     5.0   
159620     380   172497     4.0   
177688     414   166024     4.5   
177699     414   167570     4.0   
192315     448   129

In [15]:
print(filtered_df.shape)
print("\n")
print(df["movieId"].nunique())

(47, 5)


9724


The movies that doesn't have a genre list don't add any information considering a genre based recommender, so I will proceed to delete those movies (great movies to be honest, so it hurts).

In [16]:
df = df[df["genres"] != "(no genres listed)"]
df.reset_index(drop=True, inplace=True)


print(df["movieId"].nunique())
print(df.shape)

9690
(274433, 5)


In [17]:
df_genres = pd.get_dummies(df, columns=["genres"], dtype=int)
print(df_genres.head())

   userId  movieId  rating             title  genres_Action  genres_Adventure  \
0       1        1     4.0  Toy Story (1995)              0                 1   
1       1        1     4.0  Toy Story (1995)              0                 0   
2       1        1     4.0  Toy Story (1995)              0                 0   
3       1        1     4.0  Toy Story (1995)              0                 0   
4       1        1     4.0  Toy Story (1995)              0                 0   

   genres_Animation  genres_Children  genres_Comedy  genres_Crime  ...  \
0                 0                0              0             0  ...   
1                 1                0              0             0  ...   
2                 0                1              0             0  ...   
3                 0                0              1             0  ...   
4                 0                0              0             0  ...   

   genres_Film-Noir  genres_Horror  genres_IMAX  genres_Musical  \
0

In [18]:
print(f"Number of ratings: {df_genres.shape[0]}")
print(f"\nUnique users: {df_genres['userId'].nunique()}")
print(f"\nUnique movies: {df_genres['movieId'].nunique()}")

Number of ratings: 274433

Unique users: 610

Unique movies: 9690


In [19]:
print(df_genres.head(1))
df_genres.head(5)

   userId  movieId  rating             title  genres_Action  genres_Adventure  \
0       1        1     4.0  Toy Story (1995)              0                 1   

   genres_Animation  genres_Children  genres_Comedy  genres_Crime  ...  \
0                 0                0              0             0  ...   

   genres_Film-Noir  genres_Horror  genres_IMAX  genres_Musical  \
0                 0              0            0               0   

   genres_Mystery  genres_Romance  genres_Sci-Fi  genres_Thriller  genres_War  \
0               0               0              0                0           0   

   genres_Western  
0               0  

[1 rows x 23 columns]


Unnamed: 0,userId,movieId,rating,title,genres_Action,genres_Adventure,genres_Animation,genres_Children,genres_Comedy,genres_Crime,...,genres_Film-Noir,genres_Horror,genres_IMAX,genres_Musical,genres_Mystery,genres_Romance,genres_Sci-Fi,genres_Thriller,genres_War,genres_Western
0,1,1,4.0,Toy Story (1995),0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,1,4.0,Toy Story (1995),0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,1,4.0,Toy Story (1995),0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1,4.0,Toy Story (1995),0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,1,1,4.0,Toy Story (1995),0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
df_grouped = df_genres.groupby(['userId', 'movieId'], as_index=False).agg('max')

print(df_grouped.info)

<bound method DataFrame.info of         userId  movieId  rating                           title  \
0            1        1     4.0                Toy Story (1995)   
1            1        3     4.0         Grumpier Old Men (1995)   
2            1        6     4.0                     Heat (1995)   
3            1       47     5.0     Seven (a.k.a. Se7en) (1995)   
4            1       50     5.0      Usual Suspects, The (1995)   
...        ...      ...     ...                             ...   
100784     610   166534     4.0                    Split (2017)   
100785     610   168248     5.0   John Wick: Chapter Two (2017)   
100786     610   168250     5.0                  Get Out (2017)   
100787     610   168252     5.0                    Logan (2017)   
100788     610   170875     3.0  The Fate of the Furious (2017)   

        genres_Action  genres_Adventure  genres_Animation  genres_Children  \
0                   0                 1                 1                1   
1      

In [21]:
user_one_hot = pd.get_dummies(df_grouped['userId'], prefix='user')

df_grouped = pd.concat([df_grouped, user_one_hot], axis=1)

In [22]:
scaler = MinMaxScaler()

df_grouped['rating_scaled'] = scaler.fit_transform(df_grouped[['rating']])

In [23]:
print(df_grouped[['rating', 'rating_scaled']].head())
      
print(f"\n")

print(df_grouped['rating_scaled'].min(), df_grouped['rating_scaled'].max())

   rating  rating_scaled
0     4.0       0.777778
1     4.0       0.777778
2     4.0       0.777778
3     5.0       1.000000
4     5.0       1.000000


0.0 1.0


In [24]:
X = df_grouped.drop(['rating', 'userId', 'title'], axis=1)

y = df_grouped['rating_scaled']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)

In [25]:
model = Sequential([
    Dense(128, activation='relu', input_dim=X_train.shape[1]),
    Dense(64, activation='relu'),  # Capa oculta
    Dense(32, activation='relu'),  # Capa oculta
    Dense(1)
])

model.compile(optimizer='adam', loss='mean_squared_error')

2025-03-23 18:04:03.586968: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


In [26]:
X_train = X_train.astype('float32')
y_train = y_train.astype('float32')

X_val = X_val.astype('float32')
y_val = y_val.astype('float32')

print(X_train.isnull().sum())
print(y_train.isnull().sum())

movieId             0
genres_Action       0
genres_Adventure    0
genres_Animation    0
genres_Children     0
                   ..
user_607            0
user_608            0
user_609            0
user_610            0
rating_scaled       0
Length: 631, dtype: int64
0


In [27]:
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [28]:
loss = model.evaluate(X_val, y_val)
print(f"Loss on valuation set: {loss}")

Loss on valuation set: 0.05338836833834648


In [None]:
import matplotlib.pyplot as plt

plt.plot(history.history['loss'], label='Pérdida de entrenamiento')
plt.plot(history.history['val_loss'], label='Pérdida de validación')
plt.xlabel('Épocas')
plt.ylabel('Pérdida')
plt.legend()
plt.show()


In [35]:
predictions = model.predict(X_val)

predictions_rescaled = scaler.inverse_transform(predictions)

# Comparar algunas predicciones con las calificaciones reales
for i in range(5):
    print(f"Real rating: {y_val.iloc[i]}, Predicted rating: {predictions[i][0]}")

Real rating: 0.6666666865348816, Predicted rating: 0.6600615978240967
Real rating: 0.6666666865348816, Predicted rating: 0.6577098369598389
Real rating: 0.6666666865348816, Predicted rating: 0.6617114543914795
Real rating: 0.8888888955116272, Predicted rating: 0.6488292217254639
Real rating: 0.7777777910232544, Predicted rating: 0.6555964946746826
