<a href="https://colab.research.google.com/github/lmassaron/ml4dummies_3ed/blob/main/ML4D3E_18_recommending_products_and_movies.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
URL = "http://files.grouplens.org/datasets/"
URL += "movielens/ml-1m.zip"
!wget {URL}
!unzip ml-1m.zip

--2025-08-17 18:13:55--  http://files.grouplens.org/datasets/movielens/ml-1m.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5917549 (5.6M) [application/zip]
Saving to: ‘ml-1m.zip’


2025-08-17 18:13:55 (17.2 MB/s) - ‘ml-1m.zip’ saved [5917549/5917549]

Archive:  ml-1m.zip
   creating: ml-1m/
  inflating: ml-1m/movies.dat        
  inflating: ml-1m/ratings.dat       
  inflating: ml-1m/README            
  inflating: ml-1m/users.dat         


In [2]:
import pandas as pd

users_column_names = [
    "user_id", "gender", "age",
    "occupation", "zip"]
users = pd.read_table(
    "ml-1m/users.dat",
    sep="::",
    header=None,
    names=users_column_names,
    engine="python")
ratings_column_names = [
    "user_id", "movie_id", "rating",
    "timestamp"]
ratings = pd.read_table(
    "ml-1m/ratings.dat",
    sep="::",
    header=None,
    names=ratings_column_names,
    engine="python")

movies_column_names = [
    "movie_id", "title", "genres"]
movies = pd.read_table(
    "ml-1m/movies.dat",
    sep="::",
    header=None,
    names=movies_column_names,
    engine="python",
    encoding="latin-1")

movie_lens = (ratings
    .merge(users, on="user_id")
    .merge(movies, on="movie_id"))

In [3]:
print(movie_lens.head())

   user_id  movie_id  rating  timestamp gender  age  occupation    zip  \
0        1      1193       5  978300760      F    1          10  48067   
1        1       661       3  978302109      F    1          10  48067   
2        1       914       3  978301968      F    1          10  48067   
3        1      3408       4  978300275      F    1          10  48067   
4        1      2355       5  978824291      F    1          10  48067   

                                    title                        genres  
0  One Flew Over the Cuckoo's Nest (1975)                         Drama  
1        James and the Giant Peach (1996)  Animation|Children's|Musical  
2                     My Fair Lady (1964)               Musical|Romance  
3                  Erin Brockovich (2000)                         Drama  
4                    Bug's Life, A (1998)   Animation|Children's|Comedy  


In [4]:
rating_counts = (movie_lens
    .groupby("rating")["user_id"]
    .count())
print(rating_counts)

rating
1     56174
2    107557
3    261197
4    348971
5    226310
Name: user_id, dtype: int64


In [5]:
reviews_per_user = (movie_lens
    .groupby("user_id")["title"]
    .count())

print("Average movie reviews per user:", end=" ")
print(f"{reviews_per_user.mean():.1f}")

reviews_per_movie = (movie_lens
    .groupby("title")["movie_id"]
    .count())

print("\nNumber of Reviews Per Movie:")
print(reviews_per_movie)

Average movie reviews per user: 165.6

Number of Reviews Per Movie:
title
$1,000,000 Duck (1971)                         37
'Night Mother (1986)                           70
'Til There Was You (1997)                      52
'burbs, The (1989)                            303
...And Justice for All (1979)                 199
                                             ... 
Zed & Two Noughts, A (1985)                    29
Zero Effect (1998)                            301
Zero Kelvin (Kjærlighetens kjøtere) (1995)      2
Zeus and Roxanne (1997)                        23
eXistenZ (1999)                               410
Name: movie_id, Length: 3706, dtype: int64


In [6]:
selected_movie = movie_lens[
    movie_lens["movie_id"] == 260]

num_ratings_for_movie = len(selected_movie)
average_rating_for_movie = (
    selected_movie["rating"].mean())

print(
    f"{num_ratings_for_movie} users gave an "
    f"average rating of {average_rating_for_movie:.2f}")

2991 users gave an average rating of 4.45


In [7]:
reduced_movie_data = movie_lens[
    movie_lens["rating"] >= 3.0]

columns_to_drop = [
    'movie_id', 'timestamp', 'genres', 'gender',
    'age', 'occupation', 'zip']
reduced_movie_data = reduced_movie_data.drop(
    columns=columns_to_drop)

print(reduced_movie_data.head(), "\n")

original_shape = movie_lens.shape
new_shape = reduced_movie_data.shape

print(
    f"Original Shape: {original_shape}, "
    f"New Shape: {new_shape}")

   user_id  rating                                   title
0        1       5  One Flew Over the Cuckoo's Nest (1975)
1        1       3        James and the Giant Peach (1996)
2        1       3                     My Fair Lady (1964)
3        1       4                  Erin Brockovich (2000)
4        1       5                    Bug's Life, A (1998) 

Original Shape: (1000209, 10), New Shape: (836478, 3)


In [8]:
title_total_ratings = (
    reduced_movie_data
    .groupby('title')['rating']
    .transform('size'))
is_frequently_rated = (
    title_total_ratings > 1000)
reduced_movie_data = reduced_movie_data[
    is_frequently_rated]

print(
    reduced_movie_data
    .groupby('title')['rating']
    .count()
    .sort_values()
    .head(), "\n")

new_shape = reduced_movie_data.shape
print(f"New shape: {new_shape}")

title
Few Good Men, A (1992)    1003
My Cousin Vinny (1992)    1003
Boogie Nights (1997)      1004
Sneakers (1992)           1009
Witness (1985)            1009
Name: rating, dtype: int64 

New shape: (237212, 3)


In [9]:
user_rating_pivot = pd.pivot_table(
    reduced_movie_data,
    index='user_id',
    columns='title',
    values='rating',)

print(user_rating_pivot.head())

title    2001: A Space Odyssey (1968)  Abyss, The (1989)  \
user_id                                                    
1                                 NaN                NaN   
2                                 NaN                NaN   
3                                 NaN                NaN   
4                                 NaN                NaN   
5                                 NaN                NaN   

title    African Queen, The (1951)  Airplane! (1980)  Aladdin (1992)  \
user_id                                                                
1                              NaN               4.0             4.0   
2                              NaN               NaN             NaN   
3                              NaN               NaN             NaN   
4                              NaN               NaN             NaN   
5                              NaN               NaN             NaN   

title    Alien (1979)  Aliens (1986)  Amadeus (1984)  American Beauty (199

In [10]:
target_movie_title = 'Young Frankenstein (1974)'
yf_ratings = user_rating_pivot[target_movie_title]

print(yf_ratings.dropna().head())

user_id
10    5.0
11    3.0
19    5.0
28    5.0
33    3.0
Name: Young Frankenstein (1974), dtype: float64


In [11]:
yf_correlations = user_rating_pivot.corrwith(
    yf_ratings)

print(
    yf_correlations.sort_values(
        ascending=False).head())

title
Young Frankenstein (1974)                       1.000000
Blazing Saddles (1974)                          0.412395
Alien (1979)                                    0.297567
Willy Wonka and the Chocolate Factory (1971)    0.272574
M*A*S*H (1970)                                  0.259304
dtype: float64


In [12]:
import numpy as np
from scipy.linalg import svd

original_matrix = np.array([
    [1, 2, 3],
    [4, 5, 6],
    [7, 8, 9],
])
print("Original Matrix:")
print(original_matrix)

u_matrix, singular_values, vt_matrix = svd(
    original_matrix)

print("\nU matrix:")
print(u_matrix)
print("\nSingular values (s vector):")
print(singular_values)
print("\nVT matrix (V transposed):")
print(vt_matrix)

sigma_matrix = np.diag(singular_values)
print("\nSigma matrix (for reconstruction):")
print(sigma_matrix)

reconstructed_matrix = (
    u_matrix @ sigma_matrix @ vt_matrix)
print("\nReconstructed matrix:")
print(reconstructed_matrix)

Original Matrix:
[[1 2 3]
 [4 5 6]
 [7 8 9]]

U matrix:
[[-0.21483724  0.88723069  0.40824829]
 [-0.52058739  0.24964395 -0.81649658]
 [-0.82633754 -0.38794278  0.40824829]]

Singular values (s vector):
[1.68481034e+01 1.06836951e+00 4.41842475e-16]

VT matrix (V transposed):
[[-0.47967118 -0.57236779 -0.66506441]
 [-0.77669099 -0.07568647  0.62531805]
 [-0.40824829  0.81649658 -0.40824829]]

Sigma matrix (for reconstruction):
[[1.68481034e+01 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 1.06836951e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 4.41842475e-16]]

Reconstructed matrix:
[[1. 2. 3.]
 [4. 5. 6.]
 [7. 8. 9.]]


In [13]:
sigma_matrix = np.zeros(
    (original_matrix.shape[0],
     original_matrix.shape[1]))

num_columns = original_matrix.shape[1]
singular_values_diag = np.diag(singular_values)

sigma_matrix[:num_columns, :num_columns] = (
    singular_values_diag)

print(sigma_matrix)

[[1.68481034e+01 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 1.06836951e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 4.41842475e-16]]


In [14]:
ratings_pivot_df = movie_lens.pivot_table(
    values='rating',
    index='user_id',
    columns='title',
    fill_value=0)

movie_titles_index = ratings_pivot_df.columns

In [15]:
from sklearn.decomposition import TruncatedSVD

svd_model = TruncatedSVD(
    n_components=15,
    random_state=101)

ratings_values_transposed = (
    ratings_pivot_df.values.T)
item_latent_features = svd_model.fit_transform(
    ratings_values_transposed)

In [16]:
target_movie_title = (
    'Star Wars: Episode V - '
    'The Empire Strikes Back (1980)'
)

movie_idx = list(movie_titles_index).index(
    target_movie_title)

print(f"Movie index: {movie_idx}")

latent_features_for_movie = item_latent_features[
    movie_idx]
print(latent_features_for_movie)

Movie index: 3154
[184.72254552 -17.77612872  47.33450866 -51.4664494  -47.92058216
 -17.65033116 -14.3574635  -12.82219207 -17.51347857  -5.46888807
   7.5430805   -0.57117869 -30.74032355   2.4088565  -22.50368497]


In [17]:
import numpy as np

item_similarity_matrix = np.corrcoef(
    item_latent_features)

target_movie_correlations = item_similarity_matrix[
    movie_idx]

is_highly_correlated = (
    (target_movie_correlations > 0.985) &
    (target_movie_correlations < 1.0))

similar_movie_titles = list(
    movie_titles_index[is_highly_correlated])
print(similar_movie_titles)

['Star Wars: Episode IV - A New Hope (1977)', 'Star Wars: Episode VI - Return of the Jedi (1983)']
