In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


In [None]:
!wget -q https://files.grouplens.org/datasets/movielens/ml-100k.zip
!unzip -q ml-100k.zip

In [None]:
ratings = pd.read_csv(
    "ml-100k/u.data",
    sep="\t",
    names=["user_id", "item_id", "rating", "timestamp"]
)

ratings.head()

movies = pd.read_csv(
    "ml-100k/u.item",
    sep="|",
    encoding="latin-1",
    header=None,
    usecols=[0, 1],
    names=["item_id", "title"]
)

In [None]:
train_data, test_data = train_test_split(
    ratings, test_size=0.2, random_state=42
)

print("Train size:", len(train_data))
print("Test size:", len(test_data))

Train size: 80000
Test size: 20000


In [None]:
train_data, test_data = train_test_split(
    ratings, test_size=0.2, random_state=42
)

In [None]:
train_matrix = train_data.pivot(
    index="user_id",
    columns="item_id",
    values="rating"
).fillna(0)

train_matrix.shape


(943, 1653)

In [None]:
item_similarity = cosine_similarity(train_matrix.T)

item_similarity_df = pd.DataFrame(
    item_similarity,
    index=train_matrix.columns,
    columns=train_matrix.columns
)


In [None]:
def predict_rating(user_id, item_id, k=5):
    if user_id not in train_matrix.index:
        return np.nan
    if item_id not in item_similarity_df.columns:
        return np.nan

    user_ratings = train_matrix.loc[user_id]
    rated_items = user_ratings[user_ratings > 0]

    if rated_items.empty:
        return np.nan

    sims = item_similarity_df[item_id][rated_items.index]
    top_k = sims.sort_values(ascending=False).head(k)

    if top_k.sum() == 0:
        return np.nan

    return np.dot(top_k, rated_items[top_k.index]) / top_k.sum()


In [None]:
y_true, y_pred = [], []

for _, row in test_data.iterrows():
    pred = predict_rating(row.user_id, row.item_id)
    if not np.isnan(pred):
        y_true.append(row.rating)
        y_pred.append(pred)

rmse = np.sqrt(mean_squared_error(y_true, y_pred))
print("RMSE:", rmse)


RMSE: 0.9836048610606758


In [None]:
def recommend_movies(user_id, top_n=5):
    if user_id not in train_matrix.index:
        print("User không tồn tại trong train set")
        return

    print(f"\n USER {user_id}")

    #top rate by user

    user_ratings = train_matrix.loc[user_id]
    top_rated = user_ratings[user_ratings > 0] \
        .sort_values(ascending=False) \
        .head(top_n)

    top_rated = top_rated.reset_index()
    top_rated.columns = ["item_id", "rating"]
    top_rated = top_rated.merge(movies, on="item_id")

    print("\n Top 5 movies by user:")
    for i, row in top_rated.iterrows():
        print(f"{i+1}. {row.title} — Rating: {row.rating}")

    # model recommendation

    unseen_items = train_matrix.columns[
        train_matrix.loc[user_id] == 0
    ]

    recommendations = []
    for item_id in unseen_items:
        pred = predict_rating(user_id, item_id)
        if not np.isnan(pred):
            recommendations.append((item_id, pred))

    rec_df = pd.DataFrame(
        recommendations,
        columns=["item_id", "pred_rating"]
    )

    rec_df = rec_df.merge(movies, on="item_id") \
                   .sort_values("pred_rating", ascending=False) \
                   .head(top_n)

    print("\n Top 5 by model:")
    for i, row in rec_df.iterrows():
        print(f"{i+1}. {row.title} — Predicted rating: {row.pred_rating:.2f}")


In [None]:
recommend_movies(user_id=1)


 USER 1

 Top 5 movies by user:
1. Usual Suspects, The (1995) — Rating: 5.0
2. Dead Man Walking (1995) — Rating: 5.0
3. Postino, Il (1994) — Rating: 5.0
4. Antonia's Line (1995) — Rating: 5.0
5. French Twist (Gazon maudit) (1995) — Rating: 5.0

 Top 5 by model:
78. English Patient, The (1996) — Predicted rating: 5.00
111. Everyone Says I Love You (1996) — Predicted rating: 5.00
28. Maya Lin: A Strong Clear Vision (1994) — Predicted rating: 5.00
113. Mother (1996) — Predicted rating: 5.00
966. Welcome To Sarajevo (1997) — Predicted rating: 5.00
