In [2]:
# ======================================
# 0. Imports
# ======================================
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import SGDClassifier
from scipy.sparse import hstack, csr_matrix

np.random.seed(42)

# ======================================
# 1. Load data from dataset folder
# ======================================
movies = pd.read_csv("dataset/movies_metadata.csv", low_memory=False)
ratings = pd.read_csv("dataset/ratings.csv")

# ---- clean movie ids ----
movies = movies[movies["id"].str.isnumeric()]
movies["id"] = movies["id"].astype(int)

# ---- drop duplicates ----
movies = movies.drop_duplicates(subset="id").reset_index(drop=True)
ratings = ratings[ratings["movieId"].isin(movies["id"])]

# ======================================
# 2. Movie features
# ======================================
def parse_genres(x):
    try:
        return [g["name"] for g in eval(x)]
    except:
        return []

movies["genres_list"] = movies["genres"].apply(parse_genres)
movies["overview"] = movies["overview"].fillna("")
movies["tagline"] = movies["tagline"].fillna("")
movies["text"] = movies["overview"] + " " + movies["tagline"]

# ---- Text features ----
tfidf = TfidfVectorizer(max_features=5000, stop_words="english", ngram_range=(1,2))
X_text = tfidf.fit_transform(movies["text"])

# ---- Genres ----
mlb = MultiLabelBinarizer()
X_genres = csr_matrix(mlb.fit_transform(movies["genres_list"]))

# ---- Numeric ----
num_cols = ["popularity", "runtime", "vote_average", "vote_count"]
movies[num_cols] = movies[num_cols].apply(pd.to_numeric, errors="coerce").fillna(0)
scaler = StandardScaler()
X_num = csr_matrix(scaler.fit_transform(movies[num_cols]))

# ---- Item feature space ----
X_item_raw = hstack([X_text, X_genres, X_num])
print("Raw item feature space:", X_item_raw.shape)

# ======================================
# 3. Item embeddings
# ======================================
svd = TruncatedSVD(n_components=128, random_state=42)
X_item = svd.fit_transform(X_item_raw)
print("Item embedding space:", X_item.shape)

# ---- Safe mapping ----
movie_id_to_idx = dict(zip(movies["id"], range(len(movies))))

# ======================================
# 4. Ratings → indices
# ======================================
ratings = ratings.copy()
ratings["movie_idx"] = ratings["movieId"].map(movie_id_to_idx)
ratings = ratings.dropna(subset=["movie_idx"])
ratings["movie_idx"] = ratings["movie_idx"].astype(int)

# ======================================
# 5. User embeddings (weighted average of rated movies)
# ======================================
user_embeddings = {}
for user_id, grp in ratings.groupby("userId"):
    idxs = grp["movie_idx"].values
    weights = grp["rating"].values
    user_embeddings[user_id] = np.average(X_item[idxs], axis=0, weights=weights)

print("Number of users with embeddings:", len(user_embeddings))

# ======================================
# 6. Train classifier in batches (memory-efficient)
# ======================================
batch_size = 50000
model = SGDClassifier(loss="log_loss", max_iter=1, learning_rate="optimal")

ratings_list = list(ratings.itertuples(index=False))
n_batches = int(np.ceil(len(ratings_list) / batch_size))

for epoch in range(5):  # несколько эпох
    for i in range(n_batches):
        batch = ratings_list[i*batch_size:(i+1)*batch_size]
        X_batch, y_batch = [], []
        for row in batch:
            u, m = row.userId, row.movie_idx
            if u not in user_embeddings:
                continue
            X_batch.append(np.hstack([user_embeddings[u], X_item[m]]))
            y_batch.append(1 if row.rating >= 4.0 else 0)
        if X_batch:
            X_batch = np.array(X_batch, dtype=np.float32)
            y_batch = np.array(y_batch)
            if epoch == 0 and i == 0:
                model.partial_fit(X_batch, y_batch, classes=[0,1])
            else:
                model.partial_fit(X_batch, y_batch)
    print(f"Epoch {epoch+1} done")

# ======================================
# 7. Recommendation function
# ======================================
def recommend_for_user(user_id, top_n=10):
    if user_id not in user_embeddings:
        return None
    user_vec = user_embeddings[user_id].reshape(1,-1)
    user_block = np.repeat(user_vec, X_item.shape[0], axis=0)
    X_pred = np.hstack([user_block, X_item])
    scores = model.predict_proba(X_pred)[:,1]
    
    seen = ratings[ratings["userId"] == user_id]["movieId"].values
    recs = movies.copy()
    recs["score"] = scores
    recs = recs[~recs["id"].isin(seen)]
    return recs.sort_values("score", ascending=False)[["title", "score", "genres_list"]].head(top_n)

# ======================================
# 8. Example
# ======================================
print("\nRecommendations for user 1:")
print(recommend_for_user(1))


Raw item feature space: (45433, 5024)
Item embedding space: (45433, 128)
Number of users with embeddings: 265917
Epoch 1 done
Epoch 2 done
Epoch 3 done
Epoch 4 done
Epoch 5 done

Recommendations for user 1:
                                              title     score  \
30677                                       Minions  0.840955   
16481                               The Fern Flower  0.820101   
39650                            Komunaris chibukhi  0.801518   
35903                               Poil de Carotte  0.774752   
3529                        City of the Living Dead  0.768445   
40297                                A Simple Story  0.761717   
42164               Mia trelli... trelli oikogeneia  0.760367   
18198             Extremely Loud & Incredibly Close  0.759424   
2097                                    Jamaica Inn  0.749888   
21191  Return of the Living Dead: Rave to the Grave  0.749526   

                                  genres_list  
30677  [Family, Animation, Ad