Pipeline with one–rating-per-movie test split using CSR for the rating matrix
1. Load raw data
2. Split events into train/test (one test rating per movie)
3. Build sparse movie×user rating matrix on TRAIN events
4. Generate per-movie features from TRAIN
5. Cluster movies (dimensionality reduction)
6. Build event-level feature arrays & model

In [None]:
import numpy as np
from datetime import datetime
from scipy.sparse import coo_matrix
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import (
    mean_squared_error,
    accuracy_score,
    classification_report,
    silhouette_score,
    calinski_harabasz_score,
    davies_bouldin_score
)
import subprocess

random_state = 13119140
rng = np.random.default_rng(random_state)

# %% 1) LOAD RAW DATA

movie_ids = []
user_ids  = []
ratings   = []
dates     = []

with open("/Users/serenahan/Downloads/dataSet/data.txt", "r") as f:
    current_mid = None
    for line in f:
        line = line.strip()
        if line.endswith(":"):
            current_mid = int(line[:-1])
        else:
            u, r, d = line.split(",")
            movie_ids.append(current_mid)
            user_ids.append(int(u))
            ratings.append(int(r))
            dates.append(np.datetime64(d))

movie_ids = np.array(movie_ids, dtype=int)
user_ids  = np.array(user_ids,  dtype=int)
ratings   = np.array(ratings,   dtype=int)
dates     = np.array(dates,     dtype="datetime64[D]")

# Load movie titles & release years
mids, rel_years, titles = [], [], []
with open("/Users/serenahan/Downloads/dataSet/movieTitles.csv", "r", encoding="utf-8") as f:
    for line in f:
        parts = line.rstrip("\n").split(",", 2)
        if len(parts) != 3:
            continue
        mid_str, date_str, title_str = parts
        try:
            mid = int(mid_str)
        except ValueError:
            continue
        mids.append(mid)
        try:
            yr = datetime.strptime(date_str, "%Y-%m-%d").year
        except:
            yr = np.nan
        rel_years.append(yr)
        titles.append(title_str.strip())

mids      = np.array(mids,      dtype=int)
rel_years = np.array(rel_years, dtype=float)
titles    = np.array(titles,    dtype=object)

# release‐year lookup
year_map = {mid: yr for mid, yr in zip(mids, rel_years)}

# %% 2) SPLIT EVENTS INTO TRAIN/TEST (one test rating per movie)

uniq_mids, inv_mid = np.unique(movie_ids, return_inverse=True)
test_event_idx = np.array([
    rng.choice(np.where(inv_mid == grp)[0], size=1)[0]
    for grp in range(uniq_mids.size)
], dtype=int)

all_event_idx = np.arange(movie_ids.size)
mask_test     = np.zeros_like(all_event_idx, dtype=bool)
mask_test[test_event_idx] = True

train_event_idx = all_event_idx[~mask_test]
test_event_idx  = all_event_idx[ mask_test]

# %% 3) BUILD CSR MOVIE×USER RATING MATRIX ON TRAIN

uniq_uids = np.unique(user_ids)
n_movies  = uniq_mids.size
n_users   = uniq_uids.size

mid_to_idx = {mid: i for i, mid in enumerate(uniq_mids)}
uid_to_idx = {uid: j for j, uid in enumerate(uniq_uids)}

# prepare arrays for COO construction
rows = [mid_to_idx[movie_ids[i]] for i in train_event_idx]
cols = [uid_to_idx[user_ids[i]]  for i in train_event_idx]
data = ratings[train_event_idx]

rating_sparse_train = coo_matrix(
    (data, (rows, cols)),
    shape=(n_movies, n_users)
).tocsr()

# %% 4) GENERATE PER-MOVIE FEATURES (ON TRAIN DATA)

# 4a) rating count per movie
rating_counts = rating_sparse_train.getnnz(axis=1)

# 4b) sum of ratings per movie, then average
sum_ratings   = rating_sparse_train.sum(axis=1).A1
avg_ratings   = sum_ratings / rating_counts

# 4c) release year per movie, impute missing
release_years = np.array([year_map.get(mid, np.nan) for mid in uniq_mids],
                         dtype=float)
valid_years   = release_years[np.isfinite(release_years)]
year_med      = np.median(valid_years) if valid_years.size else datetime.now().year
release_years = np.where(np.isfinite(release_years),
                         release_years,
                         year_med)

# stack into (n_movies × 3)
movie_feats = np.vstack([avg_ratings, rating_counts, release_years]).T

# %% 5) CLUSTERING FOR DIMENSIONALITY REDUCTION
# prepare the 2D input (avg_rating, count)
cluster_input = StandardScaler().fit_transform(movie_feats[:, :2])

cluster_algos = {
    "KMeans":            KMeans(n_clusters=30, random_state=random_state),
    "Agglomerative":     AgglomerativeClustering(n_clusters=30),
    "DBSCAN":            DBSCAN(eps=0.5, min_samples=5),
    "GaussianMixture":   GaussianMixture(n_components=30, random_state=random_state)
}

print("\nClustering performance on movie_feats:")
for name, algo in cluster_algos.items():
    if name == "GaussianMixture":
        labels = algo.fit(cluster_input).predict(cluster_input)
    else:
        labels = algo.fit_predict(cluster_input)

    # skip metrics if only one cluster or too many small clusters
    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
    if n_clusters <= 1:
        print(f" - {name}: only {n_clusters} cluster(s), skipping metrics")
        continue

    sil  = silhouette_score(cluster_input, labels)
    ch   = calinski_harabasz_score(cluster_input, labels)
    db   = davies_bouldin_score(cluster_input, labels)
    print(f" - {name:15s}  clusters={n_clusters:2d}  "
          f"Silhouette={sil:.3f}  CH={ch:.1f}  DB={db:.3f}")

# choose KMeans labels for downstream:
clusters = cluster_algos["KMeans"].fit_predict(cluster_input)

# %% 6) BUILD EVENT-LEVEL FEATURE ARRAYS & MODEL

# map each event to its movie’s cluster
train_movie_idx = inv_mid[train_event_idx]
test_movie_idx  = inv_mid[test_event_idx]

train_clusters_event = clusters[train_movie_idx].reshape(-1, 1)
test_clusters_event  = clusters[test_movie_idx ].reshape(-1, 1)

# one-hot encode cluster IDs
ohe             = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
cluster_ohe_tr  = ohe.fit_transform(train_clusters_event)
cluster_ohe_te  = ohe.transform(test_clusters_event)

# scale release year per event
ry_train        = release_years[train_movie_idx].reshape(-1, 1)
ry_test         = release_years[test_movie_idx ].reshape(-1, 1)
scaler_yr       = StandardScaler()
ry_tr_scaled    = scaler_yr.fit_transform(ry_train)
ry_te_scaled    = scaler_yr.transform(ry_test)

# assemble design matrices
X_train = np.hstack([cluster_ohe_tr, ry_tr_scaled])
y_train = ratings[train_event_idx]

X_test  = np.hstack([cluster_ohe_te, ry_te_scaled])
y_test  = ratings[test_event_idx]

#%% 7) fit model 

# linear regression
lr    = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
rmse  = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"Linear Regression RMSE: {rmse:.4f}")
subprocess.run(["say", "Yo  come look at these results!! script has finished running"])
'''
# use the original integer ratings as class labels
y_train_cls = y_train
y_test_cls  = y_test

# 6a) Linear SVM
svc = LinearSVC(
    random_state=random_state,
    max_iter=1000,      # increase if needed
    tol=1e-4
)
svc.fit(X_train, y_train_cls)
y_pred_svc = svc.predict(X_test)
acc_svc    = accuracy_score(y_test_cls, y_pred_svc)
print(f"LinearSVC Accuracy: {acc_svc:.4f}")
print(classification_report(y_test_cls, y_pred_svc))

# 6b) SGDClassifier (hinge loss = linear SVM)
sgd = SGDClassifier(
    loss="hinge",
    random_state=random_state,
    max_iter=1000,
    tol=1e-4
)
sgd.fit(X_train, y_train_cls)
y_pred_sgd = sgd.predict(X_test)
acc_sgd    = accuracy_score(y_test_cls, y_pred_sgd)
print(f"SGDClassifier Accuracy: {acc_sgd:.4f}")
print(classification_report(y_test_cls, y_pred_sgd))

subprocess.run(["say", "Yo  come look at these results!! script has finished running"])
'''

import nbformat

# read—nbformat will skip non-JSON lines
nb = nbformat.read('FMLcapstone.ipynb', as_version=4)

# write out clean JSON
nbformat.write(nb, 'FMLcapstone_fixed.ipynb')
