# Cell 1: Imports & Data Loading

In [1]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
import pickle

In [2]:
interactions = pd.read_parquet("interactions_new.parquet")
tracks = pd.read_parquet("tracks.parquet")
catalog = pd.read_parquet("catalog_names.parquet")

In [3]:
print(f"Interactions: {len(interactions)} rows")
print(f"Tracks:       {len(tracks)} rows")
print(f"Catalog:      {len(catalog)} rows")

Interactions: 200000 rows
Tracks:       1000000 rows
Catalog:      1812471 rows


In [4]:
interactions.tail()

Unnamed: 0,user_id,track_id,track_seq,started_at
9,1147,60164742,10,2022-12-10
10,1147,60441330,11,2022-12-13
11,1147,60715380,12,2022-12-19
12,1147,61693507,13,2022-12-21
13,1147,61835292,14,2022-12-21


In [5]:
tracks.head()

Unnamed: 0,track_id,albums,artists,genres
0,26,"[3, 2490753]",[16],"[11, 21]"
1,38,"[3, 2490753]",[16],"[11, 21]"
2,135,"[12, 214, 2490809]",[84],[11]
3,136,"[12, 214, 2490809]",[84],[11]
4,138,"[12, 214, 322, 72275, 72292, 91199, 213505, 24...",[84],[11]


In [6]:
catalog.head()

Unnamed: 0,id,type,name
0,3,album,Taller Children
1,12,album,Wild Young Hearts
2,13,album,Lonesome Crow
3,17,album,Graffiti Soul
4,26,album,Blues Six Pack


In [7]:
print("\nMissing values per column in interactions_new:")
print(interactions.isnull().sum())
print("\nMissing values per column in tracks:")
print(tracks.isnull().sum())
print("\nMissing values per column in catalog_names:")
print(catalog.isnull().sum())


Missing values per column in interactions_new:
user_id       0
track_id      0
track_seq     0
started_at    0
dtype: int64

Missing values per column in tracks:
track_id    0
albums      0
artists     0
genres      0
dtype: int64

Missing values per column in catalog_names:
id      0
type    0
name    0
dtype: int64


# 2. Select Top-N Tracks & Build Sparse Interaction Matrix

In [8]:
#  N
top_N = 15000

# Find top-N popular tracks
track_popularity = interactions['track_id'].value_counts()
top_tracks = track_popularity.head(top_N).index.tolist()

# Keep only those interactions
df_small = interactions[interactions['track_id'].isin(top_tracks)].copy()

# Build mappings from raw IDs to matrix indices
user_ids  = df_small['user_id'].unique()
track_ids = top_tracks

user_map  = {u: i for i, u in enumerate(user_ids)}
track_map = {t: i for i, t in enumerate(track_ids)}

# Build CSR matrix X (users × tracks)
rows = df_small['user_id'].map(user_map)
cols = df_small['track_id'].map(track_map)
data = np.ones(len(df_small), dtype=np.float32)

X = csr_matrix((data, (rows, cols)), shape=(len(user_ids), len(track_ids)))
print(f"Built interaction matrix X with shape {X.shape}")


Built interaction matrix X with shape (1129, 15000)


# 3. Leave-One-Out Split & EASE Training

In [9]:
def loo_split(X):
    X_train = X.copy().tolil()
    X_test  = csr_matrix(X.shape, dtype=np.float32).tolil()
    for u in range(X.shape[0]):
        items = X[u].nonzero()[1]
        if len(items) == 0:
            continue
        hold = np.random.choice(items)
        X_train[u, hold] = 0
        X_test[u, hold]  = 1
    return X_train.tocsr(), X_test.tocsr()

def train_ease(X_train, l2_reg=300.0):
    G = X_train.T.dot(X_train).toarray()        
    diag_idx = np.diag_indices(G.shape[0])
    G[diag_idx] += l2_reg
    P = np.linalg.inv(G)
    B = -P / np.diag(P)
    B[diag_idx] = 0
    return B


# 4. Evaluation Metrics

precision@K, recall@K, MAP@K, F1@K and NDCG@K.


In [10]:
def precision_at_k(recs, X_test, k=10):
    hits, users = 0, 0
    for u in range(X_test.shape[0]):
        true = set(X_test[u].nonzero()[1])
        if not true: continue
        pred = set(recs[u])
        hits += len(true & pred) / k
        users += 1
    return hits / users

def recall_at_k(recs, X_test, k=10):
    hits, users = 0, 0
    for u in range(X_test.shape[0]):
        true = set(X_test[u].nonzero()[1])
        if not true: continue
        pred = set(recs[u])
        hits += len(true & pred) / min(len(true), k)
        users += 1
    return hits / users

def map_at_k(recs, X_test, k=10):
    total_map, users = 0, 0
    for u in range(X_test.shape[0]):
        true = set(X_test[u].nonzero()[1])
        if not true: continue
        score, hits = 0, 0
        for rank, it in enumerate(recs[u], 1):
            if it in true:
                hits += 1
                score += hits / rank
        total_map += score / min(len(true), k) if hits>0 else 0
        users += 1
    return total_map / users

def f1_at_k(recs, X_test, k=10):
    p = precision_at_k(recs, X_test, k)
    r = recall_at_k(recs, X_test, k)
    return 2 * p * r / (p + r) if (p + r) > 0 else 0

def ndcg_at_k(recs, X_test, k=10):
    total, users = 0, 0
    for u in range(X_test.shape[0]):
        true = set(X_test[u].nonzero()[1])
        if not true: continue
        dcg = 0
        for rank, it in enumerate(recs[u], 1):
            if it in true:
                dcg += 1 / np.log2(rank + 1)
        idcg = sum(1/np.log2(i+1) for i in range(1, min(len(true),k)+1))
        total += (dcg/idcg if idcg>0 else 0)
        users += 1
    return total / users


# 5. Training

In [11]:
X_train, X_test = loo_split(X)

# Train a EASE model
l2_reg = 200.0
B = train_ease(X_train, l2_reg)

# Score & recommend
scores = X_train.dot(B)
scores[X_train.nonzero()] = -np.inf
recs   = np.argsort(-scores, axis=1)[:, :10]

# 6. Computing metrics

In [12]:
prec = precision_at_k(recs, X_test, 10)
rec  = recall_at_k(recs, X_test, 10)
mapk = map_at_k(recs, X_test, 10)
f1   = f1_at_k(recs, X_test, 10)
ndcg = ndcg_at_k(recs, X_test, 10)

print(f"Results for l2 = {l2_reg}:")
print(f"  Precision@10: {prec:.4f}")
print(f"  Recall@10:    {rec:.4f}")
print(f"  MAP@10:       {mapk:.4f}")
print(f"  F1@10:        {f1:.4f}")
print(f"  NDCG@10:      {ndcg:.4f}")

Results for l2 = 200.0:
  Precision@10: 0.0081
  Recall@10:    0.0806
  MAP@10:       0.0323
  F1@10:        0.0147
  NDCG@10:      0.0436


# 7. Example for user 0

In [17]:
track_name_map = catalog[catalog['type'] == 'track'].set_index('id')['name'].to_dict()

# Choose user (можно заменить 0 на нужный user_idx из user_ids)
user_idx = 123
user_id = user_ids[user_idx]  

# Get held-out and predicted items
held_idx = X_test[user_idx].nonzero()[1]
held_track_ids = [track_ids[i] for i in held_idx]

pred_idx = recs[user_idx]
pred_track_ids = [track_ids[i] for i in pred_idx]

# Print formatted recommendations
print(f"Top-10 recommendations for user {user_id}")
for rank, tid in enumerate(pred_track_ids, start=1):
    name = track_name_map.get(tid, "Unknown")
    print(f"{rank:2d}. [{tid}] {name}")

Top-10 recommendations for user 124
 1. [24692821] Way Down We Go
 2. [78426489] Rampampam
 3. [29569939] Tuesday
 4. [24663745] Ocean Drive
 5. [19200822] Imagination
 6. [17902234] Prayer in C
 7. [40330534] In the End
 8. [6679078] Intro
 9. [83698212] Rampampam
10. [55291388] Улетай на крыльях ветра



# 8. Save the final model

In [18]:
with open('ease_model.pkl', 'wb') as f:
    pickle.dump({
        'B': B,
        'user_map':  user_map,
        'track_map': track_map,
        'track_ids': track_ids
    }, f)
print("Model saved to ease_model.pkl")

Model saved to ease_model.pkl
