# Alternating Least Squares
Last Updated: Thursday, November 20, 7:08PM

NDCG@20 (on 50%): 0.0372

## 1. Import and explore data

In [72]:
import pandas as pd

df = pd.read_csv("data/train_2_long.csv")
print(f"Dataframe shape: {df.shape}")
df.info(show_counts=True)

Dataframe shape: (2380730, 2)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2380730 entries, 0 to 2380729
Data columns (total 2 columns):
 #   Column   Non-Null Count    Dtype
---  ------   --------------    -----
 0   user_id  2380730 non-null  int64
 1   item_id  2380730 non-null  int64
dtypes: int64(2)
memory usage: 36.3 MB


In [73]:
len_before = df.shape[0]
print(f"Number of entries before dropping duplicates: {len_before}")
df = df.drop_duplicates()
len_after = df.shape[0]
print(f"Number of entries after dropping duplicates: {len_after}")

Number of entries before dropping duplicates: 2380730
Number of entries after dropping duplicates: 2380730


In [74]:
print(f"There are {df['user_id'].nunique()} unique users.")
print(f"There are {df['item_id'].nunique()} unique items.")

There are 52643 unique users.
There are 91599 unique items.


In [75]:
user_interactions = df["user_id"].value_counts()
print(f"The average user interacted with {int(user_interactions.mean())} items.\n")
print(f"User interactions stats:\n{user_interactions.describe()}")

The average user interacted with 45 items.

User interactions stats:
count    52643.000000
mean        45.224056
std         77.958253
min         16.000000
25%         19.000000
50%         26.000000
75%         45.000000
max      10682.000000
Name: count, dtype: float64


In [76]:
item_interactions = df["item_id"].value_counts()
print(f"The average item has {int(item_interactions.mean())} interactions.\n")
print(f"User interactions stats:\n{item_interactions.describe()}")

The average item has 25 interactions.

User interactions stats:
count    91599.000000
mean        25.990786
std         38.397318
min          1.000000
25%         10.000000
50%         15.000000
75%         28.000000
max       1741.000000
Name: count, dtype: float64


In [77]:
most_active_user = user_interactions.index[0]
df[df["user_id"] == most_active_user]

Unnamed: 0,user_id,item_id
18999,150,9566
19000,150,60
19001,150,5357
19002,150,13171
19003,150,13172
...,...,...
29676,150,21470
29677,150,21471
29678,150,552
29679,150,21472


## 2. Prepare the data

### 2.1. Encoding user and items
`implicit` requires zero-based IDs

In [79]:
# Check ID ranges
print(f"User ID min: {df['user_id'].min()}, max: {df['user_id'].max()}")
print(f"Item ID min: {df['item_id'].min()}, max: {df['item_id'].max()}")

User ID min: 0, max: 52642
Item ID min: 6, max: 91604


In [80]:
# Map IDs to zero-based
user_map = {uid: u_idx for u_idx, uid in enumerate(sorted(df["user_id"].unique()))}
item_map = {iid: i_idx for i_idx, iid in enumerate(sorted(df["item_id"].unique()))}

### 2.2. Function to convert to sparse matrices

In [84]:
from scipy.sparse import coo_matrix

def df_to_sparse(df):
    """ Function that converts a dataframe to a sparse matrix. """
    rows = df["user_id"].map(user_map)
    cols = df["item_id"].map(item_map)
    vals = np.ones(len(df))

    matrix = coo_matrix((vals, (rows, cols)),
                        shape=(len(user_map), len(item_map)))
    
    return matrix

## 3. Split data into training and testing
using `sklearn.model selection.train_test_split`

* Split user-item interactions into training and testing.
* Ensures a more realistic evaluation when generalizing to new users or items.

For every user:
* Perform `train_test_split` on the items it has interacted with.

In [94]:
from sklearn.model_selection import train_test_split

# Define group
user_groups = df.groupby("user_id")

# Store splits
train_pairs, test_pairs = [], []

# Iterate through groups
for user, items in user_groups:
    train, test = train_test_split(items, test_size=0.2, random_state=17)
    train_pairs.append(train)
    test_pairs.append(test)

In [100]:
# Turn lists into dataframes
train_df = pd.concat(train_pairs)
test_df = pd.concat(test_pairs)

# Check dataframe shapes
print(train_df.shape, test_df.shape)

# Make sure split is done properly
print(train_df.shape[0] + test_df.shape[0] == df.shape[0])

(1881725, 2) (499005, 2)
True


## 4. Train ALS model
Alternating Least Squares (ALS) is a matrix factorization algorithm which typically works well for implicit feedback data, such as user-item interactions, as seen in this [Netflix Prize and SVD](http://buzzard.ups.edu/courses/2014spring/420projects/math420-UPS-spring-2014-gower-netflix-SVD.pdf) paper.

### 4.1. Create csr_matrix representations for train and test set

In [104]:
# Training matrix
train_csr = df_to_sparse(train_df).tocsr()
train_csr

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 1881725 stored elements and shape (52643, 91599)>

In [109]:
# Test matrix
test_csr = df_to_sparse(test_df).tocsr()
test_csr

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 499005 stored elements and shape (52643, 91599)>

### 4.2. Train model
Hyperparameters used for initial training:
* `factors` = 50: the number of latent factors in for decomposition
* `regularization` = 0.1: the regularization parameter ($\lambda$)
* `iterations` = 20: the number of training iterations
* `alpha` = 1.0: the weights given for positive interactions
* `use_cg` = True: used a faster solver



In [None]:
from implicit.als import AlternatingLeastSquares

model = AlternatingLeastSquares(
    factors=50,             # hyperparameter (latent factors)
    regularization=0.1,     # hyperparameter ()
    iterations=20,          # hyperparameter (epochs)
    alpha=1.0,              # hyperparameter (alpha)
    use_cg=True,
    calculate_training_loss=True,
    random_state=17
)

model.fit(train_csr, show_progress=True)

  0%|          | 0/20 [00:00<?, ?it/s]

In [226]:
# Check matrix decomposition shapes
print("User factors shape:", model.user_factors.shape)
print("Item factors shape:", model.item_factors.shape)

User factors shape: (52643, 50)
Item factors shape: (91599, 50)


## 5. Recommend top 20 items for users in test set

In [None]:
K = 20 # top 20, NDCG@K

recommended_items = model.recommend_all(train_csr, N=K, filter_already_liked_items=True)

In [None]:
# Check that model recommended top 20 items for ALL users.
len(recommended_items) == df["user_id"].nunique()

True

## 6. Evaluate with NDCG@20
For recommender systems:
1. Create recommendations using `train_csr`
2. Compare recommendations (predictions) from the `test_csr` (hold-out/validation set)
3. Perform hyperparameter tuning on the model using the same steps above to find the best parameters for the model.
4. Re-train the model with the entire dataset given the best hyperparameter values
5. Generate recommendations from this fully-trained model.

In [229]:
import numpy as np
from sklearn.metrics import ndcg_score

def batched_ndcg_at_k(recommended_items, test_matrix, k=20, batch_size=1000):
    num_users = test_matrix.shape[0]
    ndcg_scores = []

    for start in range(0, num_users, batch_size):
        end = min(start + batch_size, num_users)

        # Extract true relevance for the batch
        true_batch = test_matrix[start:end].toarray()

        # Build predicted score matrix for this batch
        pred_batch = np.zeros_like(true_batch)
        for i, user_idx in enumerate(range(start, end)):
            items = recommended_items[user_idx]
            pred_batch[i, items[:k]] = 1.0 / (np.arange(1, k + 1))  # rank weighting

        # Compute vectorized NDCG for the batch
        batch_ndcg = ndcg_score(true_batch, pred_batch, k=k)
        ndcg_scores.append(batch_ndcg)

    return np.mean(ndcg_scores)

# Example usage
mean_ndcg = batched_ndcg_at_k(recommended_items, test_csr, k=K, batch_size=1000)
print(f"Mean NDCG@20 = {mean_ndcg:.4f}")


Mean NDCG@20 = 0.0460


## 7. Hyperparameter tuning
Tune hyperparameters mentioned in Section 5.

In [232]:
import itertools

from joblib import Parallel, delayed

In [236]:
def evaluate_model(factors, reg, iters, alpha):
    """ Evaluates model given certain hyperparameters. """
    model = AlternatingLeastSquares(
        factors=factors,
        regularization=reg,
        iterations=iters,
        alpha=alpha,
        use_cg=True,
        random_state=17         # ensure evaluation is consistent for each model state
    )

    model.fit(train_csr)

    recs = model.recommend_all(train_csr, N=K, filter_already_liked_items=True)
    ndcg = batched_ndcg_at_k(recs, test_csr, k=K, batch_size=1000)
    
    return (factors, reg, iters, alpha, ndcg)

In [237]:
# Define parameters to search
param_grid = {
    "factors": [64, 128, 256],
    "regularization": [0.01, 0.05],
    "iterations": [15, 30, 50, 100],
    "alpha": [20, 40, 50]
}


param_combos = list(itertools.product(
    param_grid["factors"],
    param_grid["regularization"],
    param_grid["iterations"],
    param_grid["alpha"]
))


In [238]:
# Parallel grid search for parameters
results = Parallel(n_jobs=-1, verbose=10)(
    delayed(evaluate_model)(f, r, i, a) for (f, r, i, a) in param_combos
)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 10 concurrent workers.
100%|██████████| 15/15 [00:15<00:00,  1.06s/it]]
100%|██████████| 15/15 [00:15<00:00,  1.06s/it]
100%|██████████| 15/15 [00:16<00:00,  1.09s/it]]
100%|██████████| 30/30 [00:32<00:00,  1.09s/it]]
100%|██████████| 30/30 [00:33<00:00,  1.10s/it]]
100%|██████████| 30/30 [00:33<00:00,  1.11s/it]
100%|██████████| 50/50 [01:01<00:00,  1.23s/it]]
100%|██████████| 50/50 [01:01<00:00,  1.23s/it]
100%|██████████| 50/50 [01:01<00:00,  1.23s/it]]
100%|██████████| 100/100 [02:13<00:00,  1.34s/it]
100%|██████████| 15/15 [00:19<00:00,  1.30s/it]]
 36%|███▌      | 36/100 [00:48<01:15,  1.18s/it][Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed: 24.6min
100%|██████████| 15/15 [00:18<00:00,  1.22s/it]]
100%|██████████| 15/15 [00:18<00:00,  1.21s/it]]
100%|██████████| 30/30 [00:34<00:00,  1.17s/it]]
100%|██████████| 30/30 [00:36<00:00,  1.21s/it]]
100%|██████████| 30/30 [00:36<00:00,  1.20s/it]]
100%|██████████| 50/50 [00:59<0

In [None]:
# Display table of grid search
pd.DataFrame(results, columns=["factors", "reg", "iterations", "alpha", "ndcg"])

In [241]:
# Get best parameters
best_params = max(results, key=lambda x: x[4])      #x[4] is ndcg

print("Best parameters:")
print(f"factors={best_params[0]}, reg={best_params[1]}, iterations={best_params[2]}, alpha={best_params[3]}")
print(f"Best NDCG@20={best_params[4]:.4f}")


Best parameters:
factors=256, reg=0.05, iterations=50, alpha=40
Best NDCG@20=0.1107


## 9. Train model on full dataset with the best hyperparameters.

In [None]:
# Get best hyperparameters from the above cell
FACTORS = 256
REGULARIZATION = 0.05
ITERATIONS = 50
ALPHA = 40

In [247]:
# Create full dataset csr_matrix
full_csr = df_to_sparse(df).tocsr()

# Create model with best hyperparameters
model = AlternatingLeastSquares(
    factors=FACTORS,
    regularization=REGULARIZATION,
    iterations=ITERATIONS,
    alpha=ALPHA,
    use_cg=True,
    calculate_training_loss=True
)

# Train model
model.fit(full_csr, show_progress=True)

# Generate recommendations
recommendations = model.recommend_all(full_csr, N=K, filter_already_liked_items=True)

  0%|          | 0/50 [00:00<?, ?it/s]

## 10. Write output file

In [248]:
# Create reverse maps for user_idx and item_idx
reverse_user_map = {user_idx: user_id for user_id, user_idx in user_map.items()}
reverse_item_map = {item_idx: item_id for item_id, item_idx in item_map.items()}

In [249]:
output_file = "als_tuned.txt"

with open(output_file, "w") as f:
    for user_idx, item_indices in enumerate(recommendations):
        user_id = reverse_user_map[user_idx]
        item_ids = [str(reverse_item_map[i]) for i in item_indices]
        f.write(f"{user_id} {' '.join(item_ids)}\n")