# Optimized ItemKNN Recommender with NDCG Evaluation

This notebook implements an optimized ItemKNN model with proper NDCG evaluation.

In [None]:
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.preprocessing import normalize
import time

## 1. Data Loading

In [None]:
def load_data(file_path):
    print("Loading data...")
    rows, cols, data = [], [], []
    user_history = {}
    max_user_id, max_item_id = 0, 0
    
    with open(file_path, 'r') as f:
        for line in f:
            parts = list(map(int, line.strip().split()))
            if not parts:
                continue
            user_id = parts[0]
            items = parts[1:]
            if not items:
                continue
                
            user_history[user_id] = items
            max_user_id = max(max_user_id, user_id)
            for item_id in items:
                rows.append(user_id)
                cols.append(item_id)
                data.append(1)
                max_item_id = max(max_item_id, item_id)
                
    X = csr_matrix((data, (rows, cols)), shape=(max_user_id + 1, max_item_id + 1))
    print(f"Data loaded. Shape: {X.shape}, Non-zeros: {X.nnz}")
    return X, user_history

train_file = '/Users/riteshsingh/Documents/SJSU/Recommender System/projectrec/train-2.txt'
X_full, user_history_full = load_data(train_file)

## 2. Train/Validation Split

In [None]:
def create_train_val_split(user_history):
    train_hist, val_hist = {}, {}
    
    for user, items in user_history.items():
        if len(items) < 2:
            train_hist[user] = items
            val_hist[user] = []
        else:
            train_hist[user] = items[:-1]
            val_hist[user] = [items[-1]]
    
    rows, cols, data = [], [], []
    max_user, max_item = 0, 0
    for user, items in train_hist.items():
        max_user = max(max_user, user)
        for item in items:
            rows.append(user)
            cols.append(item)
            data.append(1)
            max_item = max(max_item, item)
    
    X_train = csr_matrix((data, (rows, cols)), shape=(max_user + 1, max_item + 1))
    return X_train, train_hist, val_hist

X_train, train_hist, val_hist = create_train_val_split(user_history_full)
print(f"Train: {X_train.shape}, nnz: {X_train.nnz}")

## 3. Compute Similarity Matrix

In [None]:
def compute_similarity(X, k=200):
    print("Computing similarity matrix...")
    start_time = time.time()
    
    X_norm = normalize(X, norm='l2', axis=0)
    Sim = X_norm.T.dot(X_norm)
    Sim.setdiag(0)
    
    print(f"Similarity computed in {time.time() - start_time:.2f}s")
    return Sim

Sim_train = compute_similarity(X_train)

## 4. NDCG Calculation

In [None]:
def dcg_at_k(relevances, k):
    """Calculate DCG@k"""
    relevances = np.array(relevances)[:k]
    if relevances.size:
        return np.sum(relevances / np.log2(np.arange(2, relevances.size + 2)))
    return 0.0

def ndcg_at_k(recommended, relevant, k=20):
    """Calculate NDCG@k"""
    # Create relevance list for recommended items
    rel_set = set(relevant)
    relevances = [1 if item in rel_set else 0 for item in recommended[:k]]
    
    # DCG
    dcg = dcg_at_k(relevances, k)
    
    # IDCG (ideal - all relevant items first)
    ideal_relevances = [1] * min(len(rel_set), k) + [0] * (k - min(len(rel_set), k))
    idcg = dcg_at_k(ideal_relevances, k)
    
    if idcg == 0:
        return 0.0
    return dcg / idcg

def evaluate_ndcg(X, Sim, train_hist, val_hist, k=20):
    print(f"Evaluating NDCG@{k}...")
    
    n_users = X.shape[0]
    scores = X.dot(Sim)
    
    ndcg_scores = []
    
    for user in range(n_users):
        if user not in val_hist or len(val_hist[user]) == 0:
            continue
        
        user_scores = scores[user].toarray().flatten()
        
        # Mask training items
        if user in train_hist:
            user_scores[train_hist[user]] = -np.inf
        
        # Get top k
        top_k_items = np.argsort(user_scores)[-k:][::-1]
        
        # Calculate NDCG
        ndcg = ndcg_at_k(top_k_items, val_hist[user], k)
        ndcg_scores.append(ndcg)
    
    avg_ndcg = np.mean(ndcg_scores)
    print(f"NDCG@{k}: {avg_ndcg:.4f} (evaluated {len(ndcg_scores)} users)")
    return avg_ndcg

ndcg_itemknn = evaluate_ndcg(X_train, Sim_train, train_hist, val_hist, k=20)

## 5. Train on Full Data and Generate Recommendations

In [None]:
# Train on full data
Sim_full = compute_similarity(X_full)

In [None]:
def generate_recommendations(X, Sim, user_history, output_file, top_k=20):
    print("Generating recommendations...")
    
    n_users = X.shape[0]
    batch_size = 1000
    
    with open(output_file, 'w') as f:
        for start_idx in range(0, n_users, batch_size):
            end_idx = min(start_idx + batch_size, n_users)
            
            user_batch = X[start_idx:end_idx]
            scores_batch = user_batch.dot(Sim)
            scores_dense = scores_batch.toarray()
            
            # Mask training items
            mask = user_batch.toarray() > 0
            scores_dense[mask] = -np.inf
            
            # Top k
            top_items = np.argpartition(scores_dense, -top_k, axis=1)[:, -top_k:]
            rows = np.arange(scores_dense.shape[0])[:, None]
            top_scores = scores_dense[rows, top_items]
            sort_ind = np.argsort(top_scores, axis=1)[:, ::-1]
            final_recs = top_items[rows, sort_ind]
            
            for i, u_id in enumerate(range(start_idx, end_idx)):
                recs = final_recs[i]
                f.write(f"{u_id} {' '.join(map(str, recs))}\n")
                
            if start_idx % 5000 == 0:
                print(f"Processed {start_idx} users...")

output_file = '/Users/riteshsingh/Documents/SJSU/Recommender System/projectrec/output.txt'
generate_recommendations(X_full, Sim_full, user_history_full, output_file)
print("Done!")

## 6. Results Summary

In [None]:
print("="*60)
print("ITEMKNN RECOMMENDER RESULTS")
print("="*60)
print(f"Validation NDCG@20: {ndcg_itemknn:.4f}")
print(f"Recommendations saved to: {output_file}")
print("="*60)