# Recommender System (ItemKNN)

This notebook implements an Item-based K-Nearest Neighbors recommender system.

In [1]:
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.preprocessing import normalize
import sys
import time

## Data Loading
Function to load the dataset into a sparse CSR matrix.

In [2]:
def load_data(file_path):
    print("Loading data...")
    rows = []
    cols = []
    data = []
    
    max_user_id = 0
    max_item_id = 0
    
    with open(file_path, 'r') as f:
        for line in f:
            parts = list(map(int, line.strip().split()))
            if not parts:
                continue
            user_id = parts[0]
            items = parts[1:]
            
            if not items:
                continue
                
            max_user_id = max(max_user_id, user_id)
            for item_id in items:
                rows.append(user_id)
                cols.append(item_id)
                data.append(1)
                max_item_id = max(max_item_id, item_id)
                
    # Create CSR matrix
    # Shape is (max_user_id + 1, max_item_id + 1)
    X = csr_matrix((data, (rows, cols)), shape=(max_user_id + 1, max_item_id + 1))
    print(f"Data loaded. Shape: {X.shape}, Non-zeros: {X.nnz}")
    return X

## Similarity Computation
Compute item-item cosine similarity.

In [3]:
def compute_similarity(X, k=200):
    print("Computing similarity matrix...")
    # Normalize rows of X^T (which are items) to compute cosine similarity
    # Cosine similarity between item i and j is (X_i . X_j) / (|X_i| |X_j|)
    # This is equivalent to normalizing columns of X, then computing X^T X
    
    # We want item-item similarity.
    # X is User x Item.
    # Normalize columns (items) to unit norm
    X_norm = normalize(X, norm='l2', axis=0) # Normalize columns
    
    start_time = time.time()
    # Transpose X_norm to get (I, U)
    X_T = X_norm.T
    
    # Sim = X_T * X_norm
    # This computes cosine similarity
    Sim = X_T.dot(X_norm)
    
    # Zero out diagonal
    Sim.setdiag(0)
    
    print(f"Similarity computed. Shape: {Sim.shape}, Non-zeros: {Sim.nnz}")
    print(f"Time taken: {time.time() - start_time:.2f}s")
    
    return Sim

## Recommendation Generation
Generate top-k recommendations for each user.

In [4]:
def generate_recommendations(X, Sim, output_file, top_k=20):
    print("Generating recommendations...")
    # Scores = X * Sim
    # X is (U, I), Sim is (I, I) -> Scores is (U, I)
    
    n_users = X.shape[0]
    batch_size = 1000
    
    with open(output_file, 'w') as f:
        for start_idx in range(0, n_users, batch_size):
            end_idx = min(start_idx + batch_size, n_users)
            
            # Get user batch
            user_batch = X[start_idx:end_idx]
            
            # Compute scores
            scores_batch = user_batch.dot(Sim)
            
            # Convert to dense for sorting (batch is small enough)
            scores_dense = scores_batch.toarray()
            
            # Mask out items already interacted with
            mask = user_batch.toarray() > 0
            scores_dense[mask] = -np.inf
            
            # Get top K
            top_items = np.argpartition(scores_dense, -top_k, axis=1)[:, -top_k:]
            
            # The top_items are not sorted, so we need to sort them
            rows = np.arange(scores_dense.shape[0])[:, None]
            top_scores = scores_dense[rows, top_items]
            
            # Sort indices based on scores descending
            sort_ind = np.argsort(top_scores, axis=1)[:, ::-1]
            
            final_recs = top_items[rows, sort_ind]
            
            # Write to file
            for i, u_id in enumerate(range(start_idx, end_idx)):
                recs = final_recs[i]
                # Format: UserID item1 item2 ... item20
                f.write(f"{u_id} {' '.join(map(str, recs))}\n")
                
            if start_idx % 5000 == 0:
                print(f"Processed {start_idx} users...")

## Execution
Run the pipeline.

In [None]:
input_file = 'train-2.txt'
output_file = 'output_KNN.txt'

X = load_data(input_file)
Sim = compute_similarity(X)
generate_recommendations(X, Sim, output_file)
print("Done!")

Loading data...
Data loaded. Shape: (52643, 91605), Non-zeros: 2380730
Computing similarity matrix...
Similarity computed. Shape: (91605, 91605), Non-zeros: 330335859
Time taken: 9.70s
Generating recommendations...
Processed 0 users...
Processed 5000 users...
Processed 10000 users...
Processed 15000 users...
Processed 20000 users...
Processed 25000 users...
Processed 30000 users...
Processed 35000 users...
Processed 40000 users...
Processed 45000 users...
Processed 50000 users...
Done!


## Verification
Check the output file for correctness.

In [6]:
def verify_output(output_file, train_file):
    print("Verifying output...")
    
    # Load training data to check for duplicates
    train_interactions = {}
    with open(train_file, 'r') as f:
        for line in f:
            parts = list(map(int, line.strip().split()))
            if not parts: continue
            u_id = parts[0]
            items = set(parts[1:])
            train_interactions[u_id] = items
            
    num_users = 0
    with open(output_file, 'r') as f:
        for line in f:
            parts = list(map(int, line.strip().split()))
            if not parts: continue
            
            u_id = parts[0]
            recs = parts[1:]
            
            if len(recs) != 20:
                print(f"Error: User {u_id} has {len(recs)} recommendations instead of 20.")
                return
            
            if len(set(recs)) != 20:
                print(f"Error: User {u_id} has duplicate recommendations.")
                return
                
            # Check if recommended items are in training set
            if u_id in train_interactions:
                train_items = train_interactions[u_id]
                for item in recs:
                    if item in train_items:
                        print(f"Error: User {u_id} recommended item {item} which is in training set.")
                        return
            
            num_users += 1
            
    print(f"Verification successful! Verified {num_users} users.")

verify_output(output_file, input_file)

Verifying output...
Verification successful! Verified 52643 users.
