# Music Playlist Continuation Recommender System 
## CMPE 256 Group Project


In [None]:
import pandas as pd
import numpy as np
import os
import time
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
from gensim.models import Word2Vec
from collections import Counter

## 1. Data Preprocessing

In [None]:
def load_and_process_data(input_path, output_dir):
   
    print(f"Loading data from {input_path}...")
    
    
    data = []
    with open(input_path, 'r') as f:
        for line in f:
            parts = line.strip().split()
            if not parts:
                continue
            user_id = parts[0]
   
            tracks = parts[1:]
            
            for track_id in tracks:
                data.append([user_id, track_id])
    
    df = pd.DataFrame(data, columns=['user_id', 'item_id'])
    
    print(f"Raw data shape: {df.shape}")
    
    try:
        df['user_id'] = df['user_id'].astype(int)
        df['item_id'] = df['item_id'].astype(int)
    except ValueError:
        print("Warning: Could not convert IDs to integers. Keeping as strings.")

    original_count = len(df)
    df = df.drop_duplicates(subset=['user_id', 'item_id'], keep='first')
    print(f"Removed {original_count - len(df)} duplicate interactions.")
    
    df = df.dropna()
    
    df = df[df['item_id'] != 0]
    
    print(f"Cleaned data shape: {df.shape}")
    
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        
    full_path = os.path.join(output_dir, 'interactions_full.csv')
    df.to_csv(full_path, index=False)
    print(f"Saved full interactions to {full_path}")
    
    print("Creating Train/Validation split...")
    
    df['rank'] = df.groupby('user_id').cumcount() + 1
    df['total_items'] = df.groupby('user_id')['item_id'].transform('count')
    
    df['split_point'] = (df['total_items'] * 0.8).astype(int)
    
    train_df = df[df['rank'] <= df['split_point']].copy()
    val_df = df[df['rank'] > df['split_point']].copy()
    
    train_df = train_df.drop(columns=['rank', 'total_items', 'split_point'])
    val_df = val_df.drop(columns=['rank', 'total_items', 'split_point'])
    
    print(f"Train shape: {train_df.shape}")
    print(f"Validation shape: {val_df.shape}")
    
    train_path = os.path.join(output_dir, 'train_interactions.csv')
    val_path = os.path.join(output_dir, 'val_interactions.csv')
    
    train_df.to_csv(train_path, index=False)
    val_df.to_csv(val_path, index=False)
    print(f"Saved train split to {train_path}")
    print(f"Saved validation split to {val_path}")
    
    return train_df, val_df


INPUT_FILE = "train-2.txt"
OUTPUT_DIR = "data/interim"


if os.path.exists(INPUT_FILE):
    train_df, val_df = load_and_process_data(INPUT_FILE, OUTPUT_DIR)
else:
    print(f"File {INPUT_FILE} not found. Assuming data is already processed in {OUTPUT_DIR}.")
    if os.path.exists(os.path.join(OUTPUT_DIR, 'train_interactions.csv')):
        train_df = pd.read_csv(os.path.join(OUTPUT_DIR, 'train_interactions.csv'))
        val_df = pd.read_csv(os.path.join(OUTPUT_DIR, 'val_interactions.csv'))
        print("Loaded processed data.")

## 2. Model Implementation

In [None]:
class Recommender:
    def fit(self, train_df):
        raise NotImplementedError
    
    def recommend(self, user_id, n=20, already_seen=None):
        raise NotImplementedError

class PopularityRecommender(Recommender):
    def __init__(self, exclude_top_percent=0.0):
        self.popular_items = []
        self.exclude_top_percent = exclude_top_percent
        
    def fit(self, train_df):
        item_counts = train_df['item_id'].value_counts().reset_index()
        item_counts.columns = ['item_id', 'count']
        item_counts = item_counts.sort_values('count', ascending=False)
        
        if self.exclude_top_percent > 0:
            num_exclude = int(len(item_counts) * self.exclude_top_percent)
            item_counts = item_counts.iloc[num_exclude:]
            
        self.popular_items = item_counts['item_id'].tolist()
        
    def recommend(self, user_id, n=20, already_seen=None):
        if already_seen is None:
            already_seen = set()
            
        recs = []
        for item in self.popular_items:
            if item not in already_seen:
                recs.append(item)
                if len(recs) == n:
                    break
        return recs

class ItemCFRecommender(Recommender):
    def __init__(self, similarity_metric='cosine', k_neighbors=50):
        self.similarity_metric = similarity_metric
        self.k_neighbors = k_neighbors
        self.item_sim_matrix = None
        self.user_item_matrix = None
        self.item_to_idx = {}
        self.idx_to_item = {}
        self.user_to_idx = {}
        
    def fit(self, train_df):
        users = train_df['user_id'].unique()
        items = train_df['item_id'].unique()
        
        self.user_to_idx = {u: i for i, u in enumerate(users)}
        self.item_to_idx = {item: i for i, item in enumerate(items)}
        self.idx_to_item = {i: item for item, i in self.item_to_idx.items()}
        
        rows = train_df['user_id'].map(self.user_to_idx)
        cols = train_df['item_id'].map(self.item_to_idx)
        data = np.ones(len(train_df))
        
        self.user_item_matrix = csr_matrix((data, (rows, cols)), shape=(len(users), len(items)))
        item_user_matrix = self.user_item_matrix.T
        
        if self.similarity_metric == 'cosine':
            self.item_sim_matrix = cosine_similarity(item_user_matrix, dense_output=True)
        elif self.similarity_metric == 'jaccard':
            from sklearn.metrics.pairwise import pairwise_distances
            item_user_bool = item_user_matrix.astype(bool)
            dist_matrix = pairwise_distances(item_user_bool, metric='jaccard', n_jobs=-1)
            self.item_sim_matrix = 1 - dist_matrix
            
        np.fill_diagonal(self.item_sim_matrix, 0)
        
        if self.k_neighbors is not None and self.k_neighbors > 0:
            n_items = self.item_sim_matrix.shape[0]
            if self.k_neighbors < n_items:
                for i in range(n_items):
                    row = self.item_sim_matrix[i]
                    if len(row) > self.k_neighbors:
                        top_k_idx = np.argpartition(row, -self.k_neighbors)[-self.k_neighbors:]
                        mask = np.ones(n_items, dtype=bool)
                        mask[top_k_idx] = False
                        row[mask] = 0
                        self.item_sim_matrix[i] = row
            self.item_sim_matrix = csr_matrix(self.item_sim_matrix)
        
    def recommend(self, user_id, n=20, already_seen=None):
        if already_seen is None:
            already_seen = set()
        if user_id not in self.user_to_idx:
            return []
        u_idx = self.user_to_idx[user_id]
        user_vector = self.user_item_matrix[u_idx].toarray().flatten()
        scores = user_vector.dot(self.item_sim_matrix)
        if hasattr(scores, 'toarray'):
             scores = scores.toarray().flatten()
        k = n + len(already_seen) + 50
        top_indices = np.argsort(scores)[::-1][:k]
        recs = []
        for idx in top_indices:
            item = self.idx_to_item[idx]
            if item not in already_seen:
                recs.append(item)
                if len(recs) == n:
                    break
        return recs

    def recommend_batch(self, user_ids, n=20, train_interactions=None):
        u_indices = [self.user_to_idx[u] for u in user_ids if u in self.user_to_idx]
        if not u_indices:
            return {}
        user_vectors = self.user_item_matrix[u_indices]
        scores = user_vectors.dot(self.item_sim_matrix)
        results = {}
        valid_users = [u for u in user_ids if u in self.user_to_idx]
        for i, user_id in enumerate(valid_users):
            if hasattr(scores[i], 'toarray'):
                user_scores = scores[i].toarray().flatten()
            else:
                user_scores = scores[i]
            seen = train_interactions.get(user_id, set()) if train_interactions else set()
            k_cand = n + len(seen) + 50
            if k_cand > len(user_scores):
                k_cand = len(user_scores)
            top_indices_unsorted = np.argpartition(user_scores, -k_cand)[-k_cand:]
            top_scores = user_scores[top_indices_unsorted]
            sorted_indices_local = np.argsort(top_scores)[::-1]
            top_indices = top_indices_unsorted[sorted_indices_local]
            recs = []
            for idx in top_indices:
                item = self.idx_to_item[idx]
                if item not in seen:
                    recs.append(item)
                    if len(recs) == n:
                        break
            results[user_id] = recs
        return results

class SVDRecommender(Recommender):
    def __init__(self, n_components=50):
        self.n_components = n_components
        self.user_vecs = None
        self.item_vecs = None
        self.user_to_idx = {}
        self.item_to_idx = {}
        self.idx_to_item = {}
        
    def fit(self, train_df):
        users = train_df['user_id'].unique()
        items = train_df['item_id'].unique()
        self.user_to_idx = {u: i for i, u in enumerate(users)}
        self.item_to_idx = {item: i for i, item in enumerate(items)}
        self.idx_to_item = {i: item for item, i in self.item_to_idx.items()}
        rows = train_df['user_id'].map(self.user_to_idx)
        cols = train_df['item_id'].map(self.item_to_idx)
        data = np.ones(len(train_df))
        user_item_matrix = csr_matrix((data, (rows, cols)), shape=(len(users), len(items)))
        svd = TruncatedSVD(n_components=self.n_components, random_state=42)
        self.user_vecs = svd.fit_transform(user_item_matrix)
        self.item_vecs = svd.components_.T
        
    def recommend(self, user_id, n=20, already_seen=None):
        if already_seen is None:
            already_seen = set()
        if user_id not in self.user_to_idx:
            return []
        u_idx = self.user_to_idx[user_id]
        user_vec = self.user_vecs[u_idx]
        scores = user_vec.dot(self.item_vecs.T)
        k = n + len(already_seen) + 50
        top_indices = np.argsort(scores)[::-1][:k]
        recs = []
        for idx in top_indices:
            item = self.idx_to_item[idx]
            if item not in already_seen:
                recs.append(item)
                if len(recs) == n:
                    break
        return recs

    def recommend_batch(self, user_ids, n=20, train_interactions=None):
        u_indices = [self.user_to_idx[u] for u in user_ids if u in self.user_to_idx]
        if not u_indices:
            return {}
        user_vectors = self.user_vecs[u_indices]
        scores = user_vectors.dot(self.item_vecs.T)
        results = {}
        valid_users = [u for u in user_ids if u in self.user_to_idx]
        for i, user_id in enumerate(valid_users):
            user_scores = scores[i]
            seen = train_interactions.get(user_id, set()) if train_interactions else set()
            k_cand = n + len(seen) + 50
            if k_cand > len(user_scores):
                k_cand = len(user_scores)
            top_indices_unsorted = np.argpartition(user_scores, -k_cand)[-k_cand:]
            top_scores = user_scores[top_indices_unsorted]
            sorted_indices_local = np.argsort(top_scores)[::-1]
            top_indices = top_indices_unsorted[sorted_indices_local]
            recs = []
            for idx in top_indices:
                item = self.idx_to_item[idx]
                if item not in seen:
                    recs.append(item)
                    if len(recs) == n:
                        break
            results[user_id] = recs
        return results

class Item2VecRecommender(Recommender):
    def __init__(self, vector_size=100, window=5, min_count=1):
        self.vector_size = vector_size
        self.window = window
        self.min_count = min_count
        self.model = None
        
    def fit(self, train_df):
        items_str = train_df['item_id'].astype(str)
        users = train_df['user_id']
        temp_df = pd.DataFrame({'user': users, 'item': items_str})
        sentences = temp_df.groupby('user')['item'].apply(list).tolist()
        print(f"Training Word2Vec on {len(sentences)} playlists...")
        self.model = Word2Vec(sentences=sentences, 
                              vector_size=self.vector_size, 
                              window=self.window, 
                              min_count=self.min_count, 
                              workers=4,
                              sg=1,
                              seed=42)
        
    def recommend(self, user_id, n=20, already_seen=None):
        if already_seen is None:
            already_seen = set()
        user_history = [str(item) for item in already_seen if str(item) in self.model.wv]
        if not user_history:
            return []
        recs_tuples = self.model.wv.most_similar(positive=user_history, topn=n + len(already_seen) + 20)
        recs = []
        for item_str, score in recs_tuples:
            item = int(item_str)
            if item not in already_seen:
                recs.append(item)
                if len(recs) == n:
                    break
        return recs

class HybridRecommender(Recommender):
    def __init__(self, models_with_weights):
        self.models_with_weights = models_with_weights
        
    def fit(self, train_df):
        for model, weight in self.models_with_weights:
            print(f"Fitting sub-model {type(model).__name__}...")
            model.fit(train_df)
            
    def recommend(self, user_id, n=20, already_seen=None):
        k_cand = n * 5 
        item_scores = {}
        for model, weight in self.models_with_weights:
            recs = model.recommend(user_id, n=k_cand, already_seen=already_seen)
            for rank, item in enumerate(recs):
                score = weight * (1.0 / (rank + 1))
                item_scores[item] = item_scores.get(item, 0.0) + score
        sorted_items = sorted(item_scores.items(), key=lambda x: x[1], reverse=True)
        final_recs = [item for item, score in sorted_items[:n]]
        return final_recs

    def recommend_batch(self, user_ids, n=20, train_interactions=None):
        k_cand = n * 5
        all_model_recs = []
        for model, weight in self.models_with_weights:
            if hasattr(model, 'recommend_batch'):
                print(f"Batch predicting with {type(model).__name__}...")
                recs = model.recommend_batch(user_ids, n=k_cand, train_interactions=train_interactions)
                all_model_recs.append((recs, weight))
            else:
                print(f"Sequential predicting with {type(model).__name__}...")
                recs = {}
                for uid in user_ids:
                    seen = train_interactions.get(uid, set()) if train_interactions else set()
                    recs[uid] = model.recommend(uid, n=k_cand, already_seen=seen)
                all_model_recs.append((recs, weight))
        results = {}
        for user_id in user_ids:
            item_scores = {}
            for model_recs, weight in all_model_recs:
                recs = model_recs.get(user_id, [])
                for rank, item in enumerate(recs):
                    score = weight * (1.0 / (rank + 1))
                    item_scores[item] = item_scores.get(item, 0.0) + score
            sorted_items = sorted(item_scores.items(), key=lambda x: x[1], reverse=True)
            results[user_id] = [item for item, score in sorted_items[:n]]
        return results

## 3. Evaluation

In [None]:
def precision_at_k(recommended, actual, k=20):
    if not actual:
        return 0.0
    recommended = recommended[:k]
    relevant = set(recommended) & set(actual)
    return len(relevant) / k

def recall_at_k(recommended, actual, k=20):
    if not actual:
        return 0.0
    recommended = recommended[:k]
    relevant = set(recommended) & set(actual)
    return len(relevant) / len(actual)

def ndcg_at_k(recommended, actual, k=20):
    if not actual:
        return 0.0
    recommended = recommended[:k]
    dcg = 0.0
    idcg = 0.0
    actual_set = set(actual)
    for i, item in enumerate(recommended):
        if item in actual_set:
            dcg += 1.0 / np.log2(i + 2)
    for i in range(min(len(actual), k)):
        idcg += 1.0 / np.log2(i + 2)
    return dcg / idcg if idcg > 0 else 0.0

def evaluate_model(model, train_df, val_df, model_name):
    print(f"Training {model_name}...")
    start_time = time.time()
    model.fit(train_df)
    train_time = time.time() - start_time
    print(f"Training took {train_time:.2f}s")
    
    print(f"Evaluating {model_name}...")
    val_grouped = val_df.groupby('user_id')['item_id'].apply(list).to_dict()
    train_grouped = train_df.groupby('user_id')['item_id'].apply(set).to_dict()
    
    precisions = []
    recalls = []
    ndcgs = []
    
    users_to_eval = list(val_grouped.keys())
    if len(users_to_eval) > 10000:
        print(f"Limiting evaluation to 10,000 users (out of {len(users_to_eval)}) for speed.")
        users_to_eval = users_to_eval[:10000]
    
    start_eval = time.time()
    if hasattr(model, 'recommend_batch'):
        print("Using batch recommendation...")
        batch_size = 1000
        for i in range(0, len(users_to_eval), batch_size):
            batch_users = users_to_eval[i:i+batch_size]
            batch_recs = model.recommend_batch(batch_users, n=20, train_interactions=train_grouped)
            for user_id in batch_users:
                recommendations = batch_recs.get(user_id, [])
                actual_items = val_grouped[user_id]
                precisions.append(precision_at_k(recommendations, actual_items, k=20))
                recalls.append(recall_at_k(recommendations, actual_items, k=20))
                ndcgs.append(ndcg_at_k(recommendations, actual_items, k=20))
            if i % 5000 == 0:
                print(f"Evaluated {i}/{len(users_to_eval)} users...")
    else:
        for i, user_id in enumerate(users_to_eval):
            if i % 1000 == 0:
                print(f"Evaluated {i}/{len(users_to_eval)} users...")
            actual_items = val_grouped[user_id]
            seen_items = train_grouped.get(user_id, set())
            recommendations = model.recommend(user_id, n=20, already_seen=seen_items)
            precisions.append(precision_at_k(recommendations, actual_items, k=20))
            recalls.append(recall_at_k(recommendations, actual_items, k=20))
            ndcgs.append(ndcg_at_k(recommendations, actual_items, k=20))
            
    eval_time = time.time() - start_eval
    metrics = {
        'Model': model_name,
        'Precision@20': np.mean(precisions),
        'Recall@20': np.mean(recalls),
        'NDCG@20': np.mean(ndcgs),
        'Train Time': train_time,
        'Eval Time': eval_time
    }
    print(f"Results for {model_name}:")
    print(metrics)
    return metrics


results = []

results.append(evaluate_model(ItemCFRecommender(similarity_metric='cosine', k_neighbors=5), train_df, val_df, "ItemCF (Cosine, k=5)"))
results.append(evaluate_model(ItemCFRecommender(similarity_metric='cosine', k_neighbors=10), train_df, val_df, "ItemCF (Cosine, k=10)"))

results_df = pd.DataFrame(results)
print("\nFinal Evaluation Results:")
print(results_df)

## 4. Visualization

In [None]:
plt.figure(figsize=(12, 6))
bars = plt.barh(results_df['Model'], results_df['NDCG@20'], color='skyblue')
plt.xlabel('NDCG@20')
plt.title('Model Comparison - NDCG@20')
plt.grid(axis='x', linestyle='--', alpha=0.7)
for bar in bars:
    width = bar.get_width()
    plt.text(width, bar.get_y() + bar.get_height()/2, f'{width:.4f}', ha='left', va='center', fontsize=10)
plt.tight_layout()
plt.show()

plt.figure(figsize=(12, 6))
x = range(len(results_df))
width = 0.35
plt.bar([i - width/2 for i in x], results_df['Precision@20'], width, label='Precision@20', color='lightgreen')
plt.bar([i + width/2 for i in x], results_df['Recall@20'], width, label='Recall@20', color='salmon')
plt.xlabel('Model')
plt.ylabel('Score')
plt.title('Model Comparison - Precision & Recall')
plt.xticks(x, results_df['Model'], rotation=45, ha='right')
plt.legend()
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

## 5. Final Submission Generation

In [None]:
def generate_submission(model_name, output_file='submission_improved.txt'):
    print(f"Generating recommendations using {model_name}...")
    full_df = pd.read_csv('data/interim/interactions_full.csv')
    
    if 'Popularity' in model_name:
        model = PopularityRecommender(exclude_top_percent=0.01)
    elif 'ItemCF' in model_name:

        model = ItemCFRecommender(similarity_metric='cosine', k_neighbors=5)
    elif 'SVD' in model_name:
        model = SVDRecommender(n_components=100)
    elif 'Item2Vec' in model_name:
        model = Item2VecRecommender(vector_size=100, window=10)
    else:
        model = ItemCFRecommender(similarity_metric='cosine', k_neighbors=5)
        
    print("Training on full dataset...")
    model.fit(full_df)
    
    users = full_df['user_id'].unique()
    user_history = full_df.groupby('user_id')['item_id'].apply(set).to_dict()
    
    print(f"Generating for {len(users)} users...")
    results = []
    
    if hasattr(model, 'recommend_batch'):
        batch_size = 1000
        users_list = list(users)
        for i in range(0, len(users_list), batch_size):
            batch_users = users_list[i:i+batch_size]
            batch_recs = model.recommend_batch(batch_users, n=20, train_interactions=user_history)
            for user_id, recs in batch_recs.items():
                row = [user_id] + recs
                results.append(row)
            if i % 10000 == 0:
                print(f"Processed {i}/{len(users)} users...")
    else:
        for i, user_id in enumerate(users):
            seen = user_history.get(user_id, set())
            recs = model.recommend(user_id, n=20, already_seen=seen)
            row = [user_id] + recs
            results.append(row)
            if i % 5000 == 0:
                print(f"Processed {i}/{len(users)} users...")
                
    cols = ['user_id'] + [f'item_{i+1}' for i in range(20)]
    max_len = 21
    cleaned_results = []
    for row in results:
        if len(row) < max_len:
            row = row + [None] * (max_len - len(row))
        cleaned_results.append(row)
        
    res_df = pd.DataFrame(cleaned_results, columns=cols)
    res_df.to_csv(output_file, index=False, header=False)
    print(f"Saved recommendations to {output_file}")


best_model = results_df.loc[results_df['NDCG@20'].idxmax()]['Model']
print(f"Best model found: {best_model}")
generate_submission(best_model, output_file='output/submission_improved.txt')