# Project 3: Recommendation System

### Question 13

In [18]:
from sklearn.datasets import load_svmlight_file
from sklearn.metrics import ndcg_score
import numpy as np

# Load the dataset for one fold
def load_one_file(data_path):
    X_train, y_train, qid_train = load_svmlight_file(str(data_path + 'train.txt'), query_id=True)
    X_test, y_test, qid_test = load_svmlight_file(str(data_path + 'test.txt'), query_id=True)
    y_train = y_train.astype(int)
    y_test = y_test.astype(int)
    _, group_train = np.unique(qid_train, return_counts=True)
    _, group_test = np.unique(qid_test, return_counts=True)
    return X_train, y_train, qid_train, group_train, X_test, y_test, qid_test, group_test

def ndcg_single_query(y_score, y_true, k):
    order = np.argsort(y_score)[::-1]
    y_true = np.take(y_true, order[:k])

    gain = 2 ** y_true - 1

    discounts = np.log2(np.arange(len(y_true)) + 2)
    return np.sum(gain / discounts)

# calculate NDCG score given a trained model 
def compute_ndcg_all(model, X_test, y_test, qids_test, k=10):
    unique_qids = np.unique(qids_test)
    ndcg_ = list()
    for i, qid in enumerate(unique_qids):
        y = y_test[qids_test == qid]

        if np.sum(y) == 0:
            continue

        p = model.predict(X_test[qids_test == qid])

        idcg = ndcg_single_query(y, y, k=k)
        ndcg_.append(ndcg_single_query(p, y, k=k) / idcg)
    return np.mean(ndcg_)

# get importance of features
def get_feature_importance(model, importance_type='gain'):
    return model.feature_importance(importance_type=importance_type)

In [8]:
import os
import numpy as np

data_dir = "MSLR-WEB10K/"

total_unique_queries = 0
total_relevance_label_distribution_train = np.zeros(5)
total_relevance_label_distribution_test = np.zeros(5)

for fold_num in range(1, 6):
    fold_path = os.path.join(data_dir, f"Fold{fold_num}/")
    # Load and preprocess the data for the current fold
    X_train, y_train, qid_train, group_train, X_test, y_test, qid_test, group_test = load_one_file(fold_path)

    unique_queries_train = np.unique(qid_train)
    unique_queries_test = np.unique(qid_test)
    total_unique_queries_fold = len(np.unique(np.concatenate((qid_train, qid_test))))
    print(f"Total number of unique queries in fold {fold_num}: {total_unique_queries_fold}")
    total_unique_queries += total_unique_queries_fold

    # distribution of relevance labels for the current fold
    relevance_label_distribution_train = np.bincount(y_train)
    relevance_label_distribution_test = np.bincount(y_test)
    print(f"Distribution of relevance labels in training data for fold {fold_num}: {relevance_label_distribution_train}")
    print(f"Distribution of relevance labels in test data for fold {fold_num}: {relevance_label_distribution_test}")

    # relevance label distributions across folds
    total_relevance_label_distribution_train += relevance_label_distribution_train
    total_relevance_label_distribution_test += relevance_label_distribution_test

# Print Results
print("\nAggregated results across all folds:")
print("Total number of unique queries:", total_unique_queries)
print("Total distribution of relevance labels in training data across all folds:", total_relevance_label_distribution_train)
print("Total distribution of relevance labels in test data across all folds:", total_relevance_label_distribution_test)

Total number of unique queries in fold 1: 8000
Distribution of relevance labels in training data for fold 1: [377957 232569  95082  12658   5146]
Distribution of relevance labels in test data for fold 1: [124784  77896  32459   4450   1932]
Total number of unique queries in fold 2: 8000
Distribution of relevance labels in training data for fold 2: [373029 230368  95117  12814   5355]
Distribution of relevance labels in test data for fold 2: [126450  78016  31875   4053   1594]
Total number of unique queries in fold 3: 8000
Distribution of relevance labels in training data for fold 3: [371725 232302  96663  12903   5518]
Distribution of relevance labels in test data for fold 3: [126088  75962  30913   4361   1769]
Total number of unique queries in fold 4: 8000
Distribution of relevance labels in training data for fold 4: [372756 231727  96244  12712   5329]
Distribution of relevance labels in test data for fold 4: [125419  78591  32294   4244   1783]
Total number of unique queries in fo

### Question 14

In [12]:
import lightgbm as lgb

data_dir = "MSLR-WEB10K/"

ndcg_3_scores = []
ndcg_5_scores = []
ndcg_10_scores = []

for fold_dir in os.listdir(data_dir):
    if os.path.isdir(os.path.join(data_dir, fold_dir)):
        # Load and preprocess the data for the current fold
        fold_data_dir = os.path.join(data_dir, f"Fold{fold_num}/")
        X_train, y_train, qid_train, group_train, X_test, y_test, qid_test, group_test = load_one_file(fold_data_dir)

        # Train a LightGBM model using the 'lambdarank' objective
        lgb_train = lgb.Dataset(X_train, label = y_train, group = group_train)
        lgb_test = lgb.Dataset(X_test, label = y_test, group = group_test)
        
        params = {
            'objective': 'lambdarank',
            'metric': 'ndcg',
            'ndcg_at': [3, 5, 10],
        }

        num_round = 100
        model = lgb.train(params, lgb_train, num_round, valid_sets = [lgb_test])

        # nDCG@3, nDCG@5, and nDCG@10
        ndcg_3 = compute_ndcg_all(model, X_test, y_test, qid_test, k = 3)
        ndcg_5 = compute_ndcg_all(model, X_test, y_test, qid_test, k = 5)
        ndcg_10 = compute_ndcg_all(model, X_test, y_test, qid_test, k = 10)

        ndcg_3_scores.append(ndcg_3)
        ndcg_5_scores.append(ndcg_5)
        ndcg_10_scores.append(ndcg_10)

# Print average nDCG scores across all folds
print("Average nDCG@3 across all folds:", np.mean(ndcg_3_scores))
print("Average nDCG@5 across all folds:", np.mean(ndcg_5_scores))
print("Average nDCG@10 across all folds:", np.mean(ndcg_10_scores))

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.072168 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 25501
[LightGBM] [Info] Number of data points in the train set: 722602, number of used features: 136
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.073466 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 25501
[LightGBM] [Info] Number of data points in the train set: 722602, number of used features: 136
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.071583 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 25501
[LightGBM] [Info] Number of

### Question 15

In [19]:
data_dir = "MSLR-WEB10K/"

top_features_per_fold = []

# Loop through each fold directory
for fold_num in range(1, 6):
    fold_data_dir = os.path.join(data_dir, f"Fold{fold_num}/")
    
    # Load the dataset for the current fold
    X_train, y_train, qid_train, group_train, X_test, y_test, qid_test, group_test = load_one_file(fold_data_dir)
    
    # Train the LightGBM model with 'lambdarank' objective
    lgb_train = lgb.Dataset(X_train, label = y_train, group = group_train, free_raw_data = False)
    lgb_test = lgb.Dataset(X_test, label = y_test, group = group_test, free_raw_data = False)
    params = {
        'objective': 'lambdarank'
    }
    model = lgb.train(params, lgb_train, valid_sets=[lgb_test], valid_names = ['test'])
    importance_scores = get_feature_importance(model, importance_type = 'gain')

    # Sort
    sorted_indices = np.argsort(importance_scores)[::-1]
    
    # Top 5 most important features
    top_features = sorted_indices[:5]
    top_features_per_fold.append(top_features)

# Print the top 5 most important features for each fold
for fold_num, top_features in enumerate(top_features_per_fold, start = 1):
    print(f"Fold {fold_num}: Top 5 most important features")
    for i, feature_idx in enumerate(top_features, start = 1):
        print(f"   {i}. Feature {feature_idx}")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.071146 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 25637
[LightGBM] [Info] Number of data points in the train set: 723412, number of used features: 136
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.083791 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 25623
[LightGBM] [Info] Number of data points in the train set: 716683, number of used features: 136
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.070493 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 25659
[LightGBM] [Info] Number of

### Question 16

In [44]:
from sklearn.metrics import ndcg_score
from sklearn.datasets import load_svmlight_file

params = {
    'objective': 'binary',
    'metric': 'ndcg',
    'verbosity': -1
}

data_dir = "MSLR-WEB10K/"

ndcg_scores = []

for fold_num in range(1, 6):
    fold_path = os.path.join(data_dir, f"Fold{fold_num}/")

    # Load the data for the current fold
    X_train, y_train, qid_train = load_svmlight_file(os.path.join(fold_path, 'train.txt'), query_id = True)
    X_test, y_test, qid_test = load_svmlight_file(os.path.join(fold_path, 'test.txt'), query_id = True)
    
    feature_importance_scores = get_feature_importance(model)

    # Filter out top feature indices
    top_feature_indices = [idx for idx in top_feature_indices if idx < X_train.shape[1]]

    X_train_csc = X_train.tocsc()
    X_test_csc = X_test.tocsc()
    
    # Remove top features
    X_train_filtered_csc = X_train_csc[:, [i for i in range(X_train_csc.shape[1]) if i not in top_feature_indices]]
    X_test_filtered_csc = X_test_csc[:, [i for i in range(X_test_csc.shape[1]) if i not in top_feature_indices]]
    
    X_train_filtered = X_train_filtered_csc.tocsr()
    X_test_filtered = X_test_filtered_csc.tocsr()

    group_train = np.bincount(qid_train)
    group_test = np.bincount(qid_test)
    
    # Create LightGBM datasets
    lgb_train = lgb.Dataset(X_train_filtered, y_train, group = group_train)
    lgb_test = lgb.Dataset(X_test_filtered, y_test, reference = lgb_train, group = group_test)

    # Train LightGBM model
    model = lgb.train(params, lgb_train, num_boost_round = 100)

    y_pred = model.predict(X_test_filtered)

    # Calculate nDCG score for this fold
    ndcg = ndcg_score([y_test], [y_pred], k = 10) 
    ndcg_scores.append(ndcg)

# Calculate average nDCG score across all folds
average_ndcg = np.mean(ndcg_scores)
print("Average nDCG score across all folds after removing top 20 features:", average_ndcg)

Type of X_train: <class 'scipy.sparse._csr.csr_matrix'>
Type of X_test: <class 'scipy.sparse._csr.csr_matrix'>
Type of X_train: <class 'scipy.sparse._csr.csr_matrix'>
Type of X_test: <class 'scipy.sparse._csr.csr_matrix'>
Type of X_train: <class 'scipy.sparse._csr.csr_matrix'>
Type of X_test: <class 'scipy.sparse._csr.csr_matrix'>
Type of X_train: <class 'scipy.sparse._csr.csr_matrix'>
Type of X_test: <class 'scipy.sparse._csr.csr_matrix'>
Type of X_train: <class 'scipy.sparse._csr.csr_matrix'>
Type of X_test: <class 'scipy.sparse._csr.csr_matrix'>
Average nDCG score across all folds after removing top 20 features: 0.522838100527449


In [48]:
from sklearn.metrics import ndcg_score
from sklearn.datasets import load_svmlight_file


params = {
    'objective': 'binary',
    'metric': 'ndcg',
    'verbosity': -1
}

data_dir = "MSLR-WEB10K/"

ndcg_scores = []

for fold_num in range(1, 6):
    fold_path = os.path.join(data_dir, f"Fold{fold_num}/")

    # Load the data for the current fold
    X_train, y_train, qid_train = load_svmlight_file(os.path.join(fold_path, 'train.txt'), query_id = True)
    X_test, y_test, qid_test = load_svmlight_file(os.path.join(fold_path, 'test.txt'), query_id = True)
        
    print("Type of X_train:", type(X_train))
    print("Type of X_test:", type(X_test))
    
    feature_importance_scores = get_feature_importance(model)

    # Filter out the least important feature indices
    least_important_indices = np.argsort(feature_importance_scores)[:60]

    X_train_csc = X_train.tocsc()
    X_test_csc = X_test.tocsc()
    
    # Remove least important features
    X_train_filtered_csc = X_train_csc[:, [i for i in range(X_train_csc.shape[1]) if i not in least_important_indices]]
    X_test_filtered_csc = X_test_csc[:, [i for i in range(X_test_csc.shape[1]) if i not in least_important_indices]]
    
    X_train_filtered = X_train_filtered_csc.tocsr()
    X_test_filtered = X_test_filtered_csc.tocsr()

    group_train = np.bincount(qid_train)
    group_test = np.bincount(qid_test)
    
    # Create LightGBM datasets 
    lgb_train = lgb.Dataset(X_train_filtered, y_train, group = group_train)
    lgb_test = lgb.Dataset(X_test_filtered, y_test, reference = lgb_train, group = group_test)

    # Train LightGBM model
    model = lgb.train(params, lgb_train, num_boost_round = 100) 
    y_pred = model.predict(X_test_filtered)

    # Calculate nDCG score for this fold
    ndcg = ndcg_score([y_test], [y_pred], k = 10) 
    ndcg_scores.append(ndcg)

# Calculate average nDCG score across all folds
average_ndcg = np.mean(ndcg_scores)
print("Average nDCG score across all folds after removing the least important 60 features:", average_ndcg)


Type of X_train: <class 'scipy.sparse._csr.csr_matrix'>
Type of X_test: <class 'scipy.sparse._csr.csr_matrix'>
Type of X_train: <class 'scipy.sparse._csr.csr_matrix'>
Type of X_test: <class 'scipy.sparse._csr.csr_matrix'>
Type of X_train: <class 'scipy.sparse._csr.csr_matrix'>
Type of X_test: <class 'scipy.sparse._csr.csr_matrix'>
Type of X_train: <class 'scipy.sparse._csr.csr_matrix'>
Type of X_test: <class 'scipy.sparse._csr.csr_matrix'>
Type of X_train: <class 'scipy.sparse._csr.csr_matrix'>
Type of X_test: <class 'scipy.sparse._csr.csr_matrix'>
Average nDCG score across all folds after removing the least important 60 features: 0.6634693900898593
