In [1]:
# Import the required libraries
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
import pickle

In [2]:
# Load the data into pandas dataframes. The data has to be manually saved to a folder called 'data'.
# Note: the data is quite large, so this may take a while (~40 seconds) if the data is read as a csv. To speed up further file reading, it is converted to pickle format.
if (os.path.exists(os.path.join('data', 'training_set_VU_DM.pickle'))
    & os.path.exists((os.path.join('data', 'test_set_VU_DM.pickle')))):
    train_df = pd.read_pickle(os.path.join('data', 'training_set_VU_DM.pickle'))
    test_df = pd.read_pickle(os.path.join('data', 'test_set_VU_DM.pickle'))
else:
    train_df = pd.read_csv(os.path.join('data', 'training_set_VU_DM.csv'))
    test_df = pd.read_csv(os.path.join('data', 'test_set_VU_DM.csv'))
    train_df.to_pickle(os.path.join('data', 'training_set_VU_DM.pickle'))
    test_df.to_pickle(os.path.join('data', 'test_set_VU_DM.pickle'))

# train_df = pd.read_csv(os.path.join('data', 'dummy_training.csv'))
# test_df = pd.read_csv(os.path.join('data', 'dummy_testing.csv'))

In [3]:
print(f"Period of data collection: {pd.to_datetime(train_df['date_time']).min().strftime('%Y/%m/%d')} - {pd.to_datetime(train_df['date_time']).max().strftime('%Y/%m/%d')}")
print(f"Train data contains {train_df.shape[0]:,} rows and {train_df.shape[1]} columns")
print(f"Test data contains {test_df.shape[0]:,} rows and {test_df.shape[1]} columns")
print()
print(f"Train data:")
print(f"Number of unique search IDs: {len(train_df['srch_id'].unique()):,}")
print(f"Number of unique property IDs: {len(train_df['prop_id'].unique()):,}")
print(f"Number of clicks per search: avg. {train_df['click_bool'].sum() / len(train_df['srch_id'].unique()):.2f}, std. {train_df['click_bool'].std():.2f}")
print(f"Number of bookings per search: avg. {train_df['booking_bool'].sum() / len(train_df['srch_id'].unique()):.2f}, std. {train_df['booking_bool'].std():.2f}")
print()
print(f"Test data:")
print(f"Number of unique search IDs: {len(test_df['srch_id'].unique()):,}")
print(f"Number of unique property IDs: {len(test_df['prop_id'].unique()):,}")

Period of data collection: 2012/11/01 - 2013/06/30
Train data contains 4,958,347 rows and 54 columns
Test data contains 4,959,183 rows and 50 columns

Train data:
Number of unique search IDs: 199,795
Number of unique property IDs: 129,113
Number of clicks per search: avg. 1.11, std. 0.21
Number of bookings per search: avg. 0.69, std. 0.16

Test data:
Number of unique search IDs: 199,549
Number of unique property IDs: 129,438


In [4]:
def create_features(df):
    """
    Create the following new features: has_starrating, has_review_score, traveling_abroad, srch_prop_country_match, month, and day_of_week
    """
    # has_starrating: boolean whether prop_starrating is 0 or null
    df["has_starrating"] = df["prop_starrating"].isnull()
    df["has_starrating"] = df["has_starrating"].astype(int)
    df.loc[df["prop_starrating"] == 0, "has_starrating"] = 1

    # has_review_score: boolean whether prop_review_score is 0 or null
    df["has_review_score"] = df["prop_review_score"].isnull()
    df["has_review_score"] = df["has_review_score"].astype(int)
    df.loc[df["prop_review_score"] == 0, "has_review_score"] = 1

    # traveling_abroad: boolean whether visitor_location_country_id != prop_country_id
    df["traveling_abroad"] = df["visitor_location_country_id"] != df["prop_country_id"]
    df["traveling_abroad"] = df["traveling_abroad"].astype(int)

    # srch_prop_country_match: boolean whether srch_destination_id == prop_country_id
    df["srch_prop_country_match"] = df["srch_destination_id"] == df["prop_country_id"]
    df["srch_prop_country_match"] = df["srch_prop_country_match"].astype(int)

    # month: month of the search, one-hot encoded
    df["date_time"] = pd.to_datetime(df["date_time"])
    df["month"] = df["date_time"].dt.month
    df["month"] = df["month"].map({1: "jan", 2: "feb", 3: "mar", 4: "apr", 5: "may", 6: "jun", 7: "jul", 8: "aug", 9: "sep", 10: "oct", 11: "nov", 12: "dec"})
    df = pd.get_dummies(df, columns=["month"], dtype=int)
    for col in ["month_jan", "month_feb", "month_mar", "month_apr", "month_may", "month_jun", "month_jul", "month_aug", "month_sep", "month_oct", "month_nov", "month_dec"]:
        if col not in df.columns:
            df[col] = 0

    # day_of_week: day of the week of the search
    df["day_of_week"] = df["date_time"].dt.dayofweek
    df["day_of_week"] = df["day_of_week"].map({0: "mon", 1: "tue", 2: "wed", 3: "thu", 4: "fri", 5: "sat", 6: "sun"})
    df = pd.get_dummies(df, columns=["day_of_week"], dtype=int)
    for col in ["day_of_week_mon", "day_of_week_tue", "day_of_week_wed", "day_of_week_thu", "day_of_week_fri", "day_of_week_sat", "day_of_week_sun"]:
        if col not in df.columns:
            df[col] = 0

    return df

In [5]:
def drop_nan_columns(df):
    """
    Train data shows that "visitor_hist_starrating", "visitor_hist_adr_usd", "srch_query_affinity_score", and "compx_rate_percent_diff" have >90% NaN values. These values cannot be imputed accurately, so we drop these columns.
    """
    cols = ["visitor_hist_starrating", "visitor_hist_adr_usd", "srch_query_affinity_score"] + [f"comp{i}_rate_percent_diff" for i in range(1, 9)]
    df.drop(columns=cols, inplace=True)
    return df

In [6]:
def impute_missing_values(df):
    """
    Impute missing values for the following columns: "prop_starrating", "prop_review_score", "compx_rate", and "compx_inv".

    For "prop_starrating" and "prop_review_score", we replace 0 values with NaN and then impute the NaN values with the mean per srch_id. Remaining NaN values are then filled with 0.
    For "compx_rate" and "compx_inv", we assume that missing data means that Expedia has the same price and equal availability as its competitors. We therefore impute the NaN values with 0.
    """
    # Replace 0 values with NaN
    df["prop_starrating"] = df["prop_starrating"].replace(0, np.nan)
    df["prop_review_score"] = df["prop_review_score"].replace(0, np.nan)
    # Impute NaN values with mean per srch_id
    df["prop_starrating"] = df.groupby("srch_id")["prop_starrating"].transform(lambda x: x.fillna(x.mean()))
    df["prop_review_score"] = df.groupby("srch_id")["prop_review_score"].transform(lambda x: x.fillna(x.mean()))
    # Fill remaining NaN values with 0
    df["prop_starrating"] = df["prop_starrating"].fillna(0)
    df["prop_review_score"] = df["prop_review_score"].fillna(0)

    # Impute NaN values with 0
    for i in range(1, 9):
        df[f"comp{i}_rate"] = df[f"comp{i}_rate"].fillna(0)
        df[f"comp{i}_inv"] = df[f"comp{i}_inv"].fillna(0)

    return df

In [7]:
def compute_aggregated_values(df):
    """
    Compute the mean, median and standard deviation for the following columns:
    "visitor_hist_starrating", "visitor_hist_adr_usd", "prop_starrating", "prop_review_score", "prop_location_score1", "prop_log_historical_price", "price_usd"
    """
    numerical_cols = ["prop_starrating", "prop_review_score", "prop_location_score1", "prop_log_historical_price", "price_usd"]
    # srch_length_of_stay, srch_booking_window, srch_adults_count, srch_children_count, and srch_room_count are also numerical variables, but it has no use aggregating these values over prop_id.

    agg_df = df.groupby("prop_id").agg({col: ["mean", "std", "median"] for col in numerical_cols})
    agg_df.columns = ["_".join(col) for col in agg_df.columns]
    agg_df.fillna(0, inplace=True)  # Fill standard deviation NaN values with 0
    for col in agg_df.columns:
        df[col] = df["prop_id"].map(agg_df[col])
    return df

In [8]:
def compute_relative_values(df):
    """
    Subtract the mean per srch_id from the following columns and make them into new columns:
    "prop_starrating", "prop_review_score", "prop_location_score1", "prop_log_historical_price", "price_usd"

    This is done so that the model can learn the relative values of these columns per srch_id.
    """
    cols = ["prop_starrating", "prop_review_score", "prop_location_score1", "prop_log_historical_price", "price_usd"]
    grouper = df.groupby('srch_id')
    for col in cols:
        df[f"relative_{col}"] = df[col] - grouper[col].transform('mean')
    return df

In [9]:
def drop_columns(df):
    """
    Train data shows that for "orig_destination_distance" over 75% of the data with a calculated value lower than 0.95 of the largest distance was lower than 130, meaning that the distance per srch_id is roughly the same. We assume therefore that this is not a deciding factor for a customer in their booking process and drop this column.

    Features were created from "date_time" and the column will not be used anymore, so we drop this column as well.

    Columns containing IDs ("site_id", "visitor_location_country_id", "prop_country_id", "prop_id", and "srch_destination_id") are not used in the model, so we drop these columns as well. "srch_id" and "prop_id" will remain in the columns for now for later use.

    If the supplied dataframe is the training dataframe, drop the unused target columns as well.

    # TODO:
    I don't really know what to do with "prop_location_score2" yet, so I'll drop it for now.
    """
    df.drop(columns=["orig_destination_distance"], inplace=True)
    df.drop(columns=["date_time"], inplace=True)
    df.drop(columns=["site_id", "visitor_location_country_id", "prop_country_id", "srch_destination_id"], inplace=True)
    df.drop(columns=["prop_location_score2"], inplace=True)
    for col in ["position", "gross_bookings_usd"]:
        if col in df.columns:
            df.drop(columns=[col], inplace=True)
    return df

In [10]:
def process_data(train_df: pd.DataFrame, test_df: pd.DataFrame, target_value: str = "booking_bool", train_frac=0.9):
    """
    Process the dataframes for training and testing the model.
    :param train_df: The training dataframe
    :param test_df: The test dataframe
    :param target_value: The target value to use for training the model. Either "booking_bool" or "both".
    :return: The processed dataframes
    """
    # Create features
    train_df = create_features(train_df)
    test_df = create_features(test_df)

    # Drop columns with NaN values
    train_df = drop_nan_columns(train_df)
    test_df = drop_nan_columns(test_df)

    # Impute missing values
    train_df = impute_missing_values(train_df)
    test_df = impute_missing_values(test_df)

    # Compute aggregated values
    train_df = compute_aggregated_values(train_df)
    test_df = compute_aggregated_values(test_df)

    # Compute relative values
    train_df = compute_relative_values(train_df)
    test_df = compute_relative_values(test_df)

    # Drop columns
    train_df = drop_columns(train_df)
    test_df = drop_columns(test_df)

    # Split train data into features and target
    if target_value == "booking_bool":
        train_df["target"] = train_df["booking_bool"]
    elif target_value == "both":
        train_df["target"] = 5*train_df["booking_bool"] + train_df["click_bool"]
    else:
        raise ValueError("target_value must be either 'booking_bool' or 'both'")

    train_df.drop(columns=["booking_bool", "click_bool"], inplace=True)

    train_df = train_df[:int(train_frac * len(train_df))]
    val_df = train_df[int(train_frac * len(train_df)):]

    return train_df, val_df, test_df

In [11]:
def generate_sequences(df: pd.DataFrame, kind: str = "train") -> dict:
    """
    Generate sequences from the dataframe.
    :param df: The dataframe to take the sequences from.
    :param kind: The kind of dataframe. Either "train", "val" or "test".
    :return: Dictionary containing the sequences and targets for the training data.
    """
    sequences = []
    if kind == "train":
        idx_dict = df.to_dict("index")
        df = df.drop(columns=["prop_id", "srch_id", "target"])
        for i, idx in enumerate(tqdm(idx_dict, desc="Train DataLoader")):
            sequences.append({"srch_id": idx_dict[idx]["srch_id"], "prop_id": idx_dict[idx]["prop_id"], "sequence": df.iloc[i].values, "target": idx_dict[idx]["target"]})
    elif kind == "val":
        idx_dict = df.to_dict("index")
        df = df.drop(columns=["prop_id", "srch_id", "target"])
        for i, idx in enumerate(tqdm(idx_dict, desc="Validation DataLoader")):
            sequences.append({"srch_id": idx_dict[idx]["srch_id"], "prop_id": idx_dict[idx]["prop_id"], "sequence": df.iloc[i].values, "target": idx_dict[idx]["target"]})
    elif kind == "test":
        idx_dict = df.to_dict("index")
        df = df.drop(columns=["prop_id", "srch_id"])
        for i, idx in enumerate(tqdm(idx_dict, desc="Test DataLoader")):
            sequences.append({"srch_id": idx_dict[idx]["srch_id"], "prop_id": idx_dict[idx]["prop_id"], "sequence": df.iloc[i].values})
    else:
        raise ValueError("kind must be either 'train', 'val' or 'test'")
    return sequences

In [12]:
def generate_lgb_sequences(train_df, test_df, force_data_processing=False, train_frac=0.9):
    """
    Generate the train, validation and test dictionaries
    """
    if os.path.exists(os.path.join('data', f'train_sequences_both.pickle')) \
            & os.path.exists(os.path.join('data', f'test_sequences_both.pickle')) \
            & os.path.exists(os.path.join('data', f'val_sequences_both.pickle')) \
            & (not force_data_processing):
        print(f"Loading train and test sequences from pickle files...")
        with open(os.path.join('data', f'train_sequences_both.pickle'), 'rb') as f:
            train_sequences = pickle.load(f)
        with open(os.path.join('data', f'val_sequences_both.pickle'), 'rb') as f:
            val_sequences = pickle.load(f)
        with open(os.path.join('data', f'test_sequences_both.pickle'), 'rb') as f:
            test_sequences = pickle.load(f)
    else:
        processed_train_df, processed_val_df, processed_test_df = process_data(train_df, test_df, target_value="both", train_frac=train_frac)
        train_sequences = generate_sequences(processed_train_df, kind="train")
        with open(os.path.join('data', f'train_sequences_both.pickle'), 'wb') as f:
            pickle.dump(train_sequences, f)
        val_sequences = generate_sequences(processed_val_df, kind="val")
        with open(os.path.join('data', f'val_sequences_both.pickle'), 'wb') as f:
            pickle.dump(val_sequences, f)    
        test_sequences = generate_sequences(processed_test_df, kind="test")
        with open(os.path.join('data', f'test_sequences_both.pickle'), 'wb') as f:
            pickle.dump(test_sequences, f)

    return train_sequences, val_sequences, test_sequences

def generate_lgb_data(sequences, kind="train"):
    x_data = np.array([seq["sequence"] for seq in sequences])
    
    query_to_properties = {}
    query_to_properties_to_target = {}

    for seq in sequences:
        
        srch_id = seq["srch_id"]
        prop_id = seq["prop_id"]
        # Add the search ID to the dictionary if it isn't in already
        if srch_id not in query_to_properties.keys():
            query_to_properties[srch_id] = []
            query_to_properties_to_target[srch_id] = {}
        
        query_to_properties[srch_id].append(prop_id)
        
        if kind =="train" or kind=="val":
            query_to_properties_to_target[srch_id][prop_id] = seq["target"]
        
    # When generating train data, return also targets
    if kind=="train" or kind=="val":
        y_data = np.array([seq["target"] for seq in sequences])

        return x_data, y_data, query_to_properties, query_to_properties_to_target

    # When generating test data, return only features and query information
    elif kind=="test":

        return x_data, query_to_properties

    else:
        raise ValueError("Invalid dataset creation")

**Train the LambdaMART model**

*Load training data*

In [13]:
import lightgbm as lgb
import time

train_frac = 0.9

# Assume X_train and y_train are the training features and target values, respectively
print("Generating sequences...")
t = time.time()
lgb_train_sequences, lgb_val_sequences, lgb_test_sequences = generate_lgb_sequences(train_df, test_df, force_data_processing=False, train_frac=train_frac)
print(f"Done generating sequences. This took {time.time() - t:.2f}s")

t = time.time()
print("Generating train features...")
train_x, train_y, train_queries, train_props2targets = generate_lgb_data(lgb_train_sequences, kind="train")
print(f"Done generating train features. This took {time.time() - t:.2f}s")

t = time.time()
print("Generating validation features...")
val_x, val_y, val_queries, val_props2targets = generate_lgb_data(lgb_val_sequences, kind="val")
print(f"Done generating validation features. This took {time.time() - t:.2f}s")

t = time.time()
print("Generating test features...")
test_x, test_queries = generate_lgb_data(lgb_test_sequences, kind="test")
print(f"Done generating test features. This took {time.time() - t:.2f}s")

t = time.time()
print("Transforming train features into appropriate dataset...")
# Create a LightGBM dataset from the training data
train_data = lgb.Dataset(train_x, label=train_y, group=[len(elem) for elem in train_queries.values()])
print(f"Done generating dataset. This took {time.time() - t:.2f}s")

Generating sequences...


Train DataLoader: 100%|██████████| 4462512/4462512 [09:10<00:00, 8108.94it/s]
Validation DataLoader: 100%|██████████| 446252/446252 [00:50<00:00, 8915.22it/s]
Test DataLoader: 100%|██████████| 4959183/4959183 [10:29<00:00, 7881.31it/s]


Done generating sequences. This took 1866.71s
Generating train features...
Done generating train features. This took 71.91s
Generating validation features...
Done generating validation features. This took 3.04s
Generating test features...
Done generating test features. This took 60.69s
Transforming train features into appropriate dataset...
Done generating dataset. This took 0.35s


In [14]:
def test_and_write(fn, predictions, queries):
    """
    Function to write a model's predictions to disk, where queries is a dictionary mapping
    a query ID to a list of property IDs, and predictions is an (ordered!) list containing
    the output of the model. 
    """
    srch_to_ranks = {}
    
    i=0
    for query_id, property_ids in tqdm(queries.items()):
        
        # Retrieve all probabilities for a certain query ID (assuming the data is still ordered)
        ranks = predictions[i:i+len(property_ids)]

        # Keep track of the probabilities
        srch_to_ranks[query_id] = ranks
        i+=len(property_ids)
    
    # Write predictions to file
    with open(fn, "w") as f:
        f.write("srch_id,prop_id\n")

        # Iterate over all different queries
        for srch_id, srch_probs in srch_to_ranks.items():
            
            # Get the order of the hotels based on the predicted probabilities
            sorted_indices = np.array(srch_probs).argsort()[::-1]
            property_ids_sorted = np.array(queries[srch_id])[np.array(sorted_indices)]

            for s_id, p_id in zip([srch_id for _ in range(len(property_ids_sorted))],property_ids_sorted):
                f.write(f"{s_id},{p_id}\n")

In [32]:
# Calculate NDCG@k score from file

def NDCG_from_file(fn, prop2targets, k):
    """
    Function to calculate the NDCG@k score from a txt file in the format specified in the assignment (two 
    columns, the left one containing the search IDs and the right one containing the PropertyIDs, sorted by
    relevance)
    """
    query_to_properties = {}

    # Fill the query-to-properties dictionary
    with open(fn) as f:
        lines = f.readlines()[1:] # Remove the header
        for line in lines:
            query_id, prop_id = line.split(",")
            query_id, prop_id = int(query_id), int(prop_id)
            if query_id not in query_to_properties.keys():
                query_to_properties[query_id] = [prop_id]
            else:
                query_to_properties[query_id].append(prop_id)

    NDCG_score = 0
    for query_id, prop_ids in query_to_properties.items():
        # Get all scores in order how they are scored by the model
        all_scores = np.array([prop2targets[query_id][prop_id] for prop_id in prop_ids])

        # Take only the first k elements
        true_scores = all_scores[:k]

        # Calculate DCG@k (Discounted Cumulative Gain at k)
        dcg = np.sum(true_scores / np.log2(np.arange(2, k+2)))
        
        # Sort the true scores in descending order
        true_sorted_scores = np.sort(all_scores)[:k][::-1]
        
        # Calculate ideal DCG@k
        ideal_dcg = np.sum(true_sorted_scores[:k] / np.log2(np.arange(2, k+2)))
        
        # Calculate NDCG@k
        NDCG_score += (dcg / ideal_dcg) if ideal_dcg > 0 else 0.0

    return NDCG_score / len(query_to_properties.items())

In [16]:
# Set the parameters for the LambdaMART model
params = {
    'objective': 'lambdarank',
    'metric': 'ndcg',
    'ndcg_eval_at': [5],  # Evaluation at NDCG@5
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0,
    'device': 'gpu' # Change to GPU if available
}

# TODO: check what hyperparameters winners used and optimize
gbm_params = {
    'objective': 'lambdarank',
    'metric': 'ndcg',
    'ndcg_eval_at': [5],
    'boosting_type': 'gbdt',
    'num_leaves': 32,
    'learning_rate': 0.05,
    'feature_fraction': 0.7,
    'bagging_fraction': 0.7,
    'bagging_freq': 5,
    'verbose': -1,
    'device': 'gpu'
}

num_rounds = 100

In [17]:
# # Train the LambdaMART model

# t = time.time()
# print("Training LambdaMART model...")
# vanilla_mart_model = lgb.train(params, train_data, num_rounds)
# print(f"Done training LambdaMART model. This took {time.time() - t:.2f}s")

# # Save the trained model
# vanilla_mart_model.save_model('lambda_mart_model.txt')

In [18]:
# print("Training GBM model...")
# GBM_model = lgb.train(gbm_params, train_data, num_rounds)
# print(f"Done training GBM model. This took {time.time() - t:.2f}s")

# # Save the trained model
# GBM_model.save_model('gbm_model.txt')

**Training of the ensemble model**

This one should work the best but I kept the code above just in case

In [19]:
if not os.path.isdir("test_output/GBM"):
    os.makedirs("test_output/GBM")

# Define the number of models in the ensemble
num_models = 5

# Train the ensemble of GBM models
models = []
print(f"Training ensemble of GBM models")
for i in range(num_models):
    # Set a random seed for reproducibility
    np.random.seed(42 + i)
    
    # Create a random subsample of the training data
    subsample_indices = np.random.choice(len(train_x), size=len(train_x), replace=True)
    subsample_data = train_data.subset(subsample_indices)
    
    # Train an individual GBM model
    print(f"Training model {i}")
    model = lgb.train(gbm_params, subsample_data, num_boost_round=100)
    
    # Add the trained model to the ensemble
    models.append(model)

# Make predictions using the ensemble of models
ensemble_predictions_val = np.zeros(len(val_x))
ensemble_predictions_test = np.zeros(len(test_x))

for i, model in enumerate(models):
    predictions_val = model.predict(val_x)
    predictions_test = model.predict(test_x)

    test_and_write(f"test_output/GBM/GBM_{i}_val.csv", predictions_val, val_queries)
    test_and_write(f"test_output/GBM/GBM_{i}_test.csv", predictions_test, test_queries)

    ensemble_predictions_val += predictions_val
    ensemble_predictions_test += predictions_test

# Average the predictions from the ensemble
ensemble_predictions_val /= num_models
ensemble_predictions_test /= num_models

test_and_write(f"test_output/GBM/GBM_ensemble_val.csv", predictions_val, val_queries)
test_and_write(f"test_output/GBM/GBM_ensemble_test.csv", predictions_test, test_queries)

Training ensemble of GBM models
Training model 0




Training model 1
Training model 2
Training model 3
Training model 4


100%|██████████| 18026/18026 [00:00<00:00, 21606.40it/s]
100%|██████████| 199549/199549 [00:06<00:00, 31261.20it/s]
100%|██████████| 18026/18026 [00:00<00:00, 759132.13it/s]
100%|██████████| 199549/199549 [00:00<00:00, 1450947.95it/s]
100%|██████████| 18026/18026 [00:00<00:00, 941515.56it/s]
100%|██████████| 199549/199549 [00:00<00:00, 1522589.86it/s]
100%|██████████| 18026/18026 [00:00<00:00, 1282509.90it/s]
100%|██████████| 199549/199549 [00:00<00:00, 1551681.36it/s]
100%|██████████| 18026/18026 [00:00<00:00, 1324919.37it/s]
100%|██████████| 199549/199549 [00:00<00:00, 1551713.01it/s]
100%|██████████| 18026/18026 [00:00<00:00, 1313616.72it/s]
100%|██████████| 199549/199549 [00:00<00:00, 1516023.29it/s]


In [33]:
# Calculate NDCG for validation set on all individual models and ensemble model

for i, model in enumerate(models):
    fn = f"test_output/GBM/GBM_{i}_val.csv"
    NDGC_val_score = NDCG_from_file(fn, val_props2targets, 5)
    print(f"GBM model number {i} has a validation NDCG@5 score of {NDGC_val_score}")


fn = f"test_output/GBM/GBM_ensemble_val.csv"
NDGC_val_score = NDCG_from_file(fn, val_props2targets, 5)
print(f"The GBM ensemble model has a validation NDCG@5 score of {NDGC_val_score}")

GBM model number 0 has a validation NDCG@5 score of 0.01906336173976899
GBM model number 1 has a validation NDCG@5 score of 0.019303880565172276
GBM model number 2 has a validation NDCG@5 score of 0.019087860280537328
GBM model number 3 has a validation NDCG@5 score of 0.0191777761769073
GBM model number 4 has a validation NDCG@5 score of 0.019283069929021956
The GBM ensemble model has a validation NDCG@5 score of 0.019283069929021956


**k-NN implementation as a stupid baseline, just because we needed to add something that's in the slides >:(**

In [21]:
from sklearn.neighbors import KNeighborsClassifier

if not os.path.isdir("test_output/kNN"):
    os.makedirs("test_output/kNN")

# Create a kNN classifier object
k = 5  # Number of neighbors to consider
knn = KNeighborsClassifier(n_neighbors=k)

# Train the kNN classifier
t = time.time()
print("Training the k-NN model...")
knn.fit(train_x, train_y)
print(f"Finished training the k-NN model in {time.time() - t}s")

# Predict the labels for the validation set
knn_pred_val = knn.predict(val_x)
knn_pred_test = knn.predict(test_x)

test_and_write(f"test_output/kNN/kNN_val.csv", knn_pred_val, val_queries)
test_and_write(f"test_output/kNN/kNN_test.csv", knn_pred_test, test_queries)

Training the k-NN model...
Finished training the k-NN model in 10.506479740142822s


KeyboardInterrupt: 