In [1]:
# Important! In order for this file to run without any import errors and correct file paths:
# the weekly test set .csv (`test_set_week_x.csv`) should be in a folder named 'weekly_test_sets' under 'IML.HUJI'.

In [10]:
# Imports and general configurations

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn import model_selection
from sklearn.metrics import confusion_matrix, accuracy_score, roc_curve, auc, f1_score
from sklearn.ensemble import BaggingRegressor, RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.feature_selection import SelectKBest, chi2

import re
import numpy as np
import pandas as pd
import os

STUDENTS = ['313234940', '207906306', '204841423']

week = 7
WEEKLY_TEST_SET = os.path.join('weekly_test_sets', f'week_{week}_test_set.csv')

In [11]:
# Data columns (by type)

ID_COLS = ['h_booking_id', 'hotel_id', 'hotel_country_code', 'h_customer_id']

DATETIME_COLS = ['booking_datetime', 'checkin_date', 'checkout_date', 'hotel_live_date']

CODES_COLS = ['origin_country_code', 'hotel_area_code', 'hotel_city_code']

# (dropped) 'hotel_brand_code', 'hotel_chain_code' (have ~43K nulls!)

CATEGORICAL_COLS = ['accommadation_type_name', 'charge_option', 'customer_nationality',
                    'guest_nationality_country_name', 'language', 'original_payment_method',
                    'original_payment_type', 'original_payment_currency']

NUMERICAL_COLS = ['hotel_star_rating', 'no_of_adults',  'no_of_children', 'no_of_extra_bed', 'no_of_room',
                  'original_selling_amount']

SHOULD_BE_BOOLEAN_COLS = ['guest_is_not_the_customer', 'request_nonesmoke', 'request_latecheckin',
                          'request_highfloor', 'request_largebed', 'request_twinbeds', 'request_airport',
                          'request_earlycheckin']

# The following columns have 25040 nulls, we will treat null here as 0 (False):
# request_nonesmoke, request_latecheckin, request_highfloor, request_largebed, request_twinbeds,
# request_airport, request_earlycheckin
# remove them - or fill in nan with 0

BOOLEAN_COLS = ['is_user_logged_in', 'is_first_booking']

LABEL_COL = 'cancellation_datetime'

NO_SHOW_PATTERN = '_(\d+)(N|P)'
POLICY_PATTERN = '(\d+)D(\d+)(N|P)'

In [12]:

# Data parsing functions

def parse_all_cancellation_policy(data):
    counter = 0
    for i, row in data.iterrows():       
        if row["cancellation_policy_code"] == "UNKNOWN":
            row["cancellation_policy_code"] = '0D100P_100P'
            counter += 1

        policy = row["cancellation_policy_code"]
        n_nights = row["n_nights"]

        no_show = re.findall(NO_SHOW_PATTERN, policy)
        cancel_policy = re.findall(POLICY_PATTERN, policy)
        worse_policy = cancel_policy[-1]
        basic_policy = cancel_policy[-2] if len(cancel_policy) > 1 else worse_policy
        if no_show:
            no_show_as_int = int(no_show[0][0])
            if no_show[0][1] == "N":
                no_show_days = no_show_as_int
                no_show_percent = no_show_as_int / n_nights * 100
            else:
                no_show_days = no_show_as_int * n_nights / 100
                no_show_percent = no_show_as_int

        else:
            worse_policy_without_no_show = re.findall(POLICY_PATTERN, policy)[-1]

            if worse_policy_without_no_show[-1] == "N":
                no_show_days = int(worse_policy_without_no_show[1])
                no_show_percent = int(worse_policy_without_no_show[1]) / n_nights * 100
            else:
                no_show_days = int(worse_policy_without_no_show[1]) * n_nights / 100
                no_show_percent = int(worse_policy_without_no_show[1])

        
        worse_policy_as_int = int(worse_policy[1])
        basic_policy_as_int = int(basic_policy[1])
        if worse_policy[2] == "N":
            #nights = worse_policy_as_int
            percent = worse_policy_as_int / n_nights * 100
        else:
            #nights = worse_policy_as_int * n_nights / 100
            percent = worse_policy_as_int
        if basic_policy[2] == "N":
            #basic_by_nights = worse_policy_as_int
            basic_percent = worse_policy_as_int / n_nights * 100
        else:
            #basic_by_nights = int(basic_policy_as_int) * n_nights / 100
            basic_percent = int(basic_policy_as_int)
        

        data.loc[i, "no_show_days"] = no_show_days
        data.loc[i, "no_show_percentage"] = no_show_percent

        
        days = int(worse_policy[0])
        basic_days = int(basic_policy[0])
        data.loc[i, "basic_charge_percentage"] = basic_percent
        #data.loc[i, "basic_charge_by_nights"] = basic_by_nights
        data.loc[i, "basic_charge_days"] = basic_days
        #data.loc[i, "basic_charge_days_times_nights"] = basic_days * basic_by_nights
        #data.loc[i, "basic_charge_days_times_percentage"] = basic_days * basic_percent
        data.loc[i, "charge_percentage"] = percent
        #data.loc[i, "charge_by_nights"] = nights
        data.loc[i, "charge_days"] = days
        #data.loc[i, "charge_days_times_nights"] = days * nights
        #data.loc[i, "charge_days_times_percentage"] = days * percent

    return data

def parse_dates(data):
    data = data.copy()
    for col_name in DATETIME_COLS:
        col_date_obj = data[col_name]
        data[f"{col_name}_year"] = col_date_obj.dt.year
        data[f"{col_name}_month"] = col_date_obj.dt.month
        data[f"{col_name}_day_in_week"] = col_date_obj.dt.day_of_week
        data = pd.get_dummies(data, columns=[f"{col_name}_year", f"{col_name}_month", f"{col_name}_day_in_week"], drop_first=True)
        
    data["n_nights"] = (data["checkout_date"] - data["checkin_date"]).dt.days
    data["n_days_from_booking_to_checkin"] = (data["checkin_date"] - data["booking_datetime"]).dt.days

    data = data.drop(DATETIME_COLS, axis=1)

    return data


In [19]:
def parse_data(full_data, test=False, cols=None):
    # choose some of the original columns
    if test:
        data = full_data[NUMERICAL_COLS + DATETIME_COLS + SHOULD_BE_BOOLEAN_COLS + BOOLEAN_COLS +
                         CATEGORICAL_COLS + ["cancellation_policy_code"]]
    else:
        data = full_data[NUMERICAL_COLS + DATETIME_COLS + SHOULD_BE_BOOLEAN_COLS + BOOLEAN_COLS +
                         CATEGORICAL_COLS + ["cancellation_policy_code"] + [LABEL_COL]]

    data = data.copy()
        
    # edit labels
    if not test:
        booking_date_diff = (pd.to_datetime(data[LABEL_COL]) - data["booking_datetime"]).dt.days.between(7, 44) 
        checkin_diff = (pd.to_datetime(data[LABEL_COL]) - data["checkin_date"]).dt.days.between(2, 9) 
        
        data[LABEL_COL] = (booking_date_diff) | (checkin_diff)
        data = data.copy()

    # fill in 0 instead of None's
    for col_name in SHOULD_BE_BOOLEAN_COLS:
        data.loc[:, col_name] = data.loc[:, col_name].fillna(0)

    # handle datetime cols
    data = parse_dates(data)
    
    data = parse_all_cancellation_policy(data)
    data = data.drop(['cancellation_policy_code'], axis=1)
    
    
    for col_name in CATEGORICAL_COLS:
        options = data[col_name].unique()
        for option in options:
            if len(data[data[col_name] == option]) / len(data) < 0.0005:
                data.loc[data[col_name] == option, col_name] = "OTHER"
    
    # replace categorical features with their dummies
    data = pd.get_dummies(data, columns=CATEGORICAL_COLS, drop_first=True)
    
    data = data.copy()
    if cols is not None:
        missing_cols = set(cols) - set(data.columns)
        for col in missing_cols:
            data.loc[:, col] = 0
        data = data.loc[:, cols] 

    data = data.copy()
    
    if not test:
        data = data.dropna()

    return data



In [20]:
def load_data(filename: str, test=False, cols=None):
    """
    Load Agoda booking cancellation dataset
    Parameters
    ----------
    filename: str
        Path to house prices dataset

    Returns
    -------
    Design matrix and response vector in either of the following formats:
    1) Single dataframe with last column representing the response
    2) Tuple of pandas.DataFrame and Series
    3) Tuple of ndarray of shape (n_samples, n_features) and ndarray of shape (n_samples,)
    """
    original_train_data = pd.read_csv(filename, parse_dates=DATETIME_COLS)  

    if test:
        return parse_data(original_train_data, test, cols)
    original_train_data = original_train_data.drop_duplicates()
    parsed_data = parse_data(original_train_data)
    features, labels = parsed_data.loc[:, parsed_data.columns != LABEL_COL], parsed_data[LABEL_COL]
    return features, labels


In [21]:
def choose_cutoff(estimator, X, y_true):
    predictions_proba = estimator.predict_proba(X)[:,1]
    f1_scores = []
    for cutoff in range(0, 100, 5):
        cutoff /= 100
        predictions = pd.DataFrame([1 if i > cutoff else 0 for i in predictions_proba])
        f1_scores.append({"cutoff": cutoff, "score": f1_score(y_true, predictions, average='macro')})
    return sorted(f1_scores, key=lambda d: d['score'])[-1]["cutoff"]
    

def evaluate(extimator, X, y_true):
#     chosen_cutoff = choose_cutoff(estimator, X, y_true)
#     print("chosen_cutoff", chosen_cutoff)
#     predictions = pd.DataFrame([1 if i > chosen_cutoff else 0 for i in  estimator.predict_proba(X)[:,1]])
    
    predictions = estimator.predict(X)
    print("F1:", f1_score(y_true, predictions, average='macro'))
#     return chosen_cutoff

In [22]:
# Handle exporting data

def evaluate_and_export(estimator, cutoff, X: np.ndarray, filename: str):
    """
    Export to specified file the prediction results of given estimator on given testset.

    File saved is in csv format with a single column named 'predicted_values' and n_samples rows containing
    predicted values.

    Parameters
    ----------
    estimator: BaseEstimator or any object implementing predict() method as in BaseEstimator (for example sklearn)
        Fitted estimator to use for prediction

    X: ndarray of shape (n_samples, n_features)
        Test design matrix to predict its responses

    filename:
        path to store file at

    """
#     predictions = pd.DataFrame([1 if i > cutoff else 0 for i in estimator.predict_proba(X)[:,1]]).to_numpy()
    predictions = estimator.predict(X).astype(int)
    pd.DataFrame(predictions, columns=["predicted_values"]).to_csv(filename, index=False)



In [23]:
def previous_test_sets(features_columns):
    all_features = pd.DataFrame({}, columns=features_columns)
    all_labels = pd.DataFrame({}, columns=['h_booking_id', 'cancel'])
    
    for week in [1, 2, 3, 4, 5, 6]:
        testset_filename = os.path.join('weekly_test_sets', f'week_{week}_test_data.csv')
        labels_filename = os.path.join('labels', f'week_{week}_labels.csv')
        
        features_week = load_data(testset_filename, test=True, cols=features_columns)
        labels_week = pd.read_csv(labels_filename)
        
        all_features = pd.concat([all_features, features_week])
        all_labels = pd.concat([all_labels, labels_week])
        
    return all_features, all_labels.rename({"cancel": LABEL_COL}, axis=1)[LABEL_COL].astype(bool)

In [24]:
if __name__ == '__main__':
    np.random.seed(0)

    print("Loading and preprocessing training data...")
    features, labels = load_data("datasets/agoda_cancellation_train.csv")

    prev_weeks_features, prev_weeks_labels = previous_test_sets(features.columns)

#     all_features, all_labels = features, labels
    all_features = pd.concat([features, prev_weeks_features])
    all_labels = pd.concat([labels, prev_weeks_labels])
    
#     best_features = SelectKBest(k=100)
        
    train_X, test_X, train_y, test_y = model_selection.train_test_split(all_features, all_labels, random_state=0)

    # Fit model over data
    print("Fitting the model over the preprocesed training data...")
#     estimator = RandomForestClassifier().fit(all_features, all_labels)
    estimator = RandomForestClassifier().fit(train_X, train_y)
#     estimator = CatBoostClassifier(iterations=200).fit(train_X, train_y, verbose=False)
    
    print(f"Loading and preprocessing the weekly test data ({WEEKLY_TEST_SET})...")
    parsed_data = load_data(WEEKLY_TEST_SET, test=True, cols=features.columns)
    parsed_data = parsed_data.copy()
    parsed_data = parsed_data.reindex(columns=parsed_data.columns, fill_value=0)
    
    print("Evaluating...")
#     chosen_cutoff = evaluate(estimator, test_X, test_y)
    evaluate(estimator, test_X, test_y.astype(int))
    chosen_cutoff = None
     
    print("Predicting and exporting over the weekly test data...")
    evaluate_and_export(estimator, chosen_cutoff, parsed_data, "{}_{}_{}.csv".format(*STUDENTS))
    
#     week = 5  # remember to remove it from the prev
#     y_pred = pd.read_csv("313234940_207906306_204841423.csv")["predicted_values"]    
#     y_true = pd.read_csv(os.path.join('labels', f'week_{week}_labels.csv')).rename({"cancel": LABEL_COL}, axis=1)[LABEL_COL].astype(int)
#     print(f"f1_score of week {week}:", f1_score(y_true, y_pred, average='macro'))

    print("Done :)")


Loading and preprocessing training data...
Fitting the model over the preprocesed training data...
Loading and preprocessing the weekly test data (weekly_test_sets\week_7_test_set.csv)...
Evaluating...
F1: 0.5426875485570273
Predicting and exporting over the weekly test data...
Done :)


In [11]:
# Next things to try:
# - Maybe try regular regression to predict the actual date of cancellation.
# - Maybe try Deep Learning approach
# - Try different approach to the cancel policy feature.
# - Try different classifiers
# - Try to understand how to make the model more sensitive (predict more Positives)
# new:
# - try to fit over the original set with the new weeks
# - check if there's a cutoff in this model + other parameters we can config
# - we can also try to use the BaggingReggresor wuth different base estimators (nead to convert the prediction to binary)