In [57]:
# Important! In order for this file to run without any import errors, it needs to be directly within the "IML.HUJI" directory

In [33]:
# Imports and general configurations

from sklearn.linear_model import LogisticRegression
from sklearn import model_selection
from sklearn.metrics import confusion_matrix, accuracy_score, roc_curve, auc

import re
import numpy as np
import pandas as pd

STUDENTS = ['313234940', '207906306', '204841423']

# The file should be in the same folder as this notebook
# WEEKLY_TEST_SET = './test_set_week_1.csv'
# WEEKLY_TEST_SET = './test_set_week_2.csv'
WEEKLY_TEST_SET = './test_set_week_3.csv'


In [34]:
# Data columns (by type)

ID_COLS = ['h_booking_id', 'hotel_id', 'hotel_country_code', 'h_customer_id']

DATETIME_COLS = ['booking_datetime', 'checkin_date', 'checkout_date', 'hotel_live_date']

CODES_COLS = ['origin_country_code', 'hotel_area_code', 'hotel_city_code']

# (dropped) 'hotel_brand_code', 'hotel_chain_code' (have ~43K nulls!)

CATEGORICAL_COLS = ['accommadation_type_name', 'charge_option', 'customer_nationality',
                    'guest_nationality_country_name', 'language', 'original_payment_method',
                    'original_payment_type', 'original_payment_currency']

NUMERICAL_COLS = ['hotel_star_rating', 'no_of_adults',  'no_of_children', 'no_of_extra_bed', 'no_of_room',
                  'original_selling_amount']

SHOULD_BE_BOOLEAN_COLS = ['guest_is_not_the_customer', 'request_nonesmoke', 'request_latecheckin',
                          'request_highfloor', 'request_largebed', 'request_twinbeds', 'request_airport',
                          'request_earlycheckin']

# The following columns have 25040 nulls, we will treat null here as 0 (False):
# request_nonesmoke, request_latecheckin, request_highfloor, request_largebed, request_twinbeds,
# request_airport, request_earlycheckin
# remove them - or fill in nan with 0

BOOLEAN_COLS = ['is_user_logged_in', 'is_first_booking']

LABEL_COL = 'cancellation_datetime'

NO_SHOW_PATTERN = '_(\d+)(N|P)'
POLICY_PATTERN = '(\d+)D(\d+)(N|P)'

In [35]:

# Data parsing functions

def parse_all_cancellation_policy(data):
    counter = 0
    for i, row in data.iterrows():
        #if i % 2000 == 0:
        #    print(i)
        
        if row["cancellation_policy_code"] == "UNKNOWN":
            row["cancellation_policy_code"] = '0D100P_100P'
            counter += 1

        policy = row["cancellation_policy_code"]
        n_nights = row["n_nights"]

        no_show = re.findall(NO_SHOW_PATTERN, policy)
        cancel_policy = re.findall(POLICY_PATTERN, policy)
        worse_policy = cancel_policy[-1]
        basic_policy = cancel_policy[-2] if len(cancel_policy) > 1 else worse_policy
        if no_show:
            no_show_as_int = int(no_show[0][0])
            if no_show[0][1] == "N":
                no_show_days = no_show_as_int
                no_show_percent = no_show_as_int / n_nights * 100
            else:
                no_show_days = no_show_as_int * n_nights / 100
                no_show_percent = no_show_as_int

        else:
            worse_policy_without_no_show = re.findall(POLICY_PATTERN, policy)[-1]

            if worse_policy_without_no_show[-1] == "N":
                no_show_days = int(worse_policy_without_no_show[1])
                no_show_percent = int(worse_policy_without_no_show[1]) / n_nights * 100
            else:
                no_show_days = int(worse_policy_without_no_show[1]) * n_nights / 100
                no_show_percent = int(worse_policy_without_no_show[1])

        
        worse_policy_as_int = int(worse_policy[1])
        basic_policy_as_int = int(basic_policy[1])
        if worse_policy[2] == "N":
            #nights = worse_policy_as_int
            percent = worse_policy_as_int / n_nights * 100
        else:
            #nights = worse_policy_as_int * n_nights / 100
            percent = worse_policy_as_int
        if basic_policy[2] == "N":
            #basic_by_nights = worse_policy_as_int
            basic_percent = worse_policy_as_int / n_nights * 100
        else:
            #basic_by_nights = int(basic_policy_as_int) * n_nights / 100
            basic_percent = int(basic_policy_as_int)
        

        data.loc[i, "no_show_days"] = no_show_days
        data.loc[i, "no_show_percentage"] = no_show_percent

        
        days = int(worse_policy[0])
        basic_days = int(basic_policy[0])
        data.loc[i, "basic_charge_percentage"] = basic_percent
        #data.loc[i, "basic_charge_by_nights"] = basic_by_nights
        data.loc[i, "basic_charge_days"] = basic_days
        #data.loc[i, "basic_charge_days_times_nights"] = basic_days * basic_by_nights
        #data.loc[i, "basic_charge_days_times_percentage"] = basic_days * basic_percent
        data.loc[i, "charge_percentage"] = percent
        #data.loc[i, "charge_by_nights"] = nights
        data.loc[i, "charge_days"] = days
        #data.loc[i, "charge_days_times_nights"] = days * nights
        #data.loc[i, "charge_days_times_percentage"] = days * percent

    return data

def parse_dates(data):
    data = data.copy()
    for col_name in DATETIME_COLS:
        col_date_obj = data[col_name]
        data[f"{col_name}_year"] = col_date_obj.dt.year
        data[f"{col_name}_month"] = col_date_obj.dt.month
        data[f"{col_name}_day_in_week"] = col_date_obj.dt.day_of_week
        #data[f"{col_name}_dayofyear"] = col_date_obj.dt.dayofyear
        data = pd.get_dummies(data, columns=[f"{col_name}_year", f"{col_name}_month", f"{col_name}_day_in_week"], drop_first=True)
        
    data["n_nights"] = (data["checkout_date"] - data["checkin_date"]).dt.days
    data["n_days_from_booking_to_checkin"] = (data["checkin_date"] - data["booking_datetime"]).dt.days

    data = data.drop(DATETIME_COLS, axis=1)

    return data


In [36]:
def parse_data(full_data, test=False, cols=None):
    # choose some of the original columns
    if test:
        data = full_data[NUMERICAL_COLS + DATETIME_COLS + SHOULD_BE_BOOLEAN_COLS + BOOLEAN_COLS +
                         CATEGORICAL_COLS + ["cancellation_policy_code"]]
    else:
        data = full_data[NUMERICAL_COLS + DATETIME_COLS + SHOULD_BE_BOOLEAN_COLS + BOOLEAN_COLS +
                         CATEGORICAL_COLS + ["cancellation_policy_code"] + [LABEL_COL]]

    data = data.copy()
        
    # edit labels
    if not test:
        #data[LABEL_COL] = data[LABEL_COL].notnull()  # todo maybe change to handle the dates?
        booking_date_diff = (pd.to_datetime(data[LABEL_COL]) - data["booking_datetime"]).dt.days.between(7, 44) 
        checkin_diff = (pd.to_datetime(data[LABEL_COL]) - data["checkin_date"]).dt.days.between(2, 9) 
        
        data[LABEL_COL] = (booking_date_diff) | (checkin_diff)
        data = data.copy()

    # fill in 0 instead of None's
    for col_name in SHOULD_BE_BOOLEAN_COLS:
        data.loc[:, col_name] = data.loc[:, col_name].fillna(0)

    # handle datetime cols
    data = parse_dates(data)
    
    # data = parse_all_cancellation_policy(data)
    data = data.drop(['cancellation_policy_code'], axis=1)
    
    # replace categorical features with their dummies
    data = pd.get_dummies(data, columns=CATEGORICAL_COLS, drop_first=True)
    
    data = data.copy()
    if cols is not None:
        missing_cols = set(cols) - set(data.columns)
        for col in missing_cols:
            data.loc[:, col] = 0
        data = data.loc[:, cols] 

    data = data.copy()
    
    if not test:
        data = data.dropna()

    return data



In [37]:
def load_data(filename: str, test=False, cols=None):
    """
    Load Agoda booking cancellation dataset
    Parameters
    ----------
    filename: str
        Path to house prices dataset

    Returns
    -------
    Design matrix and response vector in either of the following formats:
    1) Single dataframe with last column representing the response
    2) Tuple of pandas.DataFrame and Series
    3) Tuple of ndarray of shape (n_samples, n_features) and ndarray of shape (n_samples,)
    """
    full_data = pd.read_csv(filename, parse_dates=DATETIME_COLS)
    if test:
        return parse_data(full_data, test, cols)
    full_data = full_data.drop_duplicates()
    parsed_data = parse_data(full_data)
    features, labels = parsed_data.loc[:, parsed_data.columns != LABEL_COL], parsed_data[LABEL_COL]
    return features, labels


In [38]:
def evaluate(extimator, X, y_true):
    predictions = pd.DataFrame(estimator.predict(X))
    conf_matrix = confusion_matrix(y_true, predictions)
    print(conf_matrix)
    tn, fp, fn, tp = conf_matrix.ravel()
    
    fpr, tpr, thresholds = roc_curve(y_true, predictions)
    print("Accuracy", accuracy_score(y_true, predictions))
    print(f"True negative: {tn}, False positive: {fp}, False Negative: {fn}, True Positive: {tp}")

In [41]:
# Handle exporting data

def evaluate_and_export(estimator, X: np.ndarray, filename: str):
    """
    Export to specified file the prediction results of given estimator on given testset.

    File saved is in csv format with a single column named 'predicted_values' and n_samples rows containing
    predicted values.

    Parameters
    ----------
    estimator: BaseEstimator or any object implementing predict() method as in BaseEstimator (for example sklearn)
        Fitted estimator to use for prediction

    X: ndarray of shape (n_samples, n_features)
        Test design matrix to predict its responses

    filename:
        path to store file at

    """
    
    predictions = pd.DataFrame(estimator.predict(X)).astype(int).to_numpy()
    # print(predictions)
    pd.DataFrame(predictions, columns=["predicted_values"]).to_csv(filename, index=False)



In [42]:
if __name__ == '__main__':
    np.random.seed(0)
    # Load data
    print("Loading and preprocessing training data...")
    features, cancellation_labels = load_data("datasets/agoda_cancellation_train.csv")

    train_X, test_X, train_y, test_y = model_selection.train_test_split(features, cancellation_labels, random_state=0)
    print(train_X.shape, train_y.shape)

    # # Fit model over data
    print("Fitting the model over the preprocesed training data...")
    estimator = LogisticRegression(solver='liblinear', max_iter=500)
    estimator.fit(train_X, train_y)
    
    
    print(f"Loading and preprocessing the weekly test data ({WEEKLY_TEST_SET})...")
    parsed_data = load_data(WEEKLY_TEST_SET, test=True, cols=train_X.columns)
    parsed_data = parsed_data.copy()
    parsed_data = parsed_data.reindex(columns=parsed_data.columns, fill_value=0)
    print(parsed_data.shape)
    
    # # # Store model predictions over test set
    print("Evaluating...")
    evaluate(estimator, test_X, test_y)

    #y_true_week1 = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])    
    #y_pred_week1 = pd.DataFrame(estimator.predict(parsed_data)).astype(int).to_numpy()
    #print("Accuracy on test", accuracy_score(y_pred_week1, y_true_week1))
    
    print("Predicting and exporting over the weekly test data...")
    evaluate_and_export(estimator, parsed_data, "{}_{}_{}.csv".format(*STUDENTS))
    
    print("Done :)")


Loading and preprocessing training data...
(43994, 525) (43994,)
Fitting the model over the preprocesed training data...
Loading and preprocessing the weekly test data (./test_set_week_3.csv)...
(700, 525)
Evaluating...


  self.obj[key] = value


[[13430    50]
 [ 1154    31]]
Accuracy 0.9178997613365155
True negative: 13430, False positive: 50, False Negative: 1154, True Positive: 31
Predicting and exporting over the weekly test data...
Done :)


In [31]:
# Next things to try:
# - Maybe try regular regression to predict the actual date of cancellation.
# - Maybe try Deep Learning approach
# - Try different approach to the cancel policy feature.
# - Remove some uncommon categorical data, e.g: original_payment_policy=UAH - replace with "other"