In [123]:
# Important! In order for this file to run without any import errors, it needs to be directly within the "IML.HUJI" directory

In [124]:
# the Estimator class

from __future__ import annotations
from typing import NoReturn
from IMLearn.base import BaseEstimator
import numpy as np
from sklearn.linear_model import LogisticRegression


class AgodaCancellationEstimator(BaseEstimator):
    """
    An estimator for solving the Agoda Cancellation challenge
    """

    def __init__(self) -> AgodaCancellationEstimator:
        """
        Instantiate an estimator for solving the Agoda Cancellation challenge

        Parameters
        ----------


        Attributes
        ----------

        """
        super().__init__()
        self.coef_ = None
        self.intercept_ = None
        self.logistic_regression_obj = LogisticRegression(solver='liblinear', max_iter=500)
        self.fitted = False

    def _fit(self, X: np.ndarray, y: np.ndarray) -> NoReturn:
        """
        Fit an estimator for given samples

        Parameters
        ----------
        X : ndarray of shape (n_samples, n_features)
            Input data to fit an estimator for

        y : ndarray of shape (n_samples, )
            Responses of input data to fit to

        Notes
        -----

        """

        self.model = self.logistic_regression_obj.fit(X, y)
        self.coef_ = self.model.coef_
        self.intercept_ = self.model.intercept_
        self.fitted_ = True

    def _predict(self, X: np.ndarray) -> np.ndarray:
        """
        Predict responses for given samples using fitted estimator

        Parameters
        ----------
        X : ndarray of shape (n_samples, n_features)
            Input data to predict responses for

        Returns
        -------
        responses : ndarray of shape (n_samples, )
            Predicted responses of given samples
        """
        return self.logistic_regression_obj.predict(X)

    def _loss(self, X: np.ndarray, y: np.ndarray) -> float:
        """
        Evaluate performance under loss function

        Parameters
        ----------
        X : ndarray of shape (n_samples, n_features)
            Test samples

        y : ndarray of shape (n_samples, )
            True labels of test samples

        Returns
        -------
        loss : float
            Performance under loss function
        """
        pass


In [125]:
# Imports and general configurations

# from challenge.agoda_cancellation_estimator import AgodaCancellationEstimator
from sklearn import model_selection
from sklearn.metrics import confusion_matrix, accuracy_score

import re
import numpy as np
import pandas as pd

STUDENTS = ['313234940', '207906306', '204841423']

# The file should be in the same folder as this notebook
WEEKLY_TEST_SET = './test_set_week_1.csv'


In [126]:
# Data columns (by type)

ID_COLS = ['h_booking_id', 'hotel_id', 'hotel_country_code', 'h_customer_id']

DATETIME_COLS = ['booking_datetime', 'checkin_date', 'checkout_date', 'hotel_live_date']

CODES_COLS = ['origin_country_code', 'hotel_area_code', 'hotel_city_code']

# (dropped) 'hotel_brand_code', 'hotel_chain_code' (have ~43K nulls!)

CATEGORICAL_COLS = ['accommadation_type_name', 'charge_option', 'customer_nationality',
                    'guest_nationality_country_name', 'language', 'original_payment_method',
                    'original_payment_type', 'original_payment_currency']

NUMERICAL_COLS = ['hotel_star_rating', 'no_of_adults',  'no_of_children', 'no_of_extra_bed', 'no_of_room',
                  'original_selling_amount']

SHOULD_BE_BOOLEAN_COLS = ['guest_is_not_the_customer', 'request_nonesmoke', 'request_latecheckin',
                          'request_highfloor', 'request_largebed', 'request_twinbeds', 'request_airport',
                          'request_earlycheckin']

# The following columns have 25040 nulls:
# request_nonesmoke, request_latecheckin, request_highfloor, request_largebed, request_twinbeds,
# request_airport, request_earlycheckin
# remove them - or fill in nan with 0

BOOLEAN_COLS = ['is_user_logged_in', 'is_first_booking']

LABEL_COL = 'cancellation_datetime'

NO_SHOW_PATTERN = '_(\d+)(N|P)'
POLICY_PATTERN = '(\d+)D(\d+)(N|P)'

In [134]:

# Data parsing functions

def parse_all_cancellation_policy(data):
    counter = 0
    for i, row in data.iterrows():
#         if i % 2000 == 0:
#             print(i)
        
        if row["cancellation_policy_code"] == "UNKNOWN":
            row["cancellation_policy_code"] = '0D100P_100P'
            counter += 1

        policy = row["cancellation_policy_code"]
        n_nights = row["n_nights"]

        no_show = re.findall(NO_SHOW_PATTERN, policy)
        cancel_policy = re.findall(POLICY_PATTERN, policy)
        worse_policy = cancel_policy[-1]
        basic_policy = cancel_policy[-2] if len(cancel_policy) > 1 else worse_policy
        if no_show:
            if no_show[0][1] == "N":
                no_show_days = int(no_show[0][0])
                no_show_percent = int(no_show[0][0]) / n_nights * 100
            else:
                no_show_days = int(no_show[0][0]) * n_nights / 100
                no_show_percent = int(no_show[0][0])

        else:
            worse_policy_without_no_show = re.findall(POLICY_PATTERN, policy)[-1]

            if worse_policy_without_no_show[-1] == "N":
                no_show_days = int(worse_policy_without_no_show[1])
                no_show_percent = int(worse_policy_without_no_show[1]) / n_nights * 100
            else:
                no_show_days = int(worse_policy_without_no_show[1]) * n_nights / 100
                no_show_percent = int(worse_policy_without_no_show[1])

        if worse_policy[2] == "N":
            nights = int(worse_policy[1])
            percent = int(worse_policy[1]) / n_nights * 100
        else:
            nights = int(worse_policy[1]) * n_nights / 100
            percent = int(worse_policy[1])
        if basic_policy[2] == "N":
            basic_by_nights = int(basic_policy[1])
            basic_percent = int(basic_policy[1]) / n_nights * 100
        else:
            basic_by_nights = int(basic_policy[1]) * n_nights / 100
            basic_percent = int(basic_policy[1])

        data.loc[i, "no_show_days"] = no_show_days
        data.loc[i, "no_show_percentage"] = no_show_percent

        days = int(worse_policy[0])
        basic_days = int(basic_policy[0])
        data.loc[i, "basic_charge_percentage"] = basic_percent
        data.loc[i, "basic_charge_by_nights"] = basic_by_nights
        data.loc[i, "basic_charge_days"] = basic_days
        # data.loc[i, "basic_charge_days_times_nights"] = basic_days * basic_by_nights
        # data.loc[i, "basic_charge_days_times_percentage"] = basic_days * basic_percent
        data.loc[i, "charge_percentage"] = percent
        data.loc[i, "charge_by_nights"] = nights
        data.loc[i, "charge_days"] = days
        # data.loc[i, "charge_days_times_nights"] = days * nights
        # data.loc[i, "charge_days_times_percentage"] = days * percent

    # print('unknowns', counter)
    return data


def parse_data(full_data, test=False, cols=None):
    # choose only numerical, dates, boolean and label:
    if test:
        data = full_data.loc[:, NUMERICAL_COLS + DATETIME_COLS + SHOULD_BE_BOOLEAN_COLS + BOOLEAN_COLS +
                                CATEGORICAL_COLS + ["cancellation_policy_code"]]
    else:
        data = full_data.loc[:, NUMERICAL_COLS + DATETIME_COLS + SHOULD_BE_BOOLEAN_COLS + BOOLEAN_COLS +
                                CATEGORICAL_COLS + ["cancellation_policy_code"] + [LABEL_COL]]

    # Should we drop duplicates again? if so, we need to make sure we keep a unique identifier (either id or datetime?)
    # data = data.drop_duplicates()

    # todo one of the two?
    #data.loc[:, LABEL_COL] = (pd.to_datetime(data["checkin_date"]) -
    #                          pd.to_datetime(data[LABEL_COL])).dt.days.between(2, 9).astype('int')

    if not test:
        data.loc[:, LABEL_COL] = data.loc[:, LABEL_COL].notnull()

    # fill in 0 instead of None's
    for col_name in SHOULD_BE_BOOLEAN_COLS:
        data.loc[:, col_name] = data.loc[:, col_name].fillna(0)
        data.loc[:, col_name] = data.loc[:, col_name] == 1

    # handle datetime cols (convert to timestamps)
    for col_name in DATETIME_COLS:
        as_datetime = data[col_name]
        data.loc[:, col_name + "_year"] = as_datetime.dt.year
        data.loc[:, col_name + "_month"] = as_datetime.dt.month
        data.loc[:, col_name + "_day"] = as_datetime.dt.day
        data.loc[:, col_name + "_day_in_week"] = as_datetime.dt.day_of_week
    
    data.loc[:, "n_nights"] = (data["checkout_date"] - data["checkin_date"]).dt.days
    data.loc[:, "n_days_from_booking_to_checkin"] = (data["checkin_date"] - data["booking_datetime"]).dt.days

    data = data.drop(DATETIME_COLS, axis=1)
    
    data = parse_all_cancellation_policy(data)
    data = data.drop(['cancellation_policy_code'], axis=1)

    # replace categorical features with their dummies
    data = pd.get_dummies(data, columns=CATEGORICAL_COLS, drop_first=True)
    if cols is not None:
        missing_cols = set(cols) - set(data.columns)
        for col in missing_cols:
            data[col] = 0
        data = data[cols] 
    
    if not test:
        data = data.dropna()

    return data



In [128]:
def load_data(filename: str, test=False, cols=None):
    """
    Load Agoda booking cancellation dataset
    Parameters
    ----------
    filename: str
        Path to house prices dataset

    Returns
    -------
    Design matrix and response vector in either of the following formats:
    1) Single dataframe with last column representing the response
    2) Tuple of pandas.DataFrame and Series
    3) Tuple of ndarray of shape (n_samples, n_features) and ndarray of shape (n_samples,)
    """
    full_data = pd.read_csv(filename, parse_dates=DATETIME_COLS)
    if test:
        return parse_data(full_data, test, cols)
    full_data = full_data.drop_duplicates()
    parsed_data = parse_data(full_data)
    features, labels = parsed_data.loc[:, parsed_data.columns != LABEL_COL], parsed_data[LABEL_COL]
    return features, labels


In [129]:
# Handle exporting data

def evaluate_and_export(estimator: AgodaCancellationEstimator, X: np.ndarray, filename: str, y_true=None):
    """
    Export to specified file the prediction results of given estimator on given testset.

    File saved is in csv format with a single column named 'predicted_values' and n_samples rows containing
    predicted values.

    Parameters
    ----------
    estimator: BaseEstimator or any object implementing predict() method as in BaseEstimator (for example sklearn)
        Fitted estimator to use for prediction

    X: ndarray of shape (n_samples, n_features)
        Test design matrix to predict its responses

    filename:
        path to store file at

    """
    
    predictions = pd.DataFrame(estimator.predict(X))
    if y_true is not None:
        conf_matrix = confusion_matrix(y_true, predictions)
        tn, fp, fn, tp = conf_matrix.ravel()
        print("Accuracy", accuracy_score(y_true, predictions))
        print(f"True negative: {tn}, False positive: {fp}, False Negative: {fn}, True Positive: {tp}")
        return # we want to write to the file only if it is the weekly test

    pd.DataFrame(predictions.astype(int).to_numpy(), columns=["predicted_values"]).to_csv(filename, index=False)



In [None]:
if __name__ == '__main__':
    np.random.seed(0)
    # Load data
    print("Loading and preprocessing training data...")
    df, cancellation_labels = load_data("datasets/agoda_cancellation_train.csv")

    train_X, test_X, train_y, test_y = model_selection.train_test_split(df, cancellation_labels, random_state=0)

    # # Fit model over data
    print("Fitting the model over the preprocesed training data...")
    estimator = AgodaCancellationEstimator()
    estimator.fit(train_X, train_y)
    
    print("Loading and preprocessing the weekly test data...")
    parsed_data = load_data(WEEKLY_TEST_SET, test=True, cols=train_X.columns)
    parsed_data = parsed_data.reindex(columns=parsed_data.columns, fill_value=0)
    
    # # # Store model predictions over test set
    print("Evaluating...")
    evaluate_and_export(estimator, test_X, "{}_{}_{}.csv".format(*STUDENTS), test_y)
    
    print("Predicting and exporting over the weekly test data...")
    evaluate_and_export(estimator, parsed_data, "{}_{}_{}.csv".format(*STUDENTS))
    
    print("Done :)")


Loading and preprocessing training data...
