In [131]:
# imports
import subprocess
import numpy as np
import zipfile
import pandas as pd
import os
import gzip
import sys
import shutil
import urllib.request
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import product
from multiprocessing import Pool

In [3]:
# if youre on a mac and want to run the notebook without letting youre compouter fall asleep, remember to terminate
caffein = subprocess.Popen("caffeinate", shell=True)

## 0. Load  / (remove) data

In [202]:
# config
path = os.getcwd()  # path to project directory
csvs = 'train.csv', 'test.csv', 'destinations.csv'  #  our names for the different csvs
gzs = '2013-2014.gz', '2015.gz', 'destinations.csv.gz'  # names of the gz files which we be unpacked from expedia.zip
url = "https://www.dropbox.com/s/gwrc8o99xcj6r72/Expedia.zip?dl=1" # dl=1
is_development = True
data_path = path + '/data/'
clear_hard_drive = True

In [203]:
# loads the dataset files into a passed directory otherwise current location
def load_files(url, gzs, csvs, data_path, path=os.getcwd()):
    os.chdir(path)

    if all(os.path.isfile(data_path + csv) for csv in csvs):
        print('Data already loaded')
    else:
        # if the Expedia file is not there yet init download it and write it as zip
        if not os.path.isfile('Expedia.zip'):
            print('Downloading Expedia.zip from Dropbox...')
            u = urllib.request.urlopen(url)
            data = u.read()
            u.close()
            
            total = 0
            with open('Expedia.zip', 'wb') as f:
                f.write(data)
                total += 1
                if total % 10000000 == 0:
                    print('Read {} lines...'.format(total))
            print('Finished downloading Expedia.zip from Dropbox')

        # Extract gz files from zip
        with zipfile.ZipFile("Expedia.zip", 'r') as zip_ref:
            print('Unzipping Expedia.zip')
            zip_ref.extractall(path)
            print('Finished unzipping Expedia.zip...')

        os.chdir(path + '/all')

        # extract the csvs from the gzs
        for csv, gz in zip(csvs, gzs):   
            with gzip.open(gz, 'rb') as f_in: 
                with open(csv, 'wb') as f_out:
                    print(f'writing {csv}...')
                    shutil.copyfileobj(f_in, f_out)
                    print(f'Finished writing {csv}')

        # clean up directory
        for f in gzs:
            os.remove(f)
            
        os.chdir(path)  # return to parent directory
        if os.path.exists(data_path): # if the data directory already exists probably empty, remove it
            shutil.rmtree(data_path)
        os.rename(path + '/all', data_path)
        os.remove('Expedia.zip')
        
        print(f'Finished.')

In [204]:
# laoding data into memory
def load_data(is_development, data_path, csvs, write_sample=False):
    print('Loading data into memory / dataframe')
    if not is_development:
        df_train = pd.read_csv(data_path + csvs[0])
        df_test = pd.read_csv(data_path + csvs[1])
        df_destination = pd.read_csv(data_path + csvs[2])  # not even needed. its enough to have the destination id as feature
    else:
        df_train = pd.read_csv(data_path + csvs[0], nrows=1000)
        df_test = pd.read_csv(data_path + csvs[1], nrows=1000)
        df_destination = pd.read_csv(data_path + csvs[2])
        
        if write_sample:
            print('Writing sample files...')
            df_train.to_csv(data_path + 'sample' + csvs[0], index=False)
            df_test.to_csv(data_path + 'sample' + csvs[1], index=False)
            print('Finished writing sample files...')
            
    print('Finished loading data into memory / dataframe')
    return df_train, df_test, df_destination

In [205]:
def remove_data(data_path, csvs, clear_hard_drive=False):
    if not clear_hard_drive:
        return
    print('Removing files from hard drive...')
    cwd = os.getcwd()
    os.chdir(data_path)
    for csv in csvs:
        try:
            print('Removing ', csv, '...')
            os.remove(csv)  
        except EnvironmentError:
            print(csv, ' not found.')
        try: 
            print('Removing sample ', csv, '...')
            os.remove('sample' + csv)
        except EnvironmentError:
            print('sample', csv, ' not found.')
    os.chdir(cwd)
    print('Finished removing files from hard drive...')

In [206]:
load_files(url, gzs, csvs, data_path, path)

Data already loaded


In [253]:
df_train, df_test, df_destination = load_data(is_development, data_path, csvs, write_sample=True)

Loading data into memory / dataframe
Writing sample files...
Finished writing sample files...
Finished loading data into memory / dataframe


In [254]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 24 columns):
date_time                    1000 non-null object
site_name                    1000 non-null int64
posa_continent               1000 non-null int64
user_location_country        1000 non-null int64
user_location_region         1000 non-null int64
user_location_city           1000 non-null int64
orig_destination_distance    268 non-null float64
user_id                      1000 non-null int64
is_mobile                    1000 non-null int64
is_package                   1000 non-null int64
channel                      1000 non-null int64
srch_ci                      1000 non-null object
srch_co                      1000 non-null object
srch_adults_cnt              1000 non-null int64
srch_children_cnt            1000 non-null int64
srch_rm_cnt                  1000 non-null int64
srch_destination_id          1000 non-null int64
srch_destination_type_id     1000 non-null int64
is_booki

## 1. What is the challenge?
Expedia is interested in predicting which hotel group a user is going to book.
* Does that mean in test set every row we look at eventually is a booking and the feature is the hotel class (multi class problem)
* Hence, easiest approach, drop all rows on other 'clicking events' and simply consider rows where is_booking
* But beforehadn construct feature over all rows
* Could you alternatively do, is the even going to be a booking? 1 / 0

## 2. Clean the data / engineer features

In [255]:
# find nans
def checkna(df):
    for col in df.columns:
        if df[col].isnull().any():
            print(col, '| Nan:', (df[col].isnull()).sum())

In [256]:
def fillna_convert(df):
    """Fill empty data"""
    df['orig_destination_distance'].fillna(df['orig_destination_distance'].median(), inplace=True)
    
    # covert data forma
    for col in df[['date_time', 'srch_ci', 'srch_co']].columns:
        df[col] = pd.to_datetime(df[col])

In [257]:
def counted(func):
    """A decorater to keep track of a function call count"""
    def wrapped(*args, **kwargs):
        wrapped.calls += 1
        if wrapped.calls % 10000 == 0:
            print(f'{func.__name__}: {wrapped.calls} rows...')
        return func(*args, **kwargs)
    wrapped.calls = 0
    return wrapped

In [258]:
def engineer_features(df, df_full):

        
    # length of trip
    df['trip_length'] = (df['srch_co'] - df['srch_co']).astype('timedelta64[D]')
    
    # solo trip
    df['solo_trip'] = np.where(((df['srch_adults_cnt'] == 1) & (df['srch_children_cnt'] == 0)), 1, 0)
    
    # aggregate a mean booking rate
    @counted
    def aggregated_booking_rate(instance):
        if instance['is_booking'] == 0:
            return np.nan
        instance_date = instance['date_time']
        instance_id = instance['user_id']
        mean_booking_rate = df_full[(df_full['date_time'] <= instance_date) & (df_full['user_id'] == instance_id)]['is_booking'].mean()
        return mean_booking_rate
        
    df['booking_rate'] = df.apply(aggregated_booking_rate, axis=1)
    
    # aggregate previous bookings & clicks by hotel cluster
    @counted
    def aggregated_previous_cluster(instance, hotel_cluster):
        if instance['is_booking'] == 0:
            return np.nan, np.nan
        instance_date = instance['date_time']
        instance_id = instance['user_id']
        count_booked = len(df_full[(df_full['date_time'] <= instance_date) & (df_full['user_id'] == instance_id) & (df_full['hotel_cluster'] == hotel_cluster) & (df_full['is_booking'] == 1)])
        count_not_booked = len(df_full[(df_full['date_time'] <= instance_date) & (df_full['user_id'] == instance_id) & (df_full['hotel_cluster'] == hotel_cluster) & (df_full['is_booking'] == 0)])
        return count_booked, count_not_booked
            
    for hotel_cluster in df_full['hotel_cluster'].unique():
         df['booked_cluster' + str(hotel_cluster)], df['not_booked_cluster' + str(hotel_cluster)]  = zip(*df.apply(lambda instance: aggregated_previous_cluster(instance, hotel_cluster), axis=1))

    return df

def parallel_feature_engineering(df_full, func=engineer_features, n_cores=4):
    df_splits = np.array_split(df_full, n_cores)
    args = [[df_split, df_full] for df_split in df_splits]
    pool = Pool(n_cores)
    df = pd.concat(pool.starmap(func, args))
    pool.close()
    pool.join()
    return df

In [259]:
def finalize(df):
    """Date times need to be removed before a model is trained."""
    for col in ['date_time', 'srch_ci', 'srch_co', 'user_id']:
        try:
            df.drop(col, axis=1, inplace=True)
        except KeyError:
            pass

In [260]:
# TODO: DOES IT REPRESENT LEAKAGE TO KNOW THE DISTANCE TO THE BOOKED HOTEL BEFOREHAND? I GUESS SO?
#def handle_leakage(df):
    # If its a booking can I already know the distance to the hotel?
    # df.drop(['orig_destination_distance'], axis=1, inplace=True)

In [261]:
fillna_convert(df_train)
fillna_convert(df_test)
df_train_copy = df_train.copy(deep=True)

In [262]:
df_train = parallel_feature_engineering(df_train)

aggregated_previous_cluster: 10000 rows...
aggregated_previous_cluster: 10000 rows...
aggregated_previous_cluster: 20000 rows...
aggregated_previous_cluster: 10000 rows...
aggregated_previous_cluster: 20000 rows...
aggregated_previous_cluster: 20000 rows...
aggregated_previous_cluster: 10000 rows...
aggregated_previous_cluster: 20000 rows...


In [265]:
df_train_copy['is_Train'] = True
df_test['is_Train'] = False
df_test['is_booking'] = 1
df_test_full_history = df_train_copy.append(df_test)
print(df_test_full_history.shape)
df_test_full_history = parallel_feature_engineering(df_test_full_history)
print(df_test_full_history.shape)
df_test = df_test_full_history[df_test_full_history['is_Train'] == False]
df_test = df_test.drop(['is_booking', 'is_Train', 'hotel_cluster', 'id'], axis=1)

(2000, 26)
aggregated_previous_cluster: 10000 rows...
aggregated_previous_cluster: 10000 rows...
aggregated_previous_cluster: 20000 rows...
aggregated_previous_cluster: 20000 rows...
aggregated_previous_cluster: 30000 rows...
aggregated_previous_cluster: 40000 rows...
aggregated_previous_cluster: 30000 rows...
aggregated_previous_cluster: 40000 rows...
aggregated_previous_cluster: 10000 rows...
aggregated_previous_cluster: 10000 rows...
aggregated_previous_cluster: 20000 rows...
aggregated_previous_cluster: 20000 rows...
aggregated_previous_cluster: 30000 rows...
aggregated_previous_cluster: 30000 rows...
aggregated_previous_cluster: 40000 rows...
aggregated_previous_cluster: 40000 rows...
(2000, 221)


In [268]:
# current strategy is to only use rows where booking is one and aggregate all info
len_before = len(df_train)
df_train = df_train[df_train['is_booking'] == 1]
df_train = df_train.drop('is_booking', axis=1)
print(f'Dropped {len_before - len(df_train)} rows which did not represent a booking in df train.')

Dropped 936 rows which did not represent a booking in df train.


In [270]:
pd.set_option('display.max_columns', None)
df_test.head(10)

Unnamed: 0,channel,cnt,date_time,hotel_continent,hotel_country,hotel_market,is_mobile,is_package,orig_destination_distance,posa_continent,site_name,srch_adults_cnt,srch_children_cnt,srch_ci,srch_co,srch_destination_id,srch_destination_type_id,srch_rm_cnt,user_id,user_location_city,user_location_country,user_location_region,trip_length,solo_trip,booking_rate,booked_cluster1.0,not_booked_cluster1.0,booked_cluster80.0,not_booked_cluster80.0,booked_cluster21.0,not_booked_cluster21.0,booked_cluster92.0,not_booked_cluster92.0,booked_cluster41.0,not_booked_cluster41.0,booked_cluster69.0,not_booked_cluster69.0,booked_cluster70.0,not_booked_cluster70.0,booked_cluster98.0,not_booked_cluster98.0,booked_cluster10.0,not_booked_cluster10.0,booked_cluster18.0,not_booked_cluster18.0,booked_cluster28.0,not_booked_cluster28.0,booked_cluster25.0,not_booked_cluster25.0,booked_cluster2.0,not_booked_cluster2.0,booked_cluster16.0,not_booked_cluster16.0,booked_cluster94.0,not_booked_cluster94.0,booked_cluster77.0,not_booked_cluster77.0,booked_cluster24.0,not_booked_cluster24.0,booked_cluster58.0,not_booked_cluster58.0,booked_cluster36.0,not_booked_cluster36.0,booked_cluster82.0,not_booked_cluster82.0,booked_cluster46.0,not_booked_cluster46.0,booked_cluster30.0,not_booked_cluster30.0,booked_cluster29.0,not_booked_cluster29.0,booked_cluster57.0,not_booked_cluster57.0,booked_cluster85.0,not_booked_cluster85.0,booked_cluster5.0,not_booked_cluster5.0,booked_cluster59.0,not_booked_cluster59.0,booked_cluster62.0,not_booked_cluster62.0,booked_cluster81.0,not_booked_cluster81.0,booked_cluster6.0,not_booked_cluster6.0,booked_cluster53.0,not_booked_cluster53.0,booked_cluster8.0,not_booked_cluster8.0,booked_cluster73.0,not_booked_cluster73.0,booked_cluster26.0,not_booked_cluster26.0,booked_cluster95.0,not_booked_cluster95.0,booked_cluster13.0,not_booked_cluster13.0,booked_cluster42.0,not_booked_cluster42.0,booked_cluster9.0,not_booked_cluster9.0,booked_cluster55.0,not_booked_cluster55.0,booked_cluster91.0,not_booked_cluster91.0,booked_cluster72.0,not_booked_cluster72.0,booked_cluster65.0,not_booked_cluster65.0,booked_cluster56.0,not_booked_cluster56.0,booked_cluster68.0,not_booked_cluster68.0,booked_cluster75.0,not_booked_cluster75.0,booked_cluster38.0,not_booked_cluster38.0,booked_cluster67.0,not_booked_cluster67.0,booked_cluster78.0,not_booked_cluster78.0,booked_cluster43.0,not_booked_cluster43.0,booked_cluster37.0,not_booked_cluster37.0,booked_cluster99.0,not_booked_cluster99.0,booked_cluster35.0,not_booked_cluster35.0,booked_cluster20.0,not_booked_cluster20.0,booked_cluster90.0,not_booked_cluster90.0,booked_cluster50.0,not_booked_cluster50.0,booked_cluster22.0,not_booked_cluster22.0,booked_cluster64.0,not_booked_cluster64.0,booked_cluster60.0,not_booked_cluster60.0,booked_cluster11.0,not_booked_cluster11.0,booked_cluster97.0,not_booked_cluster97.0,booked_cluster89.0,not_booked_cluster89.0,booked_cluster83.0,not_booked_cluster83.0,booked_cluster14.0,not_booked_cluster14.0,booked_cluster51.0,not_booked_cluster51.0,booked_cluster15.0,not_booked_cluster15.0,booked_cluster40.0,not_booked_cluster40.0,booked_cluster45.0,not_booked_cluster45.0,booked_cluster84.0,not_booked_cluster84.0,booked_cluster54.0,not_booked_cluster54.0,booked_cluster76.0,not_booked_cluster76.0,booked_cluster17.0,not_booked_cluster17.0,booked_cluster49.0,not_booked_cluster49.0,booked_cluster32.0,not_booked_cluster32.0,booked_cluster7.0,not_booked_cluster7.0,booked_cluster88.0,not_booked_cluster88.0,booked_cluster79.0,not_booked_cluster79.0,booked_cluster4.0,not_booked_cluster4.0,booked_cluster39.0,not_booked_cluster39.0,booked_cluster47.0,not_booked_cluster47.0,booked_cluster19.0,not_booked_cluster19.0,booked_cluster33.0,not_booked_cluster33.0,booked_cluster48.0,not_booked_cluster48.0,booked_cluster93.0,not_booked_cluster93.0,booked_cluster0.0,not_booked_cluster0.0,booked_cluster34.0,not_booked_cluster34.0,booked_cluster63.0,not_booked_cluster63.0,booked_cluster96.0,not_booked_cluster96.0,booked_cluster44.0,not_booked_cluster44.0,booked_cluster3.0,not_booked_cluster3.0,booked_cluster12.0,not_booked_cluster12.0,booked_cluster86.0,not_booked_cluster86.0,booked_cluster71.0,not_booked_cluster71.0,booked_cluster61.0,not_booked_cluster61.0,booked_cluster66.0,not_booked_cluster66.0,booked_cluster52.0,not_booked_cluster52.0,booked_clusternan,not_booked_clusternan
0,3,,2015-09-03 17:09:54,6,204,27,1,0,5539.0567,3,2,2,0,2016-05-19,2016-05-23,12243,6,1,1,37449,66,174,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,10,,2015-09-24 17:38:35,6,204,1540,1,0,5873.2923,3,2,2,0,2016-05-12,2016-05-15,14474,7,1,1,37449,66,174,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,,2015-06-07 15:53:02,2,50,699,0,0,3975.9776,3,2,4,0,2015-07-26,2015-07-27,11353,1,1,20,17440,66,142,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,10,,2015-09-14 14:49:10,2,50,628,0,1,1508.5975,3,2,2,0,2015-09-14,2015-09-16,8250,1,1,28,34156,66,258,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,,2015-07-17 09:32:04,2,50,538,0,0,66.7913,3,2,2,0,2015-07-22,2015-07-23,11812,1,1,50,36345,66,467,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0,,2015-07-21 11:58:45,2,50,447,0,0,359.8521,3,2,4,0,2015-07-22,2015-07-24,11827,1,2,51,48189,66,311,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,10,,2015-07-29 07:58:39,2,50,696,0,0,237.3465,3,2,2,0,2015-08-02,2015-08-03,8271,1,1,51,48189,66,311,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,10,,2015-08-01 20:13:15,2,50,191,0,0,216.5785,3,2,2,0,2015-08-03,2015-08-04,8291,1,1,51,24811,66,348,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0,,2015-11-07 12:29:09,2,50,628,0,0,2337.6754,3,2,2,0,2015-12-30,2015-12-31,8250,1,1,51,48189,66,311,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,10,,2015-11-08 16:21:37,2,50,364,0,0,2539.7995,3,2,2,0,2016-01-02,2016-01-03,9145,1,1,51,48189,66,311,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
finalize(df_train)
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 64 entries, 1 to 998
Columns: 213 entries, booked_cluster0 to is_Train
dtypes: bool(1), float64(193), int64(19)
memory usage: 106.6 KB


## 3. Data exploration

In [None]:
# What are the most common hotel clusters?
df_train.hotel_cluster.value_counts().head(10)

In [None]:
# plot hotel clusters
fig, ax = plt.subplots(1, 1, figsize=(14, 4))
x = df_train['hotel_cluster'].value_counts().reset_index()
ax.bar(x['index'].to_numpy(), x['hotel_cluster'].to_numpy())

In [None]:
fig, ax = plt.subplots(figsize=(14, 10))
# sns.heatmap(df_train.corr(),ax=ax,annot=True,linewidths=2)

## 3. Models

In [23]:
# pepare dfs for training
X_train = df_train.drop(['hotel_cluster'], axis=1)
y_train = df_train['hotel_cluster']

In [24]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import numpy as np
import warnings

warnings.simplefilter("ignore")

forest = RandomForestClassifier(n_jobs=-1)
forest.fit(X_train, y_train)

np.mean(cross_val_score(forest, X_train, y_train, cv=10, scoring='accuracy'))

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [None]:
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)

np.mean(cross_val_score(tree, X_train, y_train, cv=10, scoring='accuracy'))

In [None]:
# feature importance in random forest
feature_importances = pd.DataFrame(forest.feature_importances_, index = X_train.columns,
                                   columns=['importance']).sort_values('importance', ascending=False)
feature_importances

In [None]:
# feature importance in decision tree
feature_importances = pd.DataFrame(tree.feature_importances_, index = X_train.columns,
                                   columns=['importance']).sort_values('importance', ascending=False)
feature_importances

In [None]:
import xgboost as xgb
from xgboost import XGBClassifier

xgb = XGBClassifier(n_jobs=-1, max_depth=2, eta=1)

X_train_xgb = X_train.loc[:,feature_importances.index[:50]]
xgb.fit(X_train_xgb, y_train)

np.mean(cross_val_score(xgb, X_train, y_train, cv=10, scoring='accuracy'))

## 4. Remove data / kill subproccess

In [None]:
remove_data(data_path, csvs, clear_hard_drive)

In [None]:
caffein.kill()