In [64]:
# imports
import zipfile
import pandas as pd
import os
import gzip
import sys
import shutil
import urllib.request

In [65]:
# config
path = '/Users/max/code/maxwrf/NOVA/machine-learning/project'  # path to project directory
csvs = 'train.csv', 'test.csv', 'destinations.csv'  #  our names for the different csvs
gzs = '2013-2014.gz', '2015.gz', 'destinations.csv.gz'  # names of the gz files which we be unpacked from expedia.zip
url = "https://www.dropbox.com/s/gwrc8o99xcj6r72/Expedia.zip?dl=1" # dl=1
is_development = True
data_path = path + '/data/'
clear_hard_drive = True

In [66]:
# loads the dataset files into a passed directory otherwise current location
def load_files(url, gzs, csvs, data_path, path=os.getcwd()):
    os.chdir(path)

    if all(os.path.isfile(data_path + csv) for csv in csvs):
        print('Data already loaded')
    else:
        # if the Expedia file is not there yet init download it and write it as zip
        if not os.path.isfile('Expedia.zip'):
            print('Downloading Expedia.zip from Dropbox...')
            u = urllib.request.urlopen(url)
            data = u.read()
            u.close()
            
            with open('Expedia.zip', 'wb') as f:
                f.write(data)
            print('Finished downloading Expedia.zip from Dropbox')

        # Extract gz files from zip
        with zipfile.ZipFile("Expedia.zip", 'r') as zip_ref:
            print('Unzipping Expedia.zip')
            zip_ref.extractall(path)
            print('Finished unzipping Expedia.zip...')

        os.chdir(path + '/all')

        # extract the csvs from the gzs
        for csv, gz in zip(csvs, gzs):   
            with gzip.open(gz, 'rb') as f_in: 
                with open(csv, 'wb') as f_out:
                    print(f'writing {csv}...')
                    shutil.copyfileobj(f_in, f_out)
                    print(f'Finished writing {csv}')

        # clean up directory
        for f in gzs:
            os.remove(f)
            
        os.chdir(path)  # return to parent directory
        if os.path.exists(data_path): # if the data directory already exists probably empty, remove it
            shutil.rmtree(data_path)
        os.rename(path + '/all', data_path)
        os.remove('Expedia.zip')
        
        print(f'Finished.')

In [67]:
# laoding data into memory
def load_data(is_development, data_path, csvs, write_sample=False):
    print('Loading data into memory / dataframe')
    if not is_development:
        df_train = pd.read_csv(data_path + csvs[0])
        df_test = pd.read_csv(data_path + csvs[1])
        df_destination = pd.read_csv(data_path + csvs[2])  # not even needed. its enough to have the destination id as feature
    else:
        df_train = pd.read_csv(data_path + csvs[0], nrows=100000)
        df_test = pd.read_csv(data_path + csvs[1], nrows=10000)
        df_destination = pd.read_csv(data_path + csvs[2])
        
        if write_sample:
            print('Writing sample files...')
            df_train.to_csv(data_path + 'sample' + csvs[0], index=False)
            df_test.to_csv(data_path + 'sample' + csvs[1], index=False)
            print('Finished writing sample files...')
            
    print('Finished loading data into memory / dataframe')
    return df_train, df_test, df_destination

In [68]:
def remove_data(data_path, csvs, clear_hard_drive=False):
    if not clear_hard_drive:
        return
    print('Removing files from hard drive...')
    cwd = os.getcwd()
    os.chdir(data_path)
    for csv in csvs:
        try:
            print('Removing ', csv, '...')
            os.remove(csv)  
        except EnvironmentError:
            print(csv, ' not found.')
        try: 
            print('Removing sample ', csv, '...')
            os.remove('sample' + csv)
        except EnvironmentError:
            print('sample', csv, ' not found.')
    os.chdir(cwd)
    print('Finished removing files from hard drive...')

In [69]:
load_files(url, gzs, csvs, data_path, path)

Downloading Expedia.zip from Dropbox...
Finished downloading Expedia.zip from Dropbox
Unzipping Expedia.zip
Finished unzipping Expedia.zip...
writing train.csv...
Finished writing train.csv
writing test.csv...
Finished writing test.csv
writing destinations.csv...
Finished writing destinations.csv
Finished.


In [70]:
df_train, df_test, df_destination = load_data(is_development, data_path, csvs, write_sample=True)

Loading data into memory / dataframe
Writing sample files...
Finished writing sample files...
Finished loading data into memory / dataframe


In [71]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 24 columns):
date_time                    100000 non-null object
site_name                    100000 non-null int64
posa_continent               100000 non-null int64
user_location_country        100000 non-null int64
user_location_region         100000 non-null int64
user_location_city           100000 non-null int64
orig_destination_distance    63078 non-null float64
user_id                      100000 non-null int64
is_mobile                    100000 non-null int64
is_package                   100000 non-null int64
channel                      100000 non-null int64
srch_ci                      99929 non-null object
srch_co                      99929 non-null object
srch_adults_cnt              100000 non-null int64
srch_children_cnt            100000 non-null int64
srch_rm_cnt                  100000 non-null int64
srch_destination_id          100000 non-null int64
srch_destination_type

In [72]:
df_train.head()

Unnamed: 0,date_time,site_name,posa_continent,user_location_country,user_location_region,user_location_city,orig_destination_distance,user_id,is_mobile,is_package,...,srch_children_cnt,srch_rm_cnt,srch_destination_id,srch_destination_type_id,is_booking,cnt,hotel_continent,hotel_country,hotel_market,hotel_cluster
0,2014-08-11 07:46:59,2,3,66,348,48862,2234.2641,12,0,1,...,0,1,8250,1,0,3,2,50,628,1
1,2014-08-11 08:22:12,2,3,66,348,48862,2234.2641,12,0,1,...,0,1,8250,1,1,1,2,50,628,1
2,2014-08-11 08:24:33,2,3,66,348,48862,2234.2641,12,0,0,...,0,1,8250,1,0,1,2,50,628,1
3,2014-08-09 18:05:16,2,3,66,442,35390,913.1932,93,0,0,...,0,1,14984,1,0,1,2,50,1457,80
4,2014-08-09 18:08:18,2,3,66,442,35390,913.6259,93,0,0,...,0,1,14984,1,0,1,2,50,1457,21


In [73]:
df = df_train.groupby(['srch_destination_id', 'hotel_cluster'])['is_booking'].agg(['sum','count']).reset_index()
# count represents a click
# sum represents a booking
df.rename({'sum': 'clicks', 'count': 'bookings'}, axis=1, inplace=True)
df['score'] = df['clicks'] * 0.05 + df['bookings']
df.groupby(['srch_destination_id']).apply(lambda g: g.nlargest(5, ['score']))

Unnamed: 0_level_0,Unnamed: 1_level_0,srch_destination_id,hotel_cluster,clicks,bookings,score
srch_destination_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
8,0,8,32,1,2,2.05
8,2,8,77,1,2,2.05
8,1,8,60,0,1,1.00
11,3,11,94,1,2,2.05
14,4,14,20,1,3,3.05
...,...,...,...,...,...,...
64986,25827,64986,91,0,1,1.00
64999,25829,64999,54,0,1,1.00
65035,25830,65035,10,1,7,7.05
65035,25831,65035,35,0,1,1.00


In [76]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import numpy as np
import warnings

warnings.simplefilter("ignore")

# Simplest option --> create a df which only has the rows where booking == 1 -> Huge data leakage
df_train_simple = df_train[df_train['is_booking'] == 1]
# df_test_simple = df_test[df_test['is_booking'] == 1]

# only use numeric columns and only columns which do not aggregate information based on clicking events 
train_cols = ['site_name', 'user_location_region', 'is_package', 'srch_adults_cnt', 'srch_children_cnt', 'srch_destination_id', 'hotel_market', 'hotel_country', 'hotel_cluster']
df_train_simple = df_train_simple[train_cols]
df_test_simple = df_train_simple[train_cols]

X_train = df_train_simple.drop(['hotel_cluster'], axis=1).values
y_train = df_train_simple['hotel_cluster'].values

clf = RandomForestClassifier()
clf.fit(X_train, y_train)

np.mean(cross_val_score(clf, X_train, y_train, cv=10, scoring='accuracy'))

0.08887545344619105

In [77]:
remove_data(data_path, csvs, clear_hard_drive)

Removing files from hard drive...
Removing  train.csv ...
Removing sample  train.csv ...
Removing  test.csv ...
Removing sample  test.csv ...
Removing  destinations.csv ...
Removing sample  destinations.csv ...
sample destinations.csv  not found.
Finished removing files from hard drive...
