In [225]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.linear_model import LogisticRegression
# from sklearn.cross_validation import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn import metrics

In [226]:
# From Kaggle Competition. Expedia Hotel Recommendations
# https://www.kaggle.com/c/expedia-hotel-recommendations

## Load the data from expedia kaggle Challenge
destinations = pd.read_csv('../Final_Project/data/destinations.csv') 
# Read data into pandas and explore
# expedia = pd.read_csv('../Final_Project/data/train.csv') # takes a LONG time to load the full dataset! 
expedia = pd.read_csv('../Final_Project/data/train.csv',nrows=10000) # test code with smaller set first for faster code check. 
# other dataset provided: testing dataset
# df_test = pd.read_csv('../Final_Project/data/test.csv'); # will read this is later when/if needed. 

In [227]:
print 'Expedia Site Dataframe Shape'
print expedia.shape
expedia.head(2)
print 'Destinations Dataframe Shape'
print destinations.shape
destinations.head(2)

Expedia Site Dataframe Shape
(10000, 24)
Destinations Dataframe Shape
(62106, 150)


Unnamed: 0,srch_destination_id,d1,d2,d3,d4,d5,d6,d7,d8,d9,...,d140,d141,d142,d143,d144,d145,d146,d147,d148,d149
0,0,-2.198657,-2.198657,-2.198657,-2.198657,-2.198657,-1.897627,-2.198657,-2.198657,-1.897627,...,-2.198657,-2.198657,-2.198657,-2.198657,-2.198657,-2.198657,-2.198657,-2.198657,-2.198657,-2.198657
1,1,-2.18169,-2.18169,-2.18169,-2.082564,-2.18169,-2.165028,-2.18169,-2.18169,-2.031597,...,-2.165028,-2.18169,-2.165028,-2.18169,-2.18169,-2.165028,-2.18169,-2.18169,-2.18169,-2.18169


Explaination of Columns from https://www.kaggle.com/c/expedia-hotel-recommendations/data 

date_time - Timestamp - string
site_name - ID of the Expedia point of sale - int
posa_continent - ID of continent associated with site_name	- int
user_location_country - The ID of the country the customer is located - int
user_location_region - The ID of the region the customer is located - int
user_location_city - The ID of the city the customer is located - int
orig_destination_distance - Physical distance between a hotel and a customer at the time of search. A null means the distance could not be calculated- double
user_id - ID of user - int
is_mobile - 1 when a user connected from a mobile device, 0 otherwise - tinyint
is_package - 1 if the click/booking was generated as a part of a package, 0 otherwise - int
channel  - 	ID of a marketing channel
srch_ci - Checkin date - string
srch_co - Checkout date - string
srch_adults_cnt - The number of adults specified in the hotel room - int
srch_children_cnt - The number of (extra occupancy) children specified in the hotel room - int
srch_rm_cnt - The number of hotel rooms specified in the search - int
srch_destination_id - ID of the destination where the hotel search was performed - int
srch_destination_type_id - Type of destination - int
hotel_continent - Hotel continent - int
hotel_country - Hotel country - int
hotel_market - Hotel market - int
is_booking - 1 if a booking, 0 if a click - tinyint
cnt - Numer of similar events in the context of the same user session  -bigint
hotel_cluster - ID of a hotel cluster - int

In [228]:
# Print some info about dataset

# user IDs. 
print 'Dataset Stats'
print 'Size of Dataframe', expedia.shape

# unique User counts. 
user = expedia.groupby('user_id').user_id.count()
muser = user.mean()
print 'Unique Users: ', len(user)
print 'Mean entries per user: ', muser

# Number of bookings vs. other entries. 
lbook = len(expedia.loc[(expedia['is_booking']  == 1)])
num_book = expedia.groupby('is_booking').is_booking.count()
print 'Number of entries where users book', lbook, 'total', len(expedia)

# unique hotel cluster counts. 
hc = expedia.groupby('hotel_cluster').hotel_cluster.count()
print 'Unique Hotel Cluter IDs: ', len(hc)

Dataset Stats
Size of Dataframe (10000, 24)
Unique Users:  339
Mean entries per user:  29.4985250737
Number of entries where users book 777 total 10000
Unique Hotel Cluter IDs:  100


In [229]:
# add a column for the epoch time to dataset. Need to do this before time is changed from a string to a date time
import time, os
epochs = []
d = expedia.date_time
p='%Y-%m-%d %H:%M:%S'
for dts in d:
    epoch = int(time.mktime(time.strptime(dts,p)))
    epochs.append(epoch)
expedia['time_epoch'] = epochs
# expedia.head(2)

In [230]:
# define function to find the day of the week. 
# Monday = 1, Sunday = 7
def find_dow(date_col):
    dow = []
    for d in date_col: 
        dow.append(d.isoweekday()) 
    return dow

In [231]:
# Adding Time Feature columns to Dataset 
# TODO replace NAN in check-in and check-out columns

expedia['date_time'] = pd.to_datetime(expedia['date_time'])
expedia['year'] = expedia['date_time'].dt.year
expedia['month'] = expedia['date_time'].dt.month
expedia['day'] = expedia['date_time'].dt.day

# Adding Day of the week the search occured
# Monday = 1, Sunday = 7
date_time = expedia.date_time
dow = find_dow(date_time)
expedia['dow_search'] = dow

# Adding Day of the week check-in date
expedia['srch_ci'] = pd.to_datetime(expedia['srch_ci'])
date_chi = expedia.srch_ci
dow = find_dow(date_chi)
expedia['dow_ci'] = dow

# Adding Day of the week check-out date
expedia['srch_co'] = pd.to_datetime(expedia['srch_co'])
date_co = expedia.srch_co
dow = find_dow(date_co)
expedia['dow_co'] = dow


expedia.head(2)

Unnamed: 0,date_time,site_name,posa_continent,user_location_country,user_location_region,user_location_city,orig_destination_distance,user_id,is_mobile,is_package,...,hotel_country,hotel_market,hotel_cluster,time_epoch,year,month,day,dow_search,dow_ci,dow_co
0,2014-08-11 07:46:59,2,3,66,348,48862,2234.2641,12,0,1,...,50,628,1,1407768419,2014,8,11,1,3.0,7.0
1,2014-08-11 08:22:12,2,3,66,348,48862,2234.2641,12,0,1,...,50,628,1,1407770532,2014,8,11,1,5.0,2.0


In [232]:
# Adding a feature for is_weekend_trip column
# definition: Check-in between Wednesday - Friday and Checkout Sunday - Tuesday  will be a weekend trip. 
# 1 = weekend, 0 = during week trip. 
# dow_ci = expedia.dow_ci
# dow_co = expedia.dow_co
# wt = np.zeros(len(dow_ci))

# for i in wt:
    # if dow_ci >= 3 and dow_ci <= 6:
    #     if dow_co <= 6 and dow_co >= 2:
    #         i = 1

# expedia['is_weekend_trip'] = wt 

 feature ideas: 
 number or clicks before purchase
 repeat clicks? user visits same hotel cluster again 
 
 if user books more than one hotel - is it in the same cluster, same location? 
 time on site searching column (from time_epoch)
 
 add dictionary look up for user.
 Recommendation engine? 

In [233]:
# Adding feature columns to dataset. 

# if there are children - making a binary column of family
is_family = np.where(expedia.srch_children_cnt >= 1, 1, 0)
expedia['is_family'] = is_family

# Adding column for search and is booking. 
# booking cluster  = 100 + cluster ID if booked,  0 if not booking
expedia['booking_clust'] = np.where(expedia.is_booking == 1,(100+ expedia.hotel_cluster) ,0)

expedia.head(2)

Unnamed: 0,date_time,site_name,posa_continent,user_location_country,user_location_region,user_location_city,orig_destination_distance,user_id,is_mobile,is_package,...,hotel_cluster,time_epoch,year,month,day,dow_search,dow_ci,dow_co,is_family,booking_clust
0,2014-08-11 07:46:59,2,3,66,348,48862,2234.2641,12,0,1,...,1,1407768419,2014,8,11,1,3.0,7.0,0,0
1,2014-08-11 08:22:12,2,3,66,348,48862,2234.2641,12,0,1,...,1,1407770532,2014,8,11,1,5.0,2.0,0,101


In [234]:
# expedia.isnull().sum()
# orig_destination_distance,  srch_ci, srch_co,  contains null values
# expedia.loc[pd.isnull(expedia['orig_destination_distance'])] = np.nan
# expedia.orig_destination_distance
expedia.isnull().sum()

date_time                       0
site_name                       0
posa_continent                  0
user_location_country           0
user_location_region            0
user_location_city              0
orig_destination_distance    3729
user_id                         0
is_mobile                       0
is_package                      0
channel                         0
srch_ci                         7
srch_co                         7
srch_adults_cnt                 0
srch_children_cnt               0
srch_rm_cnt                     0
srch_destination_id             0
srch_destination_type_id        0
is_booking                      0
cnt                             0
hotel_continent                 0
hotel_country                   0
hotel_market                    0
hotel_cluster                   0
time_epoch                      0
year                            0
month                           0
day                             0
dow_search                      0
dow_ci        

In [235]:
# Predict if a person is booking or not. 
# We do not care what hotel cluster a person is looking at unless they book. 

# print expedia.hotel_cluster.max()
# print expedia.hotel_cluster.min()
# expedia.corr()


# labels = expedia.columns # features to train with 
# full list
# X = [ 
#     { 'date_time', 'site_name', 'posa_continent', 'user_location_country',
#        'user_location_region', 'user_location_city',
#        'orig_destination_distance', 'user_id', 'is_mobile', 'is_package',
#        'channel', 'srch_ci', 'srch_co', 'srch_adults_cnt',
#        'srch_children_cnt', 'srch_rm_cnt', 'srch_destination_id',
#        'srch_destination_type_id', 'is_booking', 'cnt', 'hotel_continent',
#        'hotel_country', 'hotel_market', 'hotel_cluster'
#      }]

X = expedia.drop(['is_booking','date_time','srch_ci','srch_co','orig_destination_distance'], axis = 1) # features
feature_cols = X.columns # get colnames for use later
# np.array(expedia) 
target = expedia.is_booking # y 

In [236]:
# add a null model for comparison of baseline 

mo_target = target.mode() # guess 0 - they do not book all the time. 
# pred = np.zero(len(expdedia),1)

In [237]:
# train a classification model with existing columns
# trying KNN and log reg to start. 
# Note: do not need to test_train_split because Kaggle provides a separate traing set. 

# knn
# k = 3
# knn = KNeighborsClassifier(n_neighbors=k)
# knn.fit(X, target)
# knn.score(X, target)

# from sklearn.grid_search import GridSearchCV
# k_range = range(2, 30, 1)
# param_grid = dict(n_neighbors=k_range)
# grid = GridSearchCV(knn, param_grid, cv=5, scoring='accuracy')
# grid.fit(X_train, y_train)
# grid.grid_scores_
# grid_mean_scores = [result[1] for result in grid.grid_scores_] # get scores from grid.grid_scores_
# visualize results
# plt.figure()
# plt.plot(k_range, grid_mean_scores)

# retraining with added column
# knn
k = 3
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X, target)
knn.score(X, target)
preds_knn = knn.predict(X)
print 'Accuracy for KNN model'
print accuracy_score(target, preds_knn)
print 'KNN Confusion Matrix'
print metrics.confusion_matrix(target, preds_knn)
target_names = ['Did not Book 0','Booked 1']
print(classification_report(target, preds_knn, target_names=target_names))

# test set 
# preds_knn = knn.predict(X_test)
# print 'Accuracy for KNN model'
# print accuracy_score(y_test, preds_knn)
# print 'KNN Confusion Matrix'
# print metrics.confusion_matrix(y_test, preds_knn)
# print(classification_report(y_test, preds_knn, target_names=target_names))


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [12]:
# LogReg
logreg = LogisticRegression(C=1e9)
logreg.fit(X, target)
assorted_pred_class = logreg.predict(X)
logregm = zip(feature_cols,logreg.coef_[0])
preds_logreg = logreg.predict(X)
print 'Accuracy for Logistic Regression model'
print accuracy_score(target, preds_logreg)
print 'Logistic Regression Confusion Matrix'
print metrics.confusion_matrix(target, preds_logreg)
print(classification_report(target, preds_logreg, target_names=target_names))

Accuracy for Logistic Regression model
0.9223
Logistic Regression Confusion Matrix
[[9223    0]
 [ 777    0]]
                precision    recall  f1-score   support

Did not Book 0       0.92      1.00      0.96      9223
      Booked 1       0.00      0.00      0.00       777

   avg / total       0.85      0.92      0.89     10000



  'precision', 'predicted', average, warn_for)


Switching to predicting which cluster 
Assuming we can use first part to 

In [13]:
# for predicting which hotel cluster the person will book. 

# Select df only when the user is booking the hotel. 
# only look at when a person is booking. 
df_book = expedia.loc[expedia.is_booking == 1]
print df_book.shape
# df_book.corr()

X = df_book.drop(['hotel_cluster','date_time','srch_ci','srch_co','orig_destination_distance'], axis = 1) # features
target = df_book.hotel_cluster

(777, 24)


In [14]:
# Null model

In [15]:
k = 3
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X, target)
knn.score(X, target)
preds_knn = knn.predict(X)
print 'Accuracy for KNN model'
print accuracy_score(target, preds_knn)
print 'KNN Confusion Matrix'
print metrics.confusion_matrix(target, preds_knn)
# target_names = ['Did not Book 0','Booked 1']
# print(classification_report(target, preds_knn, target_names=target_names))
print(classification_report(target, preds_knn))

Accuracy for KNN model
0.418275418275
KNN Confusion Matrix
[[ 2  0  0 ...,  1  0  0]
 [ 0 11  0 ...,  0  0  0]
 [ 0  1  5 ...,  0  0  0]
 ..., 
 [ 0  0  0 ...,  5  0  0]
 [ 0  0  0 ...,  0  0  0]
 [ 0  0  0 ...,  0  0  2]]
             precision    recall  f1-score   support

          0       0.29      0.67      0.40         3
          1       0.31      1.00      0.47        11
          2       0.29      0.83      0.43         6
          3       0.57      1.00      0.73         4
          4       0.42      1.00      0.59         5
          5       0.31      0.71      0.43         7
          6       0.41      1.00      0.58        16
          7       0.31      0.92      0.47        12
          8       0.40      0.50      0.44         4
          9       0.50      0.40      0.44         5
         10       0.38      1.00      0.55         6
         11       0.45      0.83      0.59         6
         12       0.64      0.82      0.72        11
         13       0.52      0.79  

In [16]:
logreg = LogisticRegression(C=1e9)
logreg.fit(X, target)
assorted_pred_class = logreg.predict(X)
logregm = zip(feature_cols,logreg.coef_[0])
preds_logreg = logreg.predict(X)
print 'Accuracy for Logistic Regression model'
print accuracy_score(target, preds_logreg)
print 'Logistic Regression Confusion Matrix'
print metrics.confusion_matrix(target, preds_logreg)
print(classification_report(target, preds_logreg))

Accuracy for Logistic Regression model
0.189189189189
Logistic Regression Confusion Matrix
[[0 0 0 ..., 0 0 0]
 [0 7 0 ..., 0 0 0]
 [0 0 2 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 2 0 1]
 [0 0 0 ..., 1 0 0]
 [0 0 0 ..., 1 0 4]]
             precision    recall  f1-score   support

          0       0.00      0.00      0.00         3
          1       0.26      0.64      0.37        11
          2       0.29      0.33      0.31         6
          3       0.00      0.00      0.00         4
          4       0.00      0.00      0.00         5
          5       0.33      0.14      0.20         7
          6       0.19      0.19      0.19        16
          7       0.11      0.08      0.10        12
          8       0.00      0.00      0.00         4
          9       0.14      0.20      0.17         5
         10       0.00      0.00      0.00         6
         11       0.00      0.00      0.00         6
         12       0.11      0.09      0.10        11
         13       0.44      0.50      

In [18]:
# test Full model
# combination of best first classification + predicting hotel cluster. 

