In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import metrics
from datetime import date

# import modelling from sklearn
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import cross_val_score
from sklearn.tree import export_graphviz
from sklearn.cross_validation import train_test_split

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

## Import Data and Data Prep

In [3]:
# From Kaggle Competition. Expedia Hotel Recommendations
# https://www.kaggle.com/c/expedia-hotel-recommendations

## Load the data from expedia kaggle Challenge
destinations = pd.read_csv('../Final_Project/data/destinations.csv') 
# Read data into pandas and explore
expedia = pd.read_csv('../Final_Project/data/train.csv') # takes a LONG time to load the full dataset! 
# expedia = pd.read_csv('../Final_Project/data/train.csv',nrows=10000) # test code with smaller set first for faster code check. 
# other dataset provided: testing dataset
# df_test = pd.read_csv('../Final_Project/data/test.csv'); # will read this is later when/if needed. 

In [4]:
print 'Expedia Site Dataframe Shape'
print expedia.shape
# print type(expedia)
# expedia.head(2)
print 'Destinations Dataframe Shape'
print destinations.shape
# destinations.head(2)

Expedia Site Dataframe Shape
(37670293, 24)
Destinations Dataframe Shape
(62106, 150)


In [5]:
# Select df only when the user is booking the hotel. 
# only look at when a person is booking. 

book = expedia['is_booking'] == 1
# Select all cases where stark is the attacker and the attacker wins
expedia = expedia[book]

# expedia = expedia[expedia.is_booking == 1]
# expedia = expedia.loc[expedia.is_booking == 1]
print 'Expedia Training when isbooking = 1'
print expedia.shape

Expedia Training when isbooking = 1
(3000693, 24)


Explaination of Columns from https://www.kaggle.com/c/expedia-hotel-recommendations/data 

date_time - Timestamp - string
site_name - ID of the Expedia point of sale - int
posa_continent - ID of continent associated with site_name	- int
user_location_country - The ID of the country the customer is located - int
user_location_region - The ID of the region the customer is located - int
user_location_city - The ID of the city the customer is located - int
orig_destination_distance - Physical distance between a hotel and a customer at the time of search. A null means the distance could not be calculated- double
user_id - ID of user - int
is_mobile - 1 when a user connected from a mobile device, 0 otherwise - tinyint
is_package - 1 if the click/booking was generated as a part of a package, 0 otherwise - int
channel  - 	ID of a marketing channel
srch_ci - Checkin date - string
srch_co - Checkout date - string
srch_adults_cnt - The number of adults specified in the hotel room - int
srch_children_cnt - The number of (extra occupancy) children specified in the hotel room - int
srch_rm_cnt - The number of hotel rooms specified in the search - int
srch_destination_id - ID of the destination where the hotel search was performed - int
srch_destination_type_id - Type of destination - int
hotel_continent - Hotel continent - int
hotel_country - Hotel country - int
hotel_market - Hotel market - int
is_booking - 1 if a booking, 0 if a click - tinyint
cnt - Numer of similar events in the context of the same user session  -bigint
hotel_cluster - ID of a hotel cluster - int

In [None]:
# Print some info about dataset

# user IDs. 
print 'Dataset Stats'
print 'Size of Dataframe', expedia.shape

# unique User counts. 
user = expedia.groupby('user_id').user_id.count()
muser = user.mean()
print 'Unique Users: ', len(user)
print 'Mean entries per user: ', muser

# Number of bookings vs. other entries. 
lbook = len(expedia.loc[(expedia['is_booking']  == 1)])
num_book = expedia.groupby('is_booking').is_booking.count()
print 'Number of entries where users book', lbook, 'total', len(expedia)

# unique hotel cluster counts. 
hc = expedia.groupby('hotel_cluster').hotel_cluster.count()
print 'Unique Hotel Cluter IDs: ', len(hc)

Dataset Stats
Size of Dataframe (3000693, 24)
Unique Users:  813985
Mean entries per user:  3.68642296848
Number of entries where users book 3000693 total 3000693
Unique Hotel Cluter IDs:  100


In [None]:
# Prep Expedia Train Dataset 
# Add a column for the epoch time to dataset. 
# Need to do this before time is changed from a string to a date time
import time, os
# search time epoch
epochs = []
d = expedia.date_time
p='%Y-%m-%d %H:%M:%S'
for dts in d:
    epoch = int(time.mktime(time.strptime(dts,p)))
    epochs.append(epoch)
expedia['search_time_epoch'] = epochs

# time on site column
# using time the user first searched as t0. i.e. 0 time = first search
expedia['tos'] = expedia.search_time_epoch
user = expedia.user_id
for u in user: 
    t0 = min(expedia.loc[expedia.user_id == u].search_time_epoch)
    expedia.loc[expedia['user_id']==u,'tos'] = expedia.search_time_epoch - t0

# check-in time epoch
# Do not have hh:mm:ss for check in and out. 
# Using 4:00 pm as default for checkout
# epochs = []
# d = expedia.srch_ci
# p='%Y-%m-%d %H:%M:%S'
# for dts in d:
#    epoch = int(time.mktime(time.strptime(dts,p)))
#    epochs.append(epoch)
# expedia['srch_ci_epoch'] = epochs

# check-out time epoch
# Using 11:59 am as default for checkout
# epochs = []
# d = expedia.srch_co
# p='%Y-%m-%d %H:%M:%S'
# for dts in d:
#    epoch = int(time.mktime(time.strptime(dts,p)))
#    epochs.append(epoch)
# expedia['srch_co_epoch'] = epochs

expedia.head(2)

In [None]:
# define function to find the day of the week. 
# Monday = 1, Sunday = 7
def find_dow(date_col):
    dow = []
    for d in date_col: 
        dow.append(d.isoweekday()) 
    return dow

In [None]:
# Adding Time Feature columns to Dataset 

# Replacing NaN/None values in check in and out search. 
expedia['srch_ci'] = np.where(pd.isnull(expedia.srch_ci) == 1, 0, expedia.srch_ci)
expedia['srch_co'] = np.where(pd.isnull(expedia.srch_co) == 1, 0, expedia.srch_co)

expedia['date_time'] = pd.to_datetime(expedia['date_time'])
expedia['year'] = expedia['date_time'].dt.year
expedia['month'] = expedia['date_time'].dt.month
expedia['day'] = expedia['date_time'].dt.day

# -------------------
# Adding Day of the week the search occured
# Monday = 1, Sunday = 7
date_time = expedia.date_time
dow = find_dow(date_time)
expedia['dow_search'] = dow

# -------------------
# Adding Day of the week check-in date
expedia['srch_ci'] = pd.to_datetime(expedia['srch_ci'])
date_ci = expedia.srch_ci
dow = find_dow(date_ci)
expedia['dow_ci'] = dow

# -------------------
# Adding Day of the week check-out date
expedia['srch_co'] = pd.to_datetime(expedia['srch_co'])
date_co = expedia.srch_co
dow = find_dow(date_co)
expedia['dow_co'] = dow

# -------------------
# length of stay in days
delta = date_co - date_ci
los = np.zeros(len(delta))
c = 0
for d in delta:
    los[c] = d.days
    c += 1    
expedia['length_of_stay'] = los

# -------------------
# Adding a feature that guesses if it is a  business_trip. 
# definition: checkin Sunday(7) - Thursday(4) and must check out same week Monday(!) - Friday(5). 
# 1 = business trip, 0 = no business trip. 

# los = expedia.length_of_stay
# dow_ci = expedia.dow_ci
# dow_co = expedia.dow_co
# bt = np.zeros(len(dow_ci))
# c = 0;
# for ci in dow_ci:
#    # check if in sunday - Check that check out is Monday - Friday of the same week
#    if ci == 7:
#        if los[c] < 5 and dow_co[c] < 6:
#            bt[c] = 1
#    elif ci < 4:
#         if los[c] < 5 and dow_co[c] <= 5:
#            bt[c] = 1
#    c += 1
# expedia['is_business_trip'] = bt 

# expedia.head(2)

 feature ideas: 
 number or clicks before purchase
 repeat clicks? user visits same hotel cluster again 
 
 if user books more than one hotel - is it in the same cluster, same location? 
 time on site searching column (from time_epoch)
 
 add dictionary look up for user.
 Recommendation engine? 

In [None]:
# Adding feature columns to dataset. 

# if there are children - making a binary column of family
is_family = np.where(expedia.srch_children_cnt >= 1, 1, 0)
expedia['is_family'] = is_family

# Adding column for search and is booking. 
# booking cluster  = 100 + cluster ID if booked,  0 if not booking
expedia['booking_clust'] = np.where(expedia.is_booking == 1,(100+ expedia.hotel_cluster) ,0)

# expedia.head(2)

In [None]:
# Prep Destinations Dataset
# Adding Mean, Meadian, stdev, is_positive columns 
d = destinations
d = d.drop('srch_destination_id',axis = 1)

destinations['mean_latent'] = np.nanmean(d, axis=1)
destinations['median_latent'] = np.nanmedian(d, axis=1)
destinations['std_latent'] = np.nanstd(destinations, axis=1)
destinations['mean_p_std_latent'] = abs(destinations.mean_latent) + destinations.std_latent
destinations['max_latent'] = np.nanmax(d, axis=1)
destinations['min_latent'] = np.nanmin(d, axis=1)
destinations['is_positive_review'] = np.where(destinations.mean_latent > 0, 1, 0)
destinations['range_latent'] = destinations.max_latent - destinations.min_latent

# use only summary columns in training. 
use = ['srch_destination_id', 'mean_latent', 'median_latent', 'std_latent',\
       'mean_p_std_latent', 'max_latent', 'min_latent','is_positive_review',\
       'range_latent']
destinations_use =  destinations[use]

# print destinations.shape
# print destinations_use.shape
# print destinations_use.isnull().sum()
# destinations_use.head(5)

In [None]:
# Add Destination Data Information to Expedia Dataset
# Adding by search destination ID

# print expedia.shape
# print destinations_use.shape
expedia = pd.merge(expedia, destinations_use, left_on='srch_destination_id', \
                   right_on='srch_destination_id', how='left')

# print expedia.shape
# expedia.head(2)

In [None]:
# fix NaN and Null values
expedia['mean_latent'] = np.where(pd.isnull(expedia.mean_latent) == 1, 0, expedia.mean_latent)
expedia['median_latent'] = np.where(pd.isnull(expedia.median_latent) == 1, 0, expedia.median_latent)
expedia['std_latent'] = np.where(pd.isnull(expedia.std_latent) == 1, 0, expedia.std_latent)
expedia['mean_p_std_latent'] = np.where(pd.isnull(expedia.mean_p_std_latent) == 1, 0, expedia.mean_p_std_latent)
expedia['max_latent'] = np.where(pd.isnull(expedia.max_latent) == 1, 0, expedia.max_latent)
expedia['min_latent'] = np.where(pd.isnull(expedia.min_latent) == 1, 0, expedia.min_latent)
expedia['is_positive_review'] = np.where(pd.isnull(expedia.is_positive_review) == 1, 0, expedia.is_positive_review)
expedia['range_latent'] = np.where(pd.isnull(expedia.range_latent) == 1, 0, expedia.range_latent)

# some features have negative values. making abs(features) for NB  
expedia['abs_mean_latent'] = expedia['mean_latent'].abs()
expedia['abs_median_latent'] = expedia['median_latent'].abs()
expedia['abs_max_latent'] = expedia['max_latent'].abs()
expedia['abs_min_latent'] = expedia['min_latent'].abs()

print expedia.shape
expedia.head(2)

In [None]:
print expedia.isnull().sum()
# print expedia.columns

## Predicting Which Hotel Cluseter user will book

## Setting Target and Predictors

In [None]:
# For predicting which hotel cluster the person will book. 
features = ['site_name', 'posa_continent', 'user_location_country',\
            'user_location_region', 'user_location_city',\
            'user_id', 'is_mobile', 'is_package',\
            'channel', 'srch_adults_cnt',\
            'srch_children_cnt', 'srch_rm_cnt', 'srch_destination_id',\
            'srch_destination_type_id', 'cnt', 'hotel_continent',\
            'hotel_country', 'hotel_market', 'hotel_cluster',\
            'search_time_epoch', 'tos', 'year', 'month', 'day', 'dow_search',\
            'dow_ci', 'dow_co', 'length_of_stay',\
            'is_family', 'mean_latent', 'median_latent',\
            'std_latent', 'mean_p_std_latent', 'max_latent',\
            'min_latent', 'is_positive_review', 'range_latent',\
            'abs_mean_latent','abs_median_latent', 'abs_max_latent','abs_min_latent']

# 'is_business_trip',
df_book = expedia
print 'Traing dataset', df_book.shape

# df_book.corr()
ignore = ['hotel_cluster', 'date_time', 'srch_ci', 'srch_co',\
          'booking_clust', 'orig_destination_distance']
X = df_book.drop(ignore, axis = 1) # features
target = df_book.hotel_cluster

## Null Model

In [None]:
# Null model
# Compare your best RMSE on testing set with the RMSE for the "null model", which is the model that ignores
#   all features and simply predicts the mean rating in the training set for all observations in the testing set.

# Cross-Validation by
cvn = 10

predv = (df_book.hotel_cluster.mode()[0])
null_pred = np.ones(len(df_book.hotel_cluster)) * predv
df_book['prediction'] = null_pred
# y_train['prediction'] = np.ones(len(y_train.hotel_cluster)) * predv
# print df_book.isnull().sum()

# calculate RMSE for those predictions
rmse_null = np.sqrt(metrics.mean_squared_error(df_book.hotel_cluster, df_book.prediction))

# rmse_null_test = np.sqrt(metrics.mean_squared_error(y_test.hotel_cluster, y_train.prediction))
print 'Hypthesis Value: ', predv 
print 'Null Hypothesis RMSE: ', rmse_null


## KNN 

In [None]:
# find best K to use Grid Search 2:50 by 1
k = 3
knn = KNeighborsClassifier(n_neighbors=k)

k_range = range(2, 50, 1)
param_grid = dict(n_neighbors=k_range)
grid = GridSearchCV(knn, param_grid, cv=5, scoring='accuracy')
grid.fit(X, target)
grid.grid_scores_
grid_mean_scores = [result[1] for result in grid.grid_scores_] # get scores from grid.grid_scores_
# visualize results
plt.figure()
plt.plot(k_range, grid_mean_scores)

print 'best grid score', grid.best_score_ 
best_k = grid.best_params_['n_neighbors']

print 'Using K = ', best_k
# knn = KNeighborsClassifier(n_neighbors=best_k)
knn.fit(X, target)
knn.score(X, target)
preds_knn = knn.predict(X)
knn_model = grid.best_estimator_ 

print 'Accuracy for KNN model'
print accuracy_score(target, preds_knn)
# print 'KNN Confusion Matrix'
# print metrics.confusion_matrix(target, preds_knn)
rmse_knn = np.sqrt(metrics.mean_squared_error(target, preds_knn)) 
print 'KNN RMSE', rmse_knn

scores = cross_val_score(knn, X, target, cv=cvn, scoring='mean_squared_error')
rmse_knn_cv = np.mean(np.sqrt(-scores))
print 'KNN RMSE CV', rmse_knn_cv

## Logistic Regression

In [None]:
logreg = LogisticRegression(C=1e9)
logreg.fit(X, target)
assorted_pred_class = logreg.predict(X)
assorted_pred_prob = logreg.predict_proba(X)[:, 1]
intercept = logreg.intercept_
logregm = zip(X,logreg.coef_[0]) # examine coeff
preds_logreg = logreg.predict(X)

logodds = logreg.intercept_
odds = np.exp(logodds)
prob = odds/(1 + odds)
prob

print 'Accuracy for Logistic Regression model'
print accuracy_score(target, preds_logreg)
# print 'Logistic Regression Confusion Matrix'
# print metrics.confusion_matrix(target, preds_logreg)
# print(classification_report(target, preds_logreg))
rmse_logreg = np.sqrt(metrics.mean_squared_error(target, preds_logreg)) 
print 'Log Reg RMSE', rmse_logreg

scores = cross_val_score(logreg, X, target, cv=cvn, scoring='mean_squared_error')
rmse_logreg_cv = np.mean(np.sqrt(-scores))
print 'Log Reg RMSE CV', rmse_logreg_cv


## Niave Bayes 

In [None]:
# ignoring negative channels for nb. 
ignore_nb = ['hotel_cluster', 'date_time', 'srch_ci', 'srch_co',\
             'booking_clust', 'orig_destination_distance',\
             'mean_latent','median_latent', 'max_latent','min_latent']
X_nb = df_book.drop(ignore_nb, axis = 1) # features

nb = MultinomialNB()
nb.fit(X_nb, target)

# three paramaters you can set: 
# alpha=1.0, 
# class_prior=None, 
# fit_prior=True - estimates the likelihood function - counting the probability for each. 

preds_nb = nb.predict(X_nb)
print metrics.accuracy_score(target, preds_nb)
# print metrics.confusion_matrix(target, preds_nb)

rmse_nb = np.sqrt(metrics.mean_squared_error(target, preds_nb)) 
print 'RMSE for NB', rmse_nb

scores = cross_val_score(nb, X_nb, target, cv=cvn, scoring='mean_squared_error')
rmse_nb_cv = np.mean(np.sqrt(-scores))
print 'CV RMSE for NB', rmse_nb_cv

# probs_nb = nb.predict_proba(X)[:, 1]
# probs_nb
# print metrics.roc_auc_score(target, probs_nb)

In [None]:
# Trying differnt values for Alpha 
# three paramaters you can set: 
# alpha=1.0, 
# class_prior=None, 
# fit_prior=True - estimates the likelihood function - counting the probability for each. 
MSE_scores = []
RMSE_scores = []
alpha_range = range(1,30,2)
for a in alpha_range:
    nb = MultinomialNB(alpha = a)
    nb.fit(X_nb, target)
    MSE_scores = cross_val_score(nb, X_nb, target, cv=cvn, scoring='mean_squared_error')
    RMSE_scores.append(np.mean(np.sqrt(-MSE_scores)))

plt.plot(alpha_range, RMSE_scores)
plt.xlabel('Alpha')
plt.ylabel('RMSE (lower is better)')

feature_scores = sorted(zip(RMSE_scores, alpha_range))[0]
best_alpha = feature_scores[1]

print feature_scores
print 'Best Alpha to use', best_alpha

In [None]:
# Trying differnt values for Alpha, fit_prior = false
# three paramaters you can set: 
# alpha=1.0, 
# class_prior=None, 
# fit_prior=True - estimates the likelihood function - counting the probability for each. 
MSE_scores = []
RMSE_scores = []
alpha_range = range(1,30,2)
for a in alpha_range:
    nb = MultinomialNB(alpha = a, fit_prior = False)
    nb.fit(X_nb, target)
    MSE_scores = cross_val_score(nb, X_nb, target, cv=cvn, scoring='mean_squared_error')
    RMSE_scores.append(np.mean(np.sqrt(-MSE_scores)))

plt.plot(alpha_range, RMSE_scores)
plt.xlabel('Alpha')
plt.ylabel('RMSE (lower is better)')

feature_scores = sorted(zip(RMSE_scores, alpha_range))[0]
best_alpha = feature_scores[1]

print feature_scores
print 'Best Alpha to use', best_alpha

## Decision Tree

In [None]:
# Decision Tree
# TODO grid search for depth and

treeclf = DecisionTreeClassifier(max_depth=3, random_state=1)
treeclf.fit(X, target)
preds_dtree = treeclf.predict(X)
predic_proba = treeclf.predict_proba(X)
sc = treeclf.score(X,target)

# export_graphviz(treeclf, out_file='tree_titanic.dot', feature_names=features)
# At the command line, run this to convert to PNG:
#   dot -Tpng tree_titanic.dot -o tree_titanic.png

# compute the feature importances
# importaces are on a scale on 0 to 1
pd.DataFrame({'feature':features, 'importance':treeclf.feature_importances_})

print 'Mean Accuracy on Test Data and Labels', sc
rmse_dtree = np.sqrt(metrics.mean_squared_error(target, preds_dtree))
print 'RMSE DTREE', rmse_dtree

scores = cross_val_score(treeclf, X, target, cv=cvn, scoring='mean_squared_error')
rmse_dtree_cv = np.mean(np.sqrt(-scores))
print 'CV RMSE DTREE', rmse_dtree_cv


In [None]:
# Search For better settings for Decision Tree
# searching for max_depth and min_samples_leaf

# search max_depth
max_depth_range = range(1,30,1)
MSE_scores = []
RMSE_scores= []
for md in max_depth_range: 
    treeclf = DecisionTreeClassifier(max_depth=md, random_state=1)
    treeclf.fit(X, target)
    MSE_scores = cross_val_score(treeclf, X, target, cv=cvn, scoring='mean_squared_error')
    RMSE_scores.append(np.mean(np.sqrt(-MSE_scores)))

plt.plot(max_depth_range, RMSE_scores)
plt.xlabel('Max Depth Allowed')
plt.ylabel('RMSE (lower is better)')

depth_scores = sorted(zip(RMSE_scores, max_depth_range))[0]
best_depth = depth_scores[1]
print 'Best Max Depth Score', depth_scores
print 'Best Max Depth to use', best_depth


# search min_samples_leaf
min_samples_leaf_range = range(1,215,5)
MSE_scores = []
RMSE_scores= []
for msl in min_samples_leaf_range: 
    treeclf = DecisionTreeClassifier(min_samples_leaf=msl, random_state=1)
    treeclf.fit(X, target)
    MSE_scores = cross_val_score(treeclf, X, target, cv=cvn, scoring='mean_squared_error')
    RMSE_scores.append(np.mean(np.sqrt(-MSE_scores)))


plt.plot(min_samples_leaf_range, RMSE_scores)
plt.xlabel('Max Depth Allowed')
plt.ylabel('RMSE (lower is better)')

min_samp_scores = sorted(zip(RMSE_scores, min_samples_leaf_range))[0]
best_min_samp = min_samp_scores[1]
print 'Best Min Samples Score', min_samp_scores
print 'Best Min Samples to use', best_min_samp


# combine best settings discovered in two separate searches 
treeclf = DecisionTreeClassifier(max_depth=best_depth, min_samples_leaf = best_min_samp, random_state=1)
treeclf.fit(X, target)
preds_dtree = treeclf.predict(X)
# predic_proba = treeclf.predict_proba(X)
sc = treeclf.score(X,target)

# compute the feature importances
# importaces are on a scale on 0 to 1
pd.DataFrame({'feature':features, 'importance':treeclf.feature_importances_})

print 'Mean Accuracy on Test Data and Labels', sc
rmse_dtree_best_set = np.sqrt(metrics.mean_squared_error(target, preds_dtree))
print 'RMSE DTREE Best Set', rmse_dtree_best_set

scores = cross_val_score(treeclf, X, target, cv=cvn, scoring='mean_squared_error')
rmse_dtree_best_set_cv = np.mean(np.sqrt(-scores))
print 'CV RMSE DTREE Best Set', rmse_dtree_best_set_cv


## RANDOM FOREST

In [None]:
# Random Forest 
rfreg = RandomForestClassifier()

# Tuning n_estimators
# list of values to try for n_estimators
estimator_range = range(10, 310, 10)

# list to store the average RMSE for each value of n_estimators
RMSE_scores = []

# use 5-fold cross-validation with each value of n_estimators (WARNING: SLOW!)
for estimator in estimator_range:
    rfreg = RandomForestClassifier(n_estimators=estimator, random_state=1)
    MSE_scores = cross_val_score(rfreg, X, target, cv=5, scoring='mean_squared_error')
    RMSE_scores.append(np.mean(np.sqrt(-MSE_scores)))
    
plt.plot(estimator_range, RMSE_scores)
plt.xlabel('n_estimators')
plt.ylabel('RMSE (lower is better)')

In [None]:
best_n = 175
# Tuning max Features
# list of values to try for max_features
feature_range = range(1, len(features)+1)

# list to store the average RMSE for each value of max_features
RMSE_scores = []

# use 10-fold cross-validation with each value of max_features (WARNING: SLOW!)
for feature in feature_range:
    rfreg = RandomForestClassifier(n_estimators=best_n, max_features=feature, random_state=1)
    MSE_scores = cross_val_score(rfreg, X, target, cv=10, scoring='mean_squared_error')
    RMSE_scores.append(np.mean(np.sqrt(-MSE_scores)))
    
plt.plot(feature_range, RMSE_scores)
plt.xlabel('max_features')
plt.ylabel('RMSE (lower is better)')

feature_scores = sorted(zip(RMSE_scores, feature_range))[0]
best_max_feat = feature_scores[1]

print feature_scores
print 'Best Max Features to use', best_max_feat

In [None]:
# Fitting Random Forest with best parameters
# max_features=8 is best and n_estimators=150 is sufficiently large
rfclass = RandomForestClassifier(n_estimators = best_n, max_features = best_max_feat, oob_score=True, random_state=1)
rfclass.fit(X, target)

# compute feature importances
pd.DataFrame({'feature':features, 'importance':rfclass.feature_importances_}).sort('importance')

# compute the out-of-bag R-squared score
rfclass.oob_score_
preds_rf = rfclass.predict(X)
rmse_rf = np.sqrt(metrics.mean_squared_error(target, preds_rf))
print 'RMSE RF', rmse_rf

scores = cross_val_score(rfclass, X, target, cv=10, scoring='mean_squared_error')
rmse_rf_cv = np.mean(np.sqrt(-scores))
print 'CV RMSE RF', rmse_rf_cv

In [None]:
print 'RESULTS and SCORES SUMMARY'
print 'Null Hypothesis RMSE', rmse_null
print ''
print 'KNN RMSE', rmse_knn
print 'Log Reg RMSE', rmse_logreg
print 'NB RMSE', rmse_nb
print 'DTREE RMSE', rmse_dtree
print 'Random Forest RMSE', rmse_rf
# print 'Random Forest Important Features RMSE', rmse_rf_imp

print ''
print 'CV KNN RMSE', rmse_knn_cv
print 'CV Log Reg RMSE', rmse_logreg_cv
print 'CV NB RMSE', rmse_nb_cv
print 'CV DTREE RMSE', rmse_dtree_cv
print 'CV Random Forest RMSE', rmse_rf_cv
# print 'CV Random Forest Important Features RMSE', rmse_rf_imp_cv

In [None]:
# Final Test Full model With completely unseen test data. 
# combination of best first classification + predicting hotel cluster. 
# df_test = pd.read_csv('../Final_Project/data/test.csv'); 