In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn import model_selection, preprocessing, ensemble
import xgboost as xgb

# pull feature engineering notebook
# add your own feature engineering functions to features.py and import them
# only rule is the function must take a dataframe and return a dataframe (with your new features)
from scripts.features import *
from scripts.utils import *

color = sns.color_palette()
%matplotlib inline

In [13]:
df_train = pd.read_json("../data/drace_train.json")
df_test = pd.read_json("../data/drace_test.json")

# df_train_raw = pd.read_json("../data/train.json")
# df_test_raw = pd.read_json("../data/test.json")

In [14]:
df_train.drop([i for i in df_train.columns if i not in df_test.columns if i not in 'interest_level'], 
              axis = 1, 
              inplace=True)

In [4]:
df_train.shape

(49338, 65)

In [5]:
df_test.shape

(74659, 64)

### PRE-PROCESSING

In [15]:
# # from features.py
# scrub_and_engineer = [
#                     scrub,
#                     basic_numeric_features,
#                     n_log_price,
#                     n_expensive,
#                     count_caps,
#                     scrub_features,
#                     dist_to_nearest_tube,
#                     dist_to_nearest_college,
#                     add_neighbor_features_72,
#                       ]
# for func in scrub_and_engineer:
#     try:
#         df_train = func(df_train_raw)
#         df_test = func(df_test_raw)
#     except Exception as e:
#         print e
#         continue

In [16]:
exclude = ['price', 
           'manager_skill', 
           'manager_skill_bool', 
           'price_vs_median_72',
           'building_id',
           'manager_id',
           'listing_id']
feats_to_train = [x for x in df_train.columns.tolist() if df_train[x].dtype not in ['O', '<M8[ns]'] and x not in exclude]
feats_to_train

[u'0_per_72',
 u'100_per_72',
 u'10_per_72',
 u'20_per_72',
 u'30_per_72',
 u'40_per_72',
 u'50_per_72',
 u'60_per_72',
 u'70_per_72',
 u'80_per_72',
 u'90_per_72',
 u'BB_ratio',
 u'Price_P_Room',
 u'allow_pets',
 u'amount_of_caps',
 u'available',
 u'bathroom_listed',
 u'bathrooms',
 u'bedrooms',
 u'buzzword',
 u'created',
 u'created_day',
 u'created_hour',
 u'created_month',
 u'created_year',
 u'dishwash',
 u'dist_to_nearest_college',
 u'dist_to_nearest_tube',
 u'distance_from_midtown',
 u'doorman',
 u'fitness',
 u'furnished',
 u'hardwood',
 u'has_phone',
 u'is_studio',
 u'large_space',
 u'latitude',
 u'laundry',
 u'longitude',
 u'luxurious',
 u'mean_72',
 u'median_72',
 u'n_log_price',
 u'n_num_keyfeat_score',
 u'nofee',
 u'num_description_words',
 u'num_features',
 u'num_photos',
 u'preWar',
 u'price_vs_median_72_new',
 u'quiet_nei',
 u'space_desc',
 u'subway',
 u'weekday_created']

In [17]:
num_map = {'high':0, 'medium':1, 'low':2}
train_X = np.array(df_train[feats_to_train])
test_X = np.array(df_test[feats_to_train])
train_y = np.array(df_train['interest_level'].apply(lambda x: num_map[x]))

### Train XGB

In [23]:
def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=0, num_rounds=10000):
    param = {
        'objective': 'multi:softprob',
        'eta': 0.1,
        'max_depth': 4,
        'silent': 1,
        'num_class': 3,
        'eval_metric': "mlogloss",
        'min_child_weight': 5,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'seed': seed_val
    }

    num_rounds = num_rounds

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [(xgtrain, 'train'), (xgtest, 'test')]
        model = xgb.train(plst, xgtrain, num_rounds,
                          watchlist, early_stopping_rounds=20)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)

    pred_test_y = model.predict(xgtest)
    return pred_test_y, model

In [24]:
cv_scores = []
kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=2016)
for dev_index, val_index in kf.split(range(train_X.shape[0])):
        dev_X, val_X = train_X[dev_index,:], train_X[val_index,:]
        dev_y, val_y = train_y[dev_index], train_y[val_index]
        preds, model = runXGB(dev_X, dev_y, val_X, val_y)
        cv_scores.append(log_loss(val_y, preds))
        print(cv_scores)
        break

[0]	train-mlogloss:1.0433	test-mlogloss:1.04297
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 20 rounds.
[1]	train-mlogloss:0.994547	test-mlogloss:0.993938
[2]	train-mlogloss:0.953981	test-mlogloss:0.953129
[3]	train-mlogloss:0.91842	test-mlogloss:0.91735
[4]	train-mlogloss:0.88777	test-mlogloss:0.886581
[5]	train-mlogloss:0.860882	test-mlogloss:0.859901
[6]	train-mlogloss:0.837747	test-mlogloss:0.836812
[7]	train-mlogloss:0.817146	test-mlogloss:0.81614
[8]	train-mlogloss:0.799067	test-mlogloss:0.798048
[9]	train-mlogloss:0.78295	test-mlogloss:0.78202
[10]	train-mlogloss:0.768292	test-mlogloss:0.767319
[11]	train-mlogloss:0.75583	test-mlogloss:0.755044
[12]	train-mlogloss:0.744383	test-mlogloss:0.743725
[13]	train-mlogloss:0.733915	test-mlogloss:0.733477
[14]	train-mlogloss:0.724762	test-mlogloss:0.724618
[15]	train-mlogloss:0.716641	test-mlogloss:0.716671
[16]	train-mlogloss:0.70901	test-mlog

### Send model to pickle

In [33]:
model

array([[ 0.02541094,  0.11743679,  0.85715228],
       [ 0.15600571,  0.46461314,  0.37938112],
       [ 0.00597454,  0.05324494,  0.94078052],
       ..., 
       [ 0.01753724,  0.09461581,  0.88784695],
       [ 0.20901863,  0.45418754,  0.33679381],
       [ 0.02020315,  0.1151106 ,  0.86468619]], dtype=float32)

In [25]:
# Save a dictionary into a pickle file.
import pickle
pickle.dump( model, open( "pickles/XGBoost-Tom.p", "wb" ))

In [32]:
model.dump_model('pickles/XGboost.txt')

### Clean up and output a submission file

In [None]:
preds, model = runXGB(train_X, train_y, test_X, num_rounds=400)
out_df = pd.DataFrame(preds)
out_df.columns = ["high", "medium", "low"]
out_df["listing_id"] = df_test.listing_id.values
out_df.to_csv("../submissions/xgb-Tom.csv", index=False)