In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, accuracy_score
import xgboost as xgb

# pull feature engineering notebook
# add your own feature engineering functions to features.py and import them
# only rule is the function must take a dataframe and return a dataframe (with your new features)
from scripts.features import *
from scripts.utils import *

color = sns.color_palette()
%matplotlib inline

In [3]:
df_train = pd.read_json("../data/drace_train.json")
df_test = pd.read_json("../data/drace_test.json")
# df_train_raw = pd.read_json("../data/train.json")
# df_test_raw = pd.read_json("../data/test.json")

In [24]:
df_train.drop([i for i in df_train.columns if i not in df_test.columns if i not in 'interest_level'], 
              axis = 1, 
              inplace=True)

In [33]:
df_train.shape

(49338, 65)

In [11]:
df_test.shape

(74659, 64)

### PRE-PROCESSING

In [None]:
%%timeit
scrub_and_engineer = [
                    scrub,
                    basic_numeric_features,
                    n_log_price,
                    n_expensive,
                    count_caps,
                    scrub_features,
                    dist_to_nearest_tube,
                    dist_to_nearest_college,
                    add_neighbor_features_72,
                      ]

for func in scrub_and_engineer:
    try:
        df_train = func(df_train_raw)
        df_test = func(df_test_raw)
    except Exception as e:
        print e
        continue

In [34]:
exclude = ['price', 
           'manager_skill', 
           'manager_skill_bool', 
           'price_vs_median_72',
           'building_id',
           'manager_id',
           'listing_id']
feats_to_train = [x for x in df_train.columns.tolist() if df_train[x].dtype not in ['O', '<M8[ns]'] and x not in exclude]
feats_to_train

[u'0_per_72',
 u'100_per_72',
 u'10_per_72',
 u'20_per_72',
 u'30_per_72',
 u'40_per_72',
 u'50_per_72',
 u'60_per_72',
 u'70_per_72',
 u'80_per_72',
 u'90_per_72',
 u'BB_ratio',
 u'Price_P_Room',
 u'allow_pets',
 u'amount_of_caps',
 u'available',
 u'bathroom_listed',
 u'bathrooms',
 u'bedrooms',
 u'buzzword',
 u'created',
 u'created_day',
 u'created_hour',
 u'created_month',
 u'created_year',
 u'dishwash',
 u'dist_to_nearest_college',
 u'dist_to_nearest_tube',
 u'distance_from_midtown',
 u'doorman',
 u'fitness',
 u'furnished',
 u'hardwood',
 u'has_phone',
 u'is_studio',
 u'large_space',
 u'latitude',
 u'laundry',
 u'longitude',
 u'luxurious',
 u'mean_72',
 u'median_72',
 u'n_log_price',
 u'n_num_keyfeat_score',
 u'nofee',
 u'num_description_words',
 u'num_features',
 u'num_photos',
 u'preWar',
 u'price_vs_median_72_new',
 u'quiet_nei',
 u'space_desc',
 u'subway',
 u'weekday_created']

### Hyperparameters

In [28]:
# Cross val
test_size=0.20

# Random Forest
n_estimators=1000

### Baseline training

In [29]:
base_features = ["bathrooms", "bedrooms", "latitude", "longitude", "price",
             "num_photos", "num_features", "num_description_words",
             "created_year", "created_month", "created_day"]
X = df_train[base_features]
y = df_train["interest_level"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=test_size)

clf = RandomForestClassifier(n_estimators=n_estimators)
clf.fit(X_train, y_train)
y_val_pred = clf.predict_proba(X_val)
baseline_logloss = log_loss(y_val, y_val_pred)

In [35]:
baseline_logloss

0.64919408743315676

### Training the new model

In [36]:
# filter out any object/string + timestamp variables and train the random forest on numerical columns
# feats_to_train = [x for x in df_train.columns.tolist() if df_train[x].dtype not in ['O', '<M8[ns]']] 
X = df_train[feats_to_train]
y = df_train["interest_level"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=test_size)

test_clf = RandomForestClassifier(
    n_estimators=n_estimators,
    criterion='gini',
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1)
test_clf.fit(X_train, y_train)
y_val_pred = test_clf.predict_proba(X_val)
test_logloss = log_loss(y_val, y_val_pred)

In [37]:
test_logloss

0.60853896264808638

### Testing for raw improvement

In [None]:
if baseline_logloss - test_logloss > 0:
    print "Model improved, save and submit"
else:
    print "Use baseline model, did not improve"

In [None]:
# test RF accuracy
accuracy_score(test_clf.predict(X_val), y_val)

In [None]:
# test RF feature importance
plot_feature_importance(test_clf, feats_to_train)

In [None]:
# baseline RF accuracy
accuracy_score(clf.predict(X_val), y_val)

In [None]:
# baseline RF feature importance
plot_feature_importance(clf, base_features)

### Send model to pickle

In [38]:
# Save a dictionary into a pickle file.
import pickle
pickle.dump( test_clf, open( "pickles/rf-Tom.p", "wb" ) )

### Clean up and output a submission file

In [None]:
sub = output(df_test, clf, feats_to_train)

In [None]:
sub.to_csv("rf-Tom.csv", index=False)

In [None]:
# submission = pd.read_csv('submission_rf.csv')
# len(submission)