In [1]:
import json

import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import log_loss

%matplotlib inline
sns.set(style="whitegrid")

import rental_utils; reload(rental_utils)

<module 'rental_utils' from 'rental_utils.pyc'>

In [2]:
train_raw = pd.read_json('Data/train.json')
test_raw = pd.read_json('Data/test.json')

In [3]:
print(train_raw.shape)
print(test_raw.shape)

(49352, 15)
(74659, 14)


In [4]:
train_raw.head()

Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,interest_level,latitude,listing_id,longitude,manager_id,photos,price,street_address
10,1.5,3,53a5b119ba8f7b61d4e010512e0dfc85,2016-06-24 07:54:24,A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...,Metropolitan Avenue,[],medium,40.7145,7211212,-73.9425,5ba989232d0489da1b5f2c45f6688adc,[https://photos.renthop.com/2/7211212_1ed4542e...,3000,792 Metropolitan Avenue
10000,1.0,2,c5c8a357cba207596b04d1afd1e4f130,2016-06-12 12:19:27,,Columbus Avenue,"[Doorman, Elevator, Fitness Center, Cats Allow...",low,40.7947,7150865,-73.9667,7533621a882f71e25173b27e3139d83d,[https://photos.renthop.com/2/7150865_be3306c5...,5465,808 Columbus Avenue
100004,1.0,1,c3ba40552e2120b0acfc3cb5730bb2aa,2016-04-17 03:26:41,"Top Top West Village location, beautiful Pre-w...",W 13 Street,"[Laundry In Building, Dishwasher, Hardwood Flo...",high,40.7388,6887163,-74.0018,d9039c43983f6e564b1482b273bd7b01,[https://photos.renthop.com/2/6887163_de85c427...,2850,241 W 13 Street
100007,1.0,1,28d9ad350afeaab8027513a3e52ac8d5,2016-04-18 02:22:02,Building Amenities - Garage - Garden - fitness...,East 49th Street,"[Hardwood Floors, No Fee]",low,40.7539,6888711,-73.9677,1067e078446a7897d2da493d2f741316,[https://photos.renthop.com/2/6888711_6e660cee...,3275,333 East 49th Street
100013,1.0,4,0,2016-04-28 01:32:41,Beautifully renovated 3 bedroom flex 4 bedroom...,West 143rd Street,[Pre-War],low,40.8241,6934781,-73.9493,98e13ad4b495b9613cef886d79a6291f,[https://photos.renthop.com/2/6934781_1fa4b41a...,3350,500 West 143rd Street


In [14]:
[features, feature_names] = rental_utils.get_features(10, train_raw)

train = rental_utils.add_features(train_raw, features, feature_names)
test = rental_utils.add_features(test_raw, features, feature_names)

train = rental_utils.add_region(train)
test = rental_utils.add_region(test)

train = rental_utils.add_variables(train, train)
test = rental_utils.add_variables(test, train)

In [15]:
dv_county = rental_utils.vectorizer('County', train)
train = rental_utils.one_hot_encode(dv_county, train, 'County')
test = rental_utils.one_hot_encode(dv_county, test, 'County')

dv_name = rental_utils.vectorizer('Name', train)
train = rental_utils.one_hot_encode(dv_name, train, 'Name')
test = rental_utils.one_hot_encode(dv_name, test, 'Name')

dv_region = rental_utils.vectorizer('RegionID', train)
train = rental_utils.one_hot_encode(dv_region, train, 'RegionID')
test = rental_utils.one_hot_encode(dv_region, test, 'RegionID')

In [16]:
independent = (['bathrooms', 'bedrooms', 'price'] + feature_names + 
    ['description_length', 'n_features', 'n_photos', 'month'] +
    [x for x in train.columns.values if 'County' in x] +
    [x for x in train.columns.values if 'Name' in x] +
    [x for x in train.columns.values if 'Region' in x]
    )

In [17]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(train[independent], train['interest_level'], test_size=0.33)

model = RandomForestClassifier(n_estimators=100,
                               max_features=None)
model.fit(X_train, y_train)
probs = model.predict_proba(X_val)

In [18]:
preds_train, probs_train = rental_utils.predict(model, X_train)
preds, probs = rental_utils.predict(model, X_val)

ct = pd.crosstab(preds, y_val.values, margins=False)
print(ct)

print('')
ct_perc = ct.apply(lambda x: x/sum(x), axis=1)
print(ct_perc)

print('')
accuracy_train = float(np.sum(preds_train==y_train))/len(preds_train)
accuracy_val = float(np.sum(preds==y_val))/len(preds)
print('Training Accuracy: ' + str(round(100*accuracy_train, 1)))
print('Validation Accuracy: ' + str(round(100*accuracy_val, 1)))
print('Log loss: ' + str(round(log_loss(y_val, probs), 1)))

print('')
high_accuracy = ct_perc.iloc[0,0]
print('High Accuracy: ' + str(round(100*high_accuracy, 1)))
med_accuracy = ct_perc.iloc[2,2]
print('Medium Accuracy: ' + str(round(100*med_accuracy, 1)))
low_accuracy = ct_perc.iloc[1,1]
print('Low Accuracy: ' + str(round(100*low_accuracy, 1)))


col_0   high    low  medium
row_0                      
high     339    127     256
low      483  10170    2371
medium   477    952    1112

col_0       high       low    medium
row_0                               
high    0.469529  0.175900  0.354571
low     0.037085  0.780866  0.182049
medium  0.187721  0.374656  0.437623

Training Accuracy: 99.0
Validation Accuracy: 71.4
Log loss: 0.8

High Accuracy: 47.0
Medium Accuracy: 43.8
Low Accuracy: 78.1


In [19]:
low_accuracy = 1
med_accuracy = 1
high_accuracy = 1

preds, probs = rental_utils.predict(model, X_val, [low_accuracy, med_accuracy, high_accuracy])
print('New log loss: ' + str(round(log_loss(y_val, probs), 1)))

New log loss: 0.8


In [20]:
submission = test[['listing_id']]
prreds, probs = rental_utils.predict(model, test[independent], [low_accuracy, med_accuracy, high_accuracy])
submission = pd.concat([submission.reset_index(drop=True), pd.DataFrame(probs, columns=model.classes_)], axis=1)
submission = submission[['listing_id', 'high', 'medium', 'low']]

print(submission.head())

timestamp = str(datetime.datetime.now())[:16]
submission_name = 'Submissions/submission ' + timestamp + '.csv'
submission_name = submission_name.replace(' ', '_').replace(':','')
submission.to_csv(submission_name, index=False)

   listing_id  high  medium   low
0     7142618  0.00    0.31  0.69
1     7210040  0.67    0.05  0.28
2     7174566  0.00    0.05  0.95
3     7191391  0.27    0.35  0.38
4     7171695  0.00    0.09  0.91
