In [1]:
import json

import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

%matplotlib inline
sns.set(style="whitegrid")

import rental_utils; reload(rental_utils)

<module 'rental_utils' from 'rental_utils.pyc'>

In [2]:
train_raw = pd.read_json('Data/train.json')
test_raw = pd.read_json('Data/test.json')

In [3]:
print(train_raw.shape)
print(test_raw.shape)

(49352, 15)
(74659, 14)


In [4]:
train_raw.head()

Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,interest_level,latitude,listing_id,longitude,manager_id,photos,price,street_address
10,1.5,3,53a5b119ba8f7b61d4e010512e0dfc85,2016-06-24 07:54:24,A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...,Metropolitan Avenue,[],medium,40.7145,7211212,-73.9425,5ba989232d0489da1b5f2c45f6688adc,[https://photos.renthop.com/2/7211212_1ed4542e...,3000,792 Metropolitan Avenue
10000,1.0,2,c5c8a357cba207596b04d1afd1e4f130,2016-06-12 12:19:27,,Columbus Avenue,"[Doorman, Elevator, Fitness Center, Cats Allow...",low,40.7947,7150865,-73.9667,7533621a882f71e25173b27e3139d83d,[https://photos.renthop.com/2/7150865_be3306c5...,5465,808 Columbus Avenue
100004,1.0,1,c3ba40552e2120b0acfc3cb5730bb2aa,2016-04-17 03:26:41,"Top Top West Village location, beautiful Pre-w...",W 13 Street,"[Laundry In Building, Dishwasher, Hardwood Flo...",high,40.7388,6887163,-74.0018,d9039c43983f6e564b1482b273bd7b01,[https://photos.renthop.com/2/6887163_de85c427...,2850,241 W 13 Street
100007,1.0,1,28d9ad350afeaab8027513a3e52ac8d5,2016-04-18 02:22:02,Building Amenities - Garage - Garden - fitness...,East 49th Street,"[Hardwood Floors, No Fee]",low,40.7539,6888711,-73.9677,1067e078446a7897d2da493d2f741316,[https://photos.renthop.com/2/6888711_6e660cee...,3275,333 East 49th Street
100013,1.0,4,0,2016-04-28 01:32:41,Beautifully renovated 3 bedroom flex 4 bedroom...,West 143rd Street,[Pre-War],low,40.8241,6934781,-73.9493,98e13ad4b495b9613cef886d79a6291f,[https://photos.renthop.com/2/6934781_1fa4b41a...,3350,500 West 143rd Street


In [5]:
[features, feature_names] = rental_utils.get_features(50, train_raw)

train = rental_utils.add_features(train_raw, features, feature_names)
test = rental_utils.add_features(test_raw, features, feature_names)

train = rental_utils.add_region(train)
test = rental_utils.add_region(test)

train = rental_utils.add_variables(train, train)
test = rental_utils.add_variables(test, train)

In [6]:
dv_county = rental_utils.vectorizer('County', train)
train = rental_utils.one_hot_encode(dv_county, train, 'County')
test = rental_utils.one_hot_encode(dv_county, test, 'County')

dv_name = rental_utils.vectorizer('Name', train)
train = rental_utils.one_hot_encode(dv_name, train, 'Name')
test = rental_utils.one_hot_encode(dv_name, test, 'Name')

In [7]:
dv_region = rental_utils.vectorizer('RegionID', train)
train = rental_utils.one_hot_encode(dv_region, train, 'RegionID')
test = rental_utils.one_hot_encode(dv_region, test, 'RegionID')

In [8]:
independent = (['bathrooms', 'bedrooms', 'price'] + feature_names + 
    ['description_length', 'n_features', 'n_photos', 'month'] +
    [x for x in train.columns.values if 'County' in x] +
    [x for x in train.columns.values if 'Name' in x] +
    [x for x in train.columns.values if 'Region' in x]
    )
print(independent)

['bathrooms', 'bedrooms', 'price', u'elevator', u'hardwood_floors', u'cats_allowed', u'dogs_allowed', u'doorman', u'dishwasher', u'laundry_in_building', u'no_fee', u'fitness_center', u'laundry_in_unit', u'pre-war', u'roof_deck', u'outdoor_space', u'dining_room', u'high_speed_internet', u'balcony', u'swimming_pool', u'new_construction', u'terrace', u'exclusive', u'loft', u'garden/patio', u'wheelchair_access', u'prewar', u'common_outdoor_space', u'hardwood', u'fireplace', u'simplex', u'lowrise', u'garage', u'laundry_room', u'reduced_fee', u'furnished', u'multi-level', u'high_ceilings', u'private_outdoor_space', u'publicoutdoor', u'parking_space', u'roof-deck', u'live_in_super', u'renovated', u'pool', u'on-site_laundry', u'laundry', u'green_building', u'storage', u'high_ceiling', u'washer_in_unit', u'dryer_in_unit', u'stainless_steel_appliances', 'description_length', 'n_features', 'n_photos', 'month', u'County=Bronx', u'County=Kings', u'County=New York', u'County=None', u'County=Queens',

In [9]:
model = RandomForestClassifier()
model.fit(train[independent], train['interest_level'])
preds = model.predict(train[independent])
probs = model.predict_proba(train[independent])

In [10]:
ct = pd.crosstab(preds, train['interest_level'].values, margins=False)
print(ct)

print('')
ct_perc = ct.apply(lambda x: x/sum(x), axis=1)
print(ct_perc)

print('')
accuracy = float(np.sum(preds==train['interest_level'].values))/len(preds)
print('Total Accuracy: ' + str(accuracy))

print('')
high_accuracy = ct_perc.iloc[0,0]
print('High Accuracy: ' + str(high_accuracy))
med_accuracy = ct_perc.iloc[1,1]
print('Medium Accuracy: ' + str(med_accuracy))
low_accuracy = ct_perc.iloc[2,2]
print('Low Accuracy: ' + str(low_accuracy))


col_0   high    low  medium
row_0                      
high    3633     33     119
low       82  33985     904
medium   124    266   10206

col_0       high       low    medium
row_0                               
high    0.959841  0.008719  0.031440
low     0.002345  0.971805  0.025850
medium  0.011703  0.025104  0.963194

Total Accuracy: 0.969038742098

High Accuracy: 0.959841479524
Medium Accuracy: 0.971805210031
Low Accuracy: 0.963193657984


In [11]:
submission = test[['listing_id']]
probs = model.predict_proba(test[independent])
submission = pd.concat([submission.reset_index(drop=True), pd.DataFrame(probs, columns=model.classes_)], axis=1)
submission = submission[['listing_id', 'high', 'medium', 'low']]

submission.loc[submission['high'] > high_accuracy, 'high'] = high_accuracy
submission.loc[submission['high'] < 1 - high_accuracy, 'high'] = 1 - high_accuracy
submission.loc[submission['medium'] > med_accuracy, 'medium'] = med_accuracy
submission.loc[submission['medium'] < 1 - med_accuracy, 'medium'] = 1 - med_accuracy
submission.loc[submission['low'] > low_accuracy, 'low'] = low_accuracy
submission.loc[submission['low'] < 1 - low_accuracy, 'low'] = 1 - low_accuracy

print(submission.head())

timestamp = str(datetime.datetime.now())[:16]
submission_name = 'Submissions/submission ' + timestamp + '.csv'
submission_name = submission_name.replace(' ', '_').replace(':','')
submission.to_csv(submission_name, index=False)

   listing_id      high    medium       low
0     7142618  0.040159  0.500000  0.500000
1     7210040  0.040159  0.200000  0.800000
2     7174566  0.040159  0.028195  0.963194
3     7191391  0.040159  0.400000  0.600000
4     7171695  0.040159  0.200000  0.800000
