In [6]:
import json

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [25]:
train = pd.read_json('Data/train.json')
test = pd.read_json('Data/test.json')

In [150]:
train.head()

Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,interest_level,latitude,listing_id,...,elevator,hardwood_floors,cats_allowed,dogs_allowed,doorman,dishwasher,laundry_in_building,no_fee,fitness_center,laundry_in_unit
10,1.5,3,53a5b119ba8f7b61d4e010512e0dfc85,2016-06-24 07:54:24,A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...,Metropolitan Avenue,[],medium,40.7145,7211212,...,False,False,False,False,False,False,False,False,False,False
10000,1.0,2,c5c8a357cba207596b04d1afd1e4f130,2016-06-12 12:19:27,,Columbus Avenue,"[Doorman, Elevator, Fitness Center, Cats Allow...",low,40.7947,7150865,...,True,False,True,True,True,False,False,False,True,False
100004,1.0,1,c3ba40552e2120b0acfc3cb5730bb2aa,2016-04-17 03:26:41,"Top Top West Village location, beautiful Pre-w...",W 13 Street,"[Laundry In Building, Dishwasher, Hardwood Flo...",high,40.7388,6887163,...,False,True,False,False,False,True,True,False,False,False
100007,1.0,1,28d9ad350afeaab8027513a3e52ac8d5,2016-04-18 02:22:02,Building Amenities - Garage - Garden - fitness...,East 49th Street,"[Hardwood Floors, No Fee]",low,40.7539,6888711,...,False,True,False,False,False,False,False,True,False,False
100013,1.0,4,0,2016-04-28 01:32:41,Beautifully renovated 3 bedroom flex 4 bedroom...,West 143rd Street,[Pre-War],low,40.8241,6934781,...,False,False,False,False,False,False,False,False,False,False


In [148]:
import itertools
from collections import Counter

feature_list = [[w.lower() for w in line] for line in train['features'].values]
totals = Counter(i for i in list(itertools.chain.from_iterable(feature_list)))

n_features = 10
print(totals.most_common(n_features))
features  = [x[0] for x in totals.most_common(n_features)]
feature_names = [x.replace(' ', '_') for x in features]

[(u'elevator', 26273), (u'hardwood floors', 23558), (u'cats allowed', 23540), (u'dogs allowed', 22035), (u'doorman', 20967), (u'dishwasher', 20806), (u'laundry in building', 18944), (u'no fee', 18079), (u'fitness center', 13257), (u'laundry in unit', 9435)]


In [149]:
def add_features(df):
    feature_list = [[w.lower() for w in line] for line in df['features'].values]
    for i in range(len(features)):
        df[feature_names[i]] = [features[i] in f for f in feature_list]
    return(df)

train = add_features(train)
test = add_features(test)

In [152]:
independent = ['bathrooms', 'bedrooms', 'price'] + feature_names

In [153]:
interest_averages = train[independent+['interest_level']].groupby('interest_level').aggregate(np.mean)
print(interest_averages)

                bathrooms  bedrooms        price  elevator  hardwood_floors  \
interest_level                                                                
high             1.116176  1.546496  2700.293045  0.471737         0.518364   
low              1.238741  1.514759  4176.599142  0.533660         0.430901   
medium           1.163906  1.622050  3158.767388  0.549025         0.605130   

                cats_allowed  dogs_allowed   doorman  dishwasher  \
interest_level                                                     
high                0.417036      0.378484  0.289138    0.396718   
low                 0.497812      0.469111  0.447847    0.396482   
medium              0.433877      0.400659  0.400926    0.506546   

                laundry_in_building    no_fee  fitness_center  laundry_in_unit  
interest_level                                                                  
high                       0.396197  0.460797        0.190675         0.170617  
low                 

In [154]:
from sklearn.ensemble import RandomForestClassifier

In [155]:
model = RandomForestClassifier()
model.fit(train[independent], train['interest_level'])
preds = model.predict(train[independent])
probs = model.predict_proba(train[independent])

In [158]:
print(train['interest_level'].values[:10])
print(preds[:10])

[u'medium' u'low' u'high' u'low' u'low' u'medium' u'low' u'low' u'medium'
 u'low']
[u'medium' u'low' u'low' u'low' u'low' u'low' u'low' u'low' u'medium'
 u'low']


In [165]:
ct = 100 * pd.crosstab(preds, train['interest_level'].values, margins=True) / len(preds)
print(ct)
accuracy = float(np.sum(preds==train['interest_level'].values))/len(preds)
print('Total Accuracy: ' + str(accuracy))

col_0       high        low     medium         All
row_0                                             
high    4.670530   0.581537   0.836846    6.088912
low     1.854028  65.934511   7.205382   74.993921
medium  1.254255   2.952261  14.710650   18.917166
All     7.778813  69.468309  22.752877  100.000000
Total Accuracy: 0.8531569136


In [169]:
submission = test[['listing_id']]
probs = model.predict_proba(test[independent])
submission = pd.concat([submission.reset_index(drop=True), pd.DataFrame(probs, columns=model.classes_)], axis=1)
submission = submission[['listing_id', 'high', 'medium', 'low']]

submission.loc[submission['high'] > accuracy, 'high'] = accuracy
submission.loc[submission['high'] < 1 - accuracy, 'high'] = 1 - accuracy
submission.loc[submission['medium'] > accuracy, 'medium'] = accuracy
submission.loc[submission['medium'] < 1 - accuracy, 'medium'] = 1 - accuracy
submission.loc[submission['low'] > accuracy, 'low'] = accuracy
submission.loc[submission['low'] < 1 - accuracy, 'low'] = 1 - accuracy

print(submission.head())
submission.to_csv('Submissions/submission2.csv', index=False)

   listing_id      high    medium       low
0     7142618  0.146843  0.800000  0.200000
1     7210040  0.146843  0.146843  0.853157
2     7103890  0.146843  0.146843  0.853157
3     7143442  0.146843  0.308333  0.691667
4     6860601  0.146843  0.146843  0.853157
