# Classification

In this notebook, we take a different approach. Rather than trying to predict the price for a specfic listing, we first segment (bin) listings by key price metrics (e.g. price mean). We label each listing with a categorical variable (e.g. "luxury listing") and then we build a classification model (supervised) to see whether if we can predict which class a given listing (hypothetically a new one) is more likely to belong to.

In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq
import sklearn as sk
from sklearn.linear_model import LinearRegression

In [2]:
master = pd.read_parquet('master.parquet',engine='fastparquet')

NB: we have noticed hosts only set prices for 2-3 months in the future. So, among the basic data preparations, we also discard data that does not fall within this time range.

In [3]:
mask = (master['date'] > '2017-07-01') & (master['date'] <= '2017-10-07')

In [4]:
new = master.loc[mask]

In [5]:
new.price_y.describe()

count    9875.000000
mean      247.469165
std       168.676700
min        35.000000
25%       126.000000
50%       199.000000
75%       300.000000
max       999.000000
Name: price_y, dtype: float64

In [6]:
new = new.reset_index()

In [7]:
new = new.drop(columns='index')

In [8]:
new['price_bin'] = pd.cut(new.price_y, 3, right=True, labels=None, retbins=False, precision=3, include_lowest=False)

In [9]:
new['price_bin'] = new['price_bin'].cat.add_categories([1])

In [10]:
new['price_bin'] = new['price_bin'].fillna(1)

In [11]:
y = new['price_bin']

In [12]:
X = new.drop('price_y', axis = 1)

In [13]:
X = X.select_dtypes(include='float64')

In [14]:
X = X.replace([np.inf, -np.inf], np.nan)

In [15]:
X = X.fillna(X.mean())

In [16]:
from sklearn.base import ClassifierMixin
from sklearn.utils.testing import all_estimators
from sklearn.model_selection import train_test_split

In [17]:
classifiers=[est for est in all_estimators() if issubclass(est[1], ClassifierMixin)]
print(classifiers)



[('AdaBoostClassifier', <class 'sklearn.ensemble.weight_boosting.AdaBoostClassifier'>), ('BaggingClassifier', <class 'sklearn.ensemble.bagging.BaggingClassifier'>), ('BernoulliNB', <class 'sklearn.naive_bayes.BernoulliNB'>), ('CalibratedClassifierCV', <class 'sklearn.calibration.CalibratedClassifierCV'>), ('DecisionTreeClassifier', <class 'sklearn.tree.tree.DecisionTreeClassifier'>), ('ExtraTreeClassifier', <class 'sklearn.tree.tree.ExtraTreeClassifier'>), ('ExtraTreesClassifier', <class 'sklearn.ensemble.forest.ExtraTreesClassifier'>), ('GaussianNB', <class 'sklearn.naive_bayes.GaussianNB'>), ('GaussianProcessClassifier', <class 'sklearn.gaussian_process.gpc.GaussianProcessClassifier'>), ('GradientBoostingClassifier', <class 'sklearn.ensemble.gradient_boosting.GradientBoostingClassifier'>), ('KNeighborsClassifier', <class 'sklearn.neighbors.classification.KNeighborsClassifier'>), ('LabelPropagation', <class 'sklearn.semi_supervised.label_propagation.LabelPropagation'>), ('LabelSpreadi

Trying Decision Tree Classifier:

In [18]:
y = y.astype(str)

In [19]:
#cols = [c for c in new.columns if c not in ['date','price_y']]

In [20]:
#X_dt = X[cols]

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)

Using less features to avoid problems - will need to revisit

In [23]:
features = ['review_scores_rating', 'reviews_per_month', 'transitTextWordsPerc', 'bathrooms', 'bedrooms','square_feet']

Simple Decision Tree Classifier

In [24]:
from sklearn import tree
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
DT = DecisionTreeClassifier(random_state=0)

In [25]:
y = y.astype(str)

In [28]:
DT = DT.fit(X_train[features], y_train)

In [29]:
DT.predict(X_test[features])

array(['(34.036, 356.333]', '(34.036, 356.333]', '(34.036, 356.333]', ...,
       '(356.333, 677.667]', '(356.333, 677.667]', '(356.333, 677.667]'], dtype=object)

In [30]:
DT.predict_proba(X_test[features])

array([[ 1.        ,  0.        ,  0.        ,  0.        ],
       [ 1.        ,  0.        ,  0.        ,  0.        ],
       [ 0.88888889,  0.11111111,  0.        ,  0.        ],
       ..., 
       [ 0.        ,  1.        ,  0.        ,  0.        ],
       [ 0.        ,  1.        ,  0.        ,  0.        ],
       [ 0.        ,  1.        ,  0.        ,  0.        ]])

In [31]:
DT.score(X_test[features],y_test)

0.93891491022638562

In [32]:
import graphviz 

In [33]:
dot_data = tree.export_graphviz(DT, out_file=None, 
                         feature_names=X_test[features].columns,  
                         class_names='Price Bin',  
                         filled=True, rounded=True,  
                         special_characters=True) 
graph = graphviz.Source(dot_data) 

In [34]:
#tree is too large - printing to pdf
graph.render("decisiontree") 

'decisiontree.pdf'

Bernoulli Naive Bayes

In [35]:
from sklearn.naive_bayes import BernoulliNB
BNV = BernoulliNB()
BNV.fit(X_train[features], y_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [36]:
print(BNV.predict(X_test[features]))

['(34.036, 356.333]' '(34.036, 356.333]' '(34.036, 356.333]' ...,
 '(34.036, 356.333]' '(34.036, 356.333]' '(34.036, 356.333]']


In [37]:
BNV.score(X_test[features],y_test)

0.78103044496487117

In [38]:
BNV.predict_proba(X_test[features])

array([[ 0.7580471 ,  0.16412616,  0.0390194 ,  0.03880734],
       [ 0.7580471 ,  0.16412616,  0.0390194 ,  0.03880734],
       [ 0.7580471 ,  0.16412616,  0.0390194 ,  0.03880734],
       ..., 
       [ 0.7580471 ,  0.16412616,  0.0390194 ,  0.03880734],
       [ 0.7580471 ,  0.16412616,  0.0390194 ,  0.03880734],
       [ 0.7580471 ,  0.16412616,  0.0390194 ,  0.03880734]])

KNN Classifier

In [39]:
from sklearn.neighbors import KNeighborsClassifier

In [40]:
KNN = KNeighborsClassifier(n_neighbors=3)
KNN.fit(X_test[features],y_test)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform')

In [41]:
print KNN.predict(X_test[features])

['(34.036, 356.333]' '(34.036, 356.333]' '(34.036, 356.333]' ...,
 '(356.333, 677.667]' '(356.333, 677.667]' '(356.333, 677.667]']


In [42]:
print KNN.predict_proba(X_test[features]) 

[[ 1.          0.          0.          0.        ]
 [ 1.          0.          0.          0.        ]
 [ 1.          0.          0.          0.        ]
 ..., 
 [ 0.          1.          0.          0.        ]
 [ 0.33333333  0.66666667  0.          0.        ]
 [ 0.33333333  0.66666667  0.          0.        ]]


In [45]:
print 'Accuracy Score for KNN: '
KNN.score(X_test[features],y_test)

Accuracy Score for KNN: 


0.93247462919594071

C-Support Vector Classification

In [43]:
from sklearn.svm import SVC

In [46]:
svc = SVC(C=1.0)
svc.fit(X_train[features], y_train) 

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [47]:
svc.predict(X_test[features])

array(['(34.036, 356.333]', '(34.036, 356.333]', '(34.036, 356.333]', ...,
       '(356.333, 677.667]', '(34.036, 356.333]', '(34.036, 356.333]'], dtype=object)

In [48]:
print 'Accuracy for SVC with C=1: '
svc.score(X_test[features],y_test)

Accuracy for SVC with C=1: 


0.83879781420765032

Gains in Accuracy with greater C

In [49]:
for i in [1,2,3,4]:
    svcs = SVC(C=i)
    svcs.fit(X_train[features], y_train)
    print 'Accuracy for SVC with C=', str(i)
    print svcs.score(X_test[features],y_test)

Accuracy for SVC with C= 1
0.838797814208
Accuracy for SVC with C= 2
0.843286494926
Accuracy for SVC with C= 3
0.844067135051
Accuracy for SVC with C= 4
0.849726775956


In [None]:
"""x1train = X_train[['square_feet','review_scores_rating']]
x1test = X_test[['square_feet','review_scores_rating']]"""

In [None]:
"""h = 5
x_min, x_max = x1train[:].min() - .5, x1train[:].max() + .5
y_min, y_max = x1train[:].min() - .5, x1train[:].max() + .5"""

In [None]:
"""len(X_train)"""

In [None]:
"""#ill need to make this work
xx, yy = np.meshgrid(np.arange(0, 5123, h), np.arange(0, 5123, h))
Z = KNN.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot
Z = Z.reshape(xx.shape)
pl.figure(1, figsize=(4, 3))
pl.set_cmap(pl.cm.Paired)
pl.pcolormesh(xx, yy, Z)

# Plot also the training points
pl.scatter(x1train[:], x1train[:],c=Y )
pl.xlabel('Square Feet')
pl.ylabel('Review Scores Rating')

pl.xlim(xx.min(), xx.max())
pl.ylim(yy.min(), yy.max())
pl.xticks(())
pl.yticks(())

pl.show()"""

Multinomial Naive Bayes

Will need to use this for text features (discrete features such as word counts). Applying to continuous numerical features not ideal

In [50]:
from sklearn.naive_bayes import MultinomialNB

In [51]:
MNB = MultinomialNB()
MNB.fit(X_train[features], y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [52]:
MNB.predict_proba(X_test[features])

array([[ 0.80267484,  0.17184394,  0.01299744,  0.01248378],
       [ 0.93669722,  0.05871156,  0.00287844,  0.00171278],
       [ 0.8974144 ,  0.09068234,  0.00825187,  0.00365139],
       ..., 
       [ 0.21418762,  0.50430347,  0.08640213,  0.19510677],
       [ 0.8547345 ,  0.12699049,  0.00902908,  0.00924594],
       [ 0.77277654,  0.19760189,  0.0152181 ,  0.01440346]])

In [53]:
print 'Accuracy Score for Multinomial Naive Bayes: '
MNB.score(X_test[features], y_test)

Accuracy Score for Multinomial Naive Bayes: 


0.80718188914910227

Nearest Centroid

In [54]:
from sklearn.neighbors.nearest_centroid import NearestCentroid

In [56]:
NC = NearestCentroid()
NC.fit(X_train[features], y_train)

NearestCentroid(metric='euclidean', shrink_threshold=None)

In [57]:
print 'Accuracy Score for Nearest Centroid: '
NC.score(X_test[features], y_test)

Accuracy Score for Multinomial Naive Bayes: 


0.17349726775956284

In [62]:
for i in [1,2,3,4,5,6]:
    NC = NearestCentroid(shrink_threshold=i)
    NC.fit(X_train[features], y_train)
    print 'Accuracy for Nearest Centroid with Threshold for Shrinking Centroids to Remove Features=', str(i)
    print NC.score(X_test[features],y_test)

Accuracy for Nearest Centroid with Threshold for Shrinking Centroids to Remove Features= 1
0.302302888368
Accuracy for Nearest Centroid with Threshold for Shrinking Centroids to Remove Features= 2
0.559914129586
Accuracy for Nearest Centroid with Threshold for Shrinking Centroids to Remove Features= 3
0.693013270882
Accuracy for Nearest Centroid with Threshold for Shrinking Centroids to Remove Features= 4
0.716237314598
Accuracy for Nearest Centroid with Threshold for Shrinking Centroids to Remove Features= 5
0.642661982826
Accuracy for Nearest Centroid with Threshold for Shrinking Centroids to Remove Features= 6
0.629195940671
