In [1]:
# Common modules
import numpy as np
import pandas as pd
import os
from matplotlib import pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, auc, roc_curve

# Decision tree specific modules
from sklearn import tree

In [126]:
BASE_PATH = os.path.join(os.getcwd(), os.pardir)
DATA_PATH = os.path.join(BASE_PATH, 'data')

In [150]:
train_df = pd.read_json(os.path.join(BASE_PATH, '01-milestone1', 'imputed_train.json'))
X_test =  pd.read_json(os.path.join(DATA_PATH, 'test.json.zip'))

In [151]:
X = train_df.drop(columns=['interest_level'])
y = train_df['interest_level']

Some features cause difficulty with the Decision Tree model in scikit-learn. Let's temporarily remove these.

In [152]:
X_test.head()

Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,latitude,listing_id,longitude,manager_id,photos,price,street_address
0,1.0,1,79780be1514f645d7e6be99a3de696c5,2016-06-11 05:29:41,Large with awesome terrace--accessible via bed...,Suffolk Street,"[Elevator, Laundry in Building, Laundry in Uni...",40.7185,7142618,-73.9865,b1b1852c416d78d7765d746cb1b8921f,[https://photos.renthop.com/2/7142618_1c45a2c8...,2950,99 Suffolk Street
1,1.0,2,0,2016-06-24 06:36:34,Prime Soho - between Bleecker and Houston - Ne...,Thompson Street,"[Pre-War, Dogs Allowed, Cats Allowed]",40.7278,7210040,-74.0,d0b5648017832b2427eeb9956d966a14,[https://photos.renthop.com/2/7210040_d824cc71...,2850,176 Thompson Street
2,1.0,0,0,2016-06-17 01:23:39,Spacious studio in Prime Location. Cleanbuildi...,Sullivan Street,"[Pre-War, Dogs Allowed, Cats Allowed]",40.726,7174566,-74.0026,e6472c7237327dd3903b3d6f6a94515a,[https://photos.renthop.com/2/7174566_ba3a35c5...,2295,115 Sullivan Street
3,1.0,2,f9c826104b91d868e69bd25746448c0c,2016-06-21 05:06:02,For immediate access call Bryan.<br /><br />Bo...,Jones Street,"[Hardwood Floors, Dogs Allowed, Cats Allowed]",40.7321,7191391,-74.0028,41735645e0f8f13993c42894023f8e58,[https://photos.renthop.com/2/7191391_8c2f2d49...,2900,23 Jones Street
5,1.0,1,81062936e12ee5fa6cd2b965698e17d5,2016-06-16 07:24:27,Beautiful TRUE 1 bedroom in a luxury building ...,Exchange Place,"[Roof Deck, Doorman, Elevator, Fitness Center,...",40.7054,7171695,-74.0095,a742cf7dd3b2627d83417bc3a1b3ec96,[https://photos.renthop.com/2/7171695_089ffee2...,3254,20 Exchange Place


In [155]:
# For submission purposes, we need to store the listing_id of X_test
X_test_listing_ids = X_test.listing_id

In [154]:
dropped_cols = ['building_id', 'manager_id', 'description', 'created', 'display_address', 'features', 
                    'street_address', 'listing_id', 'photos']

In [133]:
X = X.drop(columns=dropped_cols)

In [137]:
X_test = X_test.drop(columns=dropped_cols)

## Training the model

We first instantiate a decision tree model and train it naively using all the features.

In [7]:
X.head()

Unnamed: 0,bathrooms,bedrooms,latitude,longitude,price
10,1.5,3,40.7145,-73.9425,3000
10000,1.0,2,40.7947,-73.9667,5465
100004,1.0,1,40.7388,-74.0018,2850
100007,1.0,1,40.7539,-73.9677,3275
100013,1.0,4,40.8241,-73.9493,3350


In [8]:
# Convert labels from {low, medium high} -> {0, 1, 2}
le = preprocessing.LabelEncoder()
y = le.fit_transform(y)

## Train-test split

We need to split the training data into a training set and a validation set (to evaluate the performance of the model).

For now we do 5-fold cross_validation.

In [12]:
seed = 36201431
from sklearn.model_selection import cross_val_score, cross_validate

In [48]:
criterions = ['gini', 'entropy']
for criterion in criterions:
    model = tree.DecisionTreeClassifier(criterion=criterion)
    scores = cross_validate(model, X, y, scoring='neg_log_loss', cv=5)
    # Convert negative log loss to log loss
    test_scores = -1 * scores['test_score']
    print("Log loss score using {0} as the criterion:".format(criterion))
    print(test_scores.mean().round(4))

Log loss score using gini as the criterion:
9.6802
Log loss score using entropy as the criterion:
9.7472


Now we can try training on the entire train set. We will obtain a model, train it on the test dataset, then submit to kaggle for evaluation.

In [138]:
model = tree.DecisionTreeClassifier(criterion='gini')
model.fit(X, y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [139]:
y_pred = model.predict_proba(X_test)

In [140]:
y_pred

array([[0.        , 0.        , 1.        ],
       [1.        , 0.        , 0.        ],
       [0.        , 0.83333333, 0.16666667],
       ...,
       [0.        , 1.        , 0.        ],
       [0.        , 0.        , 1.        ],
       [0.125     , 0.75      , 0.125     ]])

In [68]:
le.classes_

array(['high', 'low', 'medium'], dtype=object)

Since the order is not correct (low corresponds to the label 1, but in the csv they expect the low to be the third column), we must swap columns for y_pred.

In [160]:
def create_submission_csv(X_test, y_pred):
    df = pd.DataFrame(y_pred, columns=le.classes_)
    df.index = X_test_listing_ids
    df.index.name = 'listing_id'
    return df

In [165]:
output = create_submission_csv(X_test, y_pred)
output.to_csv('decision_tree_predictions_1.csv')

# Score:
9.18820

Some additional things to try:

- entropy for information gain
- limiting max_depth to like 30 or so
- class_weight = balanced
- ccp_alpha = ??? 