In [1]:
# Common modules
import numpy as np
import pandas as pd
import os
from matplotlib import pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, auc, roc_curve
from sklearn.model_selection import cross_val_score, cross_validate, GridSearchCV
import scipy.sparse as sp
from sklearn.feature_selection import SelectKBest, mutual_info_classif

# knn specific modules
from sklearn.ensemble import RandomForestClassifier

In [2]:
BASE_PATH = os.path.join(os.getcwd(), os.pardir)
DATA_PATH = os.path.join(BASE_PATH, 'data')

In [28]:
train_df = pd.read_json(os.path.join(BASE_PATH, '01-milestone1', 'imputed_train.json'))
test_df = pd.read_json(os.path.join(DATA_PATH, 'test.json.zip'))

In [29]:
X_sparse = sp.load_npz(os.path.join(DATA_PATH, 'training_feats.npz'))

X_test = sp.load_npz(os.path.join(DATA_PATH, 'test_feats.npz'))

In [30]:
# X = train_df.drop(columns=['interest_level'])
y = train_df['interest_level']
y

10        medium
10000        low
100004      high
100007       low
100013       low
           ...  
9999      medium
99991        low
99992        low
99993        low
99994        low
Name: interest_level, Length: 49308, dtype: object

In [31]:
X_test

<74659x35522 sparse matrix of type '<class 'numpy.float64'>'
	with 3939246 stored elements in Compressed Sparse Row format>

In [32]:
X_sparse

<49308x35522 sparse matrix of type '<class 'numpy.float64'>'
	with 2626763 stored elements in Compressed Sparse Row format>

## Numerical features

In [33]:
X_org = train_df.drop(columns=['interest_level', 'building_id', 'created', 'description', 'display_address', 'features', 'manager_id', 'photos','street_address', 'listing_id'])

In [34]:
X_org

Unnamed: 0,bathrooms,bedrooms,latitude,longitude,price
10,1.5,3,40.7145,-73.9425,3000
10000,1.0,2,40.7947,-73.9667,5465
100004,1.0,1,40.7388,-74.0018,2850
100007,1.0,1,40.7539,-73.9677,3275
100013,1.0,4,40.8241,-73.9493,3350
...,...,...,...,...,...
9999,1.0,2,40.7426,-73.9790,3200
99991,1.0,1,40.7102,-74.0163,3950
99992,1.0,1,40.7601,-73.9900,2595
99993,1.0,0,40.7066,-74.0101,3350


In [35]:
test_df_num = test_df.drop(columns=['building_id', 'created', 'description', 'display_address', 'features', 'manager_id', 'photos','street_address','listing_id'])

In [36]:
test_df_num

Unnamed: 0,bathrooms,bedrooms,latitude,longitude,price
0,1.0,1,40.7185,-73.9865,2950
1,1.0,2,40.7278,-74.0000,2850
2,1.0,0,40.7260,-74.0026,2295
3,1.0,2,40.7321,-74.0028,2900
5,1.0,1,40.7054,-74.0095,3254
...,...,...,...,...,...
124003,1.0,1,40.7925,-73.9454,1700
124005,1.0,2,40.7456,-73.9797,4195
124006,1.0,0,40.7416,-73.9829,2400
124007,2.0,2,40.7485,-73.9800,6895


### Preprocessing

In [37]:
# Convert labels from {low, medium high} -> {0, 1, 2}
le = preprocessing.LabelEncoder()
y = le.fit_transform(y)

### Train-test split

We will do 5-fold cross-validation

In [13]:
seed = 36201431

In [107]:
model = RandomForestClassifier(n_estimators=100)

In [108]:
scores = cross_validate(model, X_org, y, cv=5, scoring=['neg_log_loss','accuracy'], return_train_score=True)

In [109]:
scores

{'fit_time': array([4.64845991, 4.79385495, 4.71716094, 4.58241701, 4.43578506]),
 'score_time': array([0.53607202, 0.55963421, 0.61079788, 0.545578  , 0.50530195]),
 'test_neg_log_loss': array([-1.24902489, -1.24716193, -1.36752084, -1.27430131, -1.32006799]),
 'train_neg_log_loss': array([-0.23591144, -0.23914936, -0.23757033, -0.23638905, -0.23617148]),
 'test_accuracy': array([0.68356484, 0.70029403, 0.69590347, 0.69432049, 0.69127789]),
 'train_accuracy': array([0.91813918, 0.91755609, 0.91786239, 0.91832286, 0.91827216])}

In [110]:
scores['test_neg_log_loss'].mean() * -1

1.2916153943156696

In [111]:
model.fit(X_org, y)
# model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [112]:
y_pred = model.predict_proba(test_df_num)

## Improving performance

In [60]:
model = RandomForestClassifier(n_estimators=100,  ccp_alpha=1e-8, min_samples_leaf=5)

In [61]:
scores = cross_validate(model, X_org, y, cv=5, scoring=['neg_log_loss','accuracy'], return_train_score=True)

In [62]:
scores
# test_scores = -1 * scores['test_neg_log_loss']
# train_scores = -1 * scores['train_neg_log_loss']
# test_score = test_scores.mean()
# train_score = train_scores.mean()
# print("Mean log loss test  score: {0}".format(test_score))
# print("Mean log loss train score: {0}".format(train_score))

{'fit_time': array([4.10911107, 4.05192113, 4.14326692, 5.94924092, 4.50490785]),
 'score_time': array([0.37757707, 0.36337304, 0.43988109, 0.68976116, 0.36226726]),
 'test_neg_log_loss': array([-0.66255971, -0.65183408, -0.6594183 , -0.65050443, -0.66383306]),
 'train_neg_log_loss': array([-0.47909528, -0.48264254, -0.4783762 , -0.48221527, -0.47724318]),
 'test_accuracy': array([0.70908538, 0.72196309, 0.71669033, 0.71047561, 0.71453199]),
 'train_accuracy': array([0.7949095 , 0.79204482, 0.79516301, 0.79303876, 0.79567521])}

In [63]:
model.fit(X_org, y)
# model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=1e-08, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=5, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [64]:
y_pred = model.predict_proba(test_df_num)

## Categorical & Numerical features

### Preprocessing

In [26]:
# Convert labels from {low, medium high} -> {0, 1, 2}
le = preprocessing.LabelEncoder()
y = le.fit_transform(y)

In [15]:
#feature selection
kbest = SelectKBest(mutual_info_classif, k=100).fit(X_sparse,y)
X_fseltd = kbest.transform(X_sparse) 

In [16]:
X_fseltd

<49308x100 sparse matrix of type '<class 'numpy.float64'>'
	with 1310267 stored elements in Compressed Sparse Row format>

In [17]:
X_test = kbest.transform(X_test)


In [18]:
X_test

<74659x100 sparse matrix of type '<class 'numpy.float64'>'
	with 1979721 stored elements in Compressed Sparse Row format>

### Train-test split

We will do 5-fold cross-validation

In [19]:
seed = 36201431

In [20]:
model = RandomForestClassifier(n_estimators=200, min_samples_leaf=5, ccp_alpha=1e-8)

In [21]:
# scores = cross_validate(model, X_fseltd, y, cv=5, scoring=['neg_log_loss','accuracy'])
scores = cross_validate(model, X_fseltd, y, cv=5, scoring=['neg_log_loss','accuracy'], return_train_score=True)

In [22]:
scores
# test_scores = -1 * scores['test_neg_log_loss']
# train_scores = -1 * scores['train_neg_log_loss']
# test_score = test_scores.mean()
# train_score = train_scores.mean()
# print("Mean log loss test  score: {0}".format(test_score))
# print("Mean log loss train score: {0}".format(train_score))

{'fit_time': array([ 96.22595406,  96.64196396,  96.42534423, 101.17776513,
         97.45826602]),
 'score_time': array([0.9138732 , 0.9485991 , 0.96629572, 1.01070213, 0.97349691]),
 'test_neg_log_loss': array([-0.62605621, -0.61654552, -0.6120547 , -0.61310772, -0.62214949]),
 'train_neg_log_loss': array([-0.43333027, -0.43478267, -0.43461833, -0.4351947 , -0.43345594]),
 'test_accuracy': array([0.72226729, 0.72612046, 0.72449807, 0.72142785, 0.72325322]),
 'train_accuracy': array([0.83896973, 0.83907114, 0.83848806, 0.83854285, 0.83763024])}

In [23]:
model.fit(X_fseltd, y)

RandomForestClassifier(bootstrap=True, ccp_alpha=1e-08, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=5, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [24]:
y_pred = model.predict_proba(X_test)

In [38]:
le.classes_

array(['high', 'low', 'medium'], dtype=object)

## Make submission file for kaggle

In [44]:
n = 12

In [46]:
def create_submission_csv(y_pred, X_test_indexes):
    df = pd.DataFrame(y_pred, columns=le.classes_)
    df.index = X_test_indexes
    df.index.name = 'listing_id'
    return df

In [65]:
X_test_indexes = test_df['listing_id']
output = create_submission_csv(y_pred, X_test_indexes)
csv_name = 'rf_predictions_{}.csv'.format(n)
output.to_csv(csv_name)
n += 1