In [2]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV,train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler, RobustScaler, QuantileTransformer
from nltk.tokenize import RegexpTokenizer
from preprocess import preprocess
import matplotlib.pyplot as plt

train_data = pd.read_csv(os.path.join("data", "train.csv"))
test_data = pd.read_csv(os.path.join("data", "test.csv"))
train_data.dropna(inplace=True)
X, y = train_data.drop(["price"], axis=1), train_data["price"]
y = y.to_numpy().astype(int)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20, random_state=42)
X_test = test_data

X_train = preprocess(X_train).to_numpy().astype(float)
X_val = preprocess(X_val).to_numpy().astype(float)

In [3]:
from collections import Counter
from pprint import pprint
pprint(Counter(list(train_data["neighbourhood_cleansed"])))
    

Counter({'Venice': 727,
         'Santa Monica': 558,
         'Long Beach': 549,
         'West Hollywood': 535,
         'Hollywood': 515,
         'Beverly Hills': 514,
         'Downtown': 397,
         'Hollywood Hills West': 339,
         'Hollywood Hills': 287,
         'Malibu': 283,
         'Glendale': 270,
         'Pasadena': 218,
         'Beverly Grove': 207,
         'Culver City': 203,
         'Woodland Hills': 190,
         'Sherman Oaks': 183,
         'Silver Lake': 178,
         'Mid-Wilshire': 172,
         'Topanga': 171,
         'Manhattan Beach': 169,
         'Westwood': 165,
         'Marina del Rey': 165,
         'Burbank': 164,
         'Studio City': 162,
         'Redondo Beach': 159,
         'Hermosa Beach': 148,
         'Pico-Robertson': 148,
         'Koreatown': 141,
         'Mid-City': 139,
         'Inglewood': 135,
         'Avalon': 134,
         'North Hollywood': 126,
         'Westlake': 125,
         'East Hollywood': 120,
         'Echo 

In [13]:
scaler = RobustScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_val = scaler.transform(X_val)

In [31]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()

hyperparameters = {
    'n_estimators': [300],
    'class_weight' : [None], 
    'max_depth': [30, 35, 40, 45, 50, 55],
    # 'min_samples_split': [5, 10],
    # 'min_samples_leaf': [1, 2, 4],
    # 'max_features': ['log2', 'sqrt'], 
    'criterion': ["gini", "entropy", "log_loss"]
}
clf = GridSearchCV(rf, hyperparameters, cv=5, verbose=3, n_jobs=20, scoring="accuracy")

best_rf = clf.fit(X_train,y_train)

print('Best Parameter Estimates:', best_rf.best_estimator_.get_params())

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV 2/5] END class_weight=None, criterion=gini, max_depth=40, n_estimators=300;, score=0.544 total time=   6.7s
[CV 4/5] END class_weight=None, criterion=gini, max_depth=35, n_estimators=300;, score=0.554 total time=   6.8s
[CV 3/5] END class_weight=None, criterion=gini, max_depth=45, n_estimators=300;, score=0.562 total time=   6.8s
[CV 5/5] END class_weight=None, criterion=gini, max_depth=40, n_estimators=300;, score=0.538 total time=   6.9s
[CV 4/5] END class_weight=None, criterion=gini, max_depth=40, n_estimators=300;, score=0.557 total time=   6.8s
[CV 4/5] END class_weight=None, criterion=gini, max_depth=45, n_estimators=300;, score=0.545 total time=   6.8s
[CV 2/5] END class_weight=None, criterion=gini, max_depth=30, n_estimators=300;, score=0.552 total time=   6.8s
[CV 5/5] END class_weight=None, criterion=gini, max_depth=45, n_estimators=300;, score=0.549 total time=   6.8s
[CV 1/5] END class_weight=None, criterion=g

In [32]:
y_pred = best_rf.predict(X_val)
print(classification_report(y_val, y_pred, digits=5))

              precision    recall  f1-score   support

           0    0.72603   0.85697   0.78608       804
           1    0.49378   0.49910   0.49643       557
           2    0.46588   0.30545   0.36898       514
           3    0.47393   0.58366   0.52310       514
           4    0.49492   0.46497   0.47947       314
           5    0.84810   0.57759   0.68718       232

    accuracy                        0.58058      2935
   macro avg    0.58377   0.54795   0.55687      2935
weighted avg    0.57717   0.58058   0.57139      2935



In [9]:
X_test = pd.read_csv(os.path.join("data", "test.csv"))
X_test = preprocess(X_test).to_numpy().astype(float)
X_test = scaler.transform(X_test)
test_predictions = best_rf.predict(X_test)

submission = {
    "id" : list(range(len(test_predictions))), 
    "price" : list(test_predictions.astype(float))
}

submission = pd.DataFrame.from_dict(submission)

submission.to_csv(os.path.join("submissions/", "bahng_rf3.csv"), index=False)