In [4]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV,train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler, RobustScaler, QuantileTransformer
from nltk.tokenize import RegexpTokenizer
from preprocess import preprocess
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

train_data = pd.read_csv(os.path.join("data", "train.csv"))
test_data = pd.read_csv(os.path.join("data", "test.csv"))
train_data["description"].fillna("", inplace=True)
train_data["host_is_superhost"].fillna("f", inplace=True)
train_data["beds"].fillna("1", inplace=True)
train_data.dropna(inplace=True)
X, y = train_data.drop(["price"], axis=1), train_data["price"]
y = y.astype(int)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20, random_state=42)
X_test = test_data

X_train, X_val, X_test = preprocess(X_train), preprocess(X_val), preprocess(X_test)


In [5]:
scaler = RobustScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_val = scaler.transform(X_val)

In [6]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()

hyperparameters = {
    'n_estimators': [100, 200, 300, 400, 500],
    'class_weight' : [None], 
    'max_depth': [20, 30, 40, 50],
    'min_samples_split': [5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['log2', 'sqrt'], 
    'criterion': ["gini", "entropy", "log_loss"]
}
# clf = GridSearchCV(rf, hyperparameters, cv=5, verbose=3, n_jobs=20, scoring="accuracy")
clf = GridSearchCV(rf, hyperparameters, cv=5, verbose=3, n_jobs=20)
best_rf = clf.fit(X_train,y_train)

print('Best Parameter Estimates:', best_rf.best_estimator_.get_params())

Fitting 5 folds for each of 720 candidates, totalling 3600 fits
[CV 1/5] END class_weight=None, criterion=gini, max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=100;, score=0.519 total time=   1.2s
[CV 4/5] END class_weight=None, criterion=gini, max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=100;, score=0.537 total time=   1.3s
[CV 2/5] END class_weight=None, criterion=gini, max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=100;, score=0.525 total time=   1.3s
[CV 3/5] END class_weight=None, criterion=gini, max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=100;, score=0.525 total time=   1.3s
[CV 5/5] END class_weight=None, criterion=gini, max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=100;, score=0.524 total time=   1.7s
[CV 4/5] END class_weight=None, criterion=gini, max_depth=20, max_features=l

IntCastingNaNError: Cannot convert non-finite values (NA or inf) to integer

In [10]:
from pprint import pprint
pprint(best_rf.best_estimator_.get_params())
y_pred = best_rf.predict(X_val).astype(int)
print(classification_report(y_val.astype(int), y_pred, digits=5))

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': 40,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 5,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 500,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}
              precision    recall  f1-score   support

           0    0.69587   0.86809   0.77250       796
           1    0.48968   0.44013   0.46359       593
           2    0.43478   0.27833   0.33939       503
           3    0.43411   0.59574   0.50224       564
           4    0.59259   0.41995   0.49155       381
           5    0.84103   0.65600   0.73708       250

    accuracy                        0.56754      3087
   macro avg    0.58134   0.54304   0.55106      3087
weighted avg    0.56490   0.56754   0.55567      3087



In [5]:
X_test_input = X_test.to_numpy().astype(float)
X_test_input = scaler.transform(X_test_input)
test_predictions = best_rf.predict(X_test_input)

submission = {
    "id" : list(range(len(test_predictions))), 
    "price" : list(test_predictions.astype(float))
}

submission = pd.DataFrame.from_dict(submission)

submission.to_csv(os.path.join("submissions/", "bahng_rf6.csv"), index=False)

In [50]:
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import RidgeCV, LassoCV
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.ensemble import StackingClassifier


# param_grid = {
#     'n_estimators' : [100, 200, 300, 400, 500, 1000],
#     'learning_rate' : [0.1, 0.5, 1.0], 
#     'algorithm' : ['SAMME.R']
# }

# # Adaboost with DecisionTree as base estimator
# ada_clf = AdaBoostClassifier(
#     estimator=DecisionTreeClassifier(max_depth = 5)
# )

# gscv = GridSearchCV(ada_clf, param_grid, cv=5, verbose=3, n_jobs=20, scoring="accuracy")
# gscv.fit(X_train, y_train)
# ada_clf = gscv.best_estimator_
# ada_val_accuracy = ada_clf.score(X_val, y_val)
# print("Adaboost Validation Accuracy:", ada_val_accuracy)

# estimators = [('ridge', DecisionTreeClassifier()),
#               ('lasso', LinearSVC()),
#               ('knr', KNeighborsClassifier(n_neighbors=20,
                                        #   metric='euclidean'))]


reg = AdaBoostClassifier(
estimator=RandomForestClassifier(), n_estimators=50)

# reg = StackingClassifier(
    # estimators=estimators,
    # final_estimator=final_estimator)

reg.fit(X_train, y_train)

ada_val_accuracy = reg.score(X_val, y_val)
print("Adaboost Validation Accuracy:", ada_val_accuracy)

Adaboost Validation Accuracy: 0.5706984667802385


In [39]:
X_test_input = X_test.to_numpy().astype(float)
X_test_input = scaler.transform(X_test_input)
test_predictions = reg.predict(X_test_input)

submission = {
    "id" : list(range(len(test_predictions))), 
    "price" : list(test_predictions.astype(float).astype(int).astype(float))
}

submission = pd.DataFrame.from_dict(submission)

submission.to_csv(os.path.join("submissions/", "bahng_ada2.csv"), index=False)