In [4]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV,train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler, RobustScaler, QuantileTransformer
from nltk.tokenize import RegexpTokenizer
from preprocess import preprocess
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

train_data = pd.read_csv(os.path.join("data", "train.csv"))
test_data = pd.read_csv(os.path.join("data", "test.csv"))
train_data.dropna(inplace=True)
X, y = train_data.drop(["price"], axis=1), train_data["price"]
y = y.astype(int)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20, random_state=42)
X_test = test_data


In [5]:
scaler = RobustScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_val = scaler.transform(X_val)

In [8]:
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
rf = RandomForestRegressor()

hyperparameters = {
    # 'n_estimators': [300, 400],
    # 'class_weight' : [None], 
    # 'max_depth': [20],
    # 'min_samples_split': [5, 10],
    # 'min_samples_leaf': [1, 2, 4],
    # 'max_features': ['log2', 'sqrt'], 
    # 'criterion': ["gini"]
}
# clf = GridSearchCV(rf, hyperparameters, cv=5, verbose=3, n_jobs=20, scoring="accuracy")
clf = GridSearchCV(rf, hyperparameters, cv=5, verbose=3, n_jobs=20)
best_rf = clf.fit(X_train,y_train)

print('Best Parameter Estimates:', best_rf.best_estimator_.get_params())

y_pred = np.log(best_rf.predict(X_val)).astype(int)
print(classification_report(np.log(y_val).astype(int), y_pred, digits=5))

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 5/5] END ..................................., score=0.624 total time=  29.3s
[CV 4/5] END ..................................., score=0.641 total time=  29.3s
[CV 2/5] END ..................................., score=0.666 total time=  29.5s
[CV 3/5] END ..................................., score=0.637 total time=  29.6s
[CV 1/5] END ..................................., score=0.643 total time=  29.8s
Best Parameter Estimates: {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': None, 'max_features': 1.0, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}


ValueError: Classification metrics can't handle a mix of continuous and multiclass targets

In [5]:
X_test_input = X_test.to_numpy().astype(float)
X_test_input = scaler.transform(X_test_input)
test_predictions = best_rf.predict(X_test_input)

submission = {
    "id" : list(range(len(test_predictions))), 
    "price" : list(test_predictions.astype(float))
}

submission = pd.DataFrame.from_dict(submission)

submission.to_csv(os.path.join("submissions/", "bahng_rf6.csv"), index=False)

In [50]:
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import RidgeCV, LassoCV
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.ensemble import StackingClassifier


# param_grid = {
#     'n_estimators' : [100, 200, 300, 400, 500, 1000],
#     'learning_rate' : [0.1, 0.5, 1.0], 
#     'algorithm' : ['SAMME.R']
# }

# # Adaboost with DecisionTree as base estimator
# ada_clf = AdaBoostClassifier(
#     estimator=DecisionTreeClassifier(max_depth = 5)
# )

# gscv = GridSearchCV(ada_clf, param_grid, cv=5, verbose=3, n_jobs=20, scoring="accuracy")
# gscv.fit(X_train, y_train)
# ada_clf = gscv.best_estimator_
# ada_val_accuracy = ada_clf.score(X_val, y_val)
# print("Adaboost Validation Accuracy:", ada_val_accuracy)

# estimators = [('ridge', DecisionTreeClassifier()),
#               ('lasso', LinearSVC()),
#               ('knr', KNeighborsClassifier(n_neighbors=20,
                                        #   metric='euclidean'))]


reg = AdaBoostClassifier(
estimator=RandomForestClassifier(), n_estimators=50)

# reg = StackingClassifier(
    # estimators=estimators,
    # final_estimator=final_estimator)

reg.fit(X_train, y_train)

ada_val_accuracy = reg.score(X_val, y_val)
print("Adaboost Validation Accuracy:", ada_val_accuracy)

Adaboost Validation Accuracy: 0.5706984667802385


In [39]:
X_test_input = X_test.to_numpy().astype(float)
X_test_input = scaler.transform(X_test_input)
test_predictions = reg.predict(X_test_input)

submission = {
    "id" : list(range(len(test_predictions))), 
    "price" : list(test_predictions.astype(float).astype(int).astype(float))
}

submission = pd.DataFrame.from_dict(submission)

submission.to_csv(os.path.join("submissions/", "bahng_ada2.csv"), index=False)