In [1]:
import os
import re
import ast
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from preprocess import preprocess
import matplotlib.pyplot as plt

train_data = pd.read_csv(os.path.join("data", "train.csv"))
test_data = pd.read_csv(os.path.join("data", "test.csv"))
train_data.dropna(inplace=True)
X, y = train_data.drop(["price"], axis=1), train_data["price"]
y = y.to_numpy().astype(int)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20, random_state=42)
X_test = test_data

In [2]:
X_train = preprocess(X_train).to_numpy().astype(float)
X_val = preprocess(X_val).to_numpy().astype(float)

scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_val = scaler.transform(X_val)

In [4]:
from sklearn.ensemble import GradientBoostingClassifier

gbm = GradientBoostingClassifier()

n_estimators = list([150])
hyperparameters = dict(n_estimators = n_estimators)
clf = GridSearchCV(gbm, hyperparameters, cv=5)

best_gbm = clf.fit(X_train,y_train)

print('Number of trees:', best_gbm.best_estimator_.get_params()['n_estimators'])

Number of trees: 150


In [5]:
y_pred = best_gbm.predict(X_val)
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.72      0.84      0.78       804
           1       0.45      0.49      0.47       557
           2       0.39      0.22      0.28       514
           3       0.43      0.51      0.47       514
           4       0.44      0.44      0.44       314
           5       0.76      0.57      0.66       232

    accuracy                           0.54      2935
   macro avg       0.53      0.51      0.51      2935
weighted avg       0.53      0.54      0.53      2935



In [11]:
X_test = pd.read_csv(os.path.join("data", "test.csv"))
X_test = scaler.transform(preprocess(X_test))
test_predictions = best_gbm.predict(X_test)

submission = {
    "id" : list(range(len(test_predictions))), 
    "price" : list(test_predictions.astype(float))
}

submission = pd.DataFrame.from_dict(submission)

submission.to_csv("bahng_2.csv", index=False)