In [2]:
import os
import re
import ast
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from nltk.tokenize import RegexpTokenizer
from preprocess import preprocess
import matplotlib.pyplot as plt

train_data = pd.read_csv(os.path.join("data", "train.csv"))
test_data = pd.read_csv(os.path.join("data", "test.csv"))
train_data.dropna(inplace=True)
X, y = train_data.drop(["price"], axis=1), train_data["price"]
y = y.to_numpy().astype(int)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20, random_state=42)
X_test = test_data


In [3]:
X_train = preprocess(X_train).to_numpy().astype(float)
X_val = preprocess(X_val).to_numpy().astype(float)

# scale
scaler =  RobustScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_val = scaler.transform(X_val)

In [4]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()


hyperparameters = {
    'n_estimators': [300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt']
}
clf = GridSearchCV(rf, hyperparameters, cv=5)

best_rf = clf.fit(X_train,y_train)

print('Number of trees:', best_rf.best_estimator_.get_params()['n_estimators'])

Number of trees: 300


In [5]:
y_pred = best_rf.predict(X_val)
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.74      0.86      0.79       804
           1       0.51      0.50      0.50       557
           2       0.43      0.30      0.35       514
           3       0.46      0.58      0.51       514
           4       0.48      0.46      0.47       314
           5       0.83      0.56      0.67       232

    accuracy                           0.58      2935
   macro avg       0.57      0.54      0.55      2935
weighted avg       0.57      0.58      0.57      2935



In [None]:
X_test = pd.read_csv(os.path.join("data", "test.csv"))
X_test = scaler.transform(preprocess(X_test))
test_predictions = best_rf.predict(X_test)

submission = {
    "id" : list(range(len(test_predictions))), 
    "price" : list(test_predictions.astype(float))
}

submission = pd.DataFrame.from_dict(submission)

submission.to_csv("bahng_3.csv", index=False)