In [1]:
import os
import re
import ast
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV,train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler, RobustScaler, QuantileTransformer
from nltk.tokenize import RegexpTokenizer
from preprocess import preprocess
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

train_data = pd.read_csv(os.path.join("data", "train.csv"))
test_data = pd.read_csv(os.path.join("data", "test.csv"))

train_data["description"].fillna("", inplace=True)
train_data["host_is_superhost"].fillna("f", inplace=True)
train_data["beds"].fillna("1", inplace=True)
train_data.dropna(inplace=True)


X, y = train_data.drop(["price"], axis=1), train_data["price"]
y = y.to_numpy().astype(int)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20, random_state=42)
X_test = test_data

X_train = preprocess(X_train).to_numpy().astype(float)
X_val = preprocess(X_val).to_numpy().astype(float)
X_test = preprocess(X_test)

# scale
scaler =  RobustScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_val = scaler.transform(X_val)

X_train.shape

(12346, 347)

In [2]:
import xgboost as xgb

param_grid = {
    'n_estimators' : [10, 30, 50, 70],
    'learning_rate' : [0.1, 0.3, 1.0], 
    'eval_metric' : ["mlogloss", "merror"], 
    'max_depth': [3, 4, 5],
    'subsample': [0.6, 0.8, 1.0]
}

# XGBoost Classifier
xgb_clf = xgb.XGBClassifier(
    objective="multi:softmax"
)
gscv2 = GridSearchCV(xgb_clf, param_grid, cv=5, verbose=3, n_jobs=20, scoring="accuracy")
gscv2.fit(X_train, y_train)
xgb_clf = gscv2.best_estimator_

# Test accuracy
xgb_test_accuracy = xgb_clf.score(X_val, y_val)
print("XGBoost Val Accuracy:", xgb_test_accuracy)

Fitting 5 folds for each of 432 candidates, totalling 2160 fits
[CV 5/5] END eval_metric=mlogloss, learning_rate=0.1, max_depth=3, n_estimators=10, subsample=0.8;, score=0.457 total time=   0.4s
[CV 3/5] END eval_metric=mlogloss, learning_rate=0.1, max_depth=3, n_estimators=10, subsample=1.0;, score=0.470 total time=   0.4s
[CV 2/5] END eval_metric=mlogloss, learning_rate=0.1, max_depth=3, n_estimators=10, subsample=0.8;, score=0.459 total time=   0.4s
[CV 1/5] END eval_metric=mlogloss, learning_rate=0.1, max_depth=3, n_estimators=10, subsample=1.0;, score=0.448 total time=   0.5s
[CV 1/5] END eval_metric=mlogloss, learning_rate=0.1, max_depth=3, n_estimators=10, subsample=0.6;, score=0.449 total time=   0.4s
[CV 3/5] END eval_metric=mlogloss, learning_rate=0.1, max_depth=3, n_estimators=10, subsample=0.8;, score=0.465 total time=   0.5s
[CV 1/5] END eval_metric=mlogloss, learning_rate=0.1, max_depth=3, n_estimators=10, subsample=0.8;, score=0.450 total time=   0.4s
[CV 4/5] END eval_m

In [12]:
print(classification_report(y_val, xgb_clf.predict(X_val), digits=5))

              precision    recall  f1-score   support

           0    0.74836   0.85930   0.80000       796
           1    0.45685   0.45531   0.45608       593
           2    0.38084   0.30815   0.34066       503
           3    0.45008   0.50355   0.47531       564
           4    0.52199   0.46719   0.49307       381
           5    0.82759   0.67200   0.74172       250

    accuracy                        0.56333      3087
   macro avg    0.56428   0.54425   0.55114      3087
weighted avg    0.55646   0.56333   0.55717      3087



In [7]:
xgb_test_accuracy = xgb_clf.score(X_val, y_val)
print("XGBoost Val Accuracy:", xgb_test_accuracy)

XGBoost Val Accuracy: 0.5633300939423388


In [6]:
print(f"n_estimators : {xgb_clf.n_estimators}")
print(f"learning_rate : {xgb_clf.learning_rate}")
print(f"eval_metric : {xgb_clf.eval_metric}")
print(f"max_depth : {xgb_clf.max_depth}")
print(f"subsample : {xgb_clf.subsample}")

n_estimators : 70
learning_rate : 0.3
eval_metric : mlogloss
max_depth : 5
subsample : 1.0


In [4]:
X_test_input = X_test.to_numpy().astype(float)
X_test_input = scaler.transform(X_test_input)
test_predictions = xgb_clf.predict(X_test_input)

submission = {
    "id" : list(range(len(test_predictions))), 
    "price" : list(test_predictions.astype(float).astype(int).astype(float))
}

submission = pd.DataFrame.from_dict(submission)

submission.to_csv(os.path.join("submissions/", "bahng_xgb5.csv"), index=False)