In [1]:
import os
import re
import ast
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV,train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler, RobustScaler, QuantileTransformer
from nltk.tokenize import RegexpTokenizer
from preprocess import preprocess
import matplotlib.pyplot as plt

train_data = pd.read_csv(os.path.join("data", "train.csv"))
test_data = pd.read_csv(os.path.join("data", "test.csv"))
train_data.dropna(inplace=True)
X, y = train_data.drop(["price"], axis=1), train_data["price"]
y = y.to_numpy().astype(int)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20, random_state=42)
X_test = test_data

X_train = preprocess(X_train).to_numpy().astype(float)
X_val = preprocess(X_val).to_numpy().astype(float)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/muchang/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /home/muchang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/muchang/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
scaler = RobustScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_val = scaler.transform(X_val)

In [3]:
from sklearn.svm import SVC
svc = SVC()

hyperparameters = {
    "cache_size" : [70, 80, 90, 100], 
    "C": [1.0, 2.0], 
    "gamma" : ["auto"]
}
clf = GridSearchCV(svc, hyperparameters, cv=5, verbose=0, n_jobs=20)

best_svc = clf.fit(X_train,y_train)

print('Best hyperparameters:', best_svc.best_estimator_.get_params())

Best hyperparameters: {'C': 2.0, 'break_ties': False, 'cache_size': 70, 'class_weight': None, 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 'auto', 'kernel': 'rbf', 'max_iter': -1, 'probability': False, 'random_state': None, 'shrinking': True, 'tol': 0.001, 'verbose': False}


In [4]:
y_pred = best_svc.predict(X_val)
print(classification_report(y_val, y_pred))



              precision    recall  f1-score   support

           0       0.70      0.84      0.76       804
           1       0.43      0.48      0.45       557
           2       0.41      0.24      0.30       514
           3       0.44      0.52      0.48       514
           4       0.45      0.40      0.42       314
           5       0.79      0.50      0.61       232

    accuracy                           0.54      2935
   macro avg       0.54      0.50      0.50      2935
weighted avg       0.53      0.54      0.52      2935



In [None]:
X_test = pd.read_csv(os.path.join("data", "test.csv"))
X_test = scaler.transform(preprocess(X_test))
test_predictions = best_rf.predict(X_test)

submission = {
    "id" : list(range(len(test_predictions))), 
    "price" : list(test_predictions.astype(float))
}

submission = pd.DataFrame.from_dict(submission)

submission.to_csv(os.path.join("submissions/", "bahng_rf1.csv"), index=False)