In [51]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import root_mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.linear_model import Ridge
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import optuna
import lightgbm as lgb
import xgboost as xgb
import catboost as cb
from optuna.trial import Trial
from sklearn.model_selection import KFold
import time, pickle, os


warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['figure.figsize'] = (21, 9)
plt.rcParams['figure.dpi'] = 300
plt.style.use('dark_background')

In [52]:
COLUMNS=[
    'manufacturer',
    'model',
    'vehicle_condition',
    'battery_capacity',
    'drivetrain',
    'mileage',
    'warranty_period',
    'accident_history',
    'year_of_manufacture',
]
SEED = 42

In [53]:
train = pd.read_csv('./data/train.csv')
train = train.drop(columns=['ID'])
train.columns = COLUMNS + ['y']
train.manufacturer = train.manufacturer.str.replace('사', '_corp')

test = pd.read_csv('./data/test.csv')
test = test.drop(columns=['ID'])
test.columns = COLUMNS
test_X = test
test_X.manufacturer = test_X.manufacturer.str.replace('사', '_corp')

In [54]:
train['battery_capacity'] = train['battery_capacity'].fillna(0)
test['battery_capacity'] = test['battery_capacity'].fillna(0)

In [55]:
from sklearn.preprocessing import StandardScaler
x_scaler = StandardScaler()
# y_scaler = StandardScaler()

# select numerical columns
x_scaler.fit(train[train.select_dtypes(include='number').columns.drop('y')])
train.loc[:, train.select_dtypes(include='number').columns.drop('y')] = x_scaler.transform(train[train.select_dtypes(include='number').columns.drop('y')])
# train.loc[:, 'y'] = y_scaler.fit_transform(train[['y']])
test.loc[:, test.select_dtypes(include='number').columns] = x_scaler.transform(test[test.select_dtypes(include='number').columns])

In [56]:
combined = pd.concat([train,test], axis=0, ignore_index=True)
FEATURES = combined.columns.drop("y")
CATS = []
HIGH_CARDINALITY = []

In [57]:
print(f"THE {len(FEATURES)} BASIC FEATURES ARE:")
for c in FEATURES:
    ftype = "numerical"
    if combined[c].dtype=="object":
        CATS.append(c)
        combined[c] = combined[c].fillna("NAN")
        combined[c],_ = combined[c].factorize()
        combined[c] -= combined[c].min()
        ftype = "categorical"
    if combined[c].dtype=="int64":
        combined[c] = combined[c]
    elif combined[c].dtype=="float64":
        combined[c] = combined[c]
        
    n = combined[c].nunique()
    print(f"{c} ({ftype}) with {n} unique values")
    if n>=25: HIGH_CARDINALITY.append(c)
    
train = combined.iloc[:len(train)].copy()
test = combined.iloc[len(train):].reset_index(drop=True).copy()

THE 9 BASIC FEATURES ARE:
manufacturer (categorical) with 7 unique values
model (categorical) with 21 unique values
vehicle_condition (categorical) with 3 unique values
battery_capacity (numerical) with 203 unique values
drivetrain (categorical) with 3 unique values
mileage (numerical) with 7633 unique values
warranty_period (numerical) with 11 unique values
accident_history (categorical) with 2 unique values
year_of_manufacture (numerical) with 3 unique values


In [62]:
base_line_model = SVR(C=10, epsilon=0.1, kernel='rbf', gamma='scale')

kf = KFold(n_splits=5, shuffle=True, random_state=SEED)
oof = np.zeros(len(train))
predictions = np.zeros(len(test))

for tr_idx, va_idx in kf.split(train):
    tr_x, va_x = train[FEATURES].iloc[tr_idx], train[FEATURES].iloc[va_idx]
    tr_y, va_y = train['y'].iloc[tr_idx], train['y'].iloc[va_idx]
    
    tr_x = pd.get_dummies(tr_x, columns=CATS)
    va_x = pd.get_dummies(va_x, columns=CATS)
    
    base_line_model.fit(tr_x, tr_y)
    oof[va_idx] = base_line_model.predict(va_x)
    predictions += base_line_model.predict(pd.get_dummies(test.drop(columns=['y']), columns=CATS))/kf.n_splits


In [63]:
predictions[0]

130.67418311977372

In [59]:
submission = pd.read_csv('./data/sample_submission.csv')
submission['가격(백만원)'] = predictions
submission.to_csv('submission.csv', index=False)

In [45]:
train['base_line'] = oof
test['base_line'] = predictions

In [46]:
CATS

['manufacturer',
 'model',
 'vehicle_condition',
 'drivetrain',
 'accident_history']

In [47]:
train = pd.concat([pd.get_dummies(train.drop(columns=['y']), columns=CATS), train['y']], axis=1)
test = pd.concat([pd.get_dummies(test.drop(columns=['y']), columns=CATS), test['y']], axis=1)

In [48]:
FOLD = 20
kf = KFold(n_splits=FOLD, shuffle=True, random_state=SEED)
oof = np.zeros(len(train))
test_preds = np.zeros(len(test))
all_score = 0
for trn_idx, val_idx in kf.split(train):
    _train = train.iloc[trn_idx].copy()
    _valid = train.iloc[val_idx].copy()

    model = LGBMRegressor(verbose=0)
    model.fit(_train.drop(columns=['y']), _train.y, 
              eval_set=(_valid.drop(columns=['y']), _valid.y)
              )
    oof[val_idx] = model.predict(_valid.drop(columns=['y']))
    test_preds += (model.predict(test[_train.drop(columns=['y']).columns]))
    score = root_mean_squared_error((_valid.y), (model.predict(_valid.drop(columns=['y']))))
    all_score += score
test_preds /= FOLD
print(all_score / FOLD)

1.502569520233469


In [20]:
FOLD = 20
kf = KFold(n_splits=FOLD, shuffle=True, random_state=SEED)
oof = np.zeros(len(train))
test_preds = np.zeros(len(test))
all_score = 0
for trn_idx, val_idx in kf.split(train):
    _train = train.iloc[trn_idx].copy()
    _valid = train.iloc[val_idx].copy()

    model = LGBMRegressor(verbose=0)
    model.fit(_train.drop(columns=['y']), _train.y, 
            #   eval_set=(_valid.drop(columns=['y']), _valid.y)
              )
    oof[val_idx] = model.predict(_valid.drop(columns=['y']))
    test_preds += (model.predict(test[_train.drop(columns=['y']).columns]))
    score = root_mean_squared_error((_valid.y), (model.predict(_valid.drop(columns=['y']))))
    all_score += score
test_preds /= FOLD
print(all_score / FOLD)

1.3496811992980629


In [10]:
FOLD=100

In [11]:
def objective(trial: optuna.Trial):
    kf = KFold(n_splits=FOLD, shuffle=True, random_state=SEED)
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 8),
        'num_leaves': trial.suggest_int('num_leaves', 5, 100),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 100.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 100.0),
        'random_state': SEED,
        'n_jobs': -1, 
        'verbose': -1
    }
    
    model = LGBMRegressor(**params)
    oof = np.zeros(len(train))
    for trn_idx, val_idx in kf.split(train):
        _train = train.iloc[trn_idx].copy()
        _valid = train.iloc[val_idx].copy()
        model.fit(_train.drop(columns=['y']), _train.y)
        oof[val_idx] = model.predict(_valid.drop(columns=['y']))
    score = root_mean_squared_error(train.y, oof)
    return score

In [12]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

[I 2025-01-27 18:54:24,700] A new study created in memory with name: no-name-ddc988b8-4ff8-4169-9801-5130ea4a4308
[I 2025-01-27 18:54:30,616] Trial 0 finished with value: 1.749095458057816 and parameters: {'n_estimators': 267, 'learning_rate': 0.0706502958619026, 'max_depth': 3, 'num_leaves': 11, 'min_child_samples': 22, 'subsample': 0.9975924061205617, 'colsample_bytree': 0.9290193449590198, 'reg_alpha': 0.011351449032250382, 'reg_lambda': 1.1718414574044518}. Best is trial 0 with value: 1.749095458057816.
[I 2025-01-27 18:54:37,003] Trial 1 finished with value: 1.3739115157873565 and parameters: {'n_estimators': 265, 'learning_rate': 0.24985941045825252, 'max_depth': 3, 'num_leaves': 89, 'min_child_samples': 54, 'subsample': 0.5700322416550678, 'colsample_bytree': 0.7978781393936755, 'reg_alpha': 10.400721175078193, 'reg_lambda': 0.12357732095894818}. Best is trial 1 with value: 1.3739115157873565.
[I 2025-01-27 18:55:02,089] Trial 2 finished with value: 1.3933279114871513 and parame

In [24]:
study.best_params

{'n_estimators': 287,
 'learning_rate': 0.1227047898956043,
 'max_depth': 6,
 'num_leaves': 48,
 'min_child_samples': 52,
 'subsample': 0.977801968846171,
 'colsample_bytree': 0.8679243554521052,
 'reg_alpha': 0.12243143591814588,
 'reg_lambda': 2.5660014471233006}

In [19]:
best_model = LGBMRegressor(**study.best_params)

test_preds = np.zeros(len(test))


best_model.fit(train.drop(columns=['y']), train.y)
test_preds = pd.Series(best_model.predict(test.drop(columns=['y'])))

In [21]:

knn = KNeighborsRegressor(n_neighbors=5)
knn.fit(train.drop(columns=['y']), train.y)
test_preds = pd.Series(knn.predict(test.drop(columns=['y'])))

In [22]:
test_preds

0      130.470
1       80.576
2       64.740
3       34.568
4       47.776
        ...   
841    151.412
842     38.814
843     38.798
844     58.816
845     22.822
Length: 846, dtype: float64

In [17]:
submission = pd.read_csv('./data/sample_submission.csv')
submission['가격(백만원)'] = test_preds
submission.to_csv('submission.csv', index=False)

In [50]:
submission = pd.read_csv('./data/sample_submission.csv')
submission['가격(백만원)'] = predictions
submission.to_csv('submission.csv', index=False)