# import

In [20]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import root_mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import optuna
import lightgbm as lgb
import xgboost as xgb
import catboost as cb
from optuna.trial import Trial
from sklearn.model_selection import KFold
import time, pickle, os


warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['figure.figsize'] = (21, 9)
plt.rcParams['figure.dpi'] = 300
plt.style.use('dark_background')

# read data

In [21]:
COLUMNS=[
    'manufacturer',
    'model',
    'vehicle_condition',
    'battery_capacity',
    'drivetrain',
    'mileage',
    'warranty_period',
    'accident_history',
    'year_of_manufacture',
]
SEED = 42

In [22]:
train = pd.read_csv('./data/train.csv')
train = train.drop(columns=['ID'])
train.columns = COLUMNS + ['y']
train.manufacturer = train.manufacturer.str.replace('사', '_corp')

test = pd.read_csv('./data/test.csv')
test = test.drop(columns=['ID'])
test.columns = COLUMNS
test_X = test
test_X.manufacturer = test_X.manufacturer.str.replace('사', '_corp')

In [17]:
train

Unnamed: 0,manufacturer,model,vehicle_condition,battery_capacity,drivetrain,mileage,warranty_period,accident_history,year_of_manufacture,y
0,P_corp,TayGTS,Nearly New,86.077,AWD,13642,0,No,2,159.66
1,K_corp,Niro,Nearly New,56.000,FWD,10199,6,No,0,28.01
2,A_corp,eT,Brand New,91.200,AWD,2361,7,No,0,66.27
3,A_corp,RSeTGT,Nearly New,,AWD,21683,3,No,0,99.16
4,B_corp,i5,Pre-Owned,61.018,AWD,178205,1,No,0,62.02
...,...,...,...,...,...,...,...,...,...,...
7492,H_corp,ION5,Brand New,,AWD,3773,10,No,0,35.95
7493,B_corp,i3,Pre-Owned,46.000,RWD,135411,2,No,0,23.40
7494,P_corp,TayCT,Brand New,,AWD,1363,2,No,0,120.00
7495,B_corp,i3,Nearly New,56.000,RWD,39445,6,No,2,24.00


### Fillna

In [23]:
train['battery_capacity'] = train['battery_capacity'].fillna(0)
test['battery_capacity'] = test['battery_capacity'].fillna(0)

## Scaling

In [19]:
from sklearn.preprocessing import StandardScaler
x_scaler = StandardScaler()
# y_scaler = StandardScaler()

# select numerical columns
x_scaler.fit(train[train.select_dtypes(include='number').columns.drop('y')])
train.loc[:, train.select_dtypes(include='number').columns.drop('y')] = x_scaler.transform(train[train.select_dtypes(include='number').columns.drop('y')])
# train.loc[:, 'y'] = y_scaler.fit_transform(train[['y']])
test.loc[:, test.select_dtypes(include='number').columns] = x_scaler.transform(test[test.select_dtypes(include='number').columns])

In [11]:
# def inverse_y(y):
#     if isinstance(y, pd.Series):
#         y = y.values
#     return y_scaler.inverse_transform(y.reshape(-1, 1)).reshape(-1)

In [24]:
combined = pd.concat([train,test], axis=0, ignore_index=True)
FEATURES = combined.columns.drop("y")
CATS = []
HIGH_CARDINALITY = []

In [25]:
print(f"THE {len(FEATURES)} BASIC FEATURES ARE:")
for c in FEATURES:
    ftype = "numerical"
    if combined[c].dtype=="object":
        CATS.append(c)
        combined[c] = combined[c].fillna("NAN")
        combined[c],_ = combined[c].factorize()
        combined[c] -= combined[c].min()
        ftype = "categorical"
    if combined[c].dtype=="int64":
        combined[c] = combined[c].astype("int32")
    elif combined[c].dtype=="float64":
        combined[c] = combined[c].astype("float32")
        
    n = combined[c].nunique()
    print(f"{c} ({ftype}) with {n} unique values")
    if n>=25: HIGH_CARDINALITY.append(c)
    
train = combined.iloc[:len(train)].copy()
test = combined.iloc[len(train):].reset_index(drop=True).copy()

THE 9 BASIC FEATURES ARE:
manufacturer (categorical) with 7 unique values
model (categorical) with 21 unique values
vehicle_condition (categorical) with 3 unique values
battery_capacity (numerical) with 203 unique values
drivetrain (categorical) with 3 unique values
mileage (numerical) with 7633 unique values
warranty_period (numerical) with 11 unique values
accident_history (categorical) with 2 unique values
year_of_manufacture (numerical) with 3 unique values


### Simple model with Catboost baseline

In [26]:
def target_encode(_train, _valid, col, target="y", kfold=5, smooth=20, agg="mean"):
    train = _train.copy()
    valid = _valid.copy()
    train['kfold'] = ((train.index) % kfold)
    col_name = '_'.join(col)
    train[f'TE_{agg.upper()}_' + col_name] = 0.
    for i in range(kfold):
        
        df_tmp = train[train['kfold']!=i]
        if agg=="mean": mn = train[target].mean()
        elif agg=="median": mn = train[target].median()
        elif agg=="min": mn = train[target].min()
        elif agg=="max": mn = train[target].max()
        elif agg=="nunique": mn = 0
        elif agg=='std': mn = train[target].std()
        elif agg=='skew': mn = train[target].skew()
        df_tmp = df_tmp[col + [target]].groupby(col).agg([agg, 'count']).reset_index()
        df_tmp.columns = col + [agg, 'count']
        if agg=="nunique":
            df_tmp['TE_tmp'] = df_tmp[agg] / df_tmp['count']
        else:
            df_tmp['TE_tmp'] = ((df_tmp[agg]*df_tmp['count'])+(mn*smooth)) / (df_tmp['count']+smooth)
        df_tmp_m = train[col + ['kfold', f'TE_{agg.upper()}_' + col_name]].merge(df_tmp, how='left', left_on=col, right_on=col)
        df_tmp_m.loc[df_tmp_m['kfold']==i, f'TE_{agg.upper()}_' + col_name] = df_tmp_m.loc[df_tmp_m['kfold']==i, 'TE_tmp']
        train[f'TE_{agg.upper()}_' + col_name] = df_tmp_m[f'TE_{agg.upper()}_' + col_name].fillna(mn).values  
    
    df_tmp = train[col + [target]].groupby(col).agg([agg, 'count']).reset_index()
    if agg=="mean": mn = train[target].mean()
    elif agg=="median": mn = train[target].median()
    elif agg=="min": mn = train[target].min()
    elif agg=="max": mn = train[target].max()
    elif agg=="nunique": mn = 0
    elif agg=='std': mn = train[target].std()
    elif agg=='skew': mn = train[target].skew()
    df_tmp.columns = col + [agg, 'count']
    if agg=="nunique":
        df_tmp['TE_tmp'] = df_tmp[agg] / df_tmp['count']
    else:
        df_tmp['TE_tmp'] = ((df_tmp[agg]*df_tmp['count'])+(mn*smooth)) / (df_tmp['count']+smooth)
        
    df_tmp_m = valid[col].merge(df_tmp, how='left', left_on=col, right_on=col)
    valid[f'TE_{agg.upper()}_' + col_name] = df_tmp_m['TE_tmp'].fillna(mn).values
    valid[f'TE_{agg.upper()}_' + col_name] = valid[f'TE_{agg.upper()}_' + col_name].astype("float32")

    # df_tmp_m = test[col].merge(df_tmp, how='left', left_on=col, right_on=col)
    # test[f'TE_{agg.upper()}_' + col_name] = df_tmp_m['TE_tmp'].fillna(mn).values
    # test[f'TE_{agg.upper()}_' + col_name] = test[f'TE_{agg.upper()}_' + col_name].astype("float32")

    train = train.drop('kfold', axis=1)
    train[f'TE_{agg.upper()}_' + col_name] = train[f'TE_{agg.upper()}_' + col_name].astype("float32")

    return train, valid#, test

def target_encode_w_t(_train, _valid, test, col, target="y", kfold=5, smooth=20, agg="mean"):
    train = _train.copy()
    valid = _valid.copy()
    train['kfold'] = ((train.index) % kfold)
    col_name = '_'.join(col)
    train[f'TE_{agg.upper()}_' + col_name] = 0.
    for i in range(kfold):
        
        df_tmp = train[train['kfold']!=i]
        if agg=="mean": mn = train[target].mean()
        elif agg=="median": mn = train[target].median()
        elif agg=="min": mn = train[target].min()
        elif agg=="max": mn = train[target].max()
        elif agg=="nunique": mn = 0
        elif agg=='std': mn = train[target].std()
        elif agg=='skew': mn = train[target].skew()
        df_tmp = df_tmp[col + [target]].groupby(col).agg([agg, 'count']).reset_index()
        df_tmp.columns = col + [agg, 'count']
        if agg=="nunique":
            df_tmp['TE_tmp'] = df_tmp[agg] / df_tmp['count']
        else:
            df_tmp['TE_tmp'] = ((df_tmp[agg]*df_tmp['count'])+(mn*smooth)) / (df_tmp['count']+smooth)
        df_tmp_m = train[col + ['kfold', f'TE_{agg.upper()}_' + col_name]].merge(df_tmp, how='left', left_on=col, right_on=col)
        df_tmp_m.loc[df_tmp_m['kfold']==i, f'TE_{agg.upper()}_' + col_name] = df_tmp_m.loc[df_tmp_m['kfold']==i, 'TE_tmp']
        train[f'TE_{agg.upper()}_' + col_name] = df_tmp_m[f'TE_{agg.upper()}_' + col_name].fillna(mn).values  
    
    df_tmp = train[col + [target]].groupby(col).agg([agg, 'count']).reset_index()
    if agg=="mean": mn = train[target].mean()
    elif agg=="median": mn = train[target].median()
    elif agg=="min": mn = train[target].min()
    elif agg=="max": mn = train[target].max()
    elif agg=="nunique": mn = 0
    elif agg=='std': mn = train[target].std()
    elif agg=='skew': mn = train[target].skew()
    df_tmp.columns = col + [agg, 'count']
    if agg=="nunique":
        df_tmp['TE_tmp'] = df_tmp[agg] / df_tmp['count']
    else:
        df_tmp['TE_tmp'] = ((df_tmp[agg]*df_tmp['count'])+(mn*smooth)) / (df_tmp['count']+smooth)
        
    df_tmp_m = valid[col].merge(df_tmp, how='left', left_on=col, right_on=col)
    valid[f'TE_{agg.upper()}_' + col_name] = df_tmp_m['TE_tmp'].fillna(mn).values
    valid[f'TE_{agg.upper()}_' + col_name] = valid[f'TE_{agg.upper()}_' + col_name].astype("float32")

    # if f'TE_{agg.upper()}_' + col_name not in test.columns:
    df_tmp_m = test[col].merge(df_tmp, how='left', left_on=col, right_on=col)
    test[f'TE_{agg.upper()}_' + col_name] = df_tmp_m['TE_tmp'].fillna(mn).values
    test[f'TE_{agg.upper()}_' + col_name] = test[f'TE_{agg.upper()}_' + col_name].astype("float32")

    train = train.drop('kfold', axis=1)
    train[f'TE_{agg.upper()}_' + col_name] = train[f'TE_{agg.upper()}_' + col_name].astype("float32")

    return train, valid, test

def count_encoding(_X: pd.DataFrame, col):
    global combined
    nm = f"CE_" + "_".join(col)
    X = _X.copy()

    tmp: pd.Series = combined.groupby(col).size()
    tmp.name = nm
    if nm in X.columns:
        X = X.drop(columns=nm)
    X = X.merge(tmp, how='left', left_on=col, right_index=True)
    X[nm] = X[nm].fillna(0)
    return X

In [27]:
_train, _valid = train_test_split(train, test_size=0.2, random_state=SEED)
# model = RandomForestRegressor(random_state=SEED)
# model = CatBoostRegressor(verbose=0, random_state=SEED)
# model = LGBMRegressor(random_state=SEED, verbose=0)
model = XGBRegressor(
      n_estimators=1000,  # 충분히 큰 값으로 설정
      learning_rate=0.05,
      objective='reg:squarederror',
      eval_metric='rmse',  # 반드시 지정!
      early_stopping_rounds=100,  # 50회 동안 개선 없으면 중지
      verbose=-1
      )
model.fit(_train.drop(columns=['y']), _train.y, 
    eval_set=[(_valid.drop(columns=['y']), _valid.y)], verbose=0
#   eval_set=(_valid.drop(columns=['y']), _valid.y)
    )
default = root_mean_squared_error((_valid.y), (model.predict(_valid.drop(columns=['y']))))
print(default)

1.5493326412786441


In [33]:
best = default
lists = []
CHECKED = set()

In [None]:
for _ in range(100_000_000):
    _train, _valid = train_test_split(train, test_size=0.2, random_state=SEED)

    l = np.random.choice([
        1,2,3,4,5,
        6,7,8,9
    ], 1)[0]
    c = np.sort(np.random.choice(train.columns.drop(['y']), l, replace=False)).tolist()
    # agg = np.random.choice([
    #     "mean", "median", 
    #     "count",
    #     # 'std', 'skew',
    #     # "min", "max", "nunique"
    # ], 1)[0]
    if tuple(c) in CHECKED:
        print('already checked'.ljust(300), end='\r')
        time.sleep(0.01)
        continue
    print(f"{c}".ljust(300), end='\r')
    new = 0
    temp_lists = lists + [c]
    for c in temp_lists:
        _train, _valid = target_encode(_train, _valid, c, target="y", kfold=5, smooth=20, agg='mean')
        _train = count_encoding(_train, c)
        _valid = count_encoding(_valid, c)
        
    CHECKED.add(tuple(c))
    # model = LGBMRegressor(verbose=0)
    # model.fit(_train.drop(columns=['y']), _train.y, eval_set=(_valid.drop(columns=['y']), _valid.y))
    model = XGBRegressor(
      n_estimators=1000,  # 충분히 큰 값으로 설정
      learning_rate=0.05,
      objective='reg:squarederror',
      eval_metric='rmse',  # 반드시 지정!
      early_stopping_rounds=100,  # 50회 동안 개선 없으면 중지
      verbose=-1
      )
    model.fit(_train.drop(columns=['y']), _train.y, 
              eval_set=[(_valid.drop(columns=['y']), _valid.y)], verbose=0
            #   eval_set=(_valid.drop(columns=['y']), _valid.y)
              )
    new = root_mean_squared_error((_valid.y), (model.predict(_valid.drop(columns=['y']))))
    
    if new < best:
        best = new
        print('\n', best)
        lists.append(c)

In [34]:
# fold te selection

FOLD = 10
kf = KFold(n_splits=FOLD, shuffle=True, random_state=SEED)
for _ in range(100_000_000):
    l = np.random.choice([
        1,2,3,4,5,
        6,7,8,9
    ], 1)[0]
    c = np.sort(np.random.choice(train.columns.drop(['y']), l, replace=False)).tolist()
    # agg = np.random.choice([
    #     "mean", "median", 
    #     "count",
    #     # 'std', 'skew',
    #     # "min", "max", "nunique"
    # ], 1)[0]
    if tuple(c) in CHECKED:
        print('already checked'.ljust(300), end='\r')
        time.sleep(0.01)
        continue
    print(f"{c}".ljust(300), end='\r')
    new = 0
    for trn_idx, val_idx in kf.split(train):
        _train = train.iloc[trn_idx].copy()
        _valid = train.iloc[val_idx].copy()
        
        temp_lists = lists + [c]
        for c in temp_lists:
            __train, __valid = target_encode(_train, _valid, c, target="y", kfold=5, smooth=20, agg='mean')
            __train = count_encoding(_train, c)
            __valid = count_encoding(_valid, c)
        CHECKED.add(tuple(c))
        model = XGBRegressor()
        model.fit(__train.drop(columns=['y']), __train.y, 
                #   eval_set=((__valid.drop(columns=['y']).values, __valid.y.values)), 
                  verbose=0)
        new += root_mean_squared_error(__valid.y, model.predict(__valid.drop(columns=['y'])))
    new /= FOLD
    
    
    if new < best * 0.99:
        best = new
        print('\n', best)
        lists.append(c)


['accident_history', 'battery_capacity', 'drivetrain', 'mileage', 'model', 'vehicle_condition', 'year_of_manufacture']                                                                                                                                                                                      
 1.4996223316947133
['accident_history', 'battery_capacity', 'drivetrain', 'manufacturer', 'model']                                                                                                                                                                                                                             
 1.4827580218880119
['accident_history', 'drivetrain', 'model', 'vehicle_condition']                                                                                                                                                                                                                                            
 1.4599758947057564
already checked                      

KeyboardInterrupt: 

In [12]:
pickle.dump(lists, open('lists.pkl', 'wb'))

In [None]:

lists = pickle.load(open("lists.pkl", "rb"))

In [23]:
c

['battery_capacity', 'drivetrain', 'manufacturer', 'model']

In [29]:
train

Unnamed: 0,manufacturer,model,vehicle_condition,battery_capacity,drivetrain,mileage,warranty_period,accident_history,year_of_manufacture,y
0,0,0,0,1.176470,0,-0.555177,-1.572333,0,3.122858,2.656020
1,1,1,0,0.329432,1,-0.617550,0.329331,0,-0.390885,-0.936624
2,2,2,1,1.320746,0,-0.759541,0.646275,0,-0.390885,0.107467
3,2,3,0,-1.247658,0,-0.409508,-0.621501,0,-0.390885,1.005014
4,3,4,2,0.470751,0,2.426016,-1.255389,0,-0.390885,-0.008513
...,...,...,...,...,...,...,...,...,...,...
7492,4,19,1,-1.247658,0,-0.733962,1.597107,0,-0.390885,-0.719947
7493,3,18,2,0.047809,2,1.650768,-0.938445,0,-0.390885,-1.062428
7494,0,10,1,-1.247658,0,-0.777621,-0.938445,0,-0.390885,1.573724
7495,3,18,0,0.329432,2,-0.087735,0.329331,0,3.122858,-1.046054


In [78]:
_train

Unnamed: 0,manufacturer,model,vehicle_condition,battery_capacity,drivetrain,mileage,warranty_period,accident_history,year_of_manufacture,y,TE_MEAN_battery_capacity_drivetrain_manufacturer_mileage_model_vehicle_condition_warranty_period_year_of_manufacture,CE_battery_capacity_drivetrain_manufacturer_mileage_model_vehicle_condition_warranty_period_year_of_manufacture,TE_MEAN_accident_history_battery_capacity_drivetrain_mileage_model_vehicle_condition_warranty_period,CE_accident_history_battery_capacity_drivetrain_mileage_model_vehicle_condition_warranty_period,TE_MEAN_drivetrain,CE_drivetrain,TE_MEAN_accident_history_drivetrain_manufacturer_mileage_vehicle_condition_warranty_period_year_of_manufacture,CE_accident_history_drivetrain_manufacturer_mileage_vehicle_condition_warranty_period_year_of_manufacture,TE_MEAN_drivetrain_manufacturer_mileage_vehicle_condition_warranty_period,CE_drivetrain_manufacturer_mileage_vehicle_condition_warranty_period,TE_MEAN_accident_history_battery_capacity_model_vehicle_condition_year_of_manufacture,CE_accident_history_battery_capacity_model_vehicle_condition_year_of_manufacture
0,0,0,0,1.176470,0,-0.555177,-1.572333,0,3.122858,159.66,62.33675,1,62.33675,1,72.428131,5747,62.33675,1,62.33675,1,75.051086,5
1,1,1,0,0.329432,1,-0.617550,0.329331,0,-0.390885,28.01,62.33675,1,62.33675,1,28.818651,1411,62.33675,1,62.33675,1,36.620937,91
2,2,2,1,1.320746,0,-0.759541,0.646275,0,-0.390885,66.27,62.33675,1,62.33675,1,72.210945,5747,62.33675,1,62.33675,1,64.724876,97
3,2,3,0,-1.247658,0,-0.409508,-0.621501,0,-0.390885,99.16,62.33675,1,62.33675,1,72.068718,5747,62.33675,1,62.33675,1,83.865936,44
5,4,5,2,0.390319,0,1.065428,-0.621501,0,-0.390885,37.02,62.33675,1,62.33675,1,72.428131,5747,62.33675,1,62.33675,1,48.976021,33
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7492,4,19,1,-1.247658,0,-0.733962,1.597107,0,-0.390885,35.95,62.33675,1,62.33675,1,72.210945,5747,62.33675,1,62.33675,1,40.578934,142
7493,3,18,2,0.047809,2,1.650768,-0.938445,0,-0.390885,23.40,62.33675,1,62.33675,1,54.913116,1185,62.33675,1,62.33675,1,29.040983,157
7494,0,10,1,-1.247658,0,-0.777621,-0.938445,0,-0.390885,120.00,62.33675,1,62.33675,1,72.404419,5747,62.33675,1,62.33675,1,114.020691,131
7495,3,18,0,0.329432,2,-0.087735,0.329331,0,3.122858,24.00,62.33675,1,62.33675,1,55.439686,1185,62.33675,1,62.33675,1,39.509693,46


In [79]:
test

Unnamed: 0,manufacturer,model,vehicle_condition,battery_capacity,drivetrain,mileage,warranty_period,accident_history,year_of_manufacture,y,TE_MEAN_battery_capacity_drivetrain_manufacturer_mileage_model_vehicle_condition_warranty_period_year_of_manufacture,TE_MEAN_accident_history_battery_capacity_drivetrain_mileage_model_vehicle_condition_warranty_period,TE_MEAN_drivetrain,TE_MEAN_accident_history_drivetrain_manufacturer_mileage_vehicle_condition_warranty_period_year_of_manufacture,TE_MEAN_drivetrain_manufacturer_mileage_vehicle_condition_warranty_period,TE_MEAN_accident_history_battery_capacity_model_vehicle_condition_year_of_manufacture,CE_battery_capacity_drivetrain_manufacturer_mileage_model_vehicle_condition_warranty_period_year_of_manufacture,CE_accident_history_battery_capacity_drivetrain_mileage_model_vehicle_condition_warranty_period,CE_drivetrain,CE_accident_history_drivetrain_manufacturer_mileage_vehicle_condition_warranty_period_year_of_manufacture,CE_drivetrain_manufacturer_mileage_vehicle_condition_warranty_period,CE_accident_history_battery_capacity_model_vehicle_condition_year_of_manufacture
0,0,10,0,0.895298,0,-0.547659,-0.938445,0,-0.390885,,62.33675,62.33675,72.345085,62.33675,62.33675,104.032257,1,1,5747,1,1,40
1,3,12,1,1.286951,0,-0.665593,0.963219,0,-0.390885,,62.33675,62.33675,72.345085,62.33675,62.33675,75.960526,1,1,5747,1,1,79
2,3,4,1,-1.247658,2,-0.671933,0.646275,1,-0.390885,,62.33675,62.33675,54.972866,62.33675,62.33675,62.452618,1,1,1185,1,1,3
3,4,19,0,0.680870,0,-0.614687,0.646275,0,1.365987,,62.33675,62.33675,72.345085,62.33675,62.33675,47.594303,1,1,5747,1,1,27
4,1,15,1,-1.247658,1,-0.664687,1.597107,0,-0.390885,,62.33675,62.33675,28.711424,62.33675,62.33675,45.440086,1,1,1411,1,1,318
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
841,0,0,2,0.573965,0,1.322636,-0.938445,0,-0.390885,,62.33675,62.33675,72.345085,62.33675,62.33675,108.902260,1,1,5747,1,1,29
842,6,9,2,0.316675,0,0.507606,-1.572333,0,-0.390885,,62.33675,62.33675,72.345085,62.33675,62.33675,49.393066,1,1,5747,1,1,32
843,6,9,2,0.316675,0,1.453776,-1.572333,0,-0.390885,,62.33675,62.33675,72.345085,62.33675,62.33675,49.393066,1,1,5747,1,1,32
844,2,8,0,0.713735,0,-0.519181,-0.304557,0,-0.390885,,62.33675,62.33675,72.345085,62.33675,62.33675,60.591923,1,1,5747,1,1,25


In [13]:
FOLD = 10
kf = KFold(n_splits=FOLD, shuffle=True, random_state=SEED)
oof = np.zeros(len(train))
test_preds = np.zeros(len(test))
all_score = 0
for trn_idx, val_idx in kf.split(train):
    _train = train.iloc[trn_idx].copy()
    _valid = train.iloc[val_idx].copy()
    for c in lists:
        _train, _valid, test = target_encode_w_t(_train, _valid, test, c, target="y", kfold=5, smooth=20, agg='mean')
        _train = count_encoding(_train, c)
        _valid = count_encoding(_valid, c)
        test = count_encoding(test, c)
    # model = XGBRegressor(verbose=0)
    # model.fit(_train.drop(columns=['y']), _train.y, 
    #         #   eval_set=(_valid.drop(columns=['y']), _valid.y)
    #           )
    model = XGBRegressor(
      n_estimators=1000,  # 충분히 큰 값으로 설정
      learning_rate=0.05,
      objective='reg:squarederror',
      eval_metric='rmse',  # 반드시 지정!
      early_stopping_rounds=100,  # 50회 동안 개선 없으면 중지
      verbose=-1
      )
    model.fit(_train.drop(columns=['y']), _train.y, 
              eval_set=[(_valid.drop(columns=['y']), _valid.y)], verbose=0
            #   eval_set=(_valid.drop(columns=['y']), _valid.y)
              )
    oof[val_idx] = model.predict(_valid.drop(columns=['y']))
    test_preds += (model.predict(test[_train.drop(columns=['y']).columns]))
    score = root_mean_squared_error((_valid.y), (model.predict(_valid.drop(columns=['y']))))
    all_score += score
test_preds /= FOLD
print(all_score / FOLD)

3.5273237396567794


In [81]:
test_preds

array([130.79471741,  80.07255249,  65.80679398,  34.02470016,
        29.1231638 , 100.49319077,  21.84371338,  38.57113724,
        83.02510986,  23.96048603,  64.36469307,  64.48821793,
        39.68559647, 100.53721237,  91.97183533,  22.90969372,
       124.50867538,  51.3492794 ,  59.90071564,  34.3521801 ,
        37.64754333, 160.675914  ,  46.21011505,  54.00848236,
        23.15104485,  40.29012928,  80.44659042,  82.02990723,
        28.20986385, 125.09742813,  64.31525116,  40.57493477,
       114.76878204,  90.52999268,  35.76923447,  81.94037781,
        21.10951176,  38.24902725,  24.62910309, 121.81495667,
       128.17913132,  19.51195374, 120.55031357,  59.87925682,
        23.13883076, 100.41558762,  20.99717903,  74.70483398,
        62.5912117 ,  84.92684021,  35.43708878,  16.78363132,
        37.80922623, 125.16422501,  23.77247829,  27.57353382,
       155.68782806,  40.38663635, 159.89941559,  39.71319351,
        53.99379921, 126.35127106,  26.64829674, 121.59

In [30]:
def objective(trial: optuna.Trial):
    params = {
        'objective': 'regression',
        'metric': 'rmse',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
        'num_leaves': int(trial.suggest_loguniform('num_leaves', 2, 256)),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
    }
    FOLD = 10
    kf = KFold(n_splits=FOLD, shuffle=True, random_state=SEED)
    oof = np.zeros(len(train))
    # test_preds = np.zeros(len(test))
    all_score = 0
    for trn_idx, val_idx in kf.split(train):
        _train = train.iloc[trn_idx].copy()
        _valid = train.iloc[val_idx].copy()
        for c in lists:
            _train, _valid, = target_encode(_train, _valid, c, target="y", kfold=5, smooth=20, agg='mean')
            _train = count_encoding(_train, c)
            _valid = count_encoding(_valid, c)
            # test = count_encoding(test, c)
        model = LGBMRegressor(**params)
        model.fit(_train.drop(columns=['y']), _train.y, eval_set=(_valid.drop(columns=['y']), _valid.y))
        oof[val_idx] = model.predict(_valid.drop(columns=['y']))
        # test_preds += inverse_y(model.predict(test.drop(columns=['y'])))
        score = root_mean_squared_error(inverse_y(_valid.y), inverse_y(model.predict(_valid.drop(columns=['y']))))
        all_score += score
    # test_preds /= FOLD
    return all_score / FOLD

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=1000)

[I 2025-01-27 16:41:10,367] A new study created in memory with name: no-name-6fefe52f-8698-43c0-bb04-950a8a62af47
[I 2025-01-27 16:41:13,815] Trial 0 finished with value: 1.3865179678294903 and parameters: {'lambda_l1': 4.291369070790316e-07, 'lambda_l2': 0.4543403746746668, 'num_leaves': 36.68529970552415, 'feature_fraction': 0.917276993258409, 'bagging_fraction': 0.9514872393954854, 'bagging_freq': 2, 'min_child_samples': 81}. Best is trial 0 with value: 1.3865179678294903.
[I 2025-01-27 16:41:17,041] Trial 1 finished with value: 1.5874720254565216 and parameters: {'lambda_l1': 1.2100074260608613, 'lambda_l2': 0.7495830913218663, 'num_leaves': 149.00954191431055, 'feature_fraction': 0.9961484128210615, 'bagging_fraction': 0.7837434788891737, 'bagging_freq': 7, 'min_child_samples': 98}. Best is trial 0 with value: 1.3865179678294903.
[I 2025-01-27 16:41:20,323] Trial 2 finished with value: 1.6262273997350647 and parameters: {'lambda_l1': 9.503103823563813e-08, 'lambda_l2': 0.000445766

In [31]:
study.best_params

{'lambda_l1': 0.2221889068209195,
 'lambda_l2': 4.46320292574424e-07,
 'num_leaves': 156.47891247360317,
 'feature_fraction': 0.9885912050584685,
 'bagging_fraction': 0.9999427671968968,
 'bagging_freq': 3,
 'min_child_samples': 45}

In [None]:
import pickle

pickle.dump(lists, open("lists.pkl", "wb"))

In [15]:
lists

[['battery_capacity', 'drivetrain', 'manufacturer', 'model'],
 ['accident_history',
  'battery_capacity',
  'drivetrain',
  'manufacturer',
  'mileage',
  'model',
  'vehicle_condition',
  'warranty_period',
  'year_of_manufacture'],
 ['vehicle_condition', 'year_of_manufacture'],
 ['mileage'],
 ['drivetrain', 'model', 'year_of_manufacture'],
 ['manufacturer', 'mileage', 'vehicle_condition', 'warranty_period'],
 ['model', 'vehicle_condition', 'year_of_manufacture'],
 ['drivetrain', 'manufacturer', 'model', 'vehicle_condition']]

In [161]:
pred=inverse_y(model.predict(test.drop(columns=['y'])))

In [70]:
import optuna
from optuna.trial import Trial
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import KFold

def objective(trial: Trial, X, y):
    param = {
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.05, 0.3),
        'depth': trial.suggest_int('depth', 3, 10),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-3, 10),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 1.0),
        'random_strength': trial.suggest_float('random_strength', 0.0, 1.0),
        'od_type': 'Iter',
        'od_wait': 25,
        'loss_function': 'RMSE',
        'eval_metric': 'RMSE',
        'random_seed': SEED,
        'verbose': 0,
        # gpu
        'task_type': 'GPU',
        'devices': '0',
    }
    
    FOLD = 10
    kf = KFold(n_splits=FOLD, shuffle=True, random_state=SEED)
    oof = np.zeros(len(X))
    rmse_list = []

    for trn_idx, val_idx in kf.split(X):
        _train = train.iloc[trn_idx].copy()
        _valid = train.iloc[val_idx].copy()
        for c in lists:
            _train, _valid, test = target_encode_w_t(_train, _valid, test, c, target="y", kfold=5, smooth=20, agg='mean')
            _train = count_encoding(_train, c)
            _valid = count_encoding(_valid, c)
            test = count_encoding(test, c)
        model.fit(_train.drop(columns=['y']), _train.y, eval_set=(_valid.drop(columns=['y']), _valid.y), use_best_model=True, verbose=0)
        oof[val_idx] = model.predict(_valid.drop(columns=['y']))

        print(root_mean_squared_error(inverse_y(_valid.y), inverse_y(model.predict(_valid.drop(columns=['y'])))))
        X_train, X_val = X.iloc[trn_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[trn_idx], y.iloc[val_idx]

        model = CatBoostRegressor(**param)
        model.fit(X_train[FEATURES], y_train)
        oof[val_idx] = model.predict(X_val[FEATURES])

        rmse = root_mean_squared_error(y_val, oof[val_idx])
        rmse_list.append(rmse)

    return np.mean(rmse_list)

FOLD = 10
study = optuna.create_study(direction="minimize")
study.optimize(lambda trial: objective(trial, train.drop(columns=['y']), train['y']), n_trials=100)

[I 2025-01-24 03:05:28,824] A new study created in memory with name: no-name-19d85c79-b494-4050-b4d6-42c56035e4e5
[I 2025-01-24 03:06:21,081] Trial 0 finished with value: 1.4336209132336926 and parameters: {'iterations': 240, 'learning_rate': 0.26232162541778403, 'depth': 10, 'l2_leaf_reg': 9.583951419063254, 'bagging_temperature': 0.21712863437845042, 'random_strength': 0.9187820337363163}. Best is trial 0 with value: 1.4336209132336926.
[I 2025-01-24 03:06:33,790] Trial 1 finished with value: 1.423427341743748 and parameters: {'iterations': 141, 'learning_rate': 0.15145677901086618, 'depth': 9, 'l2_leaf_reg': 0.02095504944287425, 'bagging_temperature': 0.8294256608081401, 'random_strength': 0.14991661507838228}. Best is trial 1 with value: 1.423427341743748.
[I 2025-01-24 03:07:25,680] Trial 2 finished with value: 1.4528405185138134 and parameters: {'iterations': 645, 'learning_rate': 0.2596214135932194, 'depth': 3, 'l2_leaf_reg': 1.2677780893002668, 'bagging_temperature': 0.65310750

In [43]:
best_params=study.best_params
best_params['num_leaves'] = int(best_params['num_leaves'])


FOLD = 10
kf = KFold(n_splits=FOLD, shuffle=True, random_state=SEED)
oof = np.zeros(len(train))
test_preds = np.zeros(len(test))
all_score = 0

for trn_idx, val_idx in kf.split(train):
    _train = train.iloc[trn_idx].copy()
    _valid = train.iloc[val_idx].copy()
    for c in lists:
        _train, _valid, test = target_encode_w_t(_train, _valid, test, c, target="y", kfold=5, smooth=20, agg='mean')
        _train = count_encoding(_train, c)
        _valid = count_encoding(_valid, c)
        test = count_encoding(test, c)
    model = LGBMRegressor(**best_params, verbose=-1)
    model.fit(_train.drop(columns=['y']), _train.y, eval_set=(_valid.drop(columns=['y']), _valid.y))
    oof[val_idx] = model.predict(_valid.drop(columns=['y']))
    test_preds += inverse_y(model.predict(test.drop(columns=['y'])))
    score = root_mean_squared_error(inverse_y(_valid.y), inverse_y(model.predict(_valid.drop(columns=['y']))))
    all_score += score
test_preds /= FOLD
print(all_score / FOLD)

1.337392828192564


In [44]:
test_preds

array([130.67152854,  80.08229309,  65.04049003,  34.92735563,
        47.86918713, 100.35448792,  21.96969311,  40.17025075,
        81.61437283,  24.92789663,  63.9086853 ,  62.51293404,
        40.24113355, 100.36096778,  90.89722592,  22.95325205,
       125.84175522,  52.54032683,  60.05157605,  34.57381566,
        37.76036736, 160.27288101,  47.06715436,  55.81972208,
        23.03286845,  47.80079599,  78.73695068,  80.29385897,
        27.90781129, 125.83859151,  63.82604063,  40.87423005,
       114.7300918 ,  88.21205822,  35.62491242,  80.0743183 ,
        21.40788243,  39.9719322 ,  24.64900545, 121.2190105 ,
       126.76725913,  18.94603294, 120.43542157,  60.68201072,
        23.05243351, 100.27305417,  21.40340318,  74.04195055,
        62.98335324,  77.85701875,  35.61744324,  17.34354082,
        38.73539668, 126.52143074,  23.70568249,  27.870802  ,
       155.97994265,  41.13782987, 159.50034014,  40.20639045,
        55.76414138, 125.05146534,  40.77631746, 120.92

In [23]:
# 'iterations': 662,
#  'learning_rate': 0.25188255202982335,
#  'depth': 10,
#  'l2_leaf_reg': 6.234514299519463,
#  'bagging_temperature': 0.17940146832820428,
#  'random_strength': 0.603274586005211
model = CatBoostRegressor(
    iterations=662,
    learning_rate=0.25188255202982335,
    depth=10,
    l2_leaf_reg=6.234514299519463,
    bagging_temperature=0.17940146832820428,
    random_strength=0.603274586005211,
    loss_function='RMSE',
    eval_metric='RMSE',
    random_seed=SEED,
    verbose=0,
    # gpu
    # task_type='GPU',
    # devices='0',
)
model.fit(train[FEATURES], train['y'])

<catboost.core.CatBoostRegressor at 0x1506daf30>

In [24]:
root_mean_squared_error(inverse_y(train['y']), inverse_y(model.predict(train[FEATURES])))

0.731412711799948

In [None]:
import autogluon
from autogluon.tabular import TabularPredictor

predictor = TabularPredictor(label='y', path='autogluon').fit(train)
predictor.leaderboard(train, silent=True)

Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.2
Python Version:     3.12.5
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 23.5.0: Wed May  1 20:12:58 PDT 2024; root:xnu-10063.121.3~5/RELEASE_ARM64_T6000
CPU Count:          8
Memory Avail:       4.70 GB / 16.00 GB (29.3%)
Disk Space Avail:   68.78 GB / 460.43 GB (14.9%)
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets. Defaulting to `'medium'`...
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='experimental' : New in v1.2: Pre-trained foundation model + parallel fits. The absolute best accuracy without consideration for inference speed. Does not support GPU.
	presets='best'         : Maximize accuracy. Recommended for most users. Use in competitions and benchmarks.
	presets='high'         : Strong accuracy with fast inference speed.

[1000]	valid_set's rmse: 0.0478121
[2000]	valid_set's rmse: 0.0468895
[3000]	valid_set's rmse: 0.0467364


	-0.0467	 = Validation score   (-root_mean_squared_error)
	12.31s	 = Training   runtime
	0.03s	 = Validation runtime
Fitting model: LightGBM ...
	-0.0356	 = Validation score   (-root_mean_squared_error)
	1.96s	 = Training   runtime
	0.0s	 = Validation runtime
Fitting model: RandomForestMSE ...
	-0.0417	 = Validation score   (-root_mean_squared_error)
	0.61s	 = Training   runtime
	0.04s	 = Validation runtime
Fitting model: CatBoost ...
	-0.0367	 = Validation score   (-root_mean_squared_error)
	1.67s	 = Training   runtime
	0.0s	 = Validation runtime
Fitting model: ExtraTreesMSE ...
	-0.0379	 = Validation score   (-root_mean_squared_error)
	0.35s	 = Training   runtime
	0.04s	 = Validation runtime
Fitting model: NeuralNetFastAI ...
	-0.1098	 = Validation score   (-root_mean_squared_error)
	3.71s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: XGBoost ...


In [70]:
pred = inverse_y(predictor.predict(test))

In [29]:
inverse_y(model.predict(test[FEATURES]))

array([130.37851996,  79.85348567,  65.15441029,  34.04055656,
        48.23441033,  98.61992078,  21.69589585,  40.25441253,
        81.172478  ,  24.47056799,  64.01855012,  62.44368713,
        40.3486117 , 100.31765675,  90.57249605,  22.98856792,
       124.74155467,  52.69258071,  59.58681537,  34.74969411,
        37.81660187, 160.38244773,  46.83663052,  55.8650169 ,
        23.16441216,  48.38069957,  78.90382265,  80.11770851,
        27.84531621, 125.55584355,  63.92827509,  40.60824634,
       114.47399864,  88.56370664,  36.10235168,  79.99900088,
        21.61100714,  39.70042024,  24.72250782, 120.93789024,
       130.17407429,  17.640366  , 119.35523627,  60.60107424,
        23.10032829, 100.18088333,  21.86410898,  73.94977158,
        62.68400416,  78.0026252 ,  35.69812347,  15.72612913,
        38.58179846, 125.72553111,  23.76634716,  27.63870952,
       156.15351987,  41.20418345, 160.09433979,  40.18803309,
        55.82053358, 126.77380494,  41.10281272, 121.03

# Encoding

In [62]:
# one hot encoding
train = pd.get_dummies(train, columns=CATS)
test = pd.get_dummies(test, columns=CATS)
train_X, valid_X, train_y, valid_y = train_test_split(train.drop("y", axis=1), train["y"], test_size=0.2, random_state=SEED)

In [64]:
train_X

Unnamed: 0,battery_capacity,mileage,warranty_period,year_of_manufacture,manufacturer_0,manufacturer_1,manufacturer_2,manufacturer_3,manufacturer_4,manufacturer_5,manufacturer_6,model_0,model_1,model_2,model_3,model_4,model_5,model_6,model_7,model_8,model_9,model_10,model_11,model_12,model_13,model_14,model_15,model_16,model_17,model_18,model_19,model_20,vehicle_condition_0,vehicle_condition_1,vehicle_condition_2,drivetrain_0,drivetrain_1,drivetrain_2,accident_history_0,accident_history_1
7330,56.359001,52301,1,0,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,True,False,False,True,False
6329,0.000000,5734,9,0,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,True,False,True,False
1298,46.000000,93446,1,0,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,True,False,True,False
426,99.800003,612,8,0,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,True,False,False,True,False
1600,0.000000,1424,10,0,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,True,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5191,76.093002,69056,2,0,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,True,False,False,True,False
5226,0.000000,232,10,0,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,True,False,False,True,False
5390,60.271999,161594,2,0,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,True,False,False,True,False
860,68.487999,64796,2,0,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,True,False,True,False


In [63]:
model.fit(train_X, train_y)
root_mean_squared_error(inverse_y(valid_y), inverse_y(model.predict(valid_X)))

np.float64(1385.774496101624)

In [47]:
display(train)
display(test)

Unnamed: 0,battery_capacity,mileage,warranty_period,year_of_manufacture,y,manufacturer_0,manufacturer_1,manufacturer_2,manufacturer_3,manufacturer_4,manufacturer_5,manufacturer_6,model_0,model_1,model_2,model_3,model_4,model_5,model_6,model_7,model_8,model_9,model_10,model_11,model_12,model_13,model_14,model_15,model_16,model_17,model_18,model_19,model_20,vehicle_condition_0,vehicle_condition_1,vehicle_condition_2,drivetrain_0,drivetrain_1,drivetrain_2,accident_history_0,accident_history_1
0,86.077003,13642,0,2,159.66,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,True,False
1,56.000000,10199,6,0,28.01,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,True,False,True,False
2,91.199997,2361,7,0,66.27,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,True,False,False,True,False
3,0.000000,21683,3,0,99.16,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,True,False
4,61.018002,178205,1,0,62.02,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,True,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7492,0.000000,3773,10,0,35.95,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,True,False,False,True,False
7493,46.000000,135411,2,0,23.40,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,True,True,False
7494,0.000000,1363,2,0,120.00,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,True,False,False,True,False
7495,56.000000,39445,6,2,24.00,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,True,True,False


Unnamed: 0,battery_capacity,mileage,warranty_period,year_of_manufacture,y,manufacturer_0,manufacturer_1,manufacturer_2,manufacturer_3,manufacturer_4,manufacturer_5,manufacturer_6,model_0,model_1,model_2,model_3,model_4,model_5,model_6,model_7,model_8,model_9,model_10,model_11,model_12,model_13,model_14,model_15,model_16,model_17,model_18,model_19,model_20,vehicle_condition_0,vehicle_condition_1,vehicle_condition_2,drivetrain_0,drivetrain_1,drivetrain_2,accident_history_0,accident_history_1
0,76.093002,14057,2,0,,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,True,False
1,90.000000,7547,8,0,,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False,True,False,False,True,False
2,0.000000,7197,7,0,,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,True,False,True
3,68.478996,10357,7,1,,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,True,False,False,True,False,False,True,False
4,0.000000,7597,10,0,,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,True,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
841,64.682999,117298,2,0,,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,True,False,False,True,False
842,55.547001,72308,0,0,,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,True,False,False,True,False
843,55.547001,124537,0,0,,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,True,False,False,True,False
844,69.646004,15629,4,0,,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,True,False


In [58]:
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import xgboost as xgb, time

model = XGBRegressor(
    n_estimators=100,
    objective="reg:squarederror",
    random_state=SEED,
    tree_method='gpu_hist',
    gpu_id=0,
    predictor="gpu_predictor",
    verbose=0
)
model = LGBMRegressor(
    n_estimators=100,
    random_state=SEED,
    device="gpu",
    gpu_device_id=0,
)

In [30]:
submission = pd.read_csv('./data/sample_submission.csv')
submission['가격(백만원)'] = test_preds
submission.to_csv('submission.csv', index=False)