In [1]:
import pandas as pd
import numpy as np
import time
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from hyperopt.pyll.base import scope
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
path = '/content/drive/Shareddrives/dacon/Playground/'
train = pd.read_csv(path + 'train.csv', index_col=['id'])
test = pd.read_csv(path + 'test.csv', index_col=['id'])
submission = pd.read_csv(path + 'sample_submission.csv', index_col=['id'])

target = train.target

In [3]:
train.head()

Unnamed: 0_level_0,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,cont0,cont1,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
1,A,B,A,A,B,D,A,E,C,I,0.923191,0.684968,0.124454,0.217886,0.281421,0.881122,0.42165,0.741413,0.895799,0.802461,0.724417,0.701915,0.877618,0.719903,6.994023
2,B,A,A,A,B,B,A,E,A,F,0.437627,0.014213,0.357438,0.846127,0.282354,0.440011,0.34623,0.278495,0.593413,0.546056,0.613252,0.741289,0.326679,0.808464,8.071256
3,A,A,A,C,B,D,A,B,C,N,0.732209,0.760122,0.454644,0.81299,0.293756,0.914155,0.369602,0.832564,0.86562,0.825251,0.264104,0.695561,0.869133,0.828352,5.760456
4,A,A,A,C,B,D,A,E,G,K,0.705142,0.771678,0.153735,0.732893,0.769785,0.934138,0.57893,0.407313,0.868099,0.794402,0.494269,0.698125,0.809799,0.614766,7.806457
6,A,B,A,A,B,B,A,E,C,F,0.486063,0.639349,0.496212,0.354186,0.279105,0.3826,0.70594,0.325193,0.440967,0.462146,0.724447,0.683073,0.343457,0.297743,6.868974


In [4]:
def preprocess(df, encoder=None,
               scaler=None, cols_to_drop=None,
               cols_to_encode=None, cols_to_scale=None):
    """
    Preprocess input data
    :param df: DataFrame with data
    :param encoder: encoder object with fit_transform method
    :param scaler: scaler object with fit_transform method
    :param cols_to_drop: columns to be removed
    :param cols_to_encode: columns to be encoded
    :param cols_to_scale: columns to be scaled
    :return: DataFrame
    """

    if encoder:
        for col in cols_to_encode:
            df[col] = encoder.fit_transform(df[col])

    if scaler:
        for col in cols_to_scale:
            df[col] = scaler.fit_transform(df[col].values.reshape(-1, 1))

    if cols_to_drop:
        df = df.drop(cols_to_drop, axis=1)

    return df

In [5]:
cat_cols = ['cat' + str(i) for i in range(10)]
cont_cols = ['cont' + str(i) for i in range(14)]

train = preprocess(train, encoder=LabelEncoder(), scaler=StandardScaler(),
                  cols_to_drop=['target'], cols_to_encode=cat_cols,
                  cols_to_scale=cont_cols)

test = preprocess(test, encoder=LabelEncoder(), scaler=StandardScaler(),
                 cols_to_encode=cat_cols,
                 cols_to_scale=cont_cols)

In [12]:
train

Unnamed: 0_level_0,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,cont0,cont1,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
1,0,1,0,0,1,3,0,4,2,8,1.945372,0.757938,-1.597601,-0.956673,-0.867238,1.609527,-0.294623,1.396184,1.978227,1.350628,1.088938,0.744943,1.598382,0.961533
2,1,0,0,0,1,1,0,4,0,5,-0.424682,-2.093075,-0.433193,1.675598,-0.862591,-0.295002,-0.686557,-0.867560,0.607739,0.093896,0.537385,0.915496,-0.895782,1.354760
3,0,0,0,2,1,3,0,1,2,13,1.013181,1.077375,0.052623,1.536759,-0.805777,1.752148,-0.565102,1.841928,1.841445,1.462334,-1.194944,0.717420,1.559967,1.443062
4,0,0,0,2,1,3,0,4,6,10,0.881067,1.126494,-1.451257,1.201160,1.566127,1.838425,0.522703,-0.237621,1.852684,1.311128,-0.052959,0.728526,1.291356,0.494710
6,0,1,0,0,1,1,0,4,2,5,-0.188265,0.564033,0.260374,-0.385590,-0.878778,-0.542878,1.182729,-0.639198,-0.083184,-0.317379,1.089089,0.663325,-0.819827,-0.912921
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
499993,0,1,0,2,1,1,0,4,4,11,-1.288194,0.874697,-1.411648,-0.014330,1.559461,-1.030856,-1.141653,-0.451394,-0.719276,-0.913531,-1.335816,-0.634418,-1.400337,1.288893
499996,0,1,0,2,1,1,0,4,4,11,-1.714865,-1.636671,0.741668,-1.057966,1.596853,-1.343305,-1.150129,0.578991,-1.052778,-1.001429,-1.085836,-0.888263,-0.985174,-1.209708
499997,0,1,0,2,1,1,0,4,2,12,-0.887267,0.472735,0.094338,-0.117804,-0.787592,-0.254110,-0.478991,0.099331,-1.465608,-0.118463,-1.335434,-0.923796,-1.241619,-0.685152
499998,0,1,1,2,1,1,0,3,4,5,0.364951,0.762555,1.083248,-1.085251,1.510603,-0.626940,-0.801389,-1.109514,-1.080635,-0.056830,-0.573322,-1.233235,-0.998925,-0.098622


In [6]:
class EnsembleModel:
    def __init__(self, params):
        """
        LGB + XGB model
        """
        self.lgb_params = params['lgb']
        self.xgb_params = params['xgb']

        self.lgb_model = LGBMRegressor(**self.lgb_params)
        self.xgb_model = XGBRegressor(**self.xgb_params)

    def fit(self, x, y, *args, **kwargs):
        return (self.lgb_model.fit(x, y, *args, **kwargs),
                self.xgb_model.fit(x, y, *args, **kwargs))

    def predict(self, x, weights=[1.0, 1.0]):
        """
        Generate model predictions
        :param x: data
        :param weights: weights on model prediction, first one is the weight on lgb model
        :return: array with predictions
        """
        return (weights[0] * self.lgb_model.predict(x) +
                weights[1] * self.xgb_model.predict(x)) / 2

In [7]:
since = time.time()
columns = train.columns

ensemble_params = {
    "lgb" : {
        "num_leaves": 36,
        "max_depth": 21,
        'learning_rate': 0.049019854828962754,
        'min_split_gain': 0.2579555416739361,
        'min_child_samples': 500,
        "subsample": 0.2595537456780356,
        "colsample_bytree": 0.6203517996970486,
        'reg_alpha': 0.33867231210286647,
        'reg_lambda': 42.071411120949854,
        'n_jobs': -1,
        'n_estimators': 5000},
    'xgb': {
        'max_depth': 13,
        'learning_rate': 0.020206705089028228,
        'gamma': 3.5746731812451156,
        'min_child_weight': 564,
        'n_estimators': 5000,
        'colsample_bytree': 0.5015940592112956,
        'subsample': 0.6839489639112909,
        'reg_lambda': 18.085502002853246,
        'reg_alpha': 0.17532087359570606,
        'objective': 'reg:squarederror',
        'tree_method': 'gpu_hist',
        'n_jobs': -1}
        }
    
preds = np.zeros(test.shape[0])
kf = KFold(n_splits=10, random_state=22, shuffle=True)
rmse = []
n = 0

for trn_idx, test_idx in kf.split(train[columns], target):

    X_tr, X_val=train[columns].iloc[trn_idx], train[columns].iloc[test_idx]
    y_tr, y_val=target.iloc[trn_idx], target.iloc[test_idx]

    model = EnsembleModel(ensemble_params)

    model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], early_stopping_rounds=100, verbose=False)

    preds += model.predict(test[columns]) / kf.n_splits
    rmse.append(mean_squared_error(y_val, model.predict(X_val), squared=False))
    
    print(f"Fold {n+1}, RMSE: {rmse[n]}")
    n += 1


print("Mean RMSE: ", np.mean(rmse))
end_time = time.time() - since
print('Training complete in {:.0f}m {:.0f}s'.format(
        end_time // 60, end_time % 60))

Fold 1, RMSE: 0.8460301334075518
Fold 2, RMSE: 0.8390881263683205
Fold 3, RMSE: 0.8380555909146911
Fold 4, RMSE: 0.8453226257101901
Fold 5, RMSE: 0.8423290455645285
Fold 6, RMSE: 0.8458533424921761
Fold 7, RMSE: 0.8425633996040529
Fold 8, RMSE: 0.8437847024287801
Fold 9, RMSE: 0.8391813037470872
Fold 10, RMSE: 0.8446106243691848
Mean RMSE:  0.8426818894606564
Training complete in 6m 27s


In [10]:
submission['target'] = preds

In [11]:
submission.to_csv("why.csv", index = True)
!cp why.csv "drive/My Drive/"