In [1]:
import pandas as pd
import numpy as np

from sklearn import model_selection
from sklearn import preprocessing

from sklearn.preprocessing import OrdinalEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

from xgboost import XGBRegressor

import optuna

In [2]:
train_original = pd.read_csv("../input/30-days-of-ml/train.csv")
test_original = pd.read_csv("../input/30-days-of-ml/test.csv")
sample_original = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

In [3]:
train_original.head()

Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,...,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,target
0,1,B,B,B,C,B,B,A,E,C,...,0.400361,0.160266,0.310921,0.38947,0.267559,0.237281,0.377873,0.322401,0.86985,8.113634
1,2,B,B,A,A,B,D,A,F,A,...,0.533087,0.558922,0.516294,0.594928,0.341439,0.906013,0.921701,0.261975,0.465083,8.481233
2,3,A,A,A,C,B,D,A,D,A,...,0.650609,0.375348,0.902567,0.555205,0.843531,0.748809,0.620126,0.541474,0.763846,8.364351
3,4,B,B,A,C,B,D,A,E,C,...,0.66898,0.239061,0.732948,0.679618,0.574844,0.34601,0.71461,0.54015,0.280682,8.049253
4,6,A,A,A,C,B,D,A,E,A,...,0.686964,0.420667,0.648182,0.684501,0.956692,1.000773,0.776742,0.625849,0.250823,7.97226


In [4]:
sample_original.head()

Unnamed: 0,id,target
0,0,0.5
1,5,0.5
2,15,0.5
3,16,0.5
4,17,0.5


In [5]:
train_original["kfold"] = -1
n_splits = 5

In [6]:
kf = model_selection.KFold(n_splits=n_splits, shuffle=True, random_state=0)
for fold, (train_indicies, valid_indicies) in enumerate(kf.split(X=train_original)):
    train_original.loc[valid_indicies, "kfold"] = fold

In [7]:
train = train_original.copy()
test = test_original.copy()
sample = sample_original.copy()

useful_features = [c for c in train.columns if c not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if 'cat' in col]
numerical_cols = [col for col in useful_features if col.startswith("cont")]
test = test[useful_features]

scores = []
final_test_predictions = []
final_valid_predictions = {}

for fold in range(n_splits):
    xtrain =  train[train.kfold != fold].reset_index(drop=True)
    xvalid = train[train.kfold == fold].reset_index(drop=True)
    xtest = test.copy()
    
    valid_ids = xvalid.id.values.tolist()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    ordinal_encoder = preprocessing.OrdinalEncoder()
    xtrain[object_cols] = ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.transform(xvalid[object_cols])
    xtest[object_cols] = ordinal_encoder.transform(xtest[object_cols])
    
    params = {
        'random_state': 1, 
        'booster': 'gbtree',
        'n_estimators': 10000,
        'learning_rate': 0.03628302216953097,
        'reg_lambda': 0.0008746338866473539,
        'reg_alpha': 23.13181079976304,
        'subsample': 0.7875490025178415,
        'colsample_bytree': 0.11807135201147481,
        'max_depth': 3
    }
    
    model = XGBRegressor(
        n_jobs=4,
        **params
    )
    model.fit(xtrain, ytrain, early_stopping_rounds=300, eval_set=[(xvalid, yvalid)], verbose=1000)
    
    preds_valid = model.predict(xvalid)
    final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))
    
    test_preds = model.predict(xtest)
    final_test_predictions.append(test_preds)
    
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    scores.append(rmse)
    print(fold, rmse)

print(np.mean(scores), np.std(scores))
final_valid_predictions = pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
final_valid_predictions.columns = ["id", "pred_1"]
final_valid_predictions.to_csv("train_pred_1.csv", index=False)

preds = np.mean(np.column_stack(final_test_predictions), axis=1)
output = sample.copy()
output.target = preds
output.columns = ["id", "pred_1"]
output.to_csv('test_pred_1.csv', index=False)

[0]	validation_0-rmse:7.49541
[1000]	validation_0-rmse:0.72544
[2000]	validation_0-rmse:0.72135
[3000]	validation_0-rmse:0.71969
[4000]	validation_0-rmse:0.71896
[5000]	validation_0-rmse:0.71853
[6000]	validation_0-rmse:0.71832
[6709]	validation_0-rmse:0.71827
0 0.7182530328034008
[0]	validation_0-rmse:7.49834
[1000]	validation_0-rmse:0.72264
[2000]	validation_0-rmse:0.71848
[3000]	validation_0-rmse:0.71685
[4000]	validation_0-rmse:0.71609
[5000]	validation_0-rmse:0.71573
[6000]	validation_0-rmse:0.71553
[7000]	validation_0-rmse:0.71547
[7158]	validation_0-rmse:0.71546
1 0.7154590636159889
[0]	validation_0-rmse:7.50018
[1000]	validation_0-rmse:0.72792
[2000]	validation_0-rmse:0.72347
[3000]	validation_0-rmse:0.72169
[4000]	validation_0-rmse:0.72086
[5000]	validation_0-rmse:0.72038
[6000]	validation_0-rmse:0.72015
[7000]	validation_0-rmse:0.72010
[8000]	validation_0-rmse:0.72006
[8762]	validation_0-rmse:0.72005
2 0.7200347664314769
[0]	validation_0-rmse:7.49680
[1000]	validation_0-rmse:

In [8]:
train = train_original.copy()
test = test_original.copy()
sample = sample_original.copy()

useful_features = [c for c in train.columns if c not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if 'cat' in col]
numerical_cols = [col for col in useful_features if col.startswith("cont")]
test = test[useful_features]

scores = []
final_test_predictions = []
final_valid_predictions = {}

for fold in range(n_splits):
    xtrain =  train[train.kfold != fold].reset_index(drop=True)
    xvalid = train[train.kfold == fold].reset_index(drop=True)
    xtest = test.copy()
    
    valid_ids = xvalid.id.values.tolist()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    ordinal_encoder = preprocessing.OrdinalEncoder()
    xtrain[object_cols] = ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.transform(xvalid[object_cols])
    xtest[object_cols] = ordinal_encoder.transform(xtest[object_cols])
    
    params = {
        'learning_rate': 0.07853392035787837,
        'reg_lambda': 1.7549293092194938e-05,
        'reg_alpha': 14.68267919457715, 
        'subsample': 0.8031450486786944, 
        'colsample_bytree': 0.170759104940733, 
        'max_depth': 3
    }
    
    model = XGBRegressor(
        random_state=fold,
        n_jobs=4,
        n_estimators=5000,
        **params
    )
    model.fit(xtrain, ytrain, early_stopping_rounds=300, eval_set=[(xvalid, yvalid)], verbose=1000)
    
    preds_valid = model.predict(xvalid)
    final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))
    
    test_preds = model.predict(xtest)
    final_test_predictions.append(test_preds)
    
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    scores.append(rmse)
    print(fold, rmse)

print(np.mean(scores), np.std(scores))
final_valid_predictions = pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
final_valid_predictions.columns = ["id", "pred_2"]
final_valid_predictions.to_csv("train_pred_2.csv", index=False)

preds = np.mean(np.column_stack(final_test_predictions), axis=1)
output = sample.copy()
output.target = preds
output.columns = ["id", "pred_2"]
output.to_csv('test_pred_2.csv', index=False)

[0]	validation_0-rmse:7.17003
[1000]	validation_0-rmse:0.72041
[2000]	validation_0-rmse:0.71875
[3000]	validation_0-rmse:0.71859
[3121]	validation_0-rmse:0.71859
0 0.7185601952284657
[0]	validation_0-rmse:7.17290
[1000]	validation_0-rmse:0.71777
[2000]	validation_0-rmse:0.71597
[2708]	validation_0-rmse:0.71587
1 0.7158067330816605
[0]	validation_0-rmse:7.17481
[1000]	validation_0-rmse:0.72251
[2000]	validation_0-rmse:0.72051
[3000]	validation_0-rmse:0.72027
[3008]	validation_0-rmse:0.72028
2 0.7202169524733759
[0]	validation_0-rmse:7.17135
[1000]	validation_0-rmse:0.71640
[2000]	validation_0-rmse:0.71465
[3000]	validation_0-rmse:0.71442
[3156]	validation_0-rmse:0.71445
3 0.7143801710391202
[0]	validation_0-rmse:7.17550
[1000]	validation_0-rmse:0.71691
[2000]	validation_0-rmse:0.71517
[3000]	validation_0-rmse:0.71489
[3178]	validation_0-rmse:0.71492
4 0.714855266280136
0.7167638636205517 0.002253506728198582


In [9]:
train = train_original.copy()
test = test_original.copy()
sample = sample_original.copy()

train_1 = pd.read_csv("train_pred_1.csv")
train_2 = pd.read_csv("train_pred_2.csv")

test_1 = pd.read_csv("test_pred_1.csv")
test_2 = pd.read_csv("test_pred_2.csv")

train = train.merge(train_1, on="id", how="left")
train = train.merge(train_2, on="id", how="left")

test = test.merge(test_1, on="id", how="left")
test = test.merge(test_2, on="id", how="left")

train.head()

Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,...,cont8,cont9,cont10,cont11,cont12,cont13,target,kfold,pred_1,pred_2
0,1,B,B,B,C,B,B,A,E,C,...,0.38947,0.267559,0.237281,0.377873,0.322401,0.86985,8.113634,1,8.425802,8.454543
1,2,B,B,A,A,B,D,A,F,A,...,0.594928,0.341439,0.906013,0.921701,0.261975,0.465083,8.481233,3,8.409385,8.373771
2,3,A,A,A,C,B,D,A,D,A,...,0.555205,0.843531,0.748809,0.620126,0.541474,0.763846,8.364351,1,8.219944,8.240151
3,4,B,B,A,C,B,D,A,E,C,...,0.679618,0.574844,0.34601,0.71461,0.54015,0.280682,8.049253,3,8.382401,8.381382
4,6,A,A,A,C,B,D,A,E,A,...,0.684501,0.956692,1.000773,0.776742,0.625849,0.250823,7.97226,4,8.265082,8.268334


In [10]:
useful_features = ["pred_1", "pred_2"]
test = test[useful_features]

scores = []
final_predictions = []
for fold in range(n_splits):
    xtrain =  train[train.kfold != fold].reset_index(drop=True)
    xvalid = train[train.kfold == fold].reset_index(drop=True)
    xtest = test.copy()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    model = LinearRegression()
    model.fit(xtrain, ytrain)
    
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_predictions.append(test_preds)
    
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    scores.append(rmse)
    print(fold, rmse)

print(np.mean(scores), np.std(scores))

0 0.7182192192859552
1 0.7154398961804369
2 0.7199779894025019
3 0.7139535023382242
4 0.7144882110386158
0.7164157636491468 0.0023096498305119686


In [11]:
preds = np.mean(np.column_stack(final_predictions), axis=1)
output = sample.copy()
output.target = preds
output.to_csv('submission_blending.csv', index=False)