In [1]:
import pandas as pd
import numpy as np

from sklearn import model_selection
from sklearn import preprocessing

from sklearn.preprocessing import OrdinalEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

from xgboost import XGBRegressor

import optuna

In [2]:
train_original = pd.read_csv("../input/30-days-of-ml/train.csv")
test_original = pd.read_csv("../input/30-days-of-ml/test.csv")
sample = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

In [3]:
train_original.head()

Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,...,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,target
0,1,B,B,B,C,B,B,A,E,C,...,0.400361,0.160266,0.310921,0.38947,0.267559,0.237281,0.377873,0.322401,0.86985,8.113634
1,2,B,B,A,A,B,D,A,F,A,...,0.533087,0.558922,0.516294,0.594928,0.341439,0.906013,0.921701,0.261975,0.465083,8.481233
2,3,A,A,A,C,B,D,A,D,A,...,0.650609,0.375348,0.902567,0.555205,0.843531,0.748809,0.620126,0.541474,0.763846,8.364351
3,4,B,B,A,C,B,D,A,E,C,...,0.66898,0.239061,0.732948,0.679618,0.574844,0.34601,0.71461,0.54015,0.280682,8.049253
4,6,A,A,A,C,B,D,A,E,A,...,0.686964,0.420667,0.648182,0.684501,0.956692,1.000773,0.776742,0.625849,0.250823,7.97226


In [4]:
sample.head()

Unnamed: 0,id,target
0,0,0.5
1,5,0.5
2,15,0.5
3,16,0.5
4,17,0.5


In [5]:
train_original["kfold"] = -1
n_splits = 5

In [6]:
kf = model_selection.KFold(n_splits=n_splits, shuffle=True, random_state=0)
for fold, (train_indicies, valid_indicies) in enumerate(kf.split(X=train_original)):
    train_original.loc[valid_indicies, "kfold"] = fold

In [7]:
train = train_original.copy()

In [8]:
train_1 = pd.read_csv("../input/stacking/train_pred_1.csv")
train_1.columns = ["id", "pred_1"]
train_2 = pd.read_csv("../input/stacking/train_pred_2.csv")
train_2.columns = ["id", "pred_2"]

test_1 = pd.read_csv("../input/stacking/test_pred_1.csv")
test_1.columns = ["id", "pred_1"]
test_2 = pd.read_csv("../input/stacking/test_pred_2.csv")
test_2.columns = ["id", "pred_2"]

train = train.merge(train_1, on="id", how="left")
train = train.merge(train_2, on="id", how="left")

test_original = test_original.merge(test_1, on="id", how="left")
test_original = test_original.merge(test_2, on="id", how="left")

In [9]:
train.head()

Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,...,cont8,cont9,cont10,cont11,cont12,cont13,target,kfold,pred_1,pred_2
0,1,B,B,B,C,B,B,A,E,C,...,0.38947,0.267559,0.237281,0.377873,0.322401,0.86985,8.113634,1,8.425802,8.454543
1,2,B,B,A,A,B,D,A,F,A,...,0.594928,0.341439,0.906013,0.921701,0.261975,0.465083,8.481233,3,8.409385,8.373771
2,3,A,A,A,C,B,D,A,D,A,...,0.555205,0.843531,0.748809,0.620126,0.541474,0.763846,8.364351,1,8.219944,8.240151
3,4,B,B,A,C,B,D,A,E,C,...,0.679618,0.574844,0.34601,0.71461,0.54015,0.280682,8.049253,3,8.382401,8.381382
4,6,A,A,A,C,B,D,A,E,A,...,0.684501,0.956692,1.000773,0.776742,0.625849,0.250823,7.97226,4,8.265082,8.268334


In [10]:
useful_features = ["pred_1", "pred_2"]
test = test_original[useful_features]

scores = []
final_test_predictions = []
final_valid_predictions = {}

for fold in range(5):
    xtrain =  train[train.kfold != fold].reset_index(drop=True)
    xvalid = train[train.kfold == fold].reset_index(drop=True)
    xtest = test.copy()

    valid_ids = xvalid.id.values.tolist()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    params = {
        'random_state': 1, 
        'booster': 'gbtree',
        'n_estimators': 7000,
        'learning_rate': 0.03,
        'max_depth': 2
    }
    
    model = XGBRegressor(
        n_jobs=4,
        **params
    )
    model.fit(xtrain, ytrain, early_stopping_rounds=300, eval_set=[(xvalid, yvalid)], verbose=1000)
    
    preds_valid = model.predict(xvalid)
    final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))
    
    test_preds = model.predict(xtest)
    final_test_predictions.append(test_preds)
    
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    scores.append(rmse)
    print(fold, rmse)

print(np.mean(scores), np.std(scores))
final_valid_predictions = pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
final_valid_predictions.columns = ["id", "pred_1"]
final_valid_predictions.to_csv("level1_train_pred_1.csv", index=False)

preds = np.mean(np.column_stack(final_test_predictions), axis=1)
output = sample.copy()
output.target = preds
output.columns = ["id", "pred_1"]
output.to_csv('level1_test_pred_1.csv', index=False)

[0]	validation_0-rmse:7.54370
[535]	validation_0-rmse:0.71819
0 0.7181165717968511
[0]	validation_0-rmse:7.54665
[594]	validation_0-rmse:0.71513
1 0.7150940526821101
[0]	validation_0-rmse:7.54841
[591]	validation_0-rmse:0.71982
2 0.7197792511549748
[0]	validation_0-rmse:7.54510
[672]	validation_0-rmse:0.71387
3 0.7138507741576164
[0]	validation_0-rmse:7.54906
[596]	validation_0-rmse:0.71439
4 0.7143425726525522
0.716236644488821 0.0023095276603490133


In [11]:
useful_features = ["pred_1", "pred_2"]
test = test_original[useful_features]

scores = []
final_test_predictions = []
final_valid_predictions = {}

for fold in range(5):
    xtrain =  train[train.kfold != fold].reset_index(drop=True)
    xvalid = train[train.kfold == fold].reset_index(drop=True)
    xtest = test.copy()

    valid_ids = xvalid.id.values.tolist()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    model = RandomForestRegressor(n_estimators=500, n_jobs=-1, max_depth=3)
    model.fit(xtrain, ytrain)
    
    preds_valid = model.predict(xvalid)
    final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))
    
    test_preds = model.predict(xtest)
    final_test_predictions.append(test_preds)
    
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    scores.append(rmse)
    print(fold, rmse)

print(np.mean(scores), np.std(scores))
final_valid_predictions = pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
final_valid_predictions.columns = ["id", "pred_2"]
final_valid_predictions.to_csv("level1_train_pred_2.csv", index=False)

preds = np.mean(np.column_stack(final_test_predictions), axis=1)
output = sample.copy()
output.target = preds
output.columns = ["id", "pred_2"]
output.to_csv('level1_test_pred_2.csv', index=False)

0 0.7184802455624721
1 0.7153751336077635
2 0.7200550238791056
3 0.7142593391063782
4 0.7145066304175585
0.7165352745146556 0.002315718192897099


In [12]:
useful_features = ["pred_1", "pred_2"]
test = test_original[useful_features]

scores = []
final_test_predictions = []
final_valid_predictions = {}

for fold in range(5):
    xtrain =  train[train.kfold != fold].reset_index(drop=True)
    xvalid = train[train.kfold == fold].reset_index(drop=True)
    xtest = test.copy()

    valid_ids = xvalid.id.values.tolist()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    model = GradientBoostingRegressor(n_estimators=500, max_depth=3)
    model.fit(xtrain, ytrain)
    
    preds_valid = model.predict(xvalid)
    final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))
    
    test_preds = model.predict(xtest)
    final_test_predictions.append(test_preds)
    
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    scores.append(rmse)
    print(fold, rmse)

print(np.mean(scores), np.std(scores))
final_valid_predictions = pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
final_valid_predictions.columns = ["id", "pred_3"]
final_valid_predictions.to_csv("level1_train_pred_3.csv", index=False)

preds = np.mean(np.column_stack(final_test_predictions), axis=1)
output = sample.copy()
output.target = preds
output.columns = ["id", "pred_3"]
output.to_csv('level1_test_pred_3.csv', index=False)

0 0.7187950155175356
1 0.7156415667119221
2 0.7203814560040754
3 0.7142185356543508
4 0.7147753326830947
0.7167623813141958 0.0024043727577751773


In [13]:
train = train_original.copy()
test = pd.read_csv("../input/30-days-of-ml/test.csv")
sample = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

In [14]:
train_1 = pd.read_csv("level1_train_pred_1.csv")
train_2 = pd.read_csv("level1_train_pred_2.csv")
train_3 = pd.read_csv("level1_train_pred_3.csv")

test_1 = pd.read_csv("level1_test_pred_1.csv")
test_2 = pd.read_csv("level1_test_pred_2.csv")
test_3 = pd.read_csv("level1_test_pred_3.csv")

train = train.merge(train_1, on="id", how="left")
train = train.merge(train_2, on="id", how="left")
train = train.merge(train_3, on="id", how="left")

test = test.merge(test_1, on="id", how="left")
test = test.merge(test_2, on="id", how="left")
test = test.merge(test_3, on="id", how="left")

In [15]:
train.head()

Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,...,cont9,cont10,cont11,cont12,cont13,target,kfold,pred_1,pred_2,pred_3
0,1,B,B,B,C,B,B,A,E,C,...,0.267559,0.237281,0.377873,0.322401,0.86985,8.113634,1,8.456212,8.461584,8.457769
1,2,B,B,A,A,B,D,A,F,A,...,0.341439,0.906013,0.921701,0.261975,0.465083,8.481233,3,8.42447,8.454659,8.424843
2,3,A,A,A,C,B,D,A,D,A,...,0.843531,0.748809,0.620126,0.541474,0.763846,8.364351,1,8.233343,8.245901,8.235212
3,4,B,B,A,C,B,D,A,E,C,...,0.574844,0.34601,0.71461,0.54015,0.280682,8.049253,3,8.414938,8.431653,8.41638
4,6,A,A,A,C,B,D,A,E,A,...,0.956692,1.000773,0.776742,0.625849,0.250823,7.97226,4,8.271235,8.266841,8.273062


In [16]:
useful_features = ["pred_1", "pred_2", "pred_3"]
test = test[useful_features]

scores = []
final_predictions = []
for fold in range(5):
    xtrain =  train[train.kfold != fold].reset_index(drop=True)
    xvalid = train[train.kfold == fold].reset_index(drop=True)
    xtest = test.copy()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    model = LinearRegression()
    model.fit(xtrain, ytrain)
    
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_predictions.append(test_preds)
    
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    scores.append(rmse)
    print(fold, rmse)

print(np.mean(scores), np.std(scores))

0 0.7181163352849407
1 0.7150902621012295
2 0.7197851028825617
3 0.7138743266066302
4 0.7143606317702429
0.7162453317291211 0.0023038480503614366


In [17]:
preds = np.mean(np.column_stack(final_predictions), axis=1)
output = sample.copy()
output.target = preds
output.to_csv('submission_stacking.csv', index=False)