# PartB 模型训练_融合

主要包括:
 - 基于Kfold的交叉验证 
 - xgboost            默认回归器的训练
 - GradientBoosting   默认回归器的训练
 - LightGBM           默认回归器的训练
 - catboost           默认回归器的训练
 - 以上四个模型的Stacking(基于xgboost模型)

TODO:
 - [ ] xgboost          回归器的调参
 - [ ] GradientBoosting 回归器的调参
 - [ ] LightGBM         回归器的调参
 - [ ] catboost         回归器的调参
 
结果:

- 这只是**模型融合的空框架**，如果不特征工程，在不调参的情况下，**直接跑线上MSE是0.1328**
- Stacking直接用Ridge会欠拟合,线上0.1623

In [None]:
# 
!ls datalab/231693
!ls
!rm -rf temp_model
!mkdir temp_model
!ls

In [None]:
# 安装catboost
!pip3 install catboost -i https://mirrors.aliyun.com/pypi/simple


In [None]:
import pandas as pd 
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as MSE
from sklearn.externals import joblib
from sklearn.model_selection import KFold

In [None]:
#data=pd.read_csv('myspace/steam_train_washed_v.csv').drop('Unnamed: 0',axis=1)
data = pd.read_csv('datalab/231693/zhengqi_train.txt',encoding='gbk',sep="\t")
Y = data["target"]               
X = data.drop(['target'],axis=1).drop(['V5','V9','V11','V14','V17','V22','V28'],axis=1)

# init a scaler
scaler = preprocessing.StandardScaler().fit(X)
scale = False

# init a kfold to split dataset
kfold = KFold(n_splits=15, shuffle = True, random_state= 6666)

In [None]:
# model training and evaluating tool
def run_model(model,model_type):
    global scale
    mse,i = [],0
    for train, test in kfold.split(X):
        # split & scale(optional) the dataset
        if scale:
            X_train, y_train  = scaler.transform(X.iloc[train]), Y.iloc[train]
            X_test,  y_test   = scaler.transform(X.iloc[test]),  Y.iloc[test]
        else:
            X_train, y_train  = X.iloc[train], Y.iloc[train]
            X_test,  y_test   = X.iloc[test],  Y.iloc[test]

        # fit & evaluate the model
        model.fit(X_train,y_train) 
        mse.append(MSE(y_test, model.predict(X_test)))

        # print & save the model
        # print(i,mse[-1])
        joblib.dump(filename="./temp_model/"+str(model_type)+str(i),value=model)
        i+=1
    #print(model_type,np.mean(mse))
    return np.mean(mse)

## 训练

In [None]:
#xgboost
from xgboost.sklearn import XGBRegressor

model = XGBRegressor(learning_rate=0.07,n_estimators=100,max_depth=5,min_child_weight=3,seed=0,subsample=0.5,colsample_bytree=0.9,gamma=0.2,reg_alpha=0.05,reg_lambda=0.1)

run_model(model,"XGB")

In [None]:
#GradientBoosting
from sklearn.ensemble import GradientBoostingRegressor

model = GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                                  learning_rate=0.03, loss='huber', max_depth=14,
                                  max_features='sqrt', max_leaf_nodes=None,
                                  min_impurity_decrease=0.0, min_impurity_split=None,
                                  min_samples_leaf=10, min_samples_split=40,
                                  min_weight_fraction_leaf=0.0, n_estimators=300,
                                  presort='auto', random_state=10, subsample=0.8, verbose=0,
                                  warm_start=False)

run_model(model,"GBDT")

In [None]:
#lgbm
from lightgbm.sklearn import LGBMRegressor

model = LGBMRegressor(learning_rate=0.07,n_estimators=100,max_depth=9,min_child_weight=1,seed=0,subsample=0.6,colsample_bytree=0.5,gamma=0.03,reg_alpha=0,reg_lambda=1)

run_model(model,"LGBM")

In [None]:
#cat
from catboost import CatBoostRegressor

model = CatBoostRegressor(logging_level='Silent')

run_model(model,"CAT")

In [None]:
from sklearn.ensemble import RandomForestRegressor

model= RandomForestRegressor(n_estimators=100)

run_model(model,"rf")


In [None]:
from sklearn.svm import SVR

model = SVR(kernel='rbf', degree=3, coef0=0.0, tol=0.01, 
           C=1.0, epsilon=0.1, shrinking=True, cache_size=200, 
           verbose=False, max_iter=-1)

run_model(model,"SVR")

In [None]:
# stacking

from sklearn.linear_model import LinearRegression,Lasso,RidgeCV

mse_list=[[],[],[],[],[],[],[]] # cat,lgbm,xgb,gbdt,lr

i = 0
for train, test in kfold.split(X):
    # split & scale(optional) the dataset
    if scale:
        X_train, y_train  = scaler.transform(X.iloc[train]), Y.iloc[train]
        X_test,  y_test   = scaler.transform(X.iloc[test]),  Y.iloc[test]
    else:
        X_train, y_train  = X.iloc[train], Y.iloc[train]
        X_test,  y_test   = X.iloc[test],  Y.iloc[test]

    # load the models
    cat = joblib.load(filename="./temp_model/CAT"+str(i))
    lgbm = joblib.load(filename="./temp_model/GBDT"+str(i))
    xgb = joblib.load(filename="./temp_model/LGBM"+str(i))
    gbdt = joblib.load(filename="./temp_model/CAT"+str(i))
    
    rf = joblib.load(filename="./temp_model/rf"+str(i))
    svm = joblib.load(filename="./temp_model/SVR"+str(i))
     
    # input of lr model
    res = np.c_[cat.predict(X_test),
                lgbm.predict(X_test),
                xgb.predict(X_test),
                gbdt.predict(X_test),
                rf.predict(X_test),
                svm.predict(X_test),]
    
    # fit lf model
    lr = RidgeCV(cv=5)
    lr.fit(res,y_test)
    
    # record all the mse
    for j in range(6):
        mse_list[j].append(MSE(res[:,j:j+1].flatten(),y_test))
    mse_list[6].append(MSE(lr.predict(res)     ,y_test))
    
    # print & save the model   
    # print("lr mse:",mse_list[4][-1])
    joblib.dump(filename="./temp_model/LR"+str(i),value=lr)
    i+=1
    
print("============================")
print("catmse   :",np.mean(mse_list[0]))
print("lightmse :",np.mean(mse_list[1]))
print("xgmse    :",np.mean(mse_list[2]))
print("gbdtmse  :",np.mean(mse_list[3]))
print("rfmse    :",np.mean(mse_list[4]))
print("svmmse   :",np.mean(mse_list[5]))
print("lrmse    :",np.mean(mse_list[6]))

## 预测

In [None]:
data_test=pd.read_csv('datalab/231693/zhengqi_test.txt',encoding='gbk',sep="\t").drop(['V5','V9','V11','V14','V17','V22','V28'],axis=1)
# data_test=pd.read_csv('myspace/steam_test_washed_v.csv').drop('Unnamed: 0',axis=1)#.drop('target',axis=1)
if scale:
    data_test  = scaler.transform(data_test)
prediction = []
for i in range(15):
    # load the models
    cat = joblib.load(filename="./temp_model/CAT"+str(i))
    lgbm = joblib.load(filename="./temp_model/GBDT"+str(i))
    xgb = joblib.load(filename="./temp_model/LGBM"+str(i))
    gbdt = joblib.load(filename="./temp_model/CAT"+str(i))
    
    rf = joblib.load(filename="./temp_model/rf"+str(i))
    svm = joblib.load(filename="./temp_model/SVR"+str(i))
    
    res = np.c_[cat.predict(data_test),
                lgbm.predict(data_test),
                xgb.predict(data_test),
                gbdt.predict(data_test),
                rf.predict(data_test),
                svm.predict(data_test),]
    prediction.append(lr.predict(res))
    
res_pred=np.mean(np.array(prediction),axis=0)

In [None]:
np.savetxt('myspace/remove_feature.txt', res_pred)

#np.savetxt('myspace/temp/wash1_train.txt', res_pred)

In [None]:
MSE(np.loadtxt('myspace/temp/wash_test.txt'),res_pred)

In [None]:
res_train = np.c_[np.loadtxt('myspace/temp/ori_train.txt'),
                  np.loadtxt('myspace/temp/wash_train.txt'),
                  np.loadtxt('myspace/temp/wash1_train.txt')]
y_train = pd.read_csv('datalab/231693/zhengqi_train.txt',encoding='gbk',sep="\t")['target']

In [None]:
lr =XGBRegressor()
lr.fit(res_train,y_train)
MSE(y_train,lr.predict(res_train))

In [None]:
res_test = np.c_[np.loadtxt('myspace/temp/ori_test.txt'),
                  np.loadtxt('myspace/temp/wash_test.txt'),
                  np.loadtxt('myspace/temp/wash1_test.txt')]
(MSE(np.loadtxt('myspace/temp/ori_test.txt'),lr.predict(res_test)),MSE(np.loadtxt('myspace/temp/wash_test.txt'),lr.predict(res_test)),MSE(np.loadtxt('myspace/temp/wash1_test.txt'),lr.predict(res_test)))

In [None]:
lr.predict(res_test)

In [None]:

np.savetxt('myspace/submit_3in1.txt',lr.predict(res_test))