In [94]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
#显示所有列
pd.set_option('display.max_columns', None)
#显示所有行
pd.set_option('display.max_rows', None)
#设置value的显示长度为100，默认为50
pd.set_option('max_colwidth',100)

def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() 
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() 
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df

data = pd.read_csv('data1.csv', sep=',')
data = reduce_mem_usage(data)

endl_name = ['x77', 'x30', 'x33', 'x61', 'x26', 'x2', 'x274', 'x6', 'x28', 'x54', 'x162', 'x320', 'x315', 'x245', 'x273', 'x191', 'x169', 'x130', 'x182', 'x317', 'x123', 'x4', 'x174', 'x16']

X = data[endl_name]
Y = data['y']

import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor
from sklearn.linear_model import SGDRegressor, Lasso, Ridge
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
import warnings
from sklearn.ensemble import RandomForestRegressor as rf
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
warnings.filterwarnings('ignore')

def single_model(clf, XX, YY, X_test, Y_test, clf_name, class_num=1):
    train = np.zeros((XX.shape[0], class_num))

    if clf_name in ['l']:
        print('MinMaxScaler...')
        for col in endl_name:
            ss = MinMaxScaler()
            ss.fit(XX[[col]].values)
            XX[col] = ss.transform(XX[[col]].values).flatten()

#     x_train, x_test, y_train, y_test = train_test_split(XX, YY, test_size=0.3, random_state=0)



    if clf_name == 'rf':
        params = {
            "max_depth": 9,
            "max_features": 0.4717,
            'min_samples_split': 15,
            'n_estimators': 36
        }
        model = clf(**params)
        model.fit(XX, YY)
#         val_pred = model.predict(x_test)
        y_train_pred = model.predict(XX)
        y_test_pred = model.predict(X_test)

    if clf_name == "sgd":
        params = {
            'loss': 'squared_loss',
            'penalty': 'l2',
            'alpha': 0.00001,
            'random_state': 2020,
        }
        model = SGDRegressor(**params)
        model.fit(XX, YY)
#         val_pred = model.predict(x_test)
        y_train_pred = model.predict(XX)
        y_test_pred = model.predict(X_test)

    if clf_name == "ridge":
        params = {
            'alpha': 1.0,
            'random_state': 2020,
        }
        model = Ridge(**params)
        model.fit(XX, YY)
#         val_pred = model.predict(x_test)
        y_train_pred = model.predict(XX)
        y_test_pred = model.predict(X_test)

    if clf_name == "lasso":
        model = Lasso(alpha=0.1)
        model.fit(XX, YY)
#         val_pred = model.predict(x_test)
        y_train_pred = model.predict(XX)
        y_test_pred = model.predict(X_test)

    if clf_name == "SVR":
        model = SVR(C=1.0, epsilon=0.2)
        model.fit(XX, YY)
#         val_pred = model.predict(x_test)
        y_train_pred = model.predict(XX)
        y_test_pred = model.predict(X_test)

    print("%s_train_mse_score:" % clf_name, mean_squared_error(y_train_pred, YY))
    print("%s_test_mse_score:" % clf_name, mean_squared_error(y_test_pred, Y_test))

    return mean_squared_error(y_train_pred, YY), mean_squared_error(y_test_pred, Y_test), y_train_pred, y_test_pred




def SVR_model(x, y, x_test, y_test):
    svr_train_mse, svr_test_mse, svr_train_pred, svr_test_pred = single_model(SVR, x, y, x_test, y_test, "SVR")
    return svr_train_mse, svr_test_mse, svr_train_pred, svr_test_pred

def rf_model(x, y, x_test, y_test):
    rf_train_mse, rf_test_mse, rf_train_pred, rf_test_pred = single_model(rf, x, y, x_test, y_test, 'rf')
    return rf_train_mse, rf_test_mse, rf_train_pred, rf_test_pred

def sgd_model(x, y, x_test, y_test):
    sgd_train_mse, sgd_test_mse, sgd_train_pred, sgd_test_pred = single_model(SGDRegressor, x, y, x_test, y_test, "sgd")
    return sgd_train_mse, sgd_test_mse, sgd_train_pred, sgd_test_pred


def ridge_model(x, y, x_test, y_test):
    ridge_train_mse, ridge_test_mse, ridge_train_pred, ridge_test_pred = single_model(Ridge, x, y, x_test, y_test, "ridge")
    return ridge_train_mse, ridge_test_mse, ridge_train_pred, ridge_test_pred

def lasso_model(x, y, x_test, y_test):
    lasso_train_mse, lasso_test_mse, lasso_train_pred, lasso_test_pred = single_model(Lasso, x, y, x_test, y_test, "lasso")
    return lasso_train_mse, lasso_test_mse, lasso_train_pred, lasso_test_pred

Memory usage of dataframe is 553928.00 MB
Memory usage after optimization is: 147028.00 MB
Decreased by 73.5%


In [99]:
xx_train, xx_test, yy_train, yy_test = train_test_split(X, Y, test_size=0.3, random_state=5)

rf_train_mse, rf_test_mse, rf_train_pred, rf_test_pred = rf_model(xx_train, yy_train, xx_test, yy_test)

# svr_train_mse, svr_test_mse, svr_train_pred, svr_test_pred = SVR_model(xx_train, yy_train, xx_test, yy_test) 

# sgd_train_mse, sgd_test_mse, sgd_train_pred, sgd_test_pred = sgd_model(xx_train, yy_train, xx_test, yy_test)

# lasso_train_mse, lasso_test_mse, lasso_train_pred, lasso_test_pred = lasso_model(xx_train, yy_train, xx_test, yy_test)

ridge_train_mse, ridge_test_mse, ridge_train_pred, ridge_test_pred  = ridge_model(xx_train, yy_train, xx_test, yy_test)

rf_train_mse_score: 0.015839839869259843
rf_test_mse_score: 0.05605191699982565
ridge_train_mse_score: 0.03497236038138119
ridge_test_mse_score: 0.05386294083524771


In [110]:
(0.15499887+0.18484481+0.22718412+0.11383413+0.10749706)/5

0.157671798

In [111]:
X

Unnamed: 0,x77,x30,x33,x61,x26,x2,x274,x6,x28,x54,x162,x320,x315,x245,x273,x191,x169,x130,x182,x317,x123,x4,x174,x16
0,11.875,35.8125,1458.0,362.75,129.875,90.625,2.228516,61.5,49.90625,56.25,273.25,0.689453,0.486328,15.734375,2.228516,13800.0,44.90625,0.400391,130.375,427.5,452.75,24.40625,236.75,424.5
1,11.421875,35.0,1142.0,360.0,129.875,90.5,2.228516,61.875,49.96875,56.90625,282.0,0.689453,0.480713,22.796875,2.228516,11376.0,49.96875,0.397949,140.0,427.5,470.75,26.40625,236.75,421.0
2,11.570312,32.8125,1026.0,362.0,129.625,90.6875,2.232422,61.71875,50.21875,56.3125,274.25,0.689453,0.485107,22.9375,2.232422,7960.0,49.75,0.400391,139.875,427.5,459.25,26.3125,236.75,424.0
3,11.734375,35.0,968.0,362.75,131.0,90.375,2.230469,61.34375,49.875,61.0,270.0,0.689453,0.477295,25.1875,2.228516,9112.0,50.03125,0.402832,127.9375,427.5,439.0,26.09375,236.75,423.75
4,11.671875,34.4375,932.5,365.75,130.75,89.625,2.228516,61.34375,49.90625,59.875,268.0,0.689453,0.469482,19.359375,2.228516,8760.0,49.90625,0.398193,128.25,427.5,401.75,26.671875,236.75,427.25
5,12.140625,34.78125,862.0,365.75,130.0,91.0,2.207031,61.34375,50.09375,63.25,280.0,0.689453,0.489014,21.328125,2.207031,9184.0,50.0,0.399414,155.625,427.5,394.75,28.09375,236.75,426.5
6,11.851562,34.3125,796.5,364.75,129.0,90.375,2.208984,61.34375,50.03125,59.96875,279.25,0.689453,0.488281,21.40625,2.207031,6280.0,50.6875,0.398926,144.25,427.5,434.75,28.40625,236.75,425.5
7,11.328125,37.65625,1080.0,360.75,130.25,90.5,2.203125,61.34375,50.1875,65.0625,291.25,0.689453,0.474854,21.21875,2.203125,15600.0,54.4375,0.396484,159.5,427.5,405.5,29.203125,236.75,421.5
8,11.28125,36.03125,1244.0,362.5,130.0,90.375,2.205078,61.34375,50.0625,61.59375,281.0,0.689453,0.479248,20.859375,2.205078,13672.0,55.09375,0.404785,155.75,427.5,433.0,29.0625,236.75,424.0
9,11.6875,35.875,1221.0,363.0,129.0,90.1875,2.203125,61.34375,52.125,63.34375,283.25,0.689453,0.49292,18.71875,2.203125,17664.0,52.75,0.397705,170.625,427.5,405.0,28.9375,236.75,423.75


In [4]:
sgd_test, sgd_mse, y_sgd = sgd_model(X, Y)

MinMaxScaler...
sgd_mse_score: 0.04378009124432175


In [6]:
lasso_test, lasso_mse, y_lasso = lasso_model(X, Y)

lasso_mse_score: 0.05031743440372587


In [8]:
ridge_test, ridge_mse, y_ridge  = ridge_model(X, Y)

ridge_mse_score: 0.037788573244418866


In [14]:
rf_w = 1/rf_mse
lgb_w = 1/lgb_mse
xgb_w = 1/xgb_mse
svr_w = 1/svr_mse
sgd_w = 1/sgd_mse
lasso_w = 1/lasso_mse
ridge_w = 1/ridge_mse
all = rf_w+lgb_w+xgb_w+svr_w+sgd_w+lasso_w+ridge_w

In [15]:
rf_w = rf_w/all
lgb_w = lgb_w/all
xgb_w = xgb_w/all
svr_w = svr_w/all
sgd_w = sgd_w/all
lasso_w = lasso_w/all
ridge_w = ridge_w/all

In [39]:
yyy = rf_w * y_rf + lgb_w*y_lgd[:,0] + xgb_w*y_xgb[:,0] + svr_w*y_svr + sgd_w*y_sgd + lasso_w*y_lasso + ridge_w*y_ridge

In [40]:
mean_squared_error(yyy, Y)

0.03233467598910843

In [41]:
mean_squared_error(y_rf, Y)

0.023845732657390453

In [42]:
mean_squared_error(y_lgd[:,0], Y)

0.03385817656222502

In [43]:
mean_squared_error(y_xgb[:,0], Y)

0.019879045

In [44]:
mean_squared_error(y_svr, Y)

0.046424465674106165

In [45]:
mean_squared_error(y_sgd, Y)

0.04759450780000475

In [46]:
mean_squared_error(y_lasso, Y)

0.05090073331077325

In [47]:
mean_squared_error(y_ridge, Y)

0.03848770803300624

In [52]:
y_xrf = ((1/rf_mse)/(1/rf_mse + 1/xgb_mse))*y_rf + ((1/xgb_mse)/(1/rf_mse + 1/xgb_mse))*y_xgb[:,0]

In [53]:
mean_squared_error(y_xrf, Y)

0.02111608925581966

In [54]:
y_xrf = ((1/ridge_mse)/(1/ridge_mse + 1/xgb_mse))*y_ridge + ((1/xgb_mse)/(1/ridge_mse + 1/xgb_mse))*y_xgb[:,0]

In [55]:
mean_squared_error(y_xrf, Y)

0.026795246300012558