In [12]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
#显示所有列
pd.set_option('display.max_columns', None)
#显示所有行
pd.set_option('display.max_rows', None)
#设置value的显示长度为100，默认为50
pd.set_option('max_colwidth',100)

def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() 
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() 
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df

data = pd.read_csv('data1.csv', sep=',')
data = reduce_mem_usage(data)

endl_name = ['x77', 'x30', 'x33', 'x61', 'x26', 'x2', 'x274', 'x6', 'x28', 'x54', 'x162', 'x320', 'x315', 'x245', 'x273', 'x191', 'x169', 'x130', 'x182', 'x317', 'x123', 'x4', 'x174', 'x16']

X = data[endl_name]
Y = data['y']

import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor
from sklearn.linear_model import SGDRegressor, Lasso, Ridge
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
import warnings
from sklearn.ensemble import RandomForestRegressor as rf
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
warnings.filterwarnings('ignore')

def single_model(clf, XX, YY, xx_test, yy_test, clf_name, class_num=1):
    train = np.zeros((XX.shape[0], class_num))

    if clf_name in ['sgd']:
        print('MinMaxScaler...')
        for col in endl_name:
            ss = MinMaxScaler()
            ss.fit(XX[[col]].values)
            XX[col] = ss.transform(XX[[col]].values).flatten()

    x_train, x_test, y_train, y_test = train_test_split(XX, YY, test_size=0.1, random_state=0)

    if clf_name == "lgb":
        train_matrix = clf.Dataset(x_train, label=y_train)
        valid_matrix = clf.Dataset(x_test, label=y_test)
        test_matrix = clf.Dataset(xx_test, label=yy_test)
        data_matrix = clf.Dataset(XX, label=YY)

        params = {
            'boosting_type': 'gbdt',
            'objective': 'mse',
            'min_child_weight': 5,
            'num_leaves': 2 ** 8,
            'feature_fraction': 0.5,
            'bagging_fraction': 0.5,
            'bagging_freq': 1,
            'learning_rate': 0.001,
            'seed': 2020
        }

        model = clf.train(params, train_matrix, 50000, valid_sets=[train_matrix, valid_matrix], verbose_eval=500,
                          early_stopping_rounds=1000)
        model2 = clf.train(params, data_matrix, model.best_iteration)
#         val_pred = model.predict(x_test, num_iteration=model2.best_iteration).reshape(-1, 1)
        yyyy_train__pred = model.predict(XX, num_iteration=model2.best_iteration).reshape(-1, 1)
        yyyy_test_pred = model.predict(xx_test, num_iteration=model2.best_iteration).reshape(-1, 1)

    if clf_name == "xgb":
        train_matrix = clf.DMatrix(x_train, label=y_train, missing=np.nan)
        valid_matrix = clf.DMatrix(x_test, label=y_test, missing=np.nan)
        test_matrix = clf.DMatrix(xx_test, yy_test)
        data_matrix = clf.DMatrix(XX, label=YY)
        # test_matrix = clf.DMatrix(test_x, label=y_test, missing=np.nan)
        params = {'booster': 'gbtree',
                  'n_estimators': 1,
                  'eval_metric': 'mae',
                  'min_child_weight': 5,
                  'max_depth': 5,
                  'subsample': 0.8041,
                  'colsample_bytree': 0.9289,
                  'eta': 0.001,
                  'seed': 2020,
                  'nthread': 36,
                  'silent': True,
                  }

        watchlist = [(train_matrix, 'train'), (valid_matrix, 'eval')]

        model = clf.train(params, train_matrix, num_boost_round=50000, evals=watchlist, verbose_eval=500,
                          early_stopping_rounds=1000)
#         val_pred = model.predict(valid_matrix, ntree_limit=model.best_ntree_limit).reshape(-1, 1)
        yyyy_train_pred = model.predict(data_matrix, ntree_limit=model.best_ntree_limit).reshape(-1, 1)
        yyyy_test_pred = model.predict(test_matrix, ntree_limit=model.best_ntree_limit).reshape(-1, 1)

    print("%s_mse_score:" % clf_name, mean_squared_error(YY, yyyy_train_pred))

    return mean_squared_error(YY, yyyy_train__pred), mean_squared_error(yy_test, yyyy_test_pred), yyyy_train__pred, yyyy_test_pred


def lgb_model(x, y, xx, yy):
    lgb_train_mse, lgb_test_mse, lgb_train_pred,  lgb_test_pred = single_model(lgb, x, y, xx, yy, "lgb")
    return lgb_train_mse, lgb_test_mse, lgb_train_pred,  lgb_test_pred



def xgb_model(x, y, xx, yy):
    xgb_train_mse, xgb_test_mse, xgb_train_pred,  xgb_test_pred = single_model(xgb, x, y, xx, yy, "xgb")
    return xgb_train_mse, xgb_test_mse, xgb_train_pred,  xgb_test_pred

Memory usage of dataframe is 553928.00 MB
Memory usage after optimization is: 147028.00 MB
Decreased by 73.5%


In [13]:
xxx_train, xxx_test, yyy_train, yyy_test = train_test_split(X, Y, test_size=0.3, random_state=0)

In [14]:
lgb_train_mse, lgb_test_mse, lgb_train_pred,  lgb_test_pred = lgb_model(xxx_train, yyy_train, xxx_test, yyy_test)

xgb_train_mse, xgb_test_mse, xgb_train_pred,  xgb_test_pred = xgb_model(xxx_train, yyy_train, xxx_test, yyy_test)

Training until validation scores don't improve for 1000 rounds
[500]	training's l2: 0.0454449	valid_1's l2: 0.0402616
[1000]	training's l2: 0.0416518	valid_1's l2: 0.0364825
[1500]	training's l2: 0.0390549	valid_1's l2: 0.0343262
[2000]	training's l2: 0.0370172	valid_1's l2: 0.0330873
[2500]	training's l2: 0.0352263	valid_1's l2: 0.0323414
[3000]	training's l2: 0.033738	valid_1's l2: 0.0318945
[3500]	training's l2: 0.0324321	valid_1's l2: 0.0317852
[4000]	training's l2: 0.031241	valid_1's l2: 0.0315489
[4500]	training's l2: 0.0301246	valid_1's l2: 0.0312829
[5000]	training's l2: 0.0291314	valid_1's l2: 0.0313613
[5500]	training's l2: 0.0282276	valid_1's l2: 0.0315205
Early stopping, best iteration is:
[4566]	training's l2: 0.0299868	valid_1's l2: 0.0312747


UnboundLocalError: local variable 'yyyy_train_pred' referenced before assignment