In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
#显示所有列
pd.set_option('display.max_columns', None)
#显示所有行
pd.set_option('display.max_rows', None)
#设置value的显示长度为100，默认为50
pd.set_option('max_colwidth',100)

def reduce_mem_usage(df):

    start_mem = df.memory_usage().sum() 
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() 
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df

import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor
from sklearn.linear_model import SGDRegressor, Lasso, Ridge
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
import warnings
from sklearn.ensemble import RandomForestRegressor as rf
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
warnings.filterwarnings('ignore')

def single_model(clf, XX, YY, clf_name, class_num=1):
    train = np.zeros((XX.shape[0], class_num))


    if clf_name == 'rf':
        params = {
            "max_depth": 9,
            "max_features": 0.4717,
            'min_samples_split': 15,
            'n_estimators': 36
        }
        model = clf(**params)
        model.fit(XX, YY)
#         val_pred = model.predict(x_test)
        y_pred = model.predict(XX)

    if clf_name == "ridge":
        params = {
            'alpha': 1.0,
            'random_state': 2020,
        }
        model = Ridge(**params)
        model.fit(XX, YY)
#         val_pred = model.predict(x_test)
        y_pred = model.predict(XX)
        print(model.coef_)
        print(model.intercept_)


    print("%s_train_mse_score:" % clf_name, mean_squared_error(y_pred, YY))

    return mean_squared_error(y_pred, YY), mean_squared_error(y_pred, YY)**0.5, mean_absolute_error(y_pred, YY), y_pred

def rf_model(x, y):
    rf_mse, rf_rmse, rf_mae, rf_pred = single_model(rf, x, y, 'rf')
    return rf_mse, rf_rmse, rf_mae, rf_pred.reshape(-1,1)


def ridge_model(x, y):
    ridge_mse, ridge_rmse, ridge_mae, ridge_pred = single_model(Ridge, x, y, "ridge")
    return ridge_mse, ridge_rmse, ridge_mae, ridge_pred.reshape(-1,1)

In [2]:
data = pd.read_csv('data1.csv', sep=',')
data = reduce_mem_usage(data)

endl_name = ['x77', 'x30', 'x33', 'x61', 'x26', 'x2', 'x274', 'x6', 'x28', 'x54', 'x162', 'x320', 'x315', 'x245', 'x273', 'x191', 'x169', 'x130', 'x182', 'x317', 'x123', 'x4', 'x174', 'x16']

X = data[endl_name]
Y = data['y']

Memory usage of dataframe is 553928.00 MB
Memory usage after optimization is: 147028.00 MB
Decreased by 73.5%


In [3]:
rf_mse, rf_rmse, rf_mae, rf_pred = rf_model(X, Y)

rf_train_mse_score: 0.01810238991020269


In [4]:
ridge_mse, ridge_rmse, ridge_mae, ridge_pred = ridge_model(X, Y)

[-2.95892224e-03  1.23582804e-02 -1.46706264e-05  9.19944543e-03
  3.63400427e-03  1.85652806e-02  1.48860607e-02 -7.05277325e-04
 -4.76265524e-04  4.75708446e-03 -3.03500505e-03  1.34083163e-04
  2.30983382e-02  2.51450195e-03 -1.48394535e-02  3.63871085e-06
  1.26824977e-03 -3.76765635e-02 -9.02184658e-04 -5.00292321e-03
 -2.75366429e-04 -1.71185407e-03 -4.21473215e-05  5.76510919e-03]
-3.97155483543749
ridge_train_mse_score: 0.03779041719557484


In [5]:
rf_w = (1/rf_mse) / (1/rf_mse + 1/ridge_mse)
ridge_w = (1/ridge_mse) / (1/rf_mse + 1/ridge_mse)

In [6]:
rf_ridge_pred = rf_w*rf_pred + ridge_w*ridge_pred

In [7]:
mean_squared_error(rf_ridge_pred, Y), mean_squared_error(rf_ridge_pred, Y)**0.5, mean_absolute_error(rf_ridge_pred, Y)

(0.022811955263380344, 0.15103627135023012, 0.11117636899842677)