In [19]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
#显示所有列
pd.set_option('display.max_columns', None)
#显示所有行
pd.set_option('display.max_rows', None)
#设置value的显示长度为100，默认为50
pd.set_option('max_colwidth',100)

def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() 
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() 
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df

data = pd.read_csv('data1.csv', sep=',')
data = reduce_mem_usage(data)

endl_name = ['x77', 'x30', 'x33', 'x61', 'x26', 'x2', 'x274', 'x6', 'x28', 'x54', 'x162', 'x320', 'x315', 'x245', 'x273', 'x191', 'x169', 'x130', 'x182', 'x317', 'x123', 'x4', 'x174', 'x16']

X = data[endl_name]
Y = data['y']

Memory usage of dataframe is 553928.00 MB
Memory usage after optimization is: 147028.00 MB
Decreased by 73.5%


In [31]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import SGDRegressor, Lasso, Ridge
from sklearn.metrics import make_scorer
from sklearn.metrics import mean_absolute_error, mean_squared_error
from xgboost.sklearn import XGBRegressor
clf_ridge = Ridge()
scores_ridge = cross_val_score(
                clf_ridge, X, Y, cv=5, scoring=make_scorer(mean_absolute_error))
scores_ridge

array([0.17883637, 0.21435492, 0.2457301 , 0.11453548, 0.13159997])

In [32]:
xgbmodel = XGBRegressor(n_estimators=100, max_depth=2,
                     learning_rate=0.13, 
                     random_state=66, subsample=0.7,
                     colsample_bytree = 0.6)
scores_xgb = cross_val_score(
                xgbmodel, X, Y, cv=5, scoring=make_scorer(mean_absolute_error))
scores_xgb

array([0.15499887, 0.18484481, 0.22718412, 0.11383413, 0.10749706])

In [33]:
from sklearn.ensemble import RandomForestRegressor as RF
params = {
            "max_depth": 9,
            "max_features": 0.4,
            'min_samples_split': 15,
            'n_estimators': 36
        }
rf = RF(params)
scores_rf = cross_val_score(
                rf, X, Y, cv=5, scoring=make_scorer(mean_absolute_error))
scores_rf

array([nan, nan, nan, nan, nan])