In [17]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
#显示所有列
pd.set_option('display.max_columns', None)
#显示所有行
pd.set_option('display.max_rows', None)
#设置value的显示长度为100，默认为50
pd.set_option('max_colwidth',100)

def reduce_mem_usage(df):

    start_mem = df.memory_usage().sum() 
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() 
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df

data = pd.read_csv('delete3_2.csv', sep=',')
data = reduce_mem_usage(data)

Memory usage of dataframe is 30368.00 MB
Memory usage after optimization is: 7688.00 MB
Decreased by 74.7%


In [18]:
Y2 = data['y2']
name_24 = ['x77', 'x30', 'x33', 'x61', 'x26', 'x2', 'x274', 'x6',
       'x28', 'x54', 'x162', 'x320', 'x315', 'x245', 'x273', 'x191', 'x169',
       'x130', 'x182', 'x317', 'x123', 'x4', 'x174', 'x16']
X = data[name_24]

In [21]:
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
from sklearn.linear_model import Ridge
sfs1 = SFS(Ridge(), 
           k_features=(1, 24),
           forward=True, 
           floating=False, 
           scoring='r2',
           cv=5,
           n_jobs=-1)

sfs1.fit(X, Y2, custom_feature_names=name_24)

# print('best combination (r2: %.3f): %s\n' % (sfs1.k_score_, sfs1.k_feature_idx_))
# print('all subsets:\n', sfs1.subsets_)
# plot_sfs(sfs1.get_metric_dict(), kind='std_dev')
# plt.grid()
# plt.show()

SequentialFeatureSelector(clone_estimator=True, cv=5,
                          estimator=Ridge(alpha=1.0, copy_X=True,
                                          fit_intercept=True, max_iter=None,
                                          normalize=False, random_state=None,
                                          solver='auto', tol=0.001),
                          fixed_features=None, floating=False, forward=True,
                          k_features=(1, 24), n_jobs=-1,
                          pre_dispatch='2*n_jobs', scoring='r2', verbose=0)

In [22]:
name_3 = ['x320', 'x182', 'x174']
X_3 = X[name_3]
ridge2 = Ridge()
ridge2.fit(X_3, Y2)
ridge2.coef_, ridge2.intercept_

(array([-0.00078865, -0.00025998,  0.00017675]), 1.3004495697222556)

In [23]:
Y1 = data['y1']
ridge1 = Ridge()
ridge1.fit(X_3, Y1)
ridge1.coef_, ridge1.intercept_

(array([-0.00160313,  0.00497628,  0.00148422]), 3.421561390543235)