In [79]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.preprocessing import PolynomialFeatures
# from sklearn.linear_model import LinearRegression as LR
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
#显示所有列
pd.set_option('display.max_columns', None)
#显示所有行
pd.set_option('display.max_rows', None)
#设置value的显示长度为100，默认为50
pd.set_option('max_colwidth',100)
np.set_printoptions(threshold=np.inf)

def reduce_mem_usage(df):

    start_mem = df.memory_usage().sum() 
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() 
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df

data_y = pd.read_csv("s24.csv")
data_x = pd.read_csv('data1.csv')
data_y = reduce_mem_usage(data_y)
data_x = reduce_mem_usage(data_x)

Memory usage of dataframe is 65128.00 MB
Memory usage after optimization is: 16378.00 MB
Decreased by 74.9%
Memory usage of dataframe is 553928.00 MB
Memory usage after optimization is: 147028.00 MB
Decreased by 73.5%


In [80]:
Y = data_y['y']
endl_name = ['x77', 'x30', 'x33', 'x61', 'x26', 'x2', 'x274', 'x6', 'x28', 'x54', 'x162', 'x320', 'x315', 'x245', 'x273', 'x191', 'x169', 'x130', 'x182', 'x317', 'x123', 'x4', 'x174', 'x16']

In [81]:
X = data_x[endl_name]

In [82]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge, LinearRegression, Lasso

In [90]:
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
sfs1 = SFS(Ridge(), 
           k_features=(1, 24),
           forward=True, 
           floating=False, 
           scoring='r2',
           cv=5,
           n_jobs=-1)

sfs1.fit(X, Y, custom_feature_names=endl_name)

# print('best combination (r2: %.3f): %s\n' % (sfs1.k_score_, sfs1.k_feature_idx_))
# print('all subsets:\n', sfs1.subsets_)
# plot_sfs(sfs1.get_metric_dict(), kind='std_dev')
# plt.grid()
# plt.show()

SequentialFeatureSelector(clone_estimator=True, cv=5,
                          estimator=Ridge(alpha=1.0, copy_X=True,
                                          fit_intercept=True, max_iter=None,
                                          normalize=False, random_state=None,
                                          solver='auto', tol=0.001),
                          fixed_features=None, floating=False, forward=True,
                          k_features=(1, 24), n_jobs=-1,
                          pre_dispatch='2*n_jobs', scoring='r2', verbose=0)

In [92]:
feature_7 = ['x30', 'x61', 'x26', 'x274', 'x273', 'x130', 'x4']

In [95]:
x_7 = X[feature_7]

In [97]:
clf = Ridge()

In [98]:
clf.fit(x_7, Y)

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001)

In [100]:
clf.coef_

array([ 0.08508815, -0.05555696, -0.05207001,  0.10880059, -0.10850983,
       -0.48603107,  0.07619249])

In [101]:
clf.intercept_

25.832767669420505