## 导入模块

In [50]:
import pandas as pd 
import matplotlib.pyplot as plt 
import numpy as np 
# 导入模型：依次是：随机梯度下降，套索回归，线性回归，岭回归
from sklearn.linear_model import SGDRegressor, Lasso, LinearRegression, Ridge, ElasticNet
# 导入归一化处理方式:依次是Z-Score归一化，最大值最小值归一化
from sklearn.preprocessing import StandardScaler, MinMaxScaler

## 构建训练集和测试集

In [2]:
df = pd.read_csv('zhengqi_train.txt', sep='\t')
X_train = df.iloc[:, :-1]
y_train = df[['target']]
display(X_train.shape, y_train.shape)

(2888, 38)

(2888, 1)

In [3]:
X_test = pd.read_csv('zhengqi_test.txt', sep='\t')
display(X_test.shape)

(1925, 38)

## 数据进行归一化处理

一般情况下，目标值【一般为y】是不需要进行归一化处理的

### 使用最大值最小值归一化处理

In [4]:
min_max = MinMaxScaler()
X_train_min = min_max.fit_transform(X_train)
y_train_min = min_max.fit_transform(y_train)
X_test_min = min_max.fit_transform(X_test)

### 使用Z-Score进行归一化处理

In [5]:
score = StandardScaler()
X_train_ = score.fit_transform(X_train)
y_train_ = score.fit_transform(y_train)
X_test_ = score.fit_transform(X_test)

## 构建模型

### 使用未进行归一化处理的数据进行模型的构建  使用简单的线性回归

In [6]:
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression()

In [7]:
model.predict(X_test)[:15]

array([[ 0.26825457],
       [ 0.2600059 ],
       [-0.06905626],
       [ 0.09156779],
       [ 0.27141095],
       [ 0.19877399],
       [-0.13506734],
       [ 0.23545488],
       [-0.0660262 ],
       [ 0.28739887],
       [-0.57255987],
       [-0.55241158],
       [-0.33249774],
       [ 0.09408069],
       [-0.08724635]])

### 使用进行了归一化处理的数据进行模型的构建---> 线性回归

#### 使用Z-score归一化

In [8]:
model = LinearRegression()
# Z-score归一化处理的数据
model.fit(X_train_, y_train_)

LinearRegression()

In [9]:
model.predict(X_test_)[:15]

array([[ 0.52564964],
       [ 0.48973218],
       [ 0.32666665],
       [ 0.36998727],
       [ 0.5268688 ],
       [ 0.46154774],
       [ 0.22804888],
       [ 0.48236749],
       [ 0.21097507],
       [ 0.49360554],
       [-0.10965964],
       [-0.09819487],
       [-0.02267627],
       [ 0.26229629],
       [ 0.24690629]])

#### 使用最大值最小值归一化

In [10]:
model = LinearRegression()
model.fit(X_train_min, y_train_min)

LinearRegression()

In [11]:
model.predict(X_test_min)[:15]

array([[0.69801031],
       [0.6861968 ],
       [0.65270343],
       [0.67012411],
       [0.69424164],
       [0.68954379],
       [0.63941183],
       [0.69726306],
       [0.63169109],
       [0.69597451],
       [0.57275288],
       [0.57584871],
       [0.58303058],
       [0.64112287],
       [0.6590907 ]])

### 使用随机梯度下降

#### 使用Z-score的归一化处理

In [36]:
sgd =SGDRegressor(alpha=0.01, penalty='l1')
sgd.fit(X_train_, y_train_.ravel())

SGDRegressor(alpha=0.01, penalty='l1')

In [42]:
sgd.predict(X_test_)[:15]

array([ 0.3648049 ,  0.31911456,  0.18192628,  0.28915686,  0.38903366,
        0.33949308,  0.21804913,  0.27115442,  0.2183158 ,  0.38561143,
       -0.00222399, -0.0833308 ,  0.15553928,  0.37228649,  0.20751212])

#### 使用最大值最小值归一化

In [38]:
sgd.fit(X_train_min, y_train_min.ravel())

SGDRegressor(alpha=0.01, penalty='l1')

In [39]:
sgd.predict(X_test_min)[:15]

array([0.60794552, 0.60143324, 0.58124095, 0.59721664, 0.61184577,
       0.60426296, 0.58611083, 0.59486783, 0.58696681, 0.61077679,
       0.55376131, 0.54250184, 0.57669801, 0.60853041, 0.58302566])

### 使用套索回归

#### 使用Z-score

In [22]:
lasson = Lasso(alpha=0.5)
lasson.fit(X_train_, y_train_)

Lasso(alpha=0.5)

In [25]:
lasson.predict(X_test_)[:15]

array([ 0.18813954,  0.15832164,  0.01474993,  0.12222444,  0.18149541,
        0.1218764 ,  0.04058251,  0.05945151,  0.01922411,  0.14653267,
       -0.21387304, -0.2154687 ,  0.0111866 ,  0.16294088, -0.01170447])

#### 使用最大值最小值

In [26]:
lasson.fit(X_train_min, y_train_min)

Lasso(alpha=0.5)

In [28]:
lasson.predict(X_test_min)[:15]

array([0.56796002, 0.56796002, 0.56796002, 0.56796002, 0.56796002,
       0.56796002, 0.56796002, 0.56796002, 0.56796002, 0.56796002,
       0.56796002, 0.56796002, 0.56796002, 0.56796002, 0.56796002])

### 使用岭回归

#### 使用最大值最小值

In [44]:
ridge = Ridge(alpha=0.5, solver='sag')
ridge.fit(X_train_min, y_train_min)

Ridge(alpha=0.5, solver='sag')

In [45]:
ridge.predict(X_test_min)[:15]

array([[0.68437026],
       [0.67301388],
       [0.64055957],
       [0.65328529],
       [0.68528402],
       [0.67481022],
       [0.62581731],
       [0.67898933],
       [0.6219349 ],
       [0.68203385],
       [0.55922445],
       [0.55494419],
       [0.57679869],
       [0.63762383],
       [0.63584521]])

#### 使用z-score

In [46]:
ridge.fit(X_train_, y_train_)

Ridge(alpha=0.5, solver='sag')

In [48]:
ridge.predict(X_test_)[:15]

array([[ 0.52513529],
       [ 0.48925268],
       [ 0.32606849],
       [ 0.36918156],
       [ 0.52708087],
       [ 0.46127099],
       [ 0.22776708],
       [ 0.48159107],
       [ 0.21102041],
       [ 0.49355581],
       [-0.11007798],
       [-0.09947867],
       [-0.02226853],
       [ 0.26303906],
       [ 0.24571822]])

### 使用弹性网络

In [51]:
ek = ElasticNet(alpha=0.5, l1_ratio=0.7)
ek.fit(X_train_, y_train_)

ElasticNet(alpha=0.5, l1_ratio=0.7)

In [53]:
ek.predict(X_test_)[:15]

array([ 0.2554739 ,  0.20001233,  0.03941625,  0.17079058,  0.24308445,
        0.18490321,  0.0743446 ,  0.11031951,  0.06236928,  0.21422362,
       -0.20511678, -0.24184252,  0.03739644,  0.24507557,  0.04056246])

In [54]:
ek.fit(X_train_min, y_train_min)

ElasticNet(alpha=0.5, l1_ratio=0.7)

In [55]:
ek.predict(X_test_min)[:15]

array([0.56796002, 0.56796002, 0.56796002, 0.56796002, 0.56796002,
       0.56796002, 0.56796002, 0.56796002, 0.56796002, 0.56796002,
       0.56796002, 0.56796002, 0.56796002, 0.56796002, 0.56796002])

In [56]:
a = np.arange(1, 100)
a

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,
       52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68,
       69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85,
       86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99])

In [105]:
np.random.shuffle(a)

In [106]:
a

array([20, 39, 48, 59, 71, 58, 28, 34, 91, 61, 63, 47, 12,  2, 83, 85, 19,
       49, 16, 75, 97, 42, 89, 32, 35, 37, 21, 87, 31, 80, 79, 64, 55, 84,
       27,  6, 26, 30, 11, 22, 15, 82, 73, 74, 86, 60, 65, 13, 93, 57,  8,
       68, 41, 45, 78,  3, 18, 29, 36, 53, 96, 25, 23, 67,  1, 95, 24, 94,
       98, 62, 44, 38,  9, 76, 46, 40,  7, 14,  5, 70, 69, 88, 77, 90, 66,
       81, 50, 56, 17, 33, 72, 10,  4, 52, 43, 54, 99, 51, 92])