In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
import lightgbm as lgb
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import LeavePOut
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import learning_curve, validation_curve
from sklearn.decomposition import PCA

In [2]:
train_data = pd.read_csv('../input/zhengqi_train.txt', sep='\t', encoding='utf-8')
test_data = pd.read_csv('../input/zhengqi_test.txt', sep='\t', encoding='utf-8')

# 模型过拟合与欠拟合

In [3]:
features_columns = [col for col in train_data.columns if col not in ['target']]
min_max_scaler = MinMaxScaler()
min_max_scaler.fit(train_data[features_columns])

train_data_scaler = min_max_scaler.transform(train_data[features_columns])
test_data_scaler = min_max_scaler.transform(test_data[features_columns])

train_data_scaler = pd.DataFrame(train_data_scaler)
train_data_scaler.columns = features_columns
train_data_scaler['target'] = train_data['target']

test_data_scaler = pd.DataFrame(test_data_scaler)
test_data_scaler.columns = features_columns

In [4]:
pca = PCA(n_components=16)
new_train_pca_16 = pca.fit_transform(train_data_scaler.iloc[:, 0:-1])
new_test_pca_16 = pca.transform(test_data_scaler)

new_train_pca_16 = pd.DataFrame(new_train_pca_16)
new_test_pca_16 = pd.DataFrame(new_test_pca_16)
new_train_pca_16['target'] = train_data_scaler['target']

In [5]:
new_train_pca_16 = new_train_pca_16.fillna(0)
train = new_train_pca_16[new_test_pca_16.columns]
target = new_train_pca_16['target']

train_data, test_data, train_target, test_target = train_test_split(train, target, test_size=0.2, random_state=0)

## 欠拟合

In [6]:
clf = SGDRegressor(max_iter=500, tol=1e-2)
clf.fit(train_data, train_target)
score_train = mean_squared_error(train_target, clf.predict(train_data))
score_test = mean_squared_error(test_target, clf.predict(test_data))

print('SGDRegresion train MSE: ', score_train)
print('SGDRegresion test MSE:  ', score_test)

SGDRegresion train MSE:  0.15154387600636182
SGDRegresion test MSE:   0.1561265905589228


## 过拟合

In [7]:
poly = PolynomialFeatures(5)
train_data_poly = poly.fit_transform(train_data)
test_data_poly = poly.transform(test_data)

clf = SGDRegressor(max_iter=1000, tol=1e-3)
clf.fit(train_data_poly, train_target)
score_train = mean_squared_error(train_target, clf.predict(train_data_poly))
score_test = mean_squared_error(test_target, clf.predict(test_data_poly))

print('SGDRegresion train MSE: ', score_train)
print('SGDRegresion test MSE:  ', score_test)

SGDRegresion train MSE:  0.13232984463943012
SGDRegresion test MSE:   0.14470236138763864


## 正常拟合

In [8]:
poly = PolynomialFeatures(3)
train_data_poly = poly.fit_transform(train_data)
test_data_poly = poly.transform(test_data)

clf = SGDRegressor(max_iter=1000, tol=1e-3)
clf.fit(train_data_poly, train_target)
score_train = mean_squared_error(train_target, clf.predict(train_data_poly))
score_test = mean_squared_error(test_target, clf.predict(test_data_poly))

print('SGDRegresion train MSE: ', score_train)
print('SGDRegresion test MSE:  ', score_test)

SGDRegresion train MSE:  0.13403470232340356
SGDRegresion test MSE:   0.1421326713954065


# 模型正则化

## L2范数正则化

In [9]:
poly = PolynomialFeatures(3)
train_data_poly = poly.fit_transform(train_data)
test_data_poly = poly.transform(test_data)
clf = SGDRegressor(max_iter=1000, tol=1e-3, penalty='L2', alpha=0.0001)
clf.fit(train_data_poly, train_target)
score_train = mean_squared_error(train_target, clf.predict(train_data_poly))
score_test = mean_squared_error(test_target, clf.predict(test_data_poly))

print('SGDRegresion train MSE: ', score_train)
print('SGDRegresion test MSE:  ', score_test)

SGDRegresion train MSE:  0.13387548720940132
SGDRegresion test MSE:   0.14222979558536455


## L1范数正则化

In [10]:
poly = PolynomialFeatures(3)
train_data_poly = poly.fit_transform(train_data)
test_data_poly = poly.transform(test_data)
clf = SGDRegressor(max_iter=1000, tol=1e-3, penalty='L1', alpha=0.00001)
clf.fit(train_data_poly, train_target)
score_train = mean_squared_error(train_target, clf.predict(train_data_poly))
score_test = mean_squared_error(test_target, clf.predict(test_data_poly))

print('SGDRegresion train MSE: ', score_train)
print('SGDRegresion test MSE:  ', score_test)

SGDRegresion train MSE:  0.1343939060176878
SGDRegresion test MSE:   0.1427143331205221


## ElasticNet联合L1和L2范数加权正则化

In [11]:
poly = PolynomialFeatures(3)
train_data_poly = poly.fit_transform(train_data)
test_data_poly = poly.transform(test_data)
clf = SGDRegressor(max_iter=1000, tol=1e-3, penalty='elasticnet', l1_ratio=0.9, alpha=0.00001)
clf.fit(train_data_poly, train_target)
score_train = mean_squared_error(train_target, clf.predict(train_data_poly))
score_test = mean_squared_error(test_target, clf.predict(test_data_poly))

print('SGDRegresion train MSE: ', score_train)
print('SGDRegresion test MSE:  ', score_test)

SGDRegresion train MSE:  0.1342366457696468
SGDRegresion test MSE:   0.14259665923690806


# 模型交叉验证

## 简单交叉验证

In [12]:
train_data, test_data, train_target, test_target = train_test_split(train, target, test_size=0.2, random_state=0)

clf = SGDRegressor(max_iter=1000, tol=1e-3)
clf.fit(train_data, train_target)
score_train = mean_squared_error(train_target, clf.predict(train_data))
score_test = mean_squared_error(test_target, clf.predict(test_data))

print('SGDRegresion train MSE: ', score_train)
print('SGDRegresion test MSE:  ', score_test)

SGDRegresion train MSE:  0.14152056907690852
SGDRegresion test MSE:   0.14702477180431253


## K折交叉验证 KFold

In [13]:
kf = KFold(n_splits=5)
for k, (train_index, test_index) in enumerate(kf.split(train)):
    train_data, test_data = train.values[train_index], train.values[test_index]
    train_target, test_target = target.values[train_index], target.values[test_index]
    clf = SGDRegressor(max_iter=1000, tol=1e-3)
    clf.fit(train_data, train_target)
    score_train = mean_squared_error(train_target, clf.predict(train_data))
    score_test = mean_squared_error(test_target, clf.predict(test_data))

    print(k, ' 折', 'SGDRegresion train MSE: ', score_train)
    print(k, ' 折', 'SGDRegresion test MSE:  ', score_test)

0  折 SGDRegresion train MSE:  0.15000517276846342
0  折 SGDRegresion test MSE:   0.1056105492507973
1  折 SGDRegresion train MSE:  0.1335470095636986
1  折 SGDRegresion test MSE:   0.18222183097765565
2  折 SGDRegresion train MSE:  0.1471355400300835
2  折 SGDRegresion test MSE:   0.13334036459938758
3  折 SGDRegresion train MSE:  0.14140035043512503
3  折 SGDRegresion test MSE:   0.16377815310526728
4  折 SGDRegresion train MSE:  0.13873208371401843
4  折 SGDRegresion test MSE:   0.1657121669336226


## 留一法交叉验证 LeaveOneOut

In [14]:
loo = LeaveOneOut()

for k, (train_index, test_index) in enumerate(loo.split(train)):
    train_data, test_data = train.values[train_index], train.values[test_index]
    train_target, test_target = target.values[train_index], target.values[test_index]
    clf = SGDRegressor(max_iter=1000, tol=1e-3)
    clf.fit(train_data, train_target)
    score_train = mean_squared_error(train_target, clf.predict(train_data))
    score_test = mean_squared_error(test_target, clf.predict(test_data))

    print(k, ' 折', 'SGDRegresion train MSE: ', score_train)
    print(k, ' 折', 'SGDRegresion test MSE:  ', score_test)
    if k >= 9:
        break

0  折 SGDRegresion train MSE:  0.14145164974005736
0  折 SGDRegresion test MSE:   0.011491821056850147
1  折 SGDRegresion train MSE:  0.14156788788574828
1  折 SGDRegresion test MSE:   0.12599620878129256
2  折 SGDRegresion train MSE:  0.14174097015683113
2  折 SGDRegresion test MSE:   0.04237270398559636
3  折 SGDRegresion train MSE:  0.1415007659867209
3  折 SGDRegresion test MSE:   0.003638304946260904
4  折 SGDRegresion train MSE:  0.1410033055369563
4  折 SGDRegresion test MSE:   0.010476346832283332
5  折 SGDRegresion train MSE:  0.14158469183011962
5  折 SGDRegresion test MSE:   0.13784634365473986
6  折 SGDRegresion train MSE:  0.14156460534476248
6  折 SGDRegresion test MSE:   0.024323905307004307
7  折 SGDRegresion train MSE:  0.141634142232386
7  折 SGDRegresion test MSE:   0.0005141588917711953
8  折 SGDRegresion train MSE:  0.1415678899014246
8  折 SGDRegresion test MSE:   0.08955025551051773
9  折 SGDRegresion train MSE:  0.14159018595069045
9  折 SGDRegresion test MSE:   0.05038110804099924

## 留P法交叉验证

In [15]:
lpo = LeavePOut(p=10)

for k, (train_index, test_index) in enumerate(lpo.split(train)):
    train_data, test_data = train.values[train_index], train.values[test_index]
    train_target, test_target = target.values[train_index], target.values[test_index]
    clf = SGDRegressor(max_iter=1000, tol=1e-3)
    clf.fit(train_data, train_target)
    score_train = mean_squared_error(train_target, clf.predict(train_data))
    score_test = mean_squared_error(test_target, clf.predict(test_data))

    print(k, ' 10个', 'SGDRegresion train MSE: ', score_train)
    print(k, ' 10个', 'SGDRegresion test MSE:  ', score_test)
    if k >= 9:
        break

0  10个 SGDRegresion train MSE:  0.14129648169729742
0  10个 SGDRegresion test MSE:   0.04864701431404865
1  10个 SGDRegresion train MSE:  0.14200234308889043
1  10个 SGDRegresion test MSE:   0.04501420833502557
2  10个 SGDRegresion train MSE:  0.14199388311982655
2  10个 SGDRegresion test MSE:   0.04723564696417463
3  10个 SGDRegresion train MSE:  0.14194353367222431
3  10个 SGDRegresion test MSE:   0.05447202820892858
4  10个 SGDRegresion train MSE:  0.14182868706693333
4  10个 SGDRegresion test MSE:   0.06916153712527914
5  10个 SGDRegresion train MSE:  0.14197228113154764
5  10个 SGDRegresion test MSE:   0.04510425356706419
6  10个 SGDRegresion train MSE:  0.14125485098338358
6  10个 SGDRegresion test MSE:   0.04816125723008913
7  10个 SGDRegresion train MSE:  0.14186615937209565
7  10个 SGDRegresion test MSE:   0.052975980460754225
8  10个 SGDRegresion train MSE:  0.14193958908631044
8  10个 SGDRegresion test MSE:   0.0468476035244132
9  10个 SGDRegresion train MSE:  0.14204451185426142
9  10个 SGDRe

# 模型超参空间及调参

## 穷举网格搜索

In [16]:
train_data, test_data, train_target, test_target = train_test_split(train, target, test_size=0.2, random_state=0)

randomForestRegressor = RandomForestRegressor()
parameters = {'n_estimators': [50, 100, 200], 'max_depth': [1, 2, 3]}

clf = GridSearchCV(randomForestRegressor, parameters, cv=5)
clf.fit(train_data, train_target)
score_test = mean_squared_error(test_target, clf.predict(test_data))
print('RandomForestRegressor GridSearchCV test MSE: ', score_test)
sorted(clf.cv_results_.keys())

RandomForestRegressor GridSearchCV test MSE:  0.25527365152910464


['mean_fit_time',
 'mean_score_time',
 'mean_test_score',
 'param_max_depth',
 'param_n_estimators',
 'params',
 'rank_test_score',
 'split0_test_score',
 'split1_test_score',
 'split2_test_score',
 'split3_test_score',
 'split4_test_score',
 'std_fit_time',
 'std_score_time',
 'std_test_score']

## 随机参数优化

In [17]:
train_data, test_data, train_target, test_target = train_test_split(train, target, test_size=0.2, random_state=0)

randomForestRegressor = RandomForestRegressor()
parameters = {'n_estimators': [50, 100, 200, 300], 'max_depth': [1, 2, 3, 4, 5]}

clf = RandomizedSearchCV(randomForestRegressor, parameters, cv=5)
clf.fit(train_data, train_target)
score_test = mean_squared_error(test_target, clf.predict(test_data))
print('RandomForestRegressor GridSearchCV test MSE: ', score_test)
sorted(clf.cv_results_.keys())

RandomForestRegressor GridSearchCV test MSE:  0.19592615575875494


['mean_fit_time',
 'mean_score_time',
 'mean_test_score',
 'param_max_depth',
 'param_n_estimators',
 'params',
 'rank_test_score',
 'split0_test_score',
 'split1_test_score',
 'split2_test_score',
 'split3_test_score',
 'split4_test_score',
 'std_fit_time',
 'std_score_time',
 'std_test_score']

## LGB调参

In [18]:
clf = lgb.LGBMRegressor(num_leaves=31)

parameters = {'learning_rate': [0.01, 0.1, 1], 'n_estimators': [20, 40]}
clf = GridSearchCV(clf, parameters, cv=5)
clf.fit(train_data, train_target)

print('Best parameters: ', clf.best_params_)
score_test = mean_squared_error(test_target, clf.predict(test_data))
print('LGBMRegressor GridSearchCV test MSE: ', score_test)

Best parameters:  {'learning_rate': 0.1, 'n_estimators': 40}
LGBMRegressor GridSearchCV test MSE:  0.15143248564085332


## LGB线下验证

In [19]:
train_data2 = pd.read_csv('../input/zhengqi_train.txt', sep='\t', encoding='utf-8')
test_data2 = pd.read_csv('../input/zhengqi_test.txt', sep='\t', encoding='utf-8')

train_data2_f = train_data2[test_data2.columns].values
train_data2_target = train_data2['target'].values

Folds = 5
kf = KFold(n_splits=5, shuffle=True, random_state=2019)
MSE_dict = {'train_mse': [], 'test_mse': []}

for i, (train_index, test_index) in enumerate(kf.split(train_data2_f)):
    lgb_reg = lgb.LGBMRegressor(learning_rate=0.01,
                                max_depth=-1,
                                n_estimators=5000,
                                boosting_type='gbdt',
                                random_state=2019,
                                objective='regression')
    X_train_KFold, X_test_KFold = train_data2_f[train_index], train_data2_f[test_index]
    y_train_KFold, y_test_KFold = train_data2_target[train_index], train_data2_target[test_index]
    
    lgb_reg.fit(X=X_train_KFold, y=y_train_KFold,
                eval_set=[(X_train_KFold, y_train_KFold), (X_test_KFold, y_test_KFold)],
                eval_names=['Train', 'Test'],
                early_stopping_rounds=300,
                eval_metric='mse',
                verbose=300)
    
    y_train_KFold_predict = lgb_reg.predict(X_train_KFold, num_iteration=lgb_reg.best_iteration_)
    y_test_KFold_predict = lgb_reg.predict(X_test_KFold, num_iteration=lgb_reg.best_iteration_)
    
    print('第{}折 训练和预测 训练MSE 预测MSE'.format(i + 1))
    train_mse = mean_squared_error(y_train_KFold_predict, y_train_KFold)
    print('--------\n', '训练MSE\n', train_mse, '\n--------')
    test_mse = mean_squared_error(y_test_KFold_predict, y_test_KFold)
    print('--------\n', '训练MSE\n', test_mse, '\n--------')
    
    MSE_dict['train_mse'].append(train_mse)
    MSE_dict['test_mse'].append(test_mse)

print('--------\n', '训练MSE\n', MSE_dict['train_mse'], '\n', np.mean(MSE_dict['train_mse']), '\n--------')
print('--------\n', '训练MSE\n', MSE_dict['test_mse'], '\n', np.mean(MSE_dict['test_mse']), '\n--------')

Training until validation scores don't improve for 300 rounds
[300]	Train's l2: 0.0629311	Test's l2: 0.125237
[600]	Train's l2: 0.0303868	Test's l2: 0.112781
[900]	Train's l2: 0.0176066	Test's l2: 0.109759
[1200]	Train's l2: 0.0110348	Test's l2: 0.108169
[1500]	Train's l2: 0.00706786	Test's l2: 0.107126
[1800]	Train's l2: 0.0046641	Test's l2: 0.10643
[2100]	Train's l2: 0.00314878	Test's l2: 0.106064
[2400]	Train's l2: 0.00213319	Test's l2: 0.105696
[2700]	Train's l2: 0.0014622	Test's l2: 0.105423
[3000]	Train's l2: 0.00102244	Test's l2: 0.105254
[3300]	Train's l2: 0.000708255	Test's l2: 0.105162
[3600]	Train's l2: 0.000491013	Test's l2: 0.105039
[3900]	Train's l2: 0.000346455	Test's l2: 0.105009
[4200]	Train's l2: 0.000242701	Test's l2: 0.104999
[4500]	Train's l2: 0.000172218	Test's l2: 0.104959
[4800]	Train's l2: 0.000121093	Test's l2: 0.104918
Did not meet early stopping. Best iteration is:
[5000]	Train's l2: 9.62172e-05	Test's l2: 0.104888
第1折 训练和预测 训练MSE 预测MSE
--------
 训练MSE
 9.62

# 学习曲线和验证曲线

## 学习曲线