In [7]:
import pandas as pd
import numpy as np

In [8]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

In [9]:
train.head()

Unnamed: 0,id,club,league,birth_date,height_cm,weight_kg,nationality,potential,pac,sho,...,st,lw,cf,cam,cm,cdm,cb,lb,gk,y
0,0,293,25,10/4/96,177,72,78,73,65,60,...,63.0,64.0,64.0,64.0,63.0,57.0,53.0,56.0,,70.0
1,1,258,24,9/21/84,178,70,51,62,56,39,...,52.0,60.0,57.0,59.0,61.0,64.0,61.0,64.0,,24.0
2,2,112,3,6/8/99,177,69,52,68,68,57,...,56.0,54.0,55.0,53.0,45.0,34.0,31.0,36.0,,17.0
3,3,604,9,7/25/88,181,81,54,81,76,74,...,77.0,76.0,77.0,77.0,79.0,78.0,77.0,78.0,,1750.0
4,4,80,37,8/4/80,179,75,96,72,40,62,...,62.0,66.0,65.0,68.0,71.0,70.0,66.0,64.0,,97.5


In [10]:
train.shape

(10441, 65)

In [11]:
test.shape

(7000, 64)

In [12]:
train_test = pd.concat([train,test],axis=0,ignore_index=True)

In [13]:
y = train_test['y'][:10441]

In [14]:
train_test.fillna(0,inplace=True)

In [15]:
#  出生日期转化为年龄，对结果提升很大
def birthdate_age(birthdate):
    year = int(birthdate.split('/')[2])
    if year <= 18:
        return 18 - year
    else:
        return 118 - year

train_test['birth_date'] = train_test['birth_date'].apply(birthdate_age)


In [16]:
fea_columns = list(train_test.columns.values)
fea_columns.remove('gk') #空值太多
fea_columns.remove('id') #去除id

In [17]:
fea_columns.remove('y') #去除标签

In [18]:
fea_columns

['acceleration',
 'aggression',
 'agility',
 'balance',
 'ball_control',
 'birth_date',
 'cam',
 'cb',
 'cdm',
 'cf',
 'club',
 'cm',
 'crossing',
 'curve',
 'def',
 'dri',
 'dribbling',
 'finishing',
 'free_kick_accuracy',
 'gk_diving',
 'gk_handling',
 'gk_kicking',
 'gk_positioning',
 'gk_reflexes',
 'heading_accuracy',
 'height_cm',
 'interceptions',
 'international_reputation',
 'jumping',
 'lb',
 'league',
 'long_passing',
 'long_shots',
 'lw',
 'marking',
 'nationality',
 'pac',
 'pas',
 'penalties',
 'phy',
 'positioning',
 'potential',
 'preferred_foot',
 'rb',
 'reactions',
 'rw',
 'sho',
 'short_passing',
 'shot_power',
 'skill_moves',
 'sliding_tackle',
 'sprint_speed',
 'st',
 'stamina',
 'standing_tackle',
 'strength',
 'vision',
 'volleys',
 'weak_foot',
 'weight_kg',
 'work_rate_att',
 'work_rate_def']

In [19]:
data = train_test[fea_columns]
cate_columns = data.select_dtypes(include='object').columns
num_columns = data.select_dtypes(exclude='object').columns

In [20]:
len(cate_columns) #2个类别特征

2

In [21]:
len(num_columns) #60个数值型特征

60

类别型特征做OneHot编码

In [22]:
fea_cate = data[cate_columns]
fea_num = data[num_columns]
fea_cate_dummies = pd.get_dummies(fea_cate)

In [23]:
fea_cate_dummies

Unnamed: 0,work_rate_att_High,work_rate_att_Low,work_rate_att_Medium,work_rate_def_High,work_rate_def_Low,work_rate_def_Medium
0,0,0,1,0,0,1
1,0,0,1,1,0,0
2,0,0,1,0,0,1
3,1,0,0,1,0,0
4,0,0,1,0,0,1
5,0,0,1,0,0,1
6,0,0,1,0,0,1
7,0,0,1,0,0,1
8,1,0,0,1,0,0
9,0,0,1,0,0,1


数值型特征做归一化处理

In [24]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
fea_num_sca = pd.DataFrame(ss.fit_transform(fea_num))

In [25]:
fea_all = pd.concat([fea_num_sca,fea_cate_dummies],axis=1)

In [26]:
fea_all.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,56,57,58,59,work_rate_att_High,work_rate_att_Low,work_rate_att_Medium,work_rate_def_High,work_rate_def_Low,work_rate_def_Medium
0,-0.063765,-0.119991,-0.45309,0.503599,0.577706,-0.98818,0.535393,0.161335,0.305826,0.545978,...,0.125268,0.371002,0.07262,-0.484043,0,0,1,0,0,1
1,-0.541835,-0.408273,-0.316003,0.146715,0.333801,1.652298,0.291409,0.548097,0.654356,0.203776,...,0.125268,-1.169559,0.07262,-0.770196,0,0,1,1,0,0
2,0.004531,-1.388434,-0.384546,0.574976,-0.275961,-1.648299,-0.001371,-0.902261,-0.839345,0.106004,...,-0.297325,-0.199576,0.07262,-0.913272,0,0,1,0,0,1
3,0.755784,0.744857,0.437979,0.003962,1.370397,0.772139,1.169751,1.321621,1.351416,1.181495,...,1.533912,1.797446,0.07262,0.803641,1,0,0,1,0,0
4,-1.088201,0.744857,0.09526,0.789107,1.00454,2.532458,0.73058,0.789823,0.953096,0.594864,...,1.533912,0.941579,-1.446051,-0.054815,0,0,1,0,0,1


In [27]:
import xgboost as xgb
from sklearn.model_selection import train_test_split 

In [30]:
X_train, X_val, y_train, y_val = train_test_split(fea_all[:10441], y, test_size=0.2, random_state = 12)

In [31]:
from sklearn.metrics import mean_absolute_error

In [136]:
model = xgb.XGBRegressor()
model.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [137]:
y_val_pred = model.predict(X_val)

In [138]:
print('MSE： ', mean_absolute_error(y_val, y_val_pred))

MSE：  38.758691794567554


In [139]:
y_result = model.predict(fea_all[10441:])

In [28]:
sub = pd.DataFrame()
sub['id'] = test['id']
sub['y'] = y_result

In [29]:
sub.head()

Unnamed: 0,id,y
0,10441,43.304466
1,10442,83.801353
2,10443,114.583504
3,10444,126.176773
4,10445,15.017225


In [30]:
#sub.to_csv('../output/sub_xgb_baseline.csv', index=False)

### 调参
- 1.调最佳迭代次数

In [31]:
cv_params = {'n_estimators': [100, 500, 800, 1000, 1200, 1500, 2000]}
other_params = {'learning_rate': 0.1, 'n_estimators': 500, 'max_depth': 5, 'min_child_weight': 1, 'seed': 0,
                    'subsample': 0.8, 'colsample_bytree': 0.8, 'gamma': 0, 'reg_alpha': 0, 'reg_lambda': 1}   

In [32]:
from sklearn.model_selection import GridSearchCV

In [34]:
model = xgb.XGBRegressor(**other_params)
optimized_GBM = GridSearchCV(estimator=model, param_grid=cv_params, scoring='r2', cv=5, verbose=1,n_jobs=-4)
optimized_GBM.fit(X_train, y_train)

Fitting 5 folds for each of 7 candidates, totalling 35 fits


[Parallel(n_jobs=-4)]: Done  35 out of  35 | elapsed:  2.3min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimators=500,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=0, silent=True,
       subsample=0.8),
       fit_params=None, iid=True, n_jobs=-4,
       param_grid={'n_estimators': [100, 500, 800, 1000, 1200, 1500, 2000]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='r2', verbose=1)

In [35]:
evalute_result = optimized_GBM.grid_scores_
print('每轮迭代运行结果:{0}'.format(evalute_result))
print('参数的最佳取值：{0}'.format(optimized_GBM.best_params_))
print('最佳模型得分:{0}'.format(optimized_GBM.best_score_))

每轮迭代运行结果:[mean: 0.93703, std: 0.00581, params: {'n_estimators': 100}, mean: 0.93943, std: 0.00578, params: {'n_estimators': 500}, mean: 0.93951, std: 0.00580, params: {'n_estimators': 800}, mean: 0.93952, std: 0.00580, params: {'n_estimators': 1000}, mean: 0.93952, std: 0.00578, params: {'n_estimators': 1200}, mean: 0.93953, std: 0.00577, params: {'n_estimators': 1500}, mean: 0.93953, std: 0.00577, params: {'n_estimators': 2000}]
参数的最佳取值：{'n_estimators': 2000}
最佳模型得分:0.9395313913587561




In [30]:
cv_params = {'n_estimators': [1000,1500,1700,1800,1900,2000,2100,2200,2300]}
other_params = {'learning_rate': 0.1, 'n_estimators': 2000, 'max_depth': 5, 'min_child_weight': 1, 'seed': 0,
                    'subsample': 0.8, 'colsample_bytree': 0.8, 'gamma': 0, 'reg_alpha': 0, 'reg_lambda': 1} 
model = xgb.XGBRegressor(**other_params)
optimized_GBM = GridSearchCV(estimator=model, param_grid=cv_params, scoring='r2', cv=5, verbose=1,n_jobs=-4)
optimized_GBM.fit(X_train, y_train)

evalute_result = optimized_GBM.grid_scores_
print('参数的最佳取值：{0}'.format(optimized_GBM.best_params_))
print('最佳模型得分:{0}'.format(optimized_GBM.best_score_))

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-4)]: Done  45 out of  45 | elapsed:  5.4min finished


参数的最佳取值：{'n_estimators': 1500}
最佳模型得分:0.9649090666946577




- 2.调min_child_weight以及max_depth

In [41]:
cv_params = {'max_depth': [3, 4, 5, 6, 7, 8, 9, 10], 'min_child_weight': [1, 2, 3, 4, 5, 6]}
other_params = {'learning_rate': 0.1, 'n_estimators': 1700, 'max_depth': 5, 'min_child_weight': 1, 'seed': 0,
                    'subsample': 0.8, 'colsample_bytree': 0.8, 'gamma': 0, 'reg_alpha': 0, 'reg_lambda': 1}
model = xgb.XGBRegressor(**other_params)
optimized_GBM = GridSearchCV(estimator=model, param_grid=cv_params, scoring='r2', cv=5, verbose=1,n_jobs=-4)
optimized_GBM.fit(X_train, y_train)

evalute_result = optimized_GBM.grid_scores_
print('参数的最佳取值：{0}'.format(optimized_GBM.best_params_))
print('最佳模型得分:{0}'.format(optimized_GBM.best_score_))

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-4)]: Done  40 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-4)]: Done 190 tasks      | elapsed: 24.8min
[Parallel(n_jobs=-4)]: Done 240 out of 240 | elapsed: 34.8min finished


参数的最佳取值：{'max_depth': 5, 'min_child_weight': 6}
最佳模型得分:0.9442953026266931




In [43]:
y_val_pred = optimized_GBM.predict(X_val)
print('MSE： ', mean_absolute_error(y_val, y_val_pred))

MSE：  36.713027542489975


- 3.调gamma

In [44]:
cv_params = {'gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6]}
other_params = {'learning_rate': 0.1, 'n_estimators': 1700, 'max_depth': 5, 'min_child_weight': 6, 'seed': 0,
                    'subsample': 0.8, 'colsample_bytree': 0.8, 'gamma': 0, 'reg_alpha': 0, 'reg_lambda': 1}
model = xgb.XGBRegressor(**other_params)
optimized_GBM = GridSearchCV(estimator=model, param_grid=cv_params, scoring='r2', cv=5, verbose=1,n_jobs=-4)
optimized_GBM.fit(X_train, y_train)

evalute_result = optimized_GBM.grid_scores_
print('参数的最佳取值：{0}'.format(optimized_GBM.best_params_))
print('最佳模型得分:{0}'.format(optimized_GBM.best_score_))


Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-4)]: Done  30 out of  30 | elapsed:  3.4min finished


参数的最佳取值：{'gamma': 0.2}
最佳模型得分:0.9443097573252244




NameError: name 'optized_GBM' is not defined

In [45]:
y_val_pred = optimized_GBM.predict(X_val)
print('MSE： ', mean_absolute_error(y_val, y_val_pred))

MSE：  36.70101845795141


- 4.调subsample colsample_bytree

In [46]:
cv_params = {'subsample': [0.6, 0.7, 0.8, 0.9], 'colsample_bytree': [0.6, 0.7, 0.8, 0.9]}
other_params = {'learning_rate': 0.1, 'n_estimators': 1700, 'max_depth': 5, 'min_child_weight': 6, 'seed': 0,
                    'subsample': 0.8, 'colsample_bytree': 0.8, 'gamma': 0.2, 'reg_alpha': 0, 'reg_lambda': 1}
model = xgb.XGBRegressor(**other_params)
optimized_GBM = GridSearchCV(estimator=model, param_grid=cv_params, scoring='r2', cv=5, verbose=1,n_jobs=-4)
optimized_GBM.fit(X_train, y_train)

evalute_result = optimized_GBM.grid_scores_
print('参数的最佳取值：{0}'.format(optimized_GBM.best_params_))
print('最佳模型得分:{0}'.format(optimized_GBM.best_score_))

y_val_pred = optimized_GBM.predict(X_val)
print('MSE： ', mean_absolute_error(y_val, y_val_pred))

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-4)]: Done  40 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-4)]: Done  80 out of  80 | elapsed:  8.6min finished


参数的最佳取值：{'colsample_bytree': 0.8, 'subsample': 0.8}
最佳模型得分:0.9443097573252244
MSE：  36.70101845795141




- 5.调reg_alpha  reg_lambda

In [None]:
cv_params = {'reg_alpha': [0.05, 0.1, 1, 2, 3], 'reg_lambda': [0.05, 0.1, 1, 2, 3]}
other_params = {'learning_rate': 0.1, 'n_estimators': 1700, 'max_depth': 4, 'min_child_weight': 5, 'seed': 0,
                    'subsample': 0.7, 'colsample_bytree': 0.7, 'gamma': 0.1, 'reg_alpha': 0, 'reg_lambda': 1}

model = xgb.XGBRegressor(**other_params)
optimized_GBM = GridSearchCV(estimator=model, param_grid=cv_params, scoring='r2', cv=5, verbose=1,n_jobs=-4)
optimized_GBM.fit(X_train, y_train)

evalute_result = optimized_GBM.grid_scores_
print('参数的最佳取值：{0}'.format(optimized_GBM.best_params_))
print('最佳模型得分:{0}'.format(optimized_GBM.best_score_))

y_val_pred = optized_GBM.predict(X_val)
print('MSE： ', mean_absolute_error(y_val, y_val_pred))

- 6.调learning_rate

In [31]:
cv_params = {'learning_rate': [0.01, 0.05, 0.07, 0.1, 0.2] }
other_params = {'learning_rate': 0.1, 'n_estimators': 1500, 'max_depth': 5, 'min_child_weight': 6, 'seed': 0,
                    'subsample': 0.8, 'colsample_bytree': 0.7, 'gamma': 0.8, 'reg_alpha': 1, 'reg_lambda': 1}

model = xgb.XGBRegressor(**other_params)
optimized_GBM = GridSearchCV(estimator=model, param_grid=cv_params, scoring='r2', cv=5, verbose=1,n_jobs=-4)
optimized_GBM.fit(X_train, y_train)

evalute_result = optimized_GBM.grid_scores_
print('参数的最佳取值：{0}'.format(optimized_GBM.best_params_))
print('最佳模型得分:{0}'.format(optimized_GBM.best_score_))



Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-4)]: Done  25 out of  25 | elapsed:  2.2min finished


参数的最佳取值：{'learning_rate': 0.01}
最佳模型得分:0.9679119228356377




NameError: name 'optized_GBM' is not defined

### 最佳参数

In [33]:
other_params = {'learning_rate': 0.01, 'n_estimators': 1500, 'max_depth': 5, 'min_child_weight': 6, 'seed': 0,
                    'subsample': 0.8, 'colsample_bytree': 0.8, 'gamma': 0.8, 'reg_alpha': 1, 'reg_lambda': 1}
model = xgb.XGBRegressor(**other_params)

In [34]:
model.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0.8, learning_rate=0.01,
       max_delta_step=0, max_depth=5, min_child_weight=6, missing=None,
       n_estimators=1500, n_jobs=1, nthread=None, objective='reg:linear',
       random_state=0, reg_alpha=1, reg_lambda=1, scale_pos_weight=1,
       seed=0, silent=True, subsample=0.8)

In [48]:
y_val_pred = model.predict(X_val)
print('MSE： ', mean_absolute_error(y_val, y_val_pred))

MSE：  28.474467726316448


In [147]:
param_dist = {
        'n_estimators':range(80,200,4),
        'max_depth':range(2,15,1),
        'learning_rate':np.linspace(0.01,2,20),
        'subsample':np.linspace(0.7,0.9,20),
        'colsample_bytree':np.linspace(0.5,0.98,10),
        'min_child_weight':range(1,9,1)
        }

In [None]:
grid = GridSearchCV(estimator=model,param_grid=param_dist, cv=3, n_jobs=-1, scoring='r2')
grid.fit(X_train, y_train)

In [None]:
best_estimator = grid.best_estimator_
print(best_estimator)
#输出最优训练器的精度
print(grid.best_score_)

In [None]:
y_val_pred = grid.predict(X_val)
print('MSE： ', mean_absolute_error(y_val, y_val_pred))

In [None]:
from sklearn.grid_search import RandomizedSearchCV
grid_r = RandomizedSearchCV(estimator=model,param_grid=param_dist, cv=3, n_jobs=-1, scoring='r2')
grid_r.fit(X_train, y_train)

best_estimator = grid_r.best_estimator_
print(best_estimator)
#输出最优训练器的精度
print(grid_r.best_score_)

In [None]:
y_val_pred = grid_r.predict(X_val)
print('MSE： ', mean_absolute_error(y_val, y_val_pred))

### 模型融合

In [49]:
from sklearn.linear_model import BayesianRidge, LinearRegression, ElasticNet, Ridge
from sklearn.svm import SVR
from sklearn.ensemble.gradient_boosting import GradientBoostingRegressor

In [40]:
model_br = BayesianRidge()
model_lr = LinearRegression()
model_en = ElasticNet()
model_svr = SVR()
model_gbr = GradientBoostingRegressor()

In [41]:
model_br.fit(X_train, y_train)
y_val_pred = model_br.predict(X_val)
print('MSE： ', mean_absolute_error(y_val, y_val_pred))

MSE：  153.8463547724676


In [42]:
model_lr.fit(X_train, y_train)
y_val_pred = model_lr.predict(X_val)
print('MSE： ', mean_absolute_error(y_val, y_val_pred))

MSE：  154.1376045433524


In [43]:
model_en.fit(X_train, y_train)
y_val_pred = model_en.predict(X_val)
print('MSE： ', mean_absolute_error(y_val, y_val_pred))

MSE：  151.11282100445635


In [44]:
model_svr.fit(X_train, y_train)
y_val_pred = model_svr.predict(X_val)
print('MSE： ', mean_absolute_error(y_val, y_val_pred))

MSE：  142.20890560155829


In [45]:
model_gbr.fit(X_train, y_train)
y_val_pred = model_gbr.predict(X_val)
print('MSE： ', mean_absolute_error(y_val, y_val_pred))

MSE：  37.6534574810567


In [46]:
import lightgbm as lgb
model_lgb = lgb.LGBMRegressor()
model_lgb.fit(X_train, y_train)
y_val_pred = model_lgb.predict(X_val)
print('MSE： ', mean_absolute_error(y_val, y_val_pred))

MSE：  30.225785366748426


In [57]:
from mlxtend.regressor import StackingRegressor
stack = StackingRegressor(regressors=[model,model_gbr,model_lgb], meta_regressor= model)
stack.fit(X_train, y_train)


StackingRegressor(meta_regressor=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0.8, learning_rate=0.01,
       max_delta_step=0, max_depth=5, min_child_weight=6, missing=None,
       n_estimators=1500, n_jobs=1, nthread=None, objective='reg:linear',
       random_state=0, reg_alpha=1, reg_lambda=1, scale_pos_weight=1,
       seed=0, silent=True, subsample=0.8),
         refit=True,
         regressors=[XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0.8, learning_rate=0.01,
       max_delta_step=0, max_depth=5, min_child_weight=6, missing=None,
       n_estimators=1500, n_jobs=1, nthread=None, objective='reg:linear',
       random_...0.0, reg_lambda=0.0, silent=True, subsample=1.0,
       subsample_for_bin=200000, subsample_freq=1)],
         store_train_meta_features=False, verbose=0)

In [58]:
y_val_pred = stack.predict(X_val)
print('MSE： ', mean_absolute_error(y_val, y_val_pred))

MSE：  27.88984798734527


In [36]:
y_result = stack.predict(fea_all[10441:])
sub = pd.DataFrame()
sub['id'] = test['id']
sub['y'] = y_result

In [37]:
sub.head()

Unnamed: 0,id,y
0,10441,46.923302
1,10442,60.060925
2,10443,153.981903
3,10444,120.96286
4,10445,10.767762


In [36]:
sub.to_csv('../data/sub_xgb_bs2.csv', index=False)