In [1]:
import numpy as np
import pandas as pd
import os
import time
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
path_folder=os.path.normcase(r'C:\Users\dell\Downloads\Titanic-maching-learning-from-disaster')
train_data=pd.read_csv(os.path.join(path_folder,'train.csv'),sep=',')
test_data_origin=pd.read_csv(os.path.join(path_folder,'test.csv'),sep=',')

In [3]:
def deal_ticket_fare(df):
    '''
    计算每张票有几人共享，以及每人花费的船票钱
    
    paramters:
        df--dataframe，待处理的数据表
        
    return:
        df_count--处理后，添加新列的数据表
    '''
    num_of_tickets=df[['Ticket']].groupby(df['Ticket']).count()
    num_of_tickets.columns=['num_of_tickets']
    df_count=df.merge(num_of_tickets,left_on='Ticket',right_index=True,how='left')
    df_count['fare_per_ticket']=df_count['Fare']/df_count['num_of_tickets']
    return df_count

In [4]:
# 构造 数据预处理 流水线
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.pipeline import Pipeline,FeatureUnion
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator,TransformerMixin

class DataFrameSelector(BaseEstimator,TransformerMixin):
    def __init__(self,attrs_name_list):
        self.attrs_name_list=attrs_name_list
    def fit(self,X,y=None):
        return self
    def transform(self,X):
        return X[self.attrs_name_list].values

def data_preparation(cat_attributes,interval_num_attributes,ratio_num_attributes):
    '''
    构造处理DataFrame 的 类
    
    Arguments:
        cat_attributes ---         标称数据列集合
        interval_num_attributes--- 标度数值列集合/不需进行标准化的列集合
        ratio_num_attributes   --- 比例数值列集合/需进行 标准化 列集合
       
    Return 
        full_pipeline---类class
    '''
    cat_attrs=cat_attributes    
    interval_num_attrs=interval_num_attributes
    ratio_num_attrs=ratio_num_attributes
    transformer_list=[]
    
    if cat_attrs:
        cat_pipeline=Pipeline([('cat_dfs',DataFrameSelector(cat_attrs)),('impute',SimpleImputer(strategy='most_frequent')),\
                               ('onehotencoder',OneHotEncoder())])
        transformer_list.append(('cat_pipeline',cat_pipeline))
        
    if interval_num_attrs:
        interval_num_pipeline=Pipeline([('dfs',DataFrameSelector(interval_num_attrs)),('impute',SimpleImputer(strategy='median')) ])
        transformer_list.append(('interval_num_pipeline',interval_num_pipeline))
        
    if ratio_num_attrs:    
        ratio_num_pipeline=Pipeline([('dfs',DataFrameSelector(ratio_num_attrs)),('impute',SimpleImputer(strategy='median')),\
                                     ('std_scaler',StandardScaler())])
        transformer_list.append(('ratio_num_pipeline',ratio_num_pipeline))
    full_pipeline=FeatureUnion(transformer_list=transformer_list)
    return full_pipeline

In [5]:
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,n_jobs=None, train_sizes=np.linspace(0.1, 1.0, 10)):
    from sklearn.model_selection import learning_curve
    
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)#train_scores size (n_ticks,n_cv_folds)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 's-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt

In [6]:
# 计算 预测准确率
def compute_acc(y,y_pred):
    y_pred_class=np.where(y_pred>=0.5,1,0)
    pred_accuracy=(y==y_pred_class).sum()/len(y)    
    return pred_accuracy

In [7]:
# scale 数值数据 数据预处理
train_data_count=deal_ticket_fare(train_data)
from sklearn.model_selection import train_test_split
#train_data_index,dev_data_index=train_test_split(train_data_count.index.tolist(),train_size=0.7,test_size=0.3,random_state=42)
#train_df_data=train_data_count.iloc[train_data_index,:]
#dev_df_data=train_data_count.iloc[dev_data_index,:]
train_df_data,dev_df_data=train_test_split(train_data_count,train_size=0.7,test_size=0.3,random_state=42)
train_data_index=train_df_data.index.tolist()
dev_data_index=dev_df_data.index.tolist()
cat_attrs=['Sex','Embarked']    
interval_num_attrs=['Pclass','SibSp','Parch','num_of_tickets']
ratio_num_attrs=['Age','Fare','fare_per_ticket']
full_pipeline_std=data_preparation(cat_attrs,interval_num_attrs,ratio_num_attrs)
train_data_X=full_pipeline_std.fit_transform(train_df_data).toarray()
train_data_y=train_df_data['Survived'].values.ravel()
train_data_X[0]

array([ 0.        ,  1.        ,  0.        ,  0.        ,  1.        ,
        1.        ,  0.        ,  2.        ,  1.        , -1.91971935,
        0.98099823,  3.53619915])

In [None]:
cols=['Sex_T','Embarked_T','Age','Fare','fare_per_ticket','num_of_tickets','Pclass','SibSp','Parch']

In [34]:
train_data_count.head(1)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,num_of_tickets,fare_per_ticket,Sex_T,Embarked_T
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,1,7.25,0,0


In [33]:
#train_data_count=deal_ticket_fare(train_data)
train_data_count.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 16 columns):
PassengerId        891 non-null int64
Survived           891 non-null int64
Pclass             891 non-null int64
Name               891 non-null object
Sex                891 non-null object
Age                891 non-null float64
SibSp              891 non-null int64
Parch              891 non-null int64
Ticket             891 non-null object
Fare               891 non-null float64
Cabin              204 non-null object
Embarked           891 non-null object
num_of_tickets     891 non-null int64
fare_per_ticket    891 non-null float64
Sex_T              891 non-null int64
Embarked_T         891 non-null int32
dtypes: float64(3), int32(1), int64(7), object(5)
memory usage: 108.0+ KB


In [35]:
#  categorical type data
trans_sex={'male':0,'female':1}
trans_embarked={'S':0,'C':1,'Q':2}
train_data_count['Sex_T']=train_data_count['Sex'].map(trans_sex)
#from sklearn.impute import MissingIndicator
#indicator=MissingIndicator(missing_values=np.nan)
#train_missing_indicator=indicator.fit_transform(train_data_count)
from sklearn.impute import SimpleImputer
imp1=SimpleImputer(missing_values=np.nan,strategy='most_frequent')
train_data_count['Embarked']=imp1.fit_transform(train_data_count[['Embarked']])
imp2=SimpleImputer(missing_values=np.nan,strategy='mean')
train_data_count['Age']=imp2.fit_transform(train_data_count[['Age']])
train_data_count['Embarked_T']=train_data_count['Embarked'].map(trans_embarked).astype(np.int)
cols=['Sex_T','Embarked_T','Age','Fare','fare_per_ticket','num_of_tickets','Pclass','SibSp','Parch']
X_train_data=train_data_count.reindex(columns=cols).values
y_train_data=train_data_count['Survived'].ravel()

In [8]:
test_data_count=deal_ticket_fare(test_data_origin)
test_data_X=full_pipeline_std.transform(test_data_count).toarray()

In [9]:
dev_data_X=full_pipeline_std.transform(dev_df_data).toarray()
dev_data_y=dev_df_data['Survived'].values.ravel()
dev_data_X[0]

array([ 0.        ,  1.        ,  1.        ,  0.        ,  0.        ,
        3.        ,  1.        ,  1.        ,  2.        , -0.0772525 ,
       -0.32547755, -0.49449441])

In [10]:
import lightgbm as lgb
# 准备数据
train_data=lgb.Dataset(train_data_X,train_data_y)
valid_data=lgb.Dataset(dev_data_X,dev_data_y,reference=train_data)

In [113]:
# setting parameters  dict type
param={'num_leaves':14,'num_iterations':40,'objective':'binary'}
param['metric']='binary_error'

In [122]:
# setting parameters  dict type
param={'num_leaves':15,'num_iterations':154,'objective':'binary'}
param['metric']='binary_error'

In [123]:
# training 
num_boost_round=10
bst=lgb.train(param,train_data,num_boost_round,valid_sets=[valid_data],verbose_eval=50)



[50]	valid_0's binary_error: 0.186567
[100]	valid_0's binary_error: 0.197761
[150]	valid_0's binary_error: 0.19403


In [124]:
bst.num_trees()

154

In [129]:
train_pred_y=bst.predict(data_X_train)
compute_acc(data_y_train,train_pred_y)

0.8585858585858586

In [131]:
dev_y_pred=bst.predict(dev_data_X)
compute_acc(dev_data_y,dev_y_pred)

0.8097014925373134

In [57]:
test_y_pred=bst.predict(test_data_X)

In [58]:
test_data_origin['Survived']=np.where(test_y_pred>=0.5,1,0)

In [59]:
test_data_origin[['PassengerId','Survived']].to_csv('c:/users/dell/desktop/gender_submission.csv',index=False)

In [52]:
bst.params

{'metric': 'binary_error', 'num_leaves': 15, 'objective': 'binary'}

In [64]:
# scikit api 参数调优
data_X_train=full_pipeline_std.fit_transform(train_data_count).toarray()
data_y_train=train_data_count['Survived'].values.ravel()
data_X_train[0]

array([ 0.        ,  1.        ,  0.        ,  0.        ,  1.        ,
        3.        ,  1.        ,  0.        ,  1.        , -0.56573646,
       -0.50244517, -0.49697568])

In [83]:
data_train=lgb.Dataset(data_X_train,data_y_train,silent=True)
params={'boosting_type':'gbdt',
        'objective':'binary',
        'learning_rate':0.1,
        'num_leaves':31,
        'max_depth':5,        
       }
cv_results=lgb.cv(params,data_train,num_boost_round=1000,nfold=5,stratified=True,shuffle=True,metrics='binary_error',early_stopping_rounds=50,\
                  verbose_eval=10,show_stdv=True,seed=0)
print ('\nbest n_estimators:',len(cv_results['binary_error-mean']))
print ('best cv-score:',cv_results['binary_error-mean'][-1])

[10]	cv_agg's binary_error: 0.189645 + 0.0259372
[20]	cv_agg's binary_error: 0.180701 + 0.0274235
[30]	cv_agg's binary_error: 0.176193 + 0.021359
[40]	cv_agg's binary_error: 0.178428 + 0.0252489
[50]	cv_agg's binary_error: 0.173952 + 0.0149303
[60]	cv_agg's binary_error: 0.166087 + 0.0170612
[70]	cv_agg's binary_error: 0.164976 + 0.0189233
[80]	cv_agg's binary_error: 0.158241 + 0.0147091
[90]	cv_agg's binary_error: 0.166106 + 0.0130923
[100]	cv_agg's binary_error: 0.162716 + 0.0124947
[110]	cv_agg's binary_error: 0.161586 + 0.0144549
[120]	cv_agg's binary_error: 0.162716 + 0.011969
[130]	cv_agg's binary_error: 0.158228 + 0.0136617
[140]	cv_agg's binary_error: 0.160475 + 0.0112666
[150]	cv_agg's binary_error: 0.161593 + 0.0126621
[160]	cv_agg's binary_error: 0.162716 + 0.0143578
[170]	cv_agg's binary_error: 0.160482 + 0.0124245
[180]	cv_agg's binary_error: 0.163859 + 0.0124669
[190]	cv_agg's binary_error: 0.161612 + 0.0113643
[200]	cv_agg's binary_error: 0.163859 + 0.0124593

best n_est

In [82]:
cv_results.keys()

dict_keys(['binary_error-mean', 'binary_error-stdv'])

In [85]:
#cv_results.items()
# 第一次 search 粗略
from sklearn.model_selection import GridSearchCV
model_lgb=lgb.LGBMClassifier(boosting_type='gbdt',num_leaves=31,max_depth=5,learning_rate=0.1,n_estimators=154,objective='binary',\
                             n_jobs=-1,silent=True,importance_type='split')
param_grid1={'num_leaves':[15,28,31,45,64],'max_depth':[4,5,6]}
gs1=GridSearchCV(model_lgb,param_grid=param_grid1,scoring='accuracy',iid='warn',n_jobs=-1,cv=5)

In [86]:
gs1.fit(data_X_train,data_y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=5,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=154, n_jobs=-1, num_leaves=31, objective='binary',
        random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'num_leaves': [15, 28, 31, 45, 64], 'max_depth': [4, 5, 6]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [88]:
gs1.best_score_,gs1.best_estimator_

(0.8338945005611672,
 LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
         importance_type='split', learning_rate=0.1, max_depth=5,
         min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
         n_estimators=154, n_jobs=-1, num_leaves=15, objective='binary',
         random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
         subsample=1.0, subsample_for_bin=200000, subsample_freq=0))

In [130]:
# K折交叉验证
from sklearn.model_selection import cross_val_score
scores=cross_val_score(gs1,data_X_train,data_y_train,cv=5,n_jobs=-1,scoring='accuracy')
scores

array([0.80446927, 0.79888268, 0.84831461, 0.82022472, 0.85310734])

In [92]:
scores.mean()

0.8249997251565455

In [94]:
# 第二次 search 精细
param_grid2={'num_leaves':[11,13,15,17,19],'max_depth':[4,5]}
gs2=GridSearchCV(model_lgb,param_grid=param_grid2,scoring='accuracy',iid='warn',cv=5)
gs2.fit(data_X_train,data_y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=5,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=154, n_jobs=-1, num_leaves=31, objective='binary',
        random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'num_leaves': [11, 13, 15, 17, 19], 'max_depth': [4, 5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [95]:
gs2.best_score_,gs2.best_estimator_

(0.8338945005611672,
 LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
         importance_type='split', learning_rate=0.1, max_depth=5,
         min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
         n_estimators=154, n_jobs=-1, num_leaves=15, objective='binary',
         random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
         subsample=1.0, subsample_for_bin=200000, subsample_freq=0))

In [97]:
scores=cross_val_score(gs2,data_X_train,data_y_train,cv=5,scoring='accuracy')
scores,scores.mean()

(array([0.79329609, 0.80446927, 0.85393258, 0.82022472, 0.85310734]),
 0.8250060022264092)

In [103]:
test_y_lgb_pred=gs2.predict(test_data_X)

In [104]:
test_data_origin['Survived']=np.where(test_y_lgb_pred.ravel()>0.5,1,0)
test_data_origin[['PassengerId','Survived']].to_csv('c:/users/dell/desktop/gender_submission.csv',index=False)

In [108]:
param_grid3={'num_leaves':[14,15,16],"n_estimators":[40,151]}
gs3=GridSearchCV(model_lgb,param_grid=param_grid3,scoring='accuracy',iid='warn',cv=5)
gs3.fit(data_X_train,data_y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=5,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=154, n_jobs=-1, num_leaves=31, objective='binary',
        random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'num_leaves': [14, 15, 16], 'n_estimators': [40, 151]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [109]:
gs3.best_score_,gs3.best_estimator_

(0.8327721661054994,
 LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
         importance_type='split', learning_rate=0.1, max_depth=5,
         min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
         n_estimators=151, n_jobs=-1, num_leaves=15, objective='binary',
         random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
         subsample=1.0, subsample_for_bin=200000, subsample_freq=0))

尽管模型已经过拟合，但仍然没有达到我要的精度要求，在这种情况下，是什么问题呢？考虑数据的特征工程