# Kaggle 之泰坦尼克号罹难者预测

In [1]:
import pandas as pd

In [2]:
train = pd.read_csv('./titanic/train.csv')
test = pd.read_csv('./titanic/test.csv')

In [3]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [5]:
# 人工选择有效的特征
select_features = ['Pclass','Sex','Age','Embarked','SibSp','Parch','Fare']

In [6]:
X_train = train[select_features]
X_test = test[select_features]

Survived 和 Age 这两个特征的数据有缺失。

In [7]:
y_train = train['Survived']

In [8]:
X_train['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [9]:
X_test['Embarked'].value_counts()

S    270
C    102
Q     46
Name: Embarked, dtype: int64

类别型的特征，使用众数来填充缺失值。

In [10]:
X_train['Embarked'].fillna('S',inplace=True)
X_test['Embarked'].fillna('S',inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


数值型的特征，使用平均值或者中位数来填充缺失值。

In [11]:
X_train['Age'].fillna(X_train['Age'].mean(),inplace=True)
X_test['Age'].fillna(X_test['Age'].mean(),inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [12]:
X_test['Fare'].fillna(X_test['Fare'].mean(),inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [13]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 7 columns):
Pclass      418 non-null int64
Sex         418 non-null object
Age         418 non-null float64
Embarked    418 non-null object
SibSp       418 non-null int64
Parch       418 non-null int64
Fare        418 non-null float64
dtypes: float64(2), int64(3), object(2)
memory usage: 22.9+ KB


+ 特征向量化

In [14]:
X_train.head()

Unnamed: 0,Pclass,Sex,Age,Embarked,SibSp,Parch,Fare
0,3,male,22.0,S,1,0,7.25
1,1,female,38.0,C,1,0,71.2833
2,3,female,26.0,S,0,0,7.925
3,1,female,35.0,S,1,0,53.1
4,3,male,35.0,S,0,0,8.05


In [15]:
from sklearn.feature_extraction import DictVectorizer

dict_vec = DictVectorizer(sparse=False)
X_train = dict_vec.fit_transform(X_train.to_dict(orient='record'))

In [16]:
dict_vec.feature_names_

['Age',
 'Embarked=C',
 'Embarked=Q',
 'Embarked=S',
 'Fare',
 'Parch',
 'Pclass',
 'Sex=female',
 'Sex=male',
 'SibSp']

In [17]:
X_train.shape

(891, 10)

In [18]:
X_test = dict_vec.transform(X_test.to_dict(orient='record'))

In [19]:
X_test

array([[ 34.5       ,   0.        ,   1.        , ...,   0.        ,
          1.        ,   0.        ],
       [ 47.        ,   0.        ,   0.        , ...,   1.        ,
          0.        ,   1.        ],
       [ 62.        ,   0.        ,   1.        , ...,   0.        ,
          1.        ,   0.        ],
       ..., 
       [ 38.5       ,   0.        ,   0.        , ...,   0.        ,
          1.        ,   0.        ],
       [ 30.27259036,   0.        ,   0.        , ...,   0.        ,
          1.        ,   0.        ],
       [ 30.27259036,   1.        ,   0.        , ...,   0.        ,
          1.        ,   1.        ]])

In [20]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()

In [21]:
from xgboost import XGBClassifier

xgbc = XGBClassifier()

In [22]:
from sklearn.model_selection import cross_val_score

cross_val_score(rfc,X_train,y_train,cv=5).mean()

0.80817061714235872

In [23]:
cross_val_score(xgbc,X_train,y_train,cv=5).mean()

0.81824559798311003

使用默认配置的 RandomForestClassifier 对测试数据进行预测操作。

In [24]:
rfc.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [25]:
rfc_y_predict = rfc.predict(X_test)

In [26]:
rfc_submission = pd.DataFrame({'PassengerId':test['PassengerId'],'Survived':rfc_y_predict})

In [27]:
rfc_submission.to_csv('./titanic/rfc_submission.csv',index=False)

使用默认配置的 XGBClassifier 对测试数据进行预测操作。

In [28]:
xgbc.fit(X_train,y_train)
xgbc_y_predict = rfc.predict(X_test)
xgbc_submission = pd.DataFrame({'PassengerId':test['PassengerId'],'Survived':xgbc_y_predict})
xgbc_submission.to_csv('./titanic/xgbc_submission.csv',index=False)

使用网格搜索的方式训练一个模型

In [29]:
from sklearn.grid_search import GridSearchCV

params = {'max_depth':list(range(2,7)),'n_estimators':list(range(100,1100,200)),'learning_rate':[0.05,0.1,0.25,0.5,1.0]}



In [30]:
xgbc_best = XGBClassifier()

gs = GridSearchCV(xgbc_best,params,n_jobs=-1,cv=5,verbose=-1)
gs.fit(X_train,y_train)

[Parallel(n_jobs=-1)]: Done 191 tasks      | elapsed:    8.0s
[Parallel(n_jobs=-1)]: Done 416 tasks      | elapsed:   18.0s
[Parallel(n_jobs=-1)]: Done 625 out of 625 | elapsed:   28.6s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'max_depth': [2, 3, 4, 5, 6], 'n_estimators': [100, 300, 500, 700, 900], 'learning_rate': [0.05, 0.1, 0.25, 0.5, 1.0]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=-1)

In [31]:
gs.best_score_

0.835016835016835

In [32]:
gs.best_params_

{'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}

In [33]:
xgbc_best_y_predict = gs.predict(X_test)
xgbc_best_submission = pd.DataFrame({'PassengerId':test['PassengerId'],'Survived':xgbc_best_y_predict})
xgbc_best_submission.to_csv('./titanic/xgbc_best_submission.csv',index=False)