# 泰坦尼克号幸存者分类预测问题

In [8]:
import numpy as np
import pandas as pd

data_train = pd.read_csv("../input/train.csv")
data_test = pd.read_csv("../input/test.csv")

In [9]:
data_train.shape, data_test.shape

((891, 12), (418, 11))

In [10]:
y = data_train.Survived.ravel()
ID = data_test.PassengerId

In [11]:
data_train.drop(['PassengerId', 'Survived'], axis=1, inplace=True)
data_test.drop(['PassengerId'], axis=1, inplace=True)

## 数据合并

In [12]:
combination = pd.concat([data_train, data_test], axis=0, sort=False)
combination.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [13]:
combination.shape

(1309, 10)

## 类型转换

In [15]:
combination['Pclass'] = combination['Pclass'].astype(str)

## 填充缺失值

In [16]:
combination.isnull().sum()

Pclass         0
Name           0
Sex            0
Age          263
SibSp          0
Parch          0
Ticket         0
Fare           1
Cabin       1014
Embarked       2
dtype: int64

### 填充 Fare

In [17]:
combination.Fare.fillna(combination.Fare.mean(), inplace=True)

### 填充 Age

In [18]:
# 把已有的数值型特征取出来丢进 Random Forest Regressor中
df_age = combination[['Age', 'Fare', 'Parch', 'SibSp', 'Pclass']]
# 乘客分成已知年龄和未知年龄两部分
known_age = df_age[df_age.Age.notnull()]
unknown_age = df_age[df_age.Age.isnull()]
# y 即目标年龄
y_for_age = known_age['Age']

# X 即特征属性值
X_train_for_age = known_age.drop(['Age'], axis=1)
X_test_for_age = unknown_age.drop(['Age'], axis=1)

In [19]:
from sklearn.ensemble import RandomForestRegressor

rfr = RandomForestRegressor(random_state=0, n_estimators=2000, n_jobs=-1)
rfr.fit(X_train_for_age, y_for_age)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=2000, n_jobs=-1,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

In [21]:
# 用得到的模型进行未知年龄结果预测
y_pred_age = rfr.predict(X_test_for_age)
# 用得到的预测结果填补原缺失数据
combination.loc[combination.Age.isnull(), 'Age'] = y_pred_age

### 填充 Cabin

In [22]:
combination.loc[combination.Cabin.notnull(), 'Cabin'] = 'yes'
combination.loc[combination.Cabin.isnull(), 'Cabin'] = 'no'

### 填充 Embarked

In [24]:
combination.Embarked.isnull().sum()

2

In [25]:
combination.Embarked.fillna(combination.Embarked.mode()[0], inplace=True)

In [26]:
combination.Embarked.isnull().sum()

0

## 特征提取

### 离散型特征提取

从 Name 字段提取 Title。

+ findall 返回的是 list。

In [27]:
import re

combination['Title'] = combination['Name'].map(
    lambda x: re.compile(", (.*?)\.").findall(x)[0])

In [28]:
dict.fromkeys(['Capt', 'Col', 'Major', 'Dr', 'Rev'], 'Officer')

{'Capt': 'Officer',
 'Col': 'Officer',
 'Dr': 'Officer',
 'Major': 'Officer',
 'Rev': 'Officer'}

In [29]:
title_Dict = {}
title_Dict.update(dict.fromkeys(['Capt', 'Col', 'Major', 'Dr', 'Rev'], 'Officer'))
title_Dict.update(dict.fromkeys(['Don', 'Sir', 'the Countess', 'Dona', 'Lady'], 'Royalty'))
title_Dict.update(dict.fromkeys(['Mme', 'Ms', 'Mrs'], 'Mrs'))
title_Dict.update(dict.fromkeys(['Mlle', 'Miss'], 'Miss'))
title_Dict.update(dict.fromkeys(['Mr'], 'Mr'))
title_Dict.update(dict.fromkeys(['Master', 'Jonkheer'], 'Master'))

In [30]:
title_Dict

{'Capt': 'Officer',
 'Col': 'Officer',
 'Don': 'Royalty',
 'Dona': 'Royalty',
 'Dr': 'Officer',
 'Jonkheer': 'Master',
 'Lady': 'Royalty',
 'Major': 'Officer',
 'Master': 'Master',
 'Miss': 'Miss',
 'Mlle': 'Miss',
 'Mme': 'Mrs',
 'Mr': 'Mr',
 'Mrs': 'Mrs',
 'Ms': 'Mrs',
 'Rev': 'Officer',
 'Sir': 'Royalty',
 'the Countess': 'Royalty'}

In [31]:
combination['Title'] = combination['Title'].map(title_Dict)
combination['Title'].head()

0      Mr
1     Mrs
2    Miss
3     Mrs
4      Mr
Name: Title, dtype: object

### 连续型变量分箱处理（变成离散型变量）

这里要处理两个变量：Age 和 Fare。

+ Age 等距离分箱
+ Fare 等频率分箱

In [32]:
combination.Age.isnull().sum()

0

In [33]:
bins = [0, 12, 18, 65, 100]
combination['Age_bin'] = pd.cut(combination['Age'], bins)

In [34]:
# qcut 等频率分箱
combination['Fare_bin'], bins = pd.qcut(combination['Fare'], 5, retbins=True)

### 提取 Family_Size 变量

In [35]:
combination['Family_Size'] = combination['Parch'] + combination['SibSp'] + 1

## 离散型变量独热编码处理

In [36]:
category_variable = [
    'Title', 'Cabin', 'Sex', 'Pclass', 'Embarked', 'Age_bin', 'Fare_bin'
]

In [37]:
category_variable_dummies = pd.get_dummies(
    combination[category_variable], drop_first=True)

## 删掉没有用的特征

In [38]:
combination.drop(category_variable, inplace=True, axis=1)

In [39]:
combination.drop(
    ['Age', 'Name', 'Ticket', 'SibSp', 'Parch'], axis=1, inplace=True)

In [40]:
combination.shape

(1309, 2)

In [41]:
combination = pd.concat(
    [combination, category_variable_dummies], sort=False, axis=1)

## 分离训练集与测试集

In [42]:
train_size = data_train.shape[0]

In [43]:
X = combination.iloc[:train_size, :]
data_for_pred = combination.iloc[train_size:, :]

这一步其实应该称为分离验证集。

In [49]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=666)

In [51]:
from sklearn.linear_model import LogisticRegression


lg = LogisticRegression() # C=1.0, penalty='l1', tol=1e-6
lg.fit(X_train, y_train)
lg.score(X_test, y_test)



0.8156424581005587

In [54]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

dtc = DecisionTreeClassifier()

bagging_dtc = BaggingClassifier(
    dtc,
    n_estimators=500,
    max_samples=0.8,
    max_features=1.0,
    bootstrap=True,
    bootstrap_features=False,
    n_jobs=-1)
bagging_dtc.fit(X_train, y_train)

y_pred = bagging_dtc.predict(X_test)  # 返回的是验证集的预测标签
bagging_dtc.score(X_test, y_test)

0.7821229050279329

In [55]:
# 计算正确率
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_pred)

0.7821229050279329

## 使用 xgboost

In [58]:
from xgboost import XGBClassifier

xgbc = XGBClassifier(learning_rate=0.5)
xgbc.fit(X_train.values, y_train)
y_pred_xgbc = xgbc.predict(X_test.values)

In [59]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_pred_xgbc)

0.8044692737430168

使用全部的数据集训练数据

In [60]:
X.columns

Index(['Fare', 'Family_Size', 'Title_Miss', 'Title_Mr', 'Title_Mrs',
       'Title_Officer', 'Title_Royalty', 'Cabin_yes', 'Sex_male', 'Pclass_2',
       'Pclass_3', 'Embarked_Q', 'Embarked_S', 'Age_bin_(12, 18]',
       'Age_bin_(18, 65]', 'Age_bin_(65, 100]', 'Fare_bin_(7.854, 10.5]',
       'Fare_bin_(10.5, 21.679]', 'Fare_bin_(21.679, 41.579]',
       'Fare_bin_(41.579, 512.329]'],
      dtype='object')

In [61]:
from xgboost import XGBClassifier

xgbc_best = XGBClassifier()
xgbc_best.fit(X.values, y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [62]:
from sklearn.ensemble import BaggingRegressor
from sklearn import linear_model

# 用正则取出我们要的属性值,其实就是过滤掉了PassengerID列  但是这里好像把age 和fare  embark也过滤掉了  什么鬼 #对的，因为age 和fare用 Age_scale和fare_scale特征代替了
train_df = df.filter(
    regex=
    'Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Sex_.*|Pclass_.*|Title_.*|Family_Size|Embarked_.*'
)
train_np = train_df.as_matrix(
)  #r如果这里不转成矩阵那么就不能用[:,0]这种表述，而必须用 train_df['具体列名或者列名组成的列表，旧金山犯罪那个比赛就是用的列名组成的列表']

# y即Survival结果
y = train_np[:, 0]  # 整个数据集

# X即特征属性值
X = train_np[:, 1:]
print('训练集规模：', X.shape)

clf = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6)
bagging_clf = BaggingRegressor(
    clf,
    n_estimators=20,
    max_samples=0.8,
    max_features=1.0,
    bootstrap=True,
    bootstrap_features=False,
    n_jobs=-1)
bagging_clf.fit(X, y)  # 训练完毕

NameError: name 'df' is not defined

## 处理测试集 （对测试集做与前面一样的处理）

In [63]:
data_test = pd.read_csv('./input/test.csv')
data_test.loc[(data_test.Fare.isnull()), 'Fare'] = 0
# 接着我们对test_data做和train_data中一致的特征变换
# 首先用同样的RandomForestRegressor模型填上丢失的年龄
tmp_df = data_test[['Age', 'Fare', 'Parch', 'SibSp', 'Pclass']]
null_age = tmp_df[data_test.Age.isnull()].as_matrix()
# 根据特征属性X预测年龄并补上
xp = null_age[:, 1:]
predictedAges = rfr.predict(xp)
data_test.loc[(data_test.Age.isnull()), 'Age'] = predictedAges

#处理embarked
#补缺失
data_test.Embarked[
    data_test.Embarked.isnull()] = data_test.Embarked.dropna().mode().values
dummies_Embarked = pd.get_dummies(data_test['Embarked'], prefix='Embarked')

# 提取名字中信息
data_test['Title'] = data_test['Name'].map(
    lambda x: re.compile(", (.*?)\.").findall(x)[0])
title_Dict = {}
title_Dict.update(
    dict.fromkeys(['Capt', 'Col', 'Major', 'Dr', 'Rev'], 'Officer'))
title_Dict.update(
    dict.fromkeys(['Don', 'Sir', 'the Countess', 'Dona', 'Lady'], 'Royalty'))
title_Dict.update(dict.fromkeys(['Mme', 'Ms', 'Mrs'], 'Mrs'))
title_Dict.update(dict.fromkeys(['Mlle', 'Miss'], 'Miss'))
title_Dict.update(dict.fromkeys(['Mr'], 'Mr'))
title_Dict.update(dict.fromkeys(['Master', 'Jonkheer'], 'Master'))
data_test['Title'] = data_test['Title'].map(title_Dict)
dummies_title = pd.get_dummies(data_test['Title'], prefix="Title")
data_test = pd.concat([data_test, dummies_title], axis=1)

#二值化
data_test = set_Cabin_type(data_test)
dummies_Cabin = pd.get_dummies(data_test['Cabin'], prefix='Cabin')
#dummies_Embarked = pd.get_dummies(data_test['Embarked'], prefix= 'Embarked')
dummies_Sex = pd.get_dummies(data_test['Sex'], prefix='Sex')
dummies_Pclass = pd.get_dummies(data_test['Pclass'], prefix='Pclass')

df_test = pd.concat(
    [data_test, dummies_Cabin, dummies_Sex, dummies_Pclass, dummies_Embarked],
    axis=1)
df_test.drop(
    ['Pclass', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked', 'Title'],
    axis=1,
    inplace=True)

#  年龄分段处理
bins_test = [0, 12, 18, 65, 100]
df_test['Age_group'] = pd.cut(df_test['Age'], bins_test)
dummies_Age = pd.get_dummies(df_test['Age_group'], prefix='Age')
df_test = pd.concat([df_test, dummies_Age], axis=1)
df_test.drop(['Age', 'Age_group'], axis=1, inplace=True)

# fare特征分段 并二值化
df_test['Fare_bin'] = pd.qcut(df_test['Fare'], 5)
#print (df[['Fare_bin', 'Survived']].groupby(['Fare_bin'], as_index=False).mean().sort_values(by='Fare_bin', ascending=True))
dummies_Fare_bin = pd.get_dummies(df_test['Fare_bin'], prefix='Fare_bin')
df_test = pd.concat([df_test, dummies_Fare_bin], axis=1)
df_test.drop(['Fare', 'Fare_bin'], axis=1, inplace=True)
#df.head()

# 对 SibSp Parch合并为family_size
df_test['Family_Size'] = df_test['Parch'] + df_test['SibSp'] + 1
df_test.drop(['SibSp', 'Parch'], axis=1, inplace=True)
df_test.head()

# 过滤规则
test = df_test.filter(
    regex=
    'Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Sex_.*|Pclass_.*|Title_.*|Family_Size|Embarked_.*'
)  # 过滤掉PassengerId 、 age 和 fare embark
print('测试集规模： ', test.shape)
test.head()

FileNotFoundError: File b'./input/test.csv' does not exist

In [64]:
# 7 生成预测结果
predictions = xgbc_best.predict(test.values)  # 获得预测的标签
result = pd.DataFrame({
    'PassengerId': data_test['PassengerId'].as_matrix(),
    'Survived': predictions.astype(np.int32)
})
result.to_csv("xgbc.csv", index=False)
print("End")

NameError: name 'test' is not defined