In [13]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

In [14]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

train_target = train.Survived
train.drop('Survived',1,inplace=True)


# 第一步 观察、分析数据

In [15]:
# 统计非空行，发现缺失值
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
PassengerId    891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 76.6+ KB


训练集有12个列，891行

数值：
PassangerId、Survived、Pclass、Age、SibSp、Parch、Fare 

字符串：
Name 、Sex 、Ticket 、Cabin 、Embarked 



存在缺失值的列：
Age 714 个有效值，缺失177,经查阅资料，尝试用中值年龄替换空值，对于异常值，这样比平均值鲁棒性更强

Cabin 204 个值，缺失687

Embarked 889个值，缺失2


In [16]:
train['Age'].fillna(train['Age'].median(), inplace=True)
train.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,2.308642,29.361582,0.523008,0.381594,32.204208
std,257.353842,0.836071,13.019697,1.102743,0.806057,49.693429
min,1.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,2.0,22.0,0.0,0.0,7.9104
50%,446.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,3.0,35.0,1.0,0.0,31.0
max,891.0,3.0,80.0,8.0,6.0,512.3292


# 第二步 特征工程

In [17]:
total = train.append(test)

In [22]:
def get_titles():
    # 从名字中提取唯一有用的信息：头衔
    total['Title'] = total['Name'].map(lambda name:name.split(',')[1].split('.')[0].strip())
    
    # 将多个头衔映射到较少的身份
    Title_Dictionary = {
                        "Capt":       "Officer",
                        "Col":        "Officer",
                        "Major":      "Officer",
                        "Jonkheer":   "Nobles",
                        "Don":        "Nobles",
                        "Sir" :       "Nobles",
                        "Dr":         "Officer",
                        "Rev":        "Officer",
                        "the Countess":"Nobles",
                        "Dona":       "Nobles",
                        "Mme":        "Mrs",
                        "Mlle":       "Miss",
                        "Ms":         "Mrs",
                        "Mr" :        "Mr",
                        "Mrs" :       "Mrs",
                        "Miss" :      "Miss",
                        "Master" :    "Master",
                        "Lady" :      "Nobles"
                        }
    
    total['Title'] = total.Title.map(Title_Dictionary)

In [26]:
get_titles()
total

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,Mr
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,Miss
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,Mrs
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,Mr
5,6,3,"Moran, Mr. James",male,28.0,0,0,330877,8.4583,,Q,Mr
6,7,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,Mr
7,8,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.0750,,S,Master
8,9,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S,Mrs
9,10,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C,Mrs


用平均值或中位年龄代替年龄的缺失值，可能不是最佳解决方案，因为年龄可能因团体和乘客类别而异。

尝试按性别，标题和乘客类分组。

In [28]:
grouped_train = total.head(891).groupby(['Sex','Pclass','Title'])
grouped_median_train = grouped_train.median()
grouped_test = total.iloc[891:].groupby(['Sex','Pclass','Title'])
grouped_median_test = grouped_test.median()

In [29]:
grouped_median_train

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,PassengerId,Age,SibSp,Parch,Fare
Sex,Pclass,Title,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
female,1,Miss,369.0,29.5,0.0,0.0,88.25
female,1,Mrs,499.0,38.0,1.0,0.0,79.2
female,1,Nobles,658.5,40.5,0.5,0.0,63.05
female,1,Officer,797.0,49.0,0.0,0.0,25.9292
female,2,Miss,437.5,24.0,0.0,0.0,13.0
female,2,Mrs,439.5,31.5,1.0,0.0,26.0
female,3,Miss,372.0,22.0,0.0,0.0,8.75625
female,3,Mrs,405.5,29.0,1.0,1.0,15.975
male,1,Master,446.0,4.0,1.0,2.0,120.0
male,1,Mr,463.0,36.0,0.0,0.0,42.4


In [32]:
def process_age():
    
    global total
    
    # a function that fills the missing values of the Age variable
    
    def fillAges(row):
        if row['Sex']=='female' and row['Pclass'] == 1:
            if row['Title'] == 'Miss':
                return 30
            elif row['Title'] == 'Mrs':
                return 45
            elif row['Title'] == 'Officer':
                return 49
            elif row['Title'] == 'Royalty':
                return 39
        elif row['Sex']=='female' and row['Pclass'] == 2:
            if row['Title'] == 'Miss':
                return 20
            elif row['Title'] == 'Mrs':
                return 30
        elif row['Sex']=='female' and row['Pclass'] == 3:
            if row['Title'] == 'Miss':
                return 18
            elif row['Title'] == 'Mrs':
                return 31
        elif row['Sex']=='male' and row['Pclass'] == 1:
            if row['Title'] == 'Master':
                return 6
            elif row['Title'] == 'Mr':
                return 41.5
            elif row['Title'] == 'Officer':
                return 52
            elif row['Title'] == 'Royalty':
                return 40
        elif row['Sex']=='male' and row['Pclass'] == 2:
            if row['Title'] == 'Master':
                return 2
            elif row['Title'] == 'Mr':
                return 30
            elif row['Title'] == 'Officer':
                return 41.5
        elif row['Sex']=='male' and row['Pclass'] == 3:
            if row['Title'] == 'Master':
                return 6
            elif row['Title'] == 'Mr':
                return 26
    
    total.Age = total.apply(lambda r : fillAges(r) if np.isnan(r['Age']) else r['Age'], axis=1)
    

In [34]:
process_age()

In [37]:
def process_names():
    
    global total
    # 删去name
    total.drop('Name',axis=1,inplace=True)
    
    # 将分类变量title转换为“哑变量矩阵”或“指标矩阵”。
    titles_dummies = pd.get_dummies(total['Title'],prefix='Title')
    total = pd.concat([total,titles_dummies],axis=1)
    
    # 删去title
    total.drop('Title',axis=1,inplace=True)

In [None]:
process_names()

In [39]:
total

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Nobles,Title_Officer
0,1,3,male,22.0,1,0,A/5 21171,7.2500,,S,0,0,1,0,0,0
1,2,1,female,38.0,1,0,PC 17599,71.2833,C85,C,0,0,0,1,0,0
2,3,3,female,26.0,0,0,STON/O2. 3101282,7.9250,,S,0,1,0,0,0,0
3,4,1,female,35.0,1,0,113803,53.1000,C123,S,0,0,0,1,0,0
4,5,3,male,35.0,0,0,373450,8.0500,,S,0,0,1,0,0,0
5,6,3,male,28.0,0,0,330877,8.4583,,Q,0,0,1,0,0,0
6,7,1,male,54.0,0,0,17463,51.8625,E46,S,0,0,1,0,0,0
7,8,3,male,2.0,3,1,349909,21.0750,,S,1,0,0,0,0,0
8,9,3,female,27.0,0,2,347742,11.1333,,S,0,0,0,1,0,0
9,10,2,female,14.0,1,0,237736,30.0708,,C,0,0,0,1,0,0


处理登船港口编号

In [44]:
#因为只有一个缺失值，船票价使用平均值填充
total.Fare.fillna(total.Fare.mean(),inplace=True)

In [45]:
# 登船港口编号只有两个缺失，使用最频繁的S填入
total.Embarked.fillna('S',inplace=True)    
# 转化为哑变量
embarked_dummies = pd.get_dummies(total['Embarked'],prefix='Embarked')
total = pd.concat([total,embarked_dummies],axis=1)
total.drop('Embarked',axis=1,inplace=True)

处理客舱位置

In [47]:
# 用U代替NA
total.Cabin.fillna('U',inplace=True)
    
# 用首字母映射舱位
total['Cabin'] = total['Cabin'].map(lambda c : c[0])
    
# 转化为哑变量
cabin_dummies = pd.get_dummies(total['Cabin'],prefix='Cabin')
    
total = pd.concat([total,cabin_dummies],axis=1)
    
total.drop('Cabin',axis=1,inplace=True)

处理乘客性别：映射到1与0

In [49]:
total['Sex'] = total['Sex'].map({'male':1,'female':0})

处理客舱等级

In [50]:
pclass_dummies = pd.get_dummies(total['Pclass'],prefix="Pclass")
# adding dummy variables
total = pd.concat([total,pclass_dummies],axis=1)
    
# removing "Pclass"
    
total.drop('Pclass',axis=1,inplace=True)

In [51]:
total

Unnamed: 0,PassengerId,Sex,Age,SibSp,Parch,Ticket,Fare,Title_Master,Title_Miss,Title_Mr,...,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_T,Cabin_U,Pclass_1,Pclass_2,Pclass_3
0,1,1,22.0,1,0,A/5 21171,7.2500,0,0,1,...,0,0,0,0,0,0,1,0,0,1
1,2,0,38.0,1,0,PC 17599,71.2833,0,0,0,...,1,0,0,0,0,0,0,1,0,0
2,3,0,26.0,0,0,STON/O2. 3101282,7.9250,0,1,0,...,0,0,0,0,0,0,1,0,0,1
3,4,0,35.0,1,0,113803,53.1000,0,0,0,...,1,0,0,0,0,0,0,1,0,0
4,5,1,35.0,0,0,373450,8.0500,0,0,1,...,0,0,0,0,0,0,1,0,0,1
5,6,1,28.0,0,0,330877,8.4583,0,0,1,...,0,0,0,0,0,0,1,0,0,1
6,7,1,54.0,0,0,17463,51.8625,0,0,1,...,0,0,1,0,0,0,0,1,0,0
7,8,1,2.0,3,1,349909,21.0750,1,0,0,...,0,0,0,0,0,0,1,0,0,1
8,9,0,27.0,0,2,347742,11.1333,0,0,0,...,0,0,0,0,0,0,1,0,0,1
9,10,0,14.0,1,0,237736,30.0708,0,0,0,...,0,0,0,0,0,0,1,0,1,0


处理船票号：

通过提取票号前缀来预处理船票。 当提取前缀失败时返回XXX。

然后使用虚拟编码对前缀进行编码。

# a function that extracts each prefix of the ticket, returns 'XXX' if no prefix (i.e the ticket is a digit)
def cleanTicket(ticket):
    ticket = ticket.replace('.','')
    ticket = ticket.replace('/','')
    ticket = ticket.split()
    ticket = map(lambda t : t.strip() , ticket)
    ticket = list( filter(lambda t : not t.isdigit(), ticket) )
    if len(ticket) > 0:
        return ticket[0]
    else: 
        return 'UNKNOWN'

In [62]:
total['Ticket'] = total['Ticket'].map(cleanTicket)
tickets_dummies = pd.get_dummies(total['Ticket'],prefix='Ticket')
total = pd.concat([total, tickets_dummies],axis=1)
total.drop('Ticket',inplace=True,axis=1)

KeyError: 'Ticket'

In [None]:
total

处理家庭等信息

SibSp 兄弟姐妹和配偶在船数量

ParCh 父母孩子在船数量

In [63]:
# introducing a new feature : the size of families (including the passenger)
total['FamilySize'] = total['Parch'] + total['SibSp'] + 1
    
# introducing other features based on the family size
total['Singleton'] = total['FamilySize'].map(lambda s : 1 if s == 1 else 0)
total['SmallFamily'] = total['FamilySize'].map(lambda s : 1 if 2<=s<=4 else 0)
total['LargeFamily'] = total['FamilySize'].map(lambda s : 1 if 5<=s else 0)
    

此过程产生了4个新特征：

FamilySize：包括乘客（他/她）在内的亲戚总数。

Sigleton： 一个布尔变量，描述size = 1家族

SmallFamily： 一个布尔变量，描述2 <= size <= 4家族

LargeFamily： 一个布尔变量，描述 5 < size 家族

In [64]:
total

Unnamed: 0,PassengerId,Sex,Age,SibSp,Parch,Fare,Title_Master,Title_Miss,Title_Mr,Title_Mrs,...,Ticket_STONO2,Ticket_STONOQ,Ticket_SWPP,Ticket_WC,Ticket_WEP,Ticket_XXX,FamilySize,Singleton,SmallFamily,LargeFamily
0,1,1,22.0,1,0,7.2500,0,0,1,0,...,0,0,0,0,0,0,2,0,1,0
1,2,0,38.0,1,0,71.2833,0,0,0,1,...,0,0,0,0,0,0,2,0,1,0
2,3,0,26.0,0,0,7.9250,0,1,0,0,...,1,0,0,0,0,0,1,1,0,0
3,4,0,35.0,1,0,53.1000,0,0,0,1,...,0,0,0,0,0,1,2,0,1,0
4,5,1,35.0,0,0,8.0500,0,0,1,0,...,0,0,0,0,0,1,1,1,0,0
5,6,1,28.0,0,0,8.4583,0,0,1,0,...,0,0,0,0,0,1,1,1,0,0
6,7,1,54.0,0,0,51.8625,0,0,1,0,...,0,0,0,0,0,1,1,1,0,0
7,8,1,2.0,3,1,21.0750,1,0,0,0,...,0,0,0,0,0,1,5,0,0,1
8,9,0,27.0,0,2,11.1333,0,0,0,1,...,0,0,0,0,0,1,3,0,1,0
9,10,0,14.0,1,0,30.0708,0,0,0,1,...,0,0,0,0,0,1,2,0,1,0


In [65]:
total.shape

(1309, 68)

经过特征工程后，获得共68个特征

这68个特征的范围处于不同区间，尝试进行标准化
（若使用其他标准化形式呢？）

In [66]:
features = list(total.columns)
features.remove('PassengerId')
total[features] = total[features].apply(lambda x: x/x.max(), axis=0)

In [67]:
total

Unnamed: 0,PassengerId,Sex,Age,SibSp,Parch,Fare,Title_Master,Title_Miss,Title_Mr,Title_Mrs,...,Ticket_STONO2,Ticket_STONOQ,Ticket_SWPP,Ticket_WC,Ticket_WEP,Ticket_XXX,FamilySize,Singleton,SmallFamily,LargeFamily
0,1,1.0,0.27500,0.125,0.000000,0.014151,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.181818,0.0,1.0,0.0
1,2,0.0,0.47500,0.125,0.000000,0.139136,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.181818,0.0,1.0,0.0
2,3,0.0,0.32500,0.000,0.000000,0.015469,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.090909,1.0,0.0,0.0
3,4,0.0,0.43750,0.125,0.000000,0.103644,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.181818,0.0,1.0,0.0
4,5,1.0,0.43750,0.000,0.000000,0.015713,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.090909,1.0,0.0,0.0
5,6,1.0,0.35000,0.000,0.000000,0.016510,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.090909,1.0,0.0,0.0
6,7,1.0,0.67500,0.000,0.000000,0.101229,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.090909,1.0,0.0,0.0
7,8,1.0,0.02500,0.375,0.111111,0.041136,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.454545,0.0,0.0,1.0
8,9,0.0,0.33750,0.000,0.222222,0.021731,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.272727,0.0,1.0,0.0
9,10,0.0,0.17500,0.125,0.000000,0.058694,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.181818,0.0,1.0,0.0


In [68]:
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.cross_validation import StratifiedKFold
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier
from sklearn.cross_validation import cross_val_score



为了评估模型，使用精度指标的5倍交叉验证。
为此，我们将定义一个小的评分函数。

In [69]:
def compute_score(clf, X, y,scoring='accuracy'):
    xval = cross_val_score(clf, X, y, cv = 5,scoring=scoring)
    return np.mean(xval)

In [74]:
def recover_train_test_target():
    global total
    
    train0 = pd.read_csv('train.csv')
    
    targets = train0.Survived
    train = total.iloc[0:890,:]
    test = total.iloc[891:,:]
    
    return train,test,targets

In [75]:
train,test,targets = recover_train_test_target()

In [76]:
train

Unnamed: 0,PassengerId,Sex,Age,SibSp,Parch,Fare,Title_Master,Title_Miss,Title_Mr,Title_Mrs,...,Ticket_STONO2,Ticket_STONOQ,Ticket_SWPP,Ticket_WC,Ticket_WEP,Ticket_XXX,FamilySize,Singleton,SmallFamily,LargeFamily
0,1,1.0,0.2750,0.125,0.000000,0.014151,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.181818,0.0,1.0,0.0
1,2,0.0,0.4750,0.125,0.000000,0.139136,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.181818,0.0,1.0,0.0
2,3,0.0,0.3250,0.000,0.000000,0.015469,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.090909,1.0,0.0,0.0
3,4,0.0,0.4375,0.125,0.000000,0.103644,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.181818,0.0,1.0,0.0
4,5,1.0,0.4375,0.000,0.000000,0.015713,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.090909,1.0,0.0,0.0
5,6,1.0,0.3500,0.000,0.000000,0.016510,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.090909,1.0,0.0,0.0
6,7,1.0,0.6750,0.000,0.000000,0.101229,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.090909,1.0,0.0,0.0
7,8,1.0,0.0250,0.375,0.111111,0.041136,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.454545,0.0,0.0,1.0
8,9,0.0,0.3375,0.000,0.222222,0.021731,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.272727,0.0,1.0,0.0
9,10,0.0,0.1750,0.125,0.000000,0.058694,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.181818,0.0,1.0,0.0


68个特征是相当大的。

当特征工程完成时，我们通过选择必不可少的特征来降低维度。以便减少数据之间的冗余、加快训练过程、减少过度拟合