In [143]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

train_target = train.Survived
train.drop('Survived',1,inplace=True)

# 第一步 观察、分析数据

In [144]:
# 统计非空行，发现缺失值
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
PassengerId    891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 76.6+ KB


训练集有12个列，891行

数值：
PassangerId、Survived、Pclass、Age、SibSp、Parch、Fare 

字符串：
Name 、Sex 、Ticket 、Cabin 、Embarked 



存在缺失值的列：
Age 714 个有效值，缺失177,经查阅资料，尝试用中值年龄替换空值，对于异常值，这样比平均值鲁棒性更强

Cabin 204 个值，缺失687

Embarked 889个值，缺失2


In [145]:
train['Age'].fillna(train['Age'].median(), inplace=True)
train.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,2.308642,29.361582,0.523008,0.381594,32.204208
std,257.353842,0.836071,13.019697,1.102743,0.806057,49.693429
min,1.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,2.0,22.0,0.0,0.0,7.9104
50%,446.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,3.0,35.0,1.0,0.0,31.0
max,891.0,3.0,80.0,8.0,6.0,512.3292


# 第二步 特征工程

In [146]:
total = train.append(test)

In [147]:
def get_titles():
    # 从名字中提取唯一有用的信息：头衔
    total['Title'] = total['Name'].map(lambda name:name.split(',')[1].split('.')[0].strip())
    
    # 将多个头衔映射到较少的身份
    Title_Dictionary = {
                        "Capt":       "Officer",
                        "Col":        "Officer",
                        "Major":      "Officer",
                        "Jonkheer":   "Nobles",
                        "Don":        "Nobles",
                        "Sir" :       "Nobles",
                        "Dr":         "Officer",
                        "Rev":        "Officer",
                        "the Countess":"Nobles",
                        "Dona":       "Nobles",
                        "Mme":        "Mrs",
                        "Mlle":       "Miss",
                        "Ms":         "Mrs",
                        "Mr" :        "Mr",
                        "Mrs" :       "Mrs",
                        "Miss" :      "Miss",
                        "Master" :    "Master",
                        "Lady" :      "Nobles"
                        }
    
    total['Title'] = total.Title.map(Title_Dictionary)

In [149]:
get_titles()

用平均值或中位年龄代替年龄的缺失值，可能不是最佳解决方案，因为年龄可能因团体和乘客类别而异。

尝试按性别，标题和乘客类分组。

In [150]:
grouped_train = total.head(891).groupby(['Sex','Pclass','Title'])
grouped_median_train = grouped_train.median()
grouped_test = total.iloc[891:].groupby(['Sex','Pclass','Title'])
grouped_median_test = grouped_test.median()

In [None]:
grouped_median_train

In [151]:
def process_age():
    
    global total
    
    # a function that fills the missing values of the Age variable
    
    def fillAges(row):
        if row['Sex']=='female' and row['Pclass'] == 1:
            if row['Title'] == 'Miss':
                return 30
            elif row['Title'] == 'Mrs':
                return 45
            elif row['Title'] == 'Officer':
                return 49
            elif row['Title'] == 'Royalty':
                return 39
        elif row['Sex']=='female' and row['Pclass'] == 2:
            if row['Title'] == 'Miss':
                return 20
            elif row['Title'] == 'Mrs':
                return 30
        elif row['Sex']=='female' and row['Pclass'] == 3:
            if row['Title'] == 'Miss':
                return 18
            elif row['Title'] == 'Mrs':
                return 31
        elif row['Sex']=='male' and row['Pclass'] == 1:
            if row['Title'] == 'Master':
                return 6
            elif row['Title'] == 'Mr':
                return 41.5
            elif row['Title'] == 'Officer':
                return 52
            elif row['Title'] == 'Royalty':
                return 40
        elif row['Sex']=='male' and row['Pclass'] == 2:
            if row['Title'] == 'Master':
                return 2
            elif row['Title'] == 'Mr':
                return 30
            elif row['Title'] == 'Officer':
                return 41.5
        elif row['Sex']=='male' and row['Pclass'] == 3:
            if row['Title'] == 'Master':
                return 6
            elif row['Title'] == 'Mr':
                return 26
    
    total.Age = total.apply(lambda r : fillAges(r) if np.isnan(r['Age']) else r['Age'], axis=1)
    

In [152]:
process_age()

In [154]:
def process_names():
    
    global total
    # 删去name
    total.drop('Name',axis=1,inplace=True)
    
    # 将分类变量title转换为“哑变量矩阵”或“指标矩阵”。
    titles_dummies = pd.get_dummies(total['Title'],prefix='Title')
    total = pd.concat([total,titles_dummies],axis=1)
    
    # 删去title
    total.drop('Title',axis=1,inplace=True)

In [155]:
process_names()

处理登船港口编号

In [156]:
#因为只有一个缺失值，船票价使用平均值填充
total.Fare.fillna(total.Fare.mean(),inplace=True)

In [157]:
# 登船港口编号只有两个缺失，使用最频繁的S填入
total.Embarked.fillna('S',inplace=True)    
# 转化为哑变量
embarked_dummies = pd.get_dummies(total['Embarked'],prefix='Embarked')
total = pd.concat([total,embarked_dummies],axis=1)
total.drop('Embarked',axis=1,inplace=True)

处理客舱位置

In [158]:
# 用U代替NA
total.Cabin.fillna('U',inplace=True)
    
# 用首字母映射舱位
total['Cabin'] = total['Cabin'].map(lambda c : c[0])
    
# 转化为哑变量
cabin_dummies = pd.get_dummies(total['Cabin'],prefix='Cabin')
    
total = pd.concat([total,cabin_dummies],axis=1)
    
total.drop('Cabin',axis=1,inplace=True)

处理乘客性别：映射到1与0

In [159]:
total['Sex'] = total['Sex'].map({'male':1,'female':0})

处理客舱等级

In [160]:
pclass_dummies = pd.get_dummies(total['Pclass'],prefix="Pclass")
# adding dummy variables
total = pd.concat([total,pclass_dummies],axis=1)
    
# removing "Pclass"
    
total.drop('Pclass',axis=1,inplace=True)

处理船票号：

通过提取票号前缀来预处理船票。 当提取前缀失败时返回XXX。

然后使用虚拟编码对前缀进行编码。

In [161]:
# a function that extracts each prefix of the ticket, returns 'XXX' if no prefix (i.e the ticket is a digit)
def cleanTicket(ticket):
    ticket = ticket.replace('.','')
    ticket = ticket.replace('/','')
    ticket = ticket.split()
    ticket = map(lambda t : t.strip() , ticket)
    ticket = list( filter(lambda t : not t.isdigit(), ticket) )
    if len(ticket) > 0:
        return ticket[0]
    else: 
        return 'UNKNOWN'

In [162]:
total['Ticket'] = total['Ticket'].map(cleanTicket)
tickets_dummies = pd.get_dummies(total['Ticket'],prefix='Ticket')
total = pd.concat([total, tickets_dummies],axis=1)
total.drop('Ticket',inplace=True,axis=1)

处理家庭等信息

SibSp 兄弟姐妹和配偶在船数量

ParCh 父母孩子在船数量

In [163]:
# introducing a new feature : the size of families (including the passenger)
total['FamilySize'] = total['Parch'] + total['SibSp'] + 1
    
# introducing other features based on the family size
total['Singleton'] = total['FamilySize'].map(lambda s : 1 if s == 1 else 0)
total['SmallFamily'] = total['FamilySize'].map(lambda s : 1 if 2<=s<=4 else 0)
total['LargeFamily'] = total['FamilySize'].map(lambda s : 1 if 5<=s else 0)
    

此过程产生了4个新特征：

FamilySize：包括乘客（他/她）在内的亲戚总数。

Sigleton： 一个布尔变量，描述size = 1家族

SmallFamily： 一个布尔变量，描述2 <= size <= 4家族

LargeFamily： 一个布尔变量，描述 5 < size 家族

In [164]:
total.shape

(1309, 68)

经过特征工程后，获得共68个特征

这68个特征的范围处于不同区间，尝试进行标准化
（若使用其他标准化形式呢？）

In [165]:
features = list(total.columns)
features.remove('PassengerId')
total[features] = total[features].apply(lambda x: x/x.max(), axis=0)

In [166]:
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.cross_validation import StratifiedKFold
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier
from sklearn.cross_validation import cross_val_score

为了评估模型，使用精度指标的5倍交叉验证。
为此，我们将定义一个小的评分函数。

In [167]:
def compute_score(clf, X, y,scoring='accuracy'):
    xval = cross_val_score(clf, X, y, cv = 5,scoring=scoring)
    return np.mean(xval)

In [168]:
def recover_train_test_target():
    global total
    
    train0 = pd.read_csv('train.csv')
    
    targets = train0.Survived
    train = total.iloc[:891,:]
    test = total.iloc[891:,:]
    
    return train,test,targets

In [169]:
train,test,targets = recover_train_test_target()

68个特征是相当大的。

当特征工程完成时，我们通过选择必不可少的特征来降低维度。以便减少数据之间的冗余、加快训练过程、减少过度拟合

可以使用基于树的估计器来计算特征重要性，这又可以用于丢弃不相关的特征。

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
clf = ExtraTreesClassifier(n_estimators=200)
clf = clf.fit(train, targets)

In [None]:
features = pd.DataFrame()
features['feature'] = train.columns
features['importance'] = clf.feature_importances_

In [None]:
features.sort_values(by=['importance'], ascending=True, inplace=True)
features.set_index('feature', inplace=True)
features.plot(kind='barh', figsize=(10, 10))

In [None]:
model = SelectFromModel(clf, prefit=True)
train_new = model.transform(train)
train_new.shape

In [None]:
test_new = model.transform(test)
test_new.shape

# 调参巫师!

我们将使用随机森林模型，随机森林相当方便。 然而，他们有一些参数进行调整，以获得预测任务的最佳模型。
要了解有关随机森林的更多信息，可以参考以下链接：https：//www.analyticsvidhya.com/blog/2015/06/tuning-random-forest-model/

In [None]:
# 6. turn run_gs to True if you want to run the gridsearch again.
run_gs = False
if run_gs:
    parameter_grid = {
                 'max_depth' : [4, 6, 8],
                 'n_estimators': [50, 10],
                 'max_features': ['sqrt', 'auto', 'log2'],
                 'min_samples_split': [1, 3, 10],
                 'min_samples_leaf': [1, 3, 10],
                 'bootstrap': [True, False],
                 }
    forest = RandomForestClassifier()
    cross_validation = StratifiedKFold(targets, n_folds=5)
    grid_search = GridSearchCV(forest,
                               scoring='accuracy',
                               param_grid=parameter_grid,
                               cv=cross_validation)
    grid_search.fit(train, targets)
    model = grid_search
    parameters = grid_search.best_params_
    print('Best score: {}'.format(grid_search.best_score_))
    print('Best parameters: {}'.format(grid_search.best_params_))
else: 
    parameters = {'bootstrap': False, 'min_samples_leaf': 3, 'n_estimators': 50, 
                  'min_samples_split': 10, 'max_features': 'sqrt', 'max_depth': 6}
    
    model = RandomForestClassifier(**parameters)
    model.fit(train, targets)

In [None]:
compute_score(model, train, targets, scoring='accuracy')

In [None]:
output = model.predict(test).astype(int)
df_output = pd.DataFrame()
aux = pd.read_csv('test.csv')
df_output['PassengerId'] = aux['PassengerId']

df_output['Survived'] = output
df_output[['PassengerId','Survived']].to_csv('output.csv',index=False)

