In [1]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd

# visualization
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
train.info()
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float

In [4]:
combine  = [train, test]

In [5]:
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [6]:
train.describe(include=['O'])

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
count,891,891,891,204,889
unique,891,2,681,147,3
top,"Drew, Mrs. James Vivian (Lulu Thorne Christian)",male,347082,C23 C25 C27,S
freq,1,577,7,4,644


In [7]:
for dataset in combine:
    dataset['Cabin'] = dataset['Cabin'].fillna('U')
    dataset['Cabin'] = dataset['Cabin'].str.extract('([A-Za-z])', expand=False)

In [11]:
train[['Cabin', 'Survived']].groupby(['Cabin'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,Cabin,Survived
3,D,0.757576
4,E,0.75
1,B,0.744681
5,F,0.615385
2,C,0.59322
6,G,0.5
0,A,0.466667
8,U,0.299854
7,T,0.0


In [12]:
for dataset in combine:
    dataset['Cabin'] = dataset['Cabin'].map( {'A': 1, 'B': 0, 'C': 0, 'D': 0, 'E':0, 
                                            'F':0, 'G':1, 'T':1, 'U':1} ).astype(int)

In [13]:
train[['Cabin', 'Survived']].groupby(['Cabin'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,Cabin,Survived
0,0,0.690217
1,1,0.304102


In [14]:
train = train.drop(['Ticket'], axis=1)
test  = test.drop(['Ticket'], axis=1)
combine  = [train, test]


# survival rate distribtion as a function of Pclass
train[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,Pclass,Survived
0,1,0.62963
1,2,0.472826
2,3,0.242363


In [15]:
# obtain Title from name (Mr, Mrs, Miss etc)
for dataset in combine:
    dataset['Title'] = dataset['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)


for dataset in combine:
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess', 'Dona'],'Royalty')
    dataset['Title'] = dataset['Title'].replace(['Mme'], 'Mrs')
    dataset['Title'] = dataset['Title'].replace(['Mlle','Ms'], 'Miss')
    dataset['Title'] = dataset['Title'].replace(['Capt', 'Col', 'Major','Rev'], 'Officer')
    dataset['Title'] = dataset['Title'].replace(['Jonkheer', 'Don','Sir'], 'Royalty')
    #定位和替换
    dataset.loc[(dataset.Sex == 'male')   & (dataset.Title == 'Dr'),'Title'] = 'Mr'
    dataset.loc[(dataset.Sex == 'female') & (dataset.Title == 'Dr'),'Title'] = 'Mrs'

#: count survived rate for different titles
train[['Title', 'Survived']].groupby(['Title'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,Title,Survived
3,Mrs,0.795276
1,Miss,0.702703
5,Royalty,0.6
0,Master,0.575
4,Officer,0.181818
2,Mr,0.1587


In [16]:
train = train.drop(['Name', 'PassengerId'], axis=1)
test = test.drop(['Name'], axis=1)
combine = [train, test]

In [17]:
# if age < 16, set 'Sex' to Child
for dataset in combine:
    dataset.loc[(dataset.Age < 16),'Sex'] = 'Child'

In [18]:
# Age distribution for different values of Pclass and gender
grid = sns.FacetGrid(train, row='Pclass', col='Sex', size=2.2, aspect=1.6)
grid.map(plt.hist, 'Age', bins=20)
grid.add_legend()

<seaborn.axisgrid.FacetGrid at 0x10a9ab6a0>

In [19]:
train['Age'].fillna(train['Age'].mean(), inplace=True)
test['Age'].fillna(test['Age'].mean(), inplace=True)

In [20]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Sex         891 non-null object
Age         891 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Fare        891 non-null float64
Cabin       891 non-null int64
Embarked    889 non-null object
Title       891 non-null object
dtypes: float64(2), int64(5), object(3)
memory usage: 69.7+ KB


In [21]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 10 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Sex            418 non-null object
Age            418 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Fare           417 non-null float64
Cabin          418 non-null int64
Embarked       418 non-null object
Title          418 non-null object
dtypes: float64(2), int64(5), object(3)
memory usage: 32.7+ KB


In [22]:
train['Embarked'].fillna('S', inplace=True)
test['Embarked'].fillna('S', inplace=True)
test['Fare'].fillna(test['Fare'].mean(), inplace=True)

In [23]:
train['AgeBand'] = pd.cut(train['Age'], 5)
train[['AgeBand', 'Survived']].groupby(['AgeBand'], as_index=False).mean().sort_values(by='AgeBand', ascending=True)

Unnamed: 0,AgeBand,Survived
0,"(0.34, 16.336]",0.55
1,"(16.336, 32.252]",0.344168
2,"(32.252, 48.168]",0.404255
3,"(48.168, 64.084]",0.434783
4,"(64.084, 80]",0.090909


In [24]:
for dataset in combine:
    dataset.loc[ dataset['Age'] <= 16, 'Age'] = 0
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
    dataset.loc[ dataset['Age'] > 64, 'Age'] = 4

train = train.drop(['AgeBand'], axis=1)
combine = [train, test]
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Title
0,0,3,male,1.0,1,0,7.25,1,S,Mr
1,1,1,female,2.0,1,0,71.2833,0,C,Mrs
2,1,3,female,1.0,0,0,7.925,1,S,Miss
3,1,1,female,2.0,1,0,53.1,0,S,Mrs
4,0,3,male,2.0,0,0,8.05,1,S,Mr


In [25]:
# Create family size from 'sibsq + parch + 1'
for dataset in combine:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1

In [26]:
train[['FamilySize', 'Survived']].groupby(['FamilySize'], as_index=False).mean().sort_values(by='Survived', ascending=False)

#create another feature called IsAlone
for dataset in combine:
    dataset['IsAlone'] = 0
    dataset.loc[(dataset['FamilySize'] == 1), 'IsAlone'] = 1
    dataset.loc[(dataset['FamilySize'] > 4), 'IsAlone'] = 2

train[['IsAlone','Survived']].groupby(['IsAlone'], as_index=False).mean()


#drop Parch, SibSp, and FamilySize features in favor of IsAlone
train = train.drop(['Parch', 'SibSp', 'FamilySize'], axis=1)
test = test.drop(['Parch', 'SibSp', 'FamilySize'], axis=1)
combine = [train, test]
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Cabin,Embarked,Title,IsAlone
0,0,3,male,1.0,7.25,1,S,Mr,0
1,1,1,female,2.0,71.2833,0,C,Mrs,0
2,1,3,female,1.0,7.925,1,S,Miss,1
3,1,1,female,2.0,53.1,0,S,Mrs,0
4,0,3,male,2.0,8.05,1,S,Mr,1


In [27]:
combine = [train, test]
# Create FareBand
train['FareBand'] = pd.qcut(train['Fare'], 4)
train[['FareBand', 'Survived']].groupby(['FareBand'], as_index=False).mean().sort_values(by='FareBand', ascending=True)

# Convert the Fare feature to ordinal values based on the FareBand
for dataset in combine:
    dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
    dataset.loc[ dataset['Fare'] > 31, 'Fare'] = 3
    dataset['Fare'] = dataset['Fare'].astype(int)

train = train.drop(['FareBand'], axis=1)
combine = [train, test]
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Cabin,Embarked,Title,IsAlone
0,0,3,male,1.0,0,1,S,Mr,0
1,1,1,female,2.0,3,0,C,Mrs,0
2,1,3,female,1.0,1,1,S,Miss,1
3,1,1,female,2.0,3,0,S,Mrs,0
4,0,3,male,2.0,1,1,S,Mr,1


In [30]:
y_train = train['Survived']
X_train = train.drop(['Survived'], axis=1)
X_test = test.drop(['PassengerId'], axis=1)
X_train.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Cabin,Embarked,Title,IsAlone
0,3,male,1.0,0,1,S,Mr,0
1,1,female,2.0,3,0,C,Mrs,0
2,3,female,1.0,1,1,S,Miss,1
3,1,female,2.0,3,0,S,Mrs,0
4,3,male,2.0,1,1,S,Mr,1


In [31]:
# use DictVect to feature data
from sklearn.feature_extraction import DictVectorizer
dict_vec=DictVectorizer(sparse=False)
X_train=dict_vec.fit_transform(X_train.to_dict(orient='record'))
X_test=dict_vec.fit_transform(X_test.to_dict(orient='record'))
dict_vec.feature_names_

['Age',
 'Cabin',
 'Embarked=C',
 'Embarked=Q',
 'Embarked=S',
 'Fare',
 'IsAlone',
 'Pclass',
 'Sex=Child',
 'Sex=female',
 'Sex=male',
 'Title=Master',
 'Title=Miss',
 'Title=Mr',
 'Title=Mrs',
 'Title=Officer',
 'Title=Royalty']

In [32]:
#load RFC and XGBC
from sklearn.ensemble import RandomForestClassifier
rfc=RandomForestClassifier()
from xgboost import XGBClassifier
xgbc=XGBClassifier()

In [33]:
# use cross_validation score to evaluate
from sklearn.cross_validation import cross_val_score
print (cross_val_score(rfc, X_train, y_train, cv=5).mean())
print (cross_val_score(xgbc, X_train, y_train, cv=5).mean())



0.819406926835
0.825006356863


In [141]:
# run rfc
rfc.fit(X_train, y_train)
rfc_y_pred=rfc.predict(X_test)
rfc_sub=pd.DataFrame({'PassengerId':test['PassengerId'], 'Survived':rfc_y_pred})
rfc_sub.to_csv('/Users/Mao/Kaggle/titanic/rfc_sub.csv', index=False)

In [142]:
# run xgbc
xgbc.fit(X_train, y_train)
xgbc_y_pred=xgbc.predict(X_test)
xgbc_sub=pd.DataFrame({'PassengerId':test['PassengerId'], 'Survived':xgbc_y_pred})
xgbc_sub.to_csv('/Users/Mao/Kaggle/titanic/xgbc_sub.csv', index=False)

In [41]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.grid_search import GridSearchCV
params={'max_depth':[3,4,5],'n_estimators':[10, 100, 300],'learning_rate':[0.01,0.1,1.0]}

gbc_best=GradientBoostingClassifier()
gs=GridSearchCV(gbc_best, params, n_jobs=-1, cv=5, verbose=1)
#gs.fit(X_train, y_train)

Fitting 5 folds for each of 27 candidates, totalling 135 fits


[Parallel(n_jobs=-1)]: Done 102 tasks      | elapsed:    9.7s
[Parallel(n_jobs=-1)]: Done 135 out of 135 | elapsed:   13.5s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=100, presort='auto', random_state=None,
              subsample=1.0, verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'max_depth': [3, 4, 5], 'n_estimators': [10, 100, 300], 'learning_rate': [0.01, 0.1, 1.0]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=1)

In [42]:
print (gs.best_score_)
print (gs.best_params_)

0.8338945005611672
{'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100}


In [45]:
params={'max_depth':[2,3,4],'n_estimators':[50, 100, 150],'learning_rate':[0.01,0.02,0.03]}

gbc_best=GradientBoostingClassifier()
gs=GridSearchCV(gbc_best, params, n_jobs=-1, cv=5, verbose=1)
gs.fit(X_train, y_train)

Fitting 5 folds for each of 27 candidates, totalling 135 fits


[Parallel(n_jobs=-1)]: Done 128 out of 135 | elapsed:    5.6s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done 135 out of 135 | elapsed:    6.1s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=100, presort='auto', random_state=None,
              subsample=1.0, verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'max_depth': [2, 3, 4], 'n_estimators': [50, 100, 150], 'learning_rate': [0.01, 0.02, 0.03]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=1)

In [46]:
print (gs.best_score_)
print (gs.best_params_)

0.8338945005611672
{'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100}


In [48]:
gbc_best_y_pred=gs.predict(X_test)
gbc_best_sub=pd.DataFrame({'PassengerId':test['PassengerId'], 'Survived':gbc_best_y_pred})
gbc_best_sub.to_csv('/Users/Mao/Kaggle/titanic/gbc_best_sub.csv', index=False)

In [None]:
# use grid_search to get higher
from sklearn.grid_search import GridSearchCV
params={'max_depth':[4,5],'n_estimators':[100, 300],'learning_rate':[0.05,0.1]}

xgbc_best=XGBClassifier()
gs=GridSearchCV(xgbc_best, params, n_jobs=-1, cv=5, verbose=1)
gs.fit(X_train, y_train)

In [None]:
print (gs.best_score_)
print (gs.best_params_)

In [None]:
xgbc_best_y_pred=gs.predict(X_test)
xgbc_best_sub=pd.DataFrame({'PassengerId':test1['PassengerId'], 'Survived':xgbc_best_y_pred})
xgbc_best_sub.to_csv('/Users/Mao/Kaggle/titanic/xgbc_best_sub.csv', index=False)