In [1]:
import pandas as pd
import numpy as np
import re
import sklearn
import xgboost as xgb
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import KFold, RepeatedKFold;



In [2]:
# load in train and test datasets
train = pd.read_csv("./data/train.csv")
test = pd.read_csv("./data/test.csv")

#Store our passenger ID for easy access
PassengerId = test['PassengerId']

train.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [3]:
full_data = [train, test]

train['Name_length'] = train['Name'].apply(len)
test['Name_length'] = test['Name'].apply(len)

train['Has_Cabin'] = train['Cabin'].apply(lambda x:0 if type(x) == float else 1)
test['Has_Cabin'] = test['Cabin'].apply(lambda x:0 if type(x) == float else 1)

#Feature engineering
#Create new feature FamilySize as a combination of sibsp and Parch
for dataset in full_data:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
# Create new feature IsAlone from FamilySize
for dataset in full_data:
    dataset['IsAlone'] = 0
    dataset.loc[dataset['FamilySize'] == 1, 'IsAlone' ] = 1

for dataset in full_data:
    dataset['Embarked'] = dataset['Embarked'].fillna('S')
for dataset in full_data:
    dataset['Fare']= dataset['Fare'].fillna(train['Fare'].median())
train['CategoricalFare'] = pd.qcut(train['Fare'],4)

for dataset in full_data:
    age_avg = dataset['Age'].mean()
    age_std = dataset['Age'].std()
    age_null_count = dataset['Age'].isnull().sum()
    age_null_random_list = np.random.randint(age_avg - age_std, age_avg + age_std, size=age_null_count)
    dataset['Age'][np.isnan(dataset['Age'])] = age_null_random_list
    dataset['Age'] = dataset['Age'].astype(int)
train['CategoricalAge'] = pd.cut(train['Age'], 5)
# Define the function to extract titles from passengers' names 
def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    if title_search:
        return title_search.group(1)
    return ""
# Create a new feature Title, containing the titles of passenger names
for dataset in full_data:
    dataset['Title'] = dataset['Name'].apply(get_title)
    
# Group all non-common titles into one single grouping "Rare"
for dataset in full_data:
    dataset['Title'] = dataset['Title'].replace(['Lady','Countess','Capt','Col','Don',
                                                 'Dr','Major','Rev','Sir','Jonkheer','Dona'],'Rare')
    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')

for dataset in full_data:
    dataset['Sex'] = dataset['Sex'].map( {'female':0, 'male':1}).astype(int)
    title_mapping = {"Mr":1 ,"Miss":2, "Mrs":3 ,"Master": 4 ,"Rare":5}
    dataset['Title'] = dataset['Title'].map(title_mapping)
    dataset['Title'] = dataset['Title'].fillna(0)
    
    dataset['Embarked'] = dataset['Embarked'].map({'S':0, 'C':1 ,'Q':2}).astype(int)
    dataset.loc[ dataset['Fare'] <= 7.91, 'Fare']  = 0
    dataset.loc[ (dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[ (dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare'] = 2
    dataset.loc[ (dataset['Fare'] > 31), 'Fare'] = 3
    dataset['Fare'] = dataset['Fare'].astype(int)
    
    dataset.loc[ dataset['Age'] <= 16, 'Age' ] = 0
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
    dataset.loc[ dataset['Age'] > 64, 'Age'] = 4 ;
    
# Feature selection
drop_elements = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp']
train = train.drop(drop_elements, axis = 1)
train = train.drop(['CategoricalAge', 'CategoricalFare'], axis = 1)
test  = test.drop(drop_elements, axis = 1)
    

In [4]:
train.head(5)

Unnamed: 0,Survived,Pclass,Sex,Age,Parch,Fare,Embarked,Name_length,Has_Cabin,FamilySize,IsAlone,Title
0,0,3,1,1,0,0,0,23,0,2,0,1
1,1,1,0,2,0,3,1,51,1,2,0,3
2,1,3,0,1,0,1,0,22,0,1,1,2
3,1,1,0,2,0,3,0,44,1,2,0,3
4,0,3,1,2,0,1,0,24,0,1,1,1


In [5]:
class SklearnHelper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)
        
    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)
    
    def predict(self, x):
        return self.clf.predict(x)
    
    def fit(self, x, y):
        return self.clf.fit(x,y)
    
    def feature_importances(self, x, y):
        print(self.clf.fit(x,y).feature_importances_)


In [6]:
ntrain = train.shape[0]
ntest = test.shape[0]
SEED = 226
NFOLDS = 5
kf = KFold(n_splits=NFOLDS, shuffle=True ,random_state=SEED) #K-交叉验证

#kf = RepeatedKFold(n_splits=NFOLDS, n_repeats =2 ,random_state=SEED) #K-交叉重复验证
def get_oof(clf, x_train, y_train, x_test):   #x_train 训练集 y_train 训练集标签 x_test测试集
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))
    
    for i,(train_index, test_index) in enumerate(kf.split(x_train)): #train_index 被分到训练集中的index  同理test_index
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]
        y_te = y_train[test_index]
        clf.train(x_tr, y_tr)
        
        oof_train[test_index] = clf.predict(x_te) # 用验证数据集验证训练集训练好的模型
        
        #oof_test_skf[i, :] = clf.predict(x_test)  # 用测试集测试
        print("RF K-fold: %d accuracy score is %f " % (i,accuracy_score(y_te.reshape(-1),clf.predict(x_te).reshape(-1))))
    #oof_test[:] = oof_test_skf.mean(axis=0)
    oof_test[:] = clf.predict(x_test)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)   

In [12]:
# Put in our parameters for said classifiers
nn_params = {
    'hidden_layer_sizes': 10,         # The number of trees in the forest
     #'warm_start': True, 
     'activation': 'relu',
    'solver': 'adam',
    'alpha': 0.0001,  # regularization term
    'batch_size' : 28,    # the max number of features to consider when looking for the best split
    'verbose': 1,
    'learning_rate_init':0.003,
    'max_iter':200,              # Maximum number of iterations. 
    'early_stopping':False,
    'beta_1':0.9,             # use when solver=’adam’
    'beta_2':0.999,
    'epsilon':1e-8
}

In [13]:
# Create 5 objects that represent our 4 models
NN = SklearnHelper(clf=MLPClassifier, seed=SEED, params=nn_params)

In [14]:
# Create Numpy arrays of train, test and target ( Survived) dataframes to feed into our models
y_train = train['Survived'].ravel()
train = train.drop(['Survived'], axis=1)
x_train = train.values # Creates an array of the train data
x_test = test.values # Creats an array of the test data

In [15]:
# Create our OOF train and test predictions. These base results will be used as new features
nn_oof_train0, nn_oof_test0 = get_oof(NN,x_train, y_train, x_test)
print("Training is complete")

Iteration 1, loss = 0.91618456
Iteration 2, loss = 0.80141046
Iteration 3, loss = 0.76992917
Iteration 4, loss = 0.74829168
Iteration 5, loss = 0.72742811
Iteration 6, loss = 0.70755718
Iteration 7, loss = 0.68934297
Iteration 8, loss = 0.67127545
Iteration 9, loss = 0.65533819
Iteration 10, loss = 0.64183796
Iteration 11, loss = 0.63123984
Iteration 12, loss = 0.61955149
Iteration 13, loss = 0.61088637
Iteration 14, loss = 0.60358504
Iteration 15, loss = 0.59489157
Iteration 16, loss = 0.58784471
Iteration 17, loss = 0.58101972
Iteration 18, loss = 0.57479125
Iteration 19, loss = 0.56948513
Iteration 20, loss = 0.56321918
Iteration 21, loss = 0.55704330
Iteration 22, loss = 0.55195045
Iteration 23, loss = 0.54773001
Iteration 24, loss = 0.54353337
Iteration 25, loss = 0.53985127
Iteration 26, loss = 0.53378339
Iteration 27, loss = 0.53056033
Iteration 28, loss = 0.52643931
Iteration 29, loss = 0.52310039
Iteration 30, loss = 0.52009980
Iteration 31, loss = 0.51645969
Iteration 32, los

Iteration 131, loss = 0.43852155
Iteration 132, loss = 0.43853443
Iteration 133, loss = 0.44098704
Iteration 134, loss = 0.43872602
Training loss did not improve more than tol=0.000100 for two consecutive epochs. Stopping.
RF K-fold: 1 accuracy score is 0.820225 
Iteration 1, loss = 0.92100679
Iteration 2, loss = 0.80010646
Iteration 3, loss = 0.76658937
Iteration 4, loss = 0.74625826
Iteration 5, loss = 0.72631712
Iteration 6, loss = 0.70681588
Iteration 7, loss = 0.68986649
Iteration 8, loss = 0.67348302
Iteration 9, loss = 0.65845883
Iteration 10, loss = 0.64537120
Iteration 11, loss = 0.63442864
Iteration 12, loss = 0.62390336
Iteration 13, loss = 0.61588062
Iteration 14, loss = 0.61021485
Iteration 15, loss = 0.60308820
Iteration 16, loss = 0.59380047
Iteration 17, loss = 0.58764130
Iteration 18, loss = 0.58264537
Iteration 19, loss = 0.57606121
Iteration 20, loss = 0.57100423
Iteration 21, loss = 0.56592812
Iteration 22, loss = 0.56085036
Iteration 23, loss = 0.55683323
Iteration

Iteration 117, loss = 0.44024085
Iteration 118, loss = 0.43886826
Iteration 119, loss = 0.44168026
Iteration 120, loss = 0.44060943
Iteration 121, loss = 0.43779033
Iteration 122, loss = 0.43799605
Iteration 123, loss = 0.43732623
Iteration 124, loss = 0.43746177
Iteration 125, loss = 0.43670172
Iteration 126, loss = 0.43644253
Iteration 127, loss = 0.43650152
Iteration 128, loss = 0.43648184
Iteration 129, loss = 0.43634905
Training loss did not improve more than tol=0.000100 for two consecutive epochs. Stopping.
RF K-fold: 3 accuracy score is 0.820225 
Iteration 1, loss = 0.92051348
Iteration 2, loss = 0.79753057
Iteration 3, loss = 0.76746464
Iteration 4, loss = 0.74950421
Iteration 5, loss = 0.72947996
Iteration 6, loss = 0.70943067
Iteration 7, loss = 0.69049381
Iteration 8, loss = 0.67386197
Iteration 9, loss = 0.65745330
Iteration 10, loss = 0.64396128
Iteration 11, loss = 0.63197588
Iteration 12, loss = 0.62114819
Iteration 13, loss = 0.61304679
Iteration 14, loss = 0.60448256


In [17]:
#print(rf_oof_train.reshape(-1))
print(accuracy_score(y_train,NN.predict(x_train)))

0.800224466891


In [12]:
#直接某个基学习器
#print(rf_oof_test.round())
#StackingSubmission = pd.DataFrame({ 'PassengerId': PassengerId,
#                           'Survived': rf_oof_test.round().reshape(-1).astype(np.int8)})
#StackingSubmission.to_csv("RFSubmission.csv", index=False)