In [1]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

In [2]:
#データクレンジング
train_df = pd.read_csv('../input/train.csv')
test_df = pd.read_csv('../input/test.csv')
combine = [train_df, test_df]

train_df=train_df.drop(['Ticket','Cabin'],axis=1)
test_df=test_df.drop(['Ticket','Cabin'],axis=1)
combine=[train_df,test_df]

for dataset in combine:
    #正規表現のパターンで分割するstr.extract,expand=Trueだとpandas.DataFrame [A-Za-z]１桁以上の半角英（a～z、A～Z）にマッチ
    dataset['Title']=dataset.Name.str.extract('([A-Za-z]+)\.',expand=False)

for dataset in combine:
    #要素の値を置換するには、replace()
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col',\
    'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')
    
title_mapping={'Mr':1,'Miss':2,"Mrs":3,'Master':4,'Rare':5}
for dataset in combine:
    #map("A":"B")でAをBに入れ替える。
    dataset['Title']=dataset['Title'].map(title_mapping)
    #fillna欠損値を他の値に置換
    dataset['Title']=dataset['Title'].fillna(0)

train_df=train_df.drop(['Name','PassengerId'],axis=1)
test_df=test_df.drop(['Name','PassengerId'],axis=1)
combine=[train_df,test_df]

for dataset in combine:
    dataset['Sex']=dataset['Sex'].map({'female':1,'male':0}).astype(int)

guess_ages=np.zeros((2,3))
for dataset in combine:
    for i in range(0,2):
        for j in range(0,3):
            #欠損値を除外dropna
            guess_df=dataset[(dataset['Sex']==i)& (dataset['Pclass']==j+1)]['Age'].dropna()
            #median中央値
            age_guess=guess_df.median()
            guess_ages[i,j]=int(age_guess/0.5+0.5)*0.5
        
    for i in range(0, 2):
        for j in range(0, 3):
            dataset.loc[ (dataset.Age.isnull()) & (dataset.Sex == i) & (dataset.Pclass == j+1),\
                    'Age'] = guess_ages[i,j]

    dataset['Age'] = dataset['Age'].astype(int)

train_df['AgeBand']=pd.cut(train_df['Age'],5)
test_df['AgeBand']=pd.cut(test_df['Age'],5)

for dataset in combine:
    dataset.loc[dataset['Age']<=16,'Age']=0
    dataset.loc[(dataset['Age']>16)&(dataset['Age']<=32),'Age']=1
    dataset.loc[(dataset['Age']>32)&(dataset['Age']<=48),'Age']=2
    dataset.loc[(dataset['Age']>48)&(dataset['Age']<=64),'Age']=3
    dataset.loc[dataset['Age']>64,'Age']=4
    
train_df=train_df.drop(['AgeBand'],axis=1)
test_df=test_df.drop(['AgeBand'],axis=1)
combine=[train_df,test_df]

for dataset in combine:
    dataset['FamilySize']=dataset['SibSp']+dataset['Parch']+1

for dataset in combine:
    dataset['IsAlone']=0
    dataset.loc[dataset['FamilySize']==1,'IsAlone']=1

train_df=train_df.drop(['Parch','SibSp','FamilySize'],axis=1)
test_df=test_df.drop(['Parch','SibSp','FamilySize'],axis=1)
combine=[train_df,test_df]

for dataset in combine:
    dataset['Age*Class']=dataset.Age*dataset.Pclass
    
freq_port=train_df.Embarked.dropna().mode()[0]
for dataset in combine:
    #欠損値を他の値に置換（穴埋め）するにはfillna()
    dataset['Embarked']=dataset['Embarked'].fillna(freq_port)
    
for dataset in combine:
    dataset['Embarked']=dataset['Embarked'].map({'S':0,'C':1,'Q':2}).astype(int)

test_df['Fare'].fillna(test_df['Fare'].dropna().median(),inplace=True)
train_df['FareBand'] = pd.qcut(train_df['Fare'], 4)

for dataset in combine:
    dataset.loc[dataset['Fare']<=7.91,'Fare']=0
    dataset.loc[(dataset['Fare']>7.91)&(dataset['Fare']<=14.454),'Fare']=1
    dataset.loc[(dataset['Fare']>14.454)&(dataset['Fare']<=31.0),'Fare']=2
    dataset.loc[dataset['Fare']>31.0,'Fare']=3
    dataset['Fare']=dataset['Fare'].astype(int)
    
train_df=train_df.drop(['FareBand'],axis=1)
combine=[train_df,test_df]

In [3]:
x_train=train_df.drop("Survived",axis=1)
y_train=train_df["Survived"]
x_test=test_df.copy()


x_train.shape, y_train.shape, x_test.shape

((891, 8), (891,), (418, 8))

In [4]:
logreg=LogisticRegression()
logreg.fit(x_train,y_train)
y_pred=logreg.predict(x_test)
acc_logreg=round(logreg.score(x_train,y_train)*100,2)
acc_logreg



81.26

In [5]:
coeff_df=pd.DataFrame(train_df.columns.delete(0))
coeff_df.columns=['Feature']
#model.coef_  #回帰変数の表示
coeff_df['Correlation']=pd.Series(logreg.coef_[0])
coeff_df.sort_values(by='Correlation',ascending=False)


Unnamed: 0,Feature,Correlation
1,Sex,2.200978
5,Title,0.414362
4,Embarked,0.281026
6,IsAlone,0.262084
3,Fare,-0.022062
7,Age*Class,-0.085959
2,Age,-0.371647
0,Pclass,-1.083515


In [31]:
svc=SVC()
svc.fit(x_train,y_train)
y_pred=svc.predict(x_test)
acc_svc=round(svc.score(x_train,y_train)*100,2)
acc_svc



83.5

In [32]:
knn=KNeighborsClassifier(n_neighbors=3)
knn.fit(x_train,y_train)
y_pred=knn.predict(x_test)
acc_knn=round(knn.score(x_train,y_train)*100,2)
acc_knn

84.06

In [33]:
gaussian=GaussianNB()
gaussian.fit(x_train,y_train)
y_pred=gaussian.predict(x_test)
acc_gaussian=round(gaussian.score(x_train,y_train)*100,2)
acc_gaussian

76.88

In [34]:
perceptron=Perceptron()
perceptron.fit(x_train,y_train)
y_pred=perceptron.predict(x_test)
acc_perceptron=round(perceptron.score(x_train,y_train)*100,2)
acc_perceptron



78.79

In [35]:
linear_svc=LinearSVC()
linear_svc.fit(x_train,y_train)
y_pred=linear_svc.predict(x_test)
acc_linear_svc=round(linear_svc.score(x_train,y_train)*100,2)
acc_linear_svc



79.46

In [36]:
sgd = SGDClassifier()
sgd.fit(x_train, y_train)
y_pred = sgd.predict(x_test)
acc_sgd = round(sgd.score(x_train, y_train) * 100, 2)
acc_sgd



77.44

In [37]:
decision_tree=DecisionTreeClassifier()
decision_tree.fit(x_train, y_train)
y_decision_tree = decision_tree.predict(x_test)
acc_decision_tree = round(decision_tree.score(x_train, y_train) * 100, 2)
acc_decision_tree

86.64

In [6]:
random_forest=RandomForestClassifier(n_estimators=100)
random_forest.fit(x_train,y_train)
y_pred=random_forest.predict(x_test)
acc_random_forest=round(random_forest.score(x_train,y_train)*100,2)
acc_random_forest

86.64

In [46]:
models = pd.DataFrame({
    'Model': ['Support Vector Machines', 'KNN', 'Logistic Regression', 
              'Random Forest', 'Naive Bayes', 'Perceptron', 
              'Stochastic Gradient Decent', 'Linear SVC', 
              'Decision Tree'],
    'Score':[acc_svc, acc_knn, acc_logreg, 
              acc_random_forest, acc_gaussian, acc_perceptron, 
              acc_sgd, acc_linear_svc, acc_decision_tree]})
models.sort_values(by='Score',ascending=False)

Unnamed: 0,Model,Score
3,Random Forest,86.64
8,Decision Tree,86.64
1,KNN,84.06
0,Support Vector Machines,83.5
2,Logistic Regression,81.26
7,Linear SVC,79.46
5,Perceptron,78.79
6,Stochastic Gradient Decent,77.44
4,Naive Bayes,76.88


In [13]:
grid_n_estimator = [10, 50, 100, 300]
grid_max_depth = [2, 4, 6, 8, 10, None]
grid_criterion = ['gini', 'entropy']
grid_seed = [0]
param_grid={            #RandomForestClassifier - http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html#sklearn.ensemble.RandomForestClassifier
            'n_estimators': grid_n_estimator, #default=10
            'criterion': grid_criterion, #default=”gini”
            'max_depth': grid_max_depth, #default=None
            'oob_score': [True], #default=False -- 12/31/17 set to reduce runtime -- The best parameter for RandomForestClassifier is {'criterion': 'entropy', 'max_depth': 6, 'n_estimators': 100, 'oob_score': True, 'random_state': 0} with a runtime of 146.35 seconds.
            'random_state': grid_seed
             }
from sklearn import model_selection
tune_model=model_selection.GridSearchCV(RandomForestClassifier(),param_grid=param_grid,scoring='roc_auc',verbose=0)
tune_model.fit(x_train,y_train)
y_pred=tune_model.predict(x_test)

  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])

  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])

In [14]:
test_df=pd.read_csv('../input/test.csv')
submission=pd.DataFrame({
    'PassengerId':test_df['PassengerId'],
    'Survived':y_pred
})

submission.to_csv('./submission.csv',index=False)

In [15]:
submission=pd.read_csv('./submission.csv')
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
