# タイタニックコンペティションへのSubmission

## このNotebookのねらい
- KaggleにおけるSubmissionを達成する
- 簡単なモデル作成と評価を行う



## このNotebookで重要じゃないこと
- データの観察

## 参考
- [Titanic: Machine Learning from Disaster | Kaggle](https://www.kaggle.com/c/titanic)


# Kaggleについて
- [Devsumi 2018summer](https://www.slideshare.net/HaradaKei/devsumi-2018summer)
- [Kaggleに登録したら次にやること ～ これだけやれば十分闘える！Titanicの先へ行く入門 10 Kernel ～ - Qiita](https://qiita.com/upura/items/3c10ff6fed4e7c3d70f0)

In [1]:
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

warnings.filterwarnings('ignore')


%matplotlib inline

# データの準備

## 読み込み

In [2]:
train = pd.read_csv('input/titanic/train.csv')

# 簡単のために、変数を選択しておく
train = train[['Survived', 'Pclass', 'Embarked', 'Fare']]

train.head()

Unnamed: 0,Survived,Pclass,Embarked,Fare
0,0,3,S,7.25
1,1,1,C,71.2833
2,1,3,S,7.925
3,1,1,S,53.1
4,0,3,S,8.05


## 前処理 

### Embarked on Train data

In [3]:
# ひとまず欠損は最頻値で補完する
train['Embarked'] = train['Embarked'].fillna('S')

# 破壊操作をしているので、2回実行すると消えてしまうので注意
embarked_map = {'S': 0, 'C': 1, 'Q': 2}

train['Embarked'] = train['Embarked'].map(embarked_map)

train.isnull().sum()

Survived    0
Pclass      0
Embarked    0
Fare        0
dtype: int64

In [4]:
train.head()

Unnamed: 0,Survived,Pclass,Embarked,Fare
0,0,3,0,7.25
1,1,1,1,71.2833
2,1,3,0,7.925
3,1,1,0,53.1
4,0,3,0,8.05


## データ分割(ホールドアウト)

In [5]:
X = pd.DataFrame(train.drop(columns='Survived'))
y = train['Survived']

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, train_size=0.8, random_state=0)

In [7]:
X_train.shape, X_valid.shape

((712, 3), (179, 3))

In [8]:
y_train.shape, y_valid.shape

((712,), (179,))

# 学習

## LogisticRegression

In [9]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()

logreg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

### 評価

In [10]:
print(f'Train Accuracy: {logreg.score(X_train, y_train):.5f}')
print(f'Valid Accuracy: {logreg.score(X_valid, y_valid):.5f}')

Train Accuracy: 0.67275
Valid Accuracy: 0.71508


### SVC

In [11]:
from sklearn.svm import SVC

svm = SVC()

svm.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [12]:
print(f'Train Accuracy: {svm.score(X_train, y_train):.5f}')
print(f'Valid Accuracy: {svm.score(X_valid, y_valid):.5f}')

Train Accuracy: 0.74579
Valid Accuracy: 0.74860


# モデル選択
- SVCの方が良かったのでSVCを採択する

In [13]:
# フルでつかう(Kaggleの場合はこれもある)
final_model = SVC()

final_model.fit(X, y)

print(f'Train Accuracy: {final_model.score(X_train, y_train):.5f}')
print(f'Valid Accuracy: {final_model.score(X_valid, y_valid):.5f}')

Train Accuracy: 0.74860
Valid Accuracy: 0.77095


# submission用のデータを作成

In [14]:
test = pd.read_csv('input/titanic/test.csv')

# 注意: test には Survived はないよ
test = test[['Pclass', 'Embarked', 'Fare']]

test.head()

Unnamed: 0,Pclass,Embarked,Fare
0,3,Q,7.8292
1,3,S,7.0
2,2,Q,9.6875
3,3,S,8.6625
4,3,S,12.2875


## 前処理

In [15]:
# trainingと同じ前処理をしないといけない！
# final_model.predict(test) # ValueError: could not convert string to float: 'S'

In [16]:
test['Embarked'] = test['Embarked'].map(embarked_map)

test.head()

Unnamed: 0,Pclass,Embarked,Fare
0,3,2,7.8292
1,3,0,7.0
2,2,2,9.6875
3,3,0,8.6625
4,3,0,12.2875


In [17]:
# Fareの欠損はtestのみ
# 欠損の補完にはいろいろな方法があるが、今回は1件のみの欠損であることと、練習なので中央値補完でよしとする
test['Fare'] = test['Fare'].fillna(test['Fare'].median())

In [18]:
test_pred = final_model.predict(test)

test_pred

array([0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0,
       1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0,

In [19]:
submission = pd.read_csv('input/titanic/gender_submission.csv')

submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [20]:
submission['Survived'] = test_pred

submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [21]:
submission.to_csv('submission/my_first_submission.csv', index=False)

# result
- Train: 0.67275
- Valid: 0.71508
- Test : 0.65071

# 演習
- 今回の結果から何が言えるか考えてみよう

おわり