# Ⅱ. タイタニック号沈没における生存可否

## 1. モジュール読込

In [None]:
import numpy as np
import pandas as pd
from pandas import Series,DataFrame

import math

#プロット用
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

# matplotlibの表示設定
%matplotlib inline

# 機械学習用
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split

# 性能評価用
from sklearn import metrics

import statsmodels.api as sm

## 2. データ整形

### データの一部を表示

In [None]:
train_df = pd.read_csv('C:/workspace/python/logistic/titanic/train.csv',engine='python')
test_df = pd.read_csv('C:/workspace/python/logistic/titanic/test.csv',engine='python')

train_df.head(3)

In [None]:
#利用しない変数を削除
dropped_train_df = train_df.drop(['Name','SibSp','Parch','Ticket','Fare','Cabin','Embarked'],axis=1)
dropped_test_df = test_df.drop(['Name','SibSp','Parch','Ticket','Fare','Cabin','Embarked'],axis=1)

dropped_train_df.head(3)

In [None]:
# データの特性（個数、平均値、標準偏差、最大値、最小値、四分位数など）を表示
dropped_train_df.describe()

In [None]:
# 年齢の欠損値を男女別の平均年齢で補間
age_train_mean = dropped_train_df.groupby('Sex').Age.mean()

def fage(x):
    if x.Sex == 'male':
        return round(age_train_mean['male'])
    if x.Sex == 'female':
        return round(age_train_mean['female'])
 
dropped_train_df.Age.fillna(train_df[train_df.Age.isnull()].apply(fage,axis=1),inplace=True)

age_test_mean = dropped_test_df.groupby('Sex').Age.mean()
 
def fage(x):
    if x.Sex == 'male':
        return round(age_test_mean['male'])
    if x.Sex == 'female':
        return round(age_test_mean['female'])
 
dropped_test_df.Age.fillna(test_df[test_df.Age.isnull()].apply(fage,axis=1),inplace=True)

dropped_train_df.head(3)

In [None]:
### クロス集計

In [None]:
# 性別のクロス集計
sex_ct = pd.crosstab(dropped_train_df['Sex'], dropped_train_df['Survived'])
sex_ct

In [None]:
# 階級のクロス集計
pclass_ct = pd.crosstab(dropped_train_df['Pclass'], dropped_train_df['Survived'])
pclass_ct

### ダミー変数に変換

In [None]:
# ダミー変数
# 性別のmale/femaleを0/1に変換し、要素を追加
dropped_train_df['Female'] = dropped_train_df['Sex'].map( {'male': 0, 'female': 1} ).astype(int)
dropped_test_df['Female'] = dropped_test_df['Sex'].map( {'male': 0, 'female': 1} ).astype(int)

dropped_train_df.head(3)

In [None]:
#ダミー変数
pclass_train_df  = pd.get_dummies(dropped_train_df['Pclass'],prefix='Class')
pclass_test_df  = pd.get_dummies(dropped_test_df['Pclass'],prefix='Class')

pclass_train_df.head(3)

In [None]:
# 多重共線性
pclass_train_df = pclass_train_df.drop(['Class_3'], axis=1)
pclass_test_df = pclass_test_df.drop(['Class_3'], axis=1)

#Class_1,Class_2カラムを追加
dropped_train_df = dropped_train_df.join(pclass_train_df)
dropped_test_df = dropped_test_df.join(pclass_test_df)

# 不必要な列と目的変数を削除
X = dropped_train_df.drop(['PassengerId','Survived','Pclass','Sex'],axis=1)

X.head(3)

In [None]:
# Yに目的変数を格納
Y = dropped_train_df.Survived

Y.head(3)

## 3. モデル作成

In [None]:
# インスタンス作成
log_model = LogisticRegression()

# データを代入して学習
log_model.fit(X,Y)

## 4. 評価

In [None]:
# モデルの精度を表示
log_model.score(X,Y)

## 5. 結果

In [None]:
# 変数名と係数を格納
coeff_df = DataFrame([X.columns, log_model.coef_[0]]).T
coeff_df

## 6. おまけ

In [None]:
dropped_test_df.head(3)

In [None]:
# 不必要な列と目的変数を削除
X = dropped_test_df.drop(['PassengerId','Pclass','Sex'],axis=1)

# テスト用データで予測
result = log_model.predict(X)

result

In [None]:
result_df = pd.DataFrame({'PassengerId': test_df['PassengerId'], 'Survived':np.array(result)})
result_df.head()