# Ⅱ. タイタニック号沈没における生存可否

## 1. モジュール読込

In [23]:
import numpy as np
import pandas as pd
from pandas import Series,DataFrame

import math

#プロット用
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

# matplotlibの表示設定
%matplotlib inline

# 機械学習用
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split

# 性能評価用
from sklearn import metrics

import statsmodels.api as sm

## 2. データ整形

### データの一部を表示

In [24]:
train_df = pd.read_csv('C:/workspace/python/train.csv',engine='python')
test_df = pd.read_csv('C:/workspace/python/test.csv',engine='python')

train_df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [25]:
#利用しない変数を削除
dropped_train_df = train_df.drop(['Name','SibSp','Parch','Ticket','Fare','Cabin','Embarked'],axis=1)
dropped_test_df = test_df.drop(['Name','SibSp','Parch','Ticket','Fare','Cabin','Embarked'],axis=1)

dropped_train_df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age
0,1,0,3,male,22.0
1,2,1,1,female,38.0
2,3,1,3,female,26.0


In [26]:
# データの特性（個数、平均値、標準偏差、最大値、最小値、四分位数など）を表示
dropped_train_df.describe()



Unnamed: 0,PassengerId,Survived,Pclass,Age
count,891.0,891.0,891.0,714.0
mean,446.0,0.383838,2.308642,29.699118
std,257.353842,0.486592,0.836071,14.526497
min,1.0,0.0,1.0,0.42
25%,223.5,0.0,2.0,
50%,446.0,0.0,3.0,
75%,668.5,1.0,3.0,
max,891.0,1.0,3.0,80.0


In [27]:
# 年齢の欠損値を男女別の平均年齢で補間
age_train_mean = dropped_train_df.groupby('Sex').Age.mean()

def fage(x):
    if x.Sex == 'male':
        return round(age_train_mean['male'])
    if x.Sex == 'female':
        return round(age_train_mean['female'])
 
dropped_train_df.Age.fillna(train_df[train_df.Age.isnull()].apply(fage,axis=1),inplace=True)

age_test_mean = dropped_test_df.groupby('Sex').Age.mean()
 
def fage(x):
    if x.Sex == 'male':
        return round(age_test_mean['male'])
    if x.Sex == 'female':
        return round(age_test_mean['female'])
 
dropped_test_df.Age.fillna(test_df[test_df.Age.isnull()].apply(fage,axis=1),inplace=True)

dropped_train_df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age
0,1,0,3,male,22.0
1,2,1,1,female,38.0
2,3,1,3,female,26.0


In [28]:
### クロス集計

In [29]:
# 性別のクロス集計
sex_ct = pd.crosstab(dropped_train_df['Sex'], dropped_train_df['Survived'])
sex_ct

Survived,0,1
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1
female,81,233
male,468,109


In [30]:
# 階級のクロス集計
pclass_ct = pd.crosstab(dropped_train_df['Pclass'], dropped_train_df['Survived'])
pclass_ct

Survived,0,1
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1
1,80,136
2,97,87
3,372,119


### ダミー変数に変換

In [31]:
# ダミー変数
# 性別のmale/femaleを0/1に変換し、要素を追加
dropped_train_df['Female'] = dropped_train_df['Sex'].map( {'male': 0, 'female': 1} ).astype(int)
dropped_test_df['Female'] = dropped_test_df['Sex'].map( {'male': 0, 'female': 1} ).astype(int)

dropped_train_df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,Female
0,1,0,3,male,22.0,0
1,2,1,1,female,38.0,1
2,3,1,3,female,26.0,1


In [32]:
#ダミー変数
pclass_train_df  = pd.get_dummies(dropped_train_df['Pclass'],prefix='Class')
pclass_test_df  = pd.get_dummies(dropped_test_df['Pclass'],prefix='Class')

pclass_train_df.head(3)

Unnamed: 0,Class_1,Class_2,Class_3
0,0.0,0.0,1.0
1,1.0,0.0,0.0
2,0.0,0.0,1.0


In [33]:
# 多重共線性
pclass_train_df = pclass_train_df.drop(['Class_3'], axis=1)
pclass_test_df = pclass_test_df.drop(['Class_3'], axis=1)

#Class_1,Class_2カラムを追加
dropped_train_df = dropped_train_df.join(pclass_train_df)
dropped_test_df = dropped_test_df.join(pclass_test_df)

# 不必要な列と目的変数を削除
X = dropped_train_df.drop(['PassengerId','Survived','Pclass','Sex'],axis=1)

X.head(3)

Unnamed: 0,Age,Female,Class_1,Class_2
0,22.0,0,0.0,0.0
1,38.0,1,1.0,0.0
2,26.0,1,0.0,0.0


In [34]:
# Yに目的変数を格納
Y = dropped_train_df.Survived

Y.head(3)

0    0
1    1
2    1
Name: Survived, dtype: int64

## 3. モデル作成

In [35]:
# インスタンス作成
log_model = LogisticRegression()

# データを代入して学習
log_model.fit(X,Y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

## 4. 評価

In [36]:
# モデルの精度を表示
log_model.score(X,Y)

0.80134680134680136

## 5. 結果

In [37]:
# 変数名と係数を格納
coeff_df = DataFrame([X.columns, log_model.coef_[0]]).T
coeff_df

Unnamed: 0,0,1
0,Age,-0.0335092
1,Female,2.46126
2,Class_1,2.16662
3,Class_2,1.08844


## 6. おまけ

In [38]:
dropped_test_df.head(3)

Unnamed: 0,PassengerId,Pclass,Sex,Age,Female,Class_1,Class_2
0,892,3,male,34.5,0,0.0,0.0
1,893,3,female,47.0,1,0.0,0.0
2,894,2,male,62.0,0,0.0,1.0


In [39]:
# 不必要な列と目的変数を削除
X = dropped_test_df.drop(['PassengerId','Pclass','Sex'],axis=1)

# テスト用データで予測
result = log_model.predict(X)

result

array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1,
       1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0,
       1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1,
       0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0,

In [40]:
result_df = pd.DataFrame({'PassengerId': test_df['PassengerId'], 'Survived':np.array(result)})
result_df.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
