In [None]:
#先導入資料處理會用到的模組
import numpy as np
import numpy.random as random
import scipy as sp
from pandas import Series, DataFrame
import pandas as pd

# 可視化模組
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
%matplotlib inline

# 機器學習模組
import sklearn

In [None]:
#讀檔
dataset = pd.read_csv("titanic/train.csv")
testdata = pd.read_csv("titanic/test.csv")
dataset.head(50)

In [None]:
#觀察資料幾列幾行
dataset.shape, testdata.shape

In [None]:
#觀察整個資料集的資訊
dataset.info()
testdata.info()

In [None]:
#設定seaborn為預設繪圖library
sns.set()

In [None]:
def bar_chart(feature):
    survived = dataset[dataset['Survived'] == 1][feature].value_counts()
    dead = dataset[dataset['Survived'] == 0][feature].value_counts()
    df = pd.DataFrame([survived, dead])
    df.index = ['Survived','Dead']
    df.plot(kind='bar',stacked=True, figsize=(10,5))

In [None]:
#分別秀出生還/死亡的男女長條圖
bar_chart('Sex')

In [None]:
#分別秀出生還/死亡的艙等長條圖
bar_chart('Pclass')

In [None]:
# 計算各個行(欄位)裡有多少個“NaN”
dataset.isna().sum()

In [None]:
testdata.isna().sum()

# Name

In [None]:
#觀察Name欄位
dataset['Name']

In [None]:
#取出Mr., Mrs., Miss
dataset['Title'] = dataset['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
testdata['Title'] = testdata['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
#https://reurl.cc/qeZQE
#https://reurl.cc/Neb8n

In [None]:
#統計各稱謂的人數
dataset['Title'].value_counts()

In [None]:
testdata['Title'].value_counts()

In [None]:
#titlemapping
title_mapping = {"Mr": 0, "Miss": 1, "Mrs": 2, "Master": 3, "Dr": 3, "Rev": 3, "Col": 3, "Major": 3, "Mlle": 3,"Capt": 3, "Jonkheer": 3, "Ms": 3, "Countess": 3, "Sir": 3, "Don": 3, "Mme": 3, "Lady": 3, "Dona": 3}
dataset['Title'] = dataset['Title'].map(title_mapping)
testdata['Title'] = testdata['Title'].map(title_mapping)

In [None]:
#現在的dataset
dataset.head()

In [None]:
#現在的dataframe
testdata.head()

In [None]:
#秀出Title/生還死亡的分佈
bar_chart('Title')

In [None]:
# 刪除Name欄位
dataset.drop('Name', axis=1, inplace=True)
testdata.drop('Name', axis=1, inplace=True)

In [None]:
#目前的dataframe
dataset.head()

# Sex

In [None]:
#mapping sex
sex_mapping = {"male": 0, "female": 1}
dataset['Sex'] = dataset['Sex'].map(sex_mapping)
testdata['Sex'] = testdata['Sex'].map(sex_mapping)

In [None]:
dataset.head()

In [None]:
bar_chart('Sex')

In [None]:
#目前的dataframe
dataset.head()

# Age

In [None]:
# fill missing age with median age for each title (Mr, Mrs, Miss, Others)
dataset["Age"].fillna(dataset.groupby("Title")["Age"].transform("median"), inplace=True)
testdata["Age"].fillna(testdata.groupby("Title")["Age"].transform("median"), inplace=True)
dataset["Age"]

In [None]:
#年齡/生還死亡分布圖
facet = sns.FacetGrid(dataset, hue="Survived",aspect=4)
facet.map(sns.kdeplot,'Age',shade= True)
facet.set(xlim=(0, dataset['Age'].max()))
facet.add_legend()
 
plt.show()

In [None]:
#依年齡區間做mapping function
dataset.loc[dataset['Age'] <= 16, 'Age'] = 0
dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 26), 'Age'] = 1
dataset.loc[(dataset['Age'] > 26) & (dataset['Age'] <= 36), 'Age'] = 2
dataset.loc[(dataset['Age'] > 36) & (dataset['Age'] <= 62), 'Age'] = 3
dataset.loc[dataset['Age'] > 62, 'Age'] = 4

testdata.loc[testdata['Age'] <= 16, 'Age'] = 0
testdata.loc[(testdata['Age'] > 16) & (testdata['Age'] <= 26), 'Age'] = 1
testdata.loc[(testdata['Age'] > 26) & (testdata['Age'] <= 36), 'Age'] = 2
testdata.loc[(testdata['Age'] > 36) & (testdata['Age'] <= 62), 'Age'] = 3
testdata.loc[testdata['Age'] > 62, 'Age'] = 4

In [None]:
#目前的dataframe
dataset.head()

In [None]:
#秀出年齡/生還死亡的分佈
bar_chart("Age")

# Embarked (登船地點)

In [None]:
#分析各票種登船地點人數
Pclass1 = dataset[dataset['Pclass'] == 1]['Embarked'].value_counts()
Pclass2 = dataset[dataset['Pclass'] == 2]['Embarked'].value_counts()
Pclass3 = dataset[dataset['Pclass'] == 3]['Embarked'].value_counts()
df = pd.DataFrame([Pclass1, Pclass2, Pclass3])
df.index = ['1st class','2nd class', '3rd class']
df.plot(kind='bar',stacked=True, figsize=(10,5))

In [None]:
#填補登船地點缺失資料
dataset['Embarked'] = dataset['Embarked'].fillna('S')
testdata['Embarked'] = testdata['Embarked'].fillna('S')
dataset.head(100)

In [None]:
# 登船地點的Mapping Function
embarked_mapping = {"S": 0, "C": 1, "Q": 2}
dataset['Embarked'] = dataset['Embarked'].map(embarked_mapping)
testdata['Embarked'] = testdata['Embarked'].map(embarked_mapping)
dataset.head(100)

# Fare(票價)

In [None]:
#填補票價缺失的資料 
dataset['Fare'].fillna(dataset.groupby("Pclass")["Fare"].transform("median"), inplace=True)
testdata['Fare'].fillna(testdata.groupby("Pclass")["Fare"].transform("median"), inplace=True)
dataset.head(50)

In [None]:
#票價/生還死亡分布圖
facet = sns.FacetGrid(dataset, hue="Survived",aspect=4)
facet.map(sns.kdeplot,'Fare',shade= True)
facet.set(xlim=(0, dataset['Fare'].max()))
facet.add_legend()
 
plt.show()

In [None]:
#依票價區間做mapping function
dataset.loc[dataset['Fare'] <= 17, 'Fare'] = 0
dataset.loc[(dataset['Fare'] > 17) & (dataset['Fare'] <= 30), 'Fare'] = 1
dataset.loc[(dataset['Fare'] > 30) & (dataset['Fare'] <= 100), 'Fare'] = 2
dataset.loc[dataset['Fare'] > 100, 'Fare'] = 3

testdata.loc[testdata['Fare'] <= 17, 'Fare'] = 0
testdata.loc[(testdata['Fare'] > 17) & (testdata['Fare'] <= 30), 'Fare'] = 1
testdata.loc[(testdata['Fare'] > 30) & (testdata['Fare'] <= 100), 'Fare'] = 2
testdata.loc[testdata['Fare'] > 100, 'Fare'] = 3

In [None]:
#目前的dataframe
dataset.head()

In [None]:
#觀察船艙編號資料
dataset['Cabin'].value_counts()

In [None]:
#觀察船艙編號資料 =>取出第一個字母
dataset["Cabin"] = dataset["Cabin"].str[:1]
testdata["Cabin"] = testdata["Cabin"].str[:1]
dataset["Cabin"]

In [None]:
#分析各票種船艙種類人數
Pclass1 = dataset[dataset['Pclass'] == 1]['Cabin'].value_counts()
Pclass2 = dataset[dataset['Pclass'] == 2]['Cabin'].value_counts()
Pclass3 = dataset[dataset['Pclass'] == 3]['Cabin'].value_counts()
df = pd.DataFrame([Pclass1, Pclass2, Pclass3])
df.index = ['1st class','2nd class', '3rd class']
df.plot(kind='bar',stacked=True, figsize=(10,5))

In [None]:
#船艙Mapping
cabin_mapping = {"A": 0, "B": 0.4, "C": 0.8, "D": 1.2, "E": 1.6, "F": 2, "G": 2.4, "T": 2.8}
dataset['Cabin'] = dataset['Cabin'].map(cabin_mapping)
testdata['Cabin'] = testdata['Cabin'].map(cabin_mapping)

In [None]:
#填補船艙種類缺失資料
dataset['Cabin'].fillna(dataset.groupby("Pclass")["Cabin"].transform("median"), inplace=True)
testdata['Cabin'].fillna(testdata.groupby("Pclass")["Cabin"].transform("median"), inplace=True)

In [None]:
#目前的dataframe
dataset.head()

# Sibsp & Parch

In [None]:
#合併Sibsp & Parch
dataset["FamilySize"] = dataset["SibSp"] + dataset["Parch"] + 1
testdata["FamilySize"] = testdata["SibSp"] + testdata["Parch"] + 1

In [None]:
#家族人口/生還死亡分布圖
facet = sns.FacetGrid(dataset, hue="Survived",aspect=4)
facet.map(sns.kdeplot,'FamilySize',shade= True)
facet.set(xlim=(0, dataset['FamilySize'].max()))
facet.add_legend()
plt.xlim(0)

In [None]:
#家族人口的Mapping Function
family_mapping = {1: 0, 2: 0.4, 3: 0.8, 4: 1.2, 5: 1.6, 6: 2, 7: 2.4, 8: 2.8, 8: 3.2, 10: 3.6, 11: 4}
dataset['FamilySize'] = dataset['FamilySize'].map(family_mapping)
testdata['FamilySize'] = testdata['FamilySize'].map(family_mapping)

In [None]:
#目前的dataframe
dataset.head()

# 資料清洗

In [None]:
#刪除Sibsp, Parch, Ticket
features_drop = ['Ticket', 'SibSp', 'Parch', 'PassengerId']
dataset = dataset.drop(features_drop, axis=1)
testdata = testdata.drop(features_drop, axis=1)

In [None]:
#設定預測目標變數與解釋變數
dataset_data = dataset.drop('Survived', axis=1)
dataset_target = dataset[['Survived']]
dataset_data.shape, dataset_target.shape

In [None]:
#目前的所有解釋變數
dataset_data.head()

In [None]:
#確認欄位的資料型態均為數值型別
dataset_data.info()

In [None]:
testdata.head()

In [None]:
testdata.info()

# 建置模型-DNN

In [None]:
#導入keras與sklearn
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.optimizers import Adam
from sklearn import preprocessing

In [None]:
#建立模型函式
def build_model():
    model = Sequential()
    model.add(Dense(input_dim=8, units=40))
    model.add(Activation('relu'))
    model.add(Dense(units=100))
    model.add(Activation('relu'))
    model.add(Dense(units=10))
    model.add(Activation('relu'))
    model.add(Dense(units=1))
    model.add(Activation('sigmoid'))
    model.summary()
    return model

In [None]:
#建置繪圖函式
def show_train_history(train_history,train,validation,label):
    plt.plot(train_history.history[train])
    plt.plot(train_history.history[validation])
    plt.title('Train History')
    plt.ylabel(label)
    plt.xlabel('Epoch')
    plt.legend(['train','validation'],loc='upper left')
    plt.show()

In [None]:
#feature標準化
minmax_scale = preprocessing.MinMaxScaler(feature_range=(0, 1))
scaledFeatures = minmax_scale.fit_transform(dataset_data)

In [None]:
#訓練模型
model = build_model()

In [None]:
#開始訓練模型
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
train_history = model.fit(scaledFeatures, dataset_target, validation_split=0.2, batch_size=32, epochs=300)

#顯示訓練結果
score = model.evaluate(scaledFeatures, dataset_target)
print ('\nTrain Acc:', score[1])

In [None]:
#顯示訓練結果
show_train_history(train_history, 'acc', 'val_acc', 'accuracy')
show_train_history(train_history, 'loss', 'val_loss', 'loss')

In [68]:
probability = model.predict(testdata).astype(int)
probability
sub=pd.read_csv('titanic/gender_submission.csv',sep=',')
sub['Survived']=probability
sub.to_csv('submission.csv',index=False)

In [67]:
testdata_target = pd.read_csv("titanic/gender_submission.csv")
testdata_target = pd.DataFrame(testdata_target, columns=['Survived'])
testdata, testdata_target

(     Pclass  Sex  Age  Fare  Cabin  Embarked  Title  FamilySize
 0         3    0  2.0   0.0    2.0         2      0         0.0
 1         3    1  3.0   0.0    2.0         0      2         0.4
 2         2    0  3.0   0.0    2.0         2      0         0.0
 3         3    0  2.0   0.0    2.0         0      0         0.0
 4         3    1  1.0   0.0    2.0         0      2         0.8
 ..      ...  ...  ...   ...    ...       ...    ...         ...
 413       3    0  2.0   0.0    2.0         0      0         0.0
 414       1    1  3.0   3.0    0.8         1      3         0.0
 415       3    0  3.0   0.0    2.0         0      0         0.0
 416       3    0  2.0   0.0    2.0         0      0         0.0
 417       3    0  0.0   1.0    2.0         1      3         0.8
 
 [418 rows x 8 columns],
      Survived
 0           0
 1           1
 2           0
 3           0
 4           1
 ..        ...
 413         0
 414         1
 415         0
 416         0
 417         0
 
 [418 rows 

In [66]:
score = model.evaluate(testdata, testdata_target)
print('\Test Acc:', score[1])

\Test Acc: 0.7344497442245483


In [None]:
model.save('dnnfortitanic.h5')

# SVM

In [70]:
from sklearn import svm 

In [75]:
model = svm.SVC().fit(scaledFeatures, dataset_target)

  return f(*args, **kwargs)


In [81]:
probability = model.predict(testdata)
probability
sub=pd.read_csv('titanic/gender_submission.csv',sep=',')
sub['Survived']=probability
sub.to_csv('svm_submission.csv',index=False)

# RandomForest

In [77]:
from sklearn.ensemble import RandomForestClassifier

In [78]:
model = RandomForestClassifier(n_estimators=1000)
model.fit(scaledFeatures, dataset_target)

  model.fit(scaledFeatures, dataset_target)


RandomForestClassifier(n_estimators=1000)

In [80]:
probability = model.predict(testdata)
probability
sub=pd.read_csv('titanic/gender_submission.csv',sep=',')
sub['Survived']=probability
sub.to_csv('rf_submission.csv',index=False)