# Naive Bayes
ref: https://blog.sicara.com/naive-bayes-classifier-sklearn-python-example-tips-42d100429e44  
dataset: https://www.kaggle.com/c/titanic/data


In [64]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn import preprocessing
from sklearn.metrics import accuracy_score, confusion_matrix

# 1.將資料集讀入
df=pd.read_csv('./titanic/train.csv')

df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [65]:
# 2.將sex, embarked欄位做編碼
#df['Sex'] = df['Sex'].replace({'male': 1, 'female': 0}) 
#df['Embarked'] = df['Embarked'].replace({'C': 0, 'S': 2, 'Q':1})
df['Sex']=pd.Categorical(df['Sex'],categories=['male', 'female']).codes   #分的不正確,可能有空值之類的,可以加入categories=['male', 'female']
df['Embarked']=pd.Categorical(df['Embarked']).codes   #會根據資料中的 唯一值 自動排序，將它們分配為類別。.code,會為每個類別分配一個整數值（從 0 開始）

df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,2
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,0
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,2
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,2
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,2


In [66]:
#3.移除含有缺失值之row
print(df.isna().sum())
df = df.dropna(axis=0)

df.isna().sum()


PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64


PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

In [78]:
# 4.使用"Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"欄位來定義特徵向量X
#x=df.loc[:,["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]]
x=np.asarray(df[["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]])

# 5.使用"Survived"來定義y
#y=df['Survived']
y=np.asarray(df['Survived']).flatten()

print(x.shape)
print(y.shape)

(185, 7)
(185,)


In [79]:
#6.將資料集 8:2分成訓練資料及測試資料(假設random_state=1)
x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.2,random_state=1)

In [80]:
#7.將訓練資料做標準化
scaler=preprocessing.StandardScaler().fit(x_train)
x_train=scaler.transform(x_train)


In [81]:
#8.建立機器學習模型並訓練
model=GaussianNB().fit(x_train,y_train)




In [82]:
#9.將測試資料做標準化並用訓練好的模型做預測
x_test=scaler.transform(x_test)
y_pred=model.predict(x_test)

In [83]:
#10.驗證模型結果
num_correct_samples=accuracy_score(y_test,y_pred,normalize=False)
accuracy=accuracy_score(y_test,y_pred)
con_matrix=confusion_matrix(y_test,y_pred)

print('number of correct sample: {}'.format(num_correct_samples))
print('accuracy: {}'.format(accuracy))
print('confusion matrix: {}'.format(con_matrix))

number of correct sample: 55.0
accuracy: 0.3716216216216216
confusion matrix: [[39  8]
 [85 16]]


In [1]:
!pip freeze > requirements.txt