In [12]:
import pandas as pd
import numpy as np
import matplotlib

#讀取資料
url = "/Volumes/Transcend/train.csv"
Data = pd.read_csv(url)

print(Data.dtypes)
print("---")
print(Data.shape)
print("---")

Data.head()# 發現 Cabin 有 NaN

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object
---
(891, 12)
---


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [11]:
#敘述性統計
Data.describe() # Age 有 NaN

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [9]:
#確認變數使否又含遺漏值
print(Data.isnull().sum())

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [14]:
#填補年齡遺漏值
#我們使用中位數來填補 Age 變數的遺漏值。
age_median = np.nanmedian(Data.Age)
new_Age = np.where(Data.Age.isnull(), age_median, Data.Age)
Data.Age = new_Age
Data.describe()


Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.361582,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,13.019697,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,22.0,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,35.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [17]:
#填補登船港口遺漏值
#計數最多的港口來填補 Embarked 變數的遺漏值。

print(Data.Embarked.value_counts()) # S 最多
new_Embarked = np.where(Data.Embarked.isnull(), "S", Data.Embarked)

S    644
C    168
Q     77
Name: Embarked, dtype: int64


In [19]:
#Label Encoding
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
encoded_Sex = label_encoder.fit_transform(Data.Sex)
encoded_Embarked = label_encoder.fit_transform(new_Embarked)

In [21]:
titanic_X = pd.DataFrame([Data.Pclass,
                          encoded_Sex,
                          new_Age,
                          Data.SibSp,
                          Data.Parch,
                          Data.Fare,
                          encoded_Embarked
]).T

titanic_y = Data.Survived
print(titanic_X.shape)
print(titanic_y.shape)

(891, 7)
(891,)


In [24]:
from sklearn.cross_validation import train_test_split

# 切分訓練與測試資料
train_X, test_X, train_y, test_y = train_test_split(titanic_X, titanic_y, test_size = 0.3)

In [25]:
from sklearn import metrics, tree

# 建立模型
decison_clf = tree.DecisionTreeClassifier(random_state = 87)
decison_clf.fit(train_X, train_y)

# 預測
test_y_predicted = decison_clf.predict(test_X)

# 準確率
accuracy = metrics.accuracy_score(test_y, test_y_predicted)
print(accuracy)

0.768656716418


In [28]:
url = "https://storage.googleapis.com/py_ds_basic/kaggle_titanic_test.csv"
to_submit = pd.read_csv(url)

# 繳交資料有一個觀測值的 Fare 是遺漏值
#print(to_submit.isnull().sum())

# 跟訓練資料作一樣的整理
encoded_Sex_to_submit = label_encoder.fit_transform(to_submit.Sex)
encoded_Embarked_to_submit = label_encoder.fit_transform(to_submit.Embarked)
age_median = np.nanmedian(to_submit.Age) # 要忽略 NaN
imputed_Age = np.where(to_submit.Age.isnull(), age_median, to_submit.Age)
fare_median = np.nanmedian(to_submit.Fare) # 要忽略 NaN
imputed_Fare = np.where(to_submit.Fare.isnull(), fare_median, to_submit.Fare)

to_submit_X = pd.DataFrame([to_submit.Pclass,
                            encoded_Sex_to_submit,
                            imputed_Age,
                            to_submit.SibSp,
                            to_submit.Parch,
                            imputed_Fare,
                            encoded_Embarked_to_submit
]).T

# 預測
to_submit_y = decison_clf.predict(to_submit_X)
print(to_submit_y[0:5])

# 準備要上傳的檔案
to_submit_dict = {
    "PassengerId": to_submit["PassengerId"],
    "Survived": to_submit_y
}
to_submit_df = pd.DataFrame(to_submit_dict)

# 輸出成 csv
to_submit_df.to_csv("to_submit.csv", index = False)

[0 0 1 0 1]


![submission](https://github.com/pcpo8992/python_hw_01/blob/master/result.png)