# 機械学習 第1回　レポート課題



## はじめに

本レポートでは, 以下のデータセットと手法を用いて機械学習を行う.

- データセット:
    https://www.kaggle.com/c/titanic/data ここからDLしてきたtrain.csvをデータセットtest.csvを教師データとする.
- 特徴:
    元のデータの特徴は以下のようになっている.
    <table>
    <tbody>
    <tr><th><b>Variable</b></th><th><b>Definition</b></th><th><b>Key</b></th></tr>
    <tr>
    <td>survival</td>
    <td>Survival</td>
    <td>0 = No, 1 = Yes</td>
    </tr>
    <tr>
    <td>pclass</td>
    <td>Ticket class</td>
    <td>1 = 1st, 2 = 2nd, 3 = 3rd</td>
    </tr>
    <tr>
    <td>sex</td>
    <td>Sex</td>
    <td></td>
    </tr>
    <tr>
    <td>Age</td>
    <td>Age in years</td>
    <td></td>
    </tr>
    <tr>
    <td>sibsp</td>
    <td># of siblings / spouses aboard the Titanic</td>
    <td></td>
    </tr>
    <tr>
    <td>parch</td>
    <td># of parents / children aboard the Titanic</td>
    <td></td>
    </tr>
    <tr>
    <td>ticket</td>
    <td>Ticket number</td>
    <td></td>
    </tr>
    <tr>
    <td>fare</td>
    <td>Passenger fare</td>
    <td></td>
    </tr>
    <tr>
    <td>cabin</td>
    <td>Cabin number</td>
    <td></td>
    </tr>
    <tr>
    <td>embarked</td>
    <td>Port of Embarkation</td>
    <td>C = Cherbourg, Q = Queenstown, S = Southampton</td>
    </tr>
    <tr>
    <td>PassengerId</td>
    <td>passenger id</td>
    </tr>
    <tr>
    <td>Name</td>
    <td>Passenger name</td>
    </tr>
    </tbody>
    </table>
    
    
    

In [1]:
# このセルではデータの前処理を行う
import pandas as pd 
import numpy as np
"""
それぞれのデータのsexの値を(male=0, female=1)のように置き換える.
"""
train= pd.read_csv("train.csv").replace("male",0).replace("female",1)
test= pd.read_csv("test.csv").replace("male",0).replace("female",1)

"""
今回使わないデータを削除.
"""
del(train["Ticket"])
del(train["Cabin"])
del(train["Embarked"])
del(test["Ticket"])
del(test["Cabin"])
del(test["Embarked"])

"""
家族の人数(FamillySize)と独身(isAlone)であるかの特徴を追加.
"""
train["FamilySize"] = train["SibSp"] + train["Parch"] + 1
for t in [train]:
    t['IsAlone'] = 0
    t.loc[train['FamilySize'] == 1, 'IsAlone'] = 1
    
test["FamilySize"] = test["SibSp"] + test["Parch"] + 1
for t in [test]:
    t['IsAlone'] = 0
    t.loc[test['FamilySize'] == 1, 'IsAlone'] = 1

"""
欠損値をそれぞれの方法で埋める.
"""
# 名前の敬称を使って, 新たに特徴を加え名前の特徴を消す.
for t in [train]: 
        t['HonorificTitle'] = train.Name.str.extract(' ([A-Za-z]+).', expand=False) 
for t in [train]: 
        t['HonorificTitle'] = train['HonorificTitle'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
        t['HonorificTitle'] = train['HonorificTitle'].replace('Mlle', 'Miss')
        t['HonorificTitle'] = train['HonorificTitle'].replace('Ms', 'Miss')
        t['HonorificTitle'] = train['HonorificTitle'].replace('Mme', 'Mrs')
        del train['Name']
Salutation_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5} 
for t in [train]: 
        t['HonorificTitle'] = train['HonorificTitle'].map(Salutation_mapping) 
        t['HonorificTitle'] = train['HonorificTitle'].fillna(0)

# 敬称からその敬称の年齢の平均を求めて, それを利用して欠損値を埋める.
for i in range(0,6):
    print(train['HonorificTitle'])
#     print(train['HonorificTitle'],end="")
#     print(": ", end="")
#     print(train[train['HonorificTitle'] == i]['Age'].dropna().mean())
    train['Age'].fillna(train[train['HonorificTitle'] == i]['Age'].dropna().mean(), inplace=True)
        
for t in [test]: 
        t['HonorificTitle'] = test.Name.str.extract(' ([A-Za-z]+).', expand=False) 
for t in [test]: 
        t['HonorificTitle'] = test['HonorificTitle'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
        t['HonorificTitle'] = test['HonorificTitle'].replace('Mlle', 'Miss')
        t['HonorificTitle'] = test['HonorificTitle'].replace('Ms', 'Miss')
        t['HonorificTitle'] = test['HonorificTitle'].replace('Mme', 'Mrs')
        del test['Name']
Salutation_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5} 
for t in [test]: 
        t['HonorificTitle'] = test['HonorificTitle'].map(Salutation_mapping) 
        t['HonorificTitle'] = test['HonorificTitle'].fillna(0)

for i in range(0,6):
    test['Age'].fillna(test[test['HonorificTitle'] == i]['Age'].mean(), inplace=True)
        
test["Fare"].fillna(train.Fare.mean(), inplace=True) # 1つだけ欠損しているため.

# 情報の確認用.
# print(train.head(10))
# train.info()
# test.info()

train_data = train.values
X_train = train_data[:, 2:] # Pclass以降の変数
Y_train  = train_data[:, 1]  # 正解データ

test_data = test.values
X_test = test_data[:, 1:] # Pclass以降の変数

0      1.0
1      3.0
2      2.0
3      3.0
4      1.0
      ... 
886    5.0
887    2.0
888    2.0
889    1.0
890    1.0
Name: HonorificTitle, Length: 891, dtype: float64
0      1.0
1      3.0
2      2.0
3      3.0
4      1.0
      ... 
886    5.0
887    2.0
888    2.0
889    1.0
890    1.0
Name: HonorificTitle, Length: 891, dtype: float64
0      1.0
1      3.0
2      2.0
3      3.0
4      1.0
      ... 
886    5.0
887    2.0
888    2.0
889    1.0
890    1.0
Name: HonorificTitle, Length: 891, dtype: float64
0      1.0
1      3.0
2      2.0
3      3.0
4      1.0
      ... 
886    5.0
887    2.0
888    2.0
889    1.0
890    1.0
Name: HonorificTitle, Length: 891, dtype: float64
0      1.0
1      3.0
2      2.0
3      3.0
4      1.0
      ... 
886    5.0
887    2.0
888    2.0
889    1.0
890    1.0
Name: HonorificTitle, Length: 891, dtype: float64
0      1.0
1      3.0
2      2.0
3      3.0
4      1.0
      ... 
886    5.0
887    2.0
888    2.0
889    1.0
890    1.0
Name: HonorificTitle, Le

In [2]:
from sklearn.linear_model import LinearRegression # 線形回帰

lr = LinearRegression()
lr.fit(X_train,Y_train)
Y_pred = lr.predict(X_test)

In [3]:
import csv
with open("predict_result_data.csv", "w") as f:
    writer = csv.writer(f, lineterminator='\n')
    writer.writerow(["PassengerId", "Survived"])
    for pid, survived in zip(test_data[:,0].astype(int), Y_pred.astype(int)):
        writer.writerow([pid, survived])