まずはデータの確認・前処理をしていく

In [1]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy as sp
import seaborn as sns

In [2]:
train = pd.read_csv("./input/titanic/train.csv")
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [3]:
train.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


年齢に欠損があるため、穴埋めする（欠損数：１７７）

機械学習により、年齢を予測することとする
正確な予測は困難なため、年齢をラベル[old, adult, young]に変換して分類問題とする

In [4]:
train[train["Age"].isnull()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
17,18,1,2,"Williams, Mr. Charles Eugene",male,,0,0,244373,13.0000,,S
19,20,1,3,"Masselmani, Mrs. Fatima",female,,0,0,2649,7.2250,,C
26,27,0,3,"Emir, Mr. Farred Chehab",male,,0,0,2631,7.2250,,C
28,29,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,,0,0,330959,7.8792,,Q
...,...,...,...,...,...,...,...,...,...,...,...,...
859,860,0,3,"Razi, Mr. Raihed",male,,0,0,2629,7.2292,,C
863,864,0,3,"Sage, Miss. Dorothy Edith ""Dolly""",female,,8,2,CA. 2343,69.5500,,S
868,869,0,3,"van Melkebeke, Mr. Philemon",male,,0,0,345777,9.5000,,S
878,879,0,3,"Laleff, Mr. Kristo",male,,0,0,349217,7.8958,,S


年齢予測に関係ありそうなラベルを学習に使用する

Pclass,SibSp,Parch,Fare

In [5]:
# 年齢が欠損値でない行を抽出　→　モデルの学習・評価に使用する
# 欠損値の行は作成したモデルで予測していく
# text data の列は削除
pred_age = train.dropna(subset=["Age"]).reset_index(drop=True)
pred_age = pred_age[["Pclass", "SibSp", "Parch", "Fare", "Age"]]
X_pred_age = pred_age.drop("Age", axis=1)
y_pred_age = pred_age["Age"]

In [6]:
# 列追加　カラム名「age bracket」
# young:0 adult:1 elder:2
def age_classification(age):
    young = 18.0
    adult = 50.0
    elder = 100.0
    if age >= 0 and age <= young:
        age_bracket = 0
    if age > young and age <= adult:
        age_bracket = 1
    if age > adult and age <= elder:
        age_bracket = 2
    return age_bracket


age_bracket1 = []
i = 0
for i in range(y_pred_age.size):
    age_bracket1.append(age_classification(y_pred_age[i]))

age_bracket1 = pd.Series(age_bracket1, name="age_bracket")
pred_agebracket = pd.concat([pred_age, age_bracket1], axis=1)

In [7]:
X_pred_agebracket = pred_agebracket.drop(["Age", "age_bracket"], axis=1)
y_pred_agebracket = pred_agebracket["age_bracket"]
X_pred_agebracket.head()

Unnamed: 0,Pclass,SibSp,Parch,Fare
0,3,1,0,7.25
1,1,1,0,71.2833
2,3,0,0,7.925
3,1,1,0,53.1
4,3,0,0,8.05


Fareが極端に数字が大きいため、標準化する

In [8]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
sc.fit(X_pred_agebracket)
std_X_pred_agebracket = sc.transform(X_pred_agebracket)
std_X_pred_agebracket

array([[ 0.91123237,  0.52457013, -0.50589515, -0.51897787],
       [-1.47636364,  0.52457013, -0.50589515,  0.69189675],
       [ 0.91123237, -0.55170307, -0.50589515, -0.50621356],
       ...,
       [-1.47636364, -0.55170307, -0.50589515, -0.08877362],
       [-1.47636364, -0.55170307, -0.50589515, -0.08877362],
       [ 0.91123237, -0.55170307, -0.50589515, -0.50952283]])

In [9]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC

models = {
    "線形SVC": LinearSVC(),
    "SVC": SVC(),
    "K近傍法": KNeighborsClassifier(),
}
scoring = "accuracy"

for model_name, model in models.items():
    scores = cross_val_score(model, std_X_pred_agebracket, y_pred_agebracket, scoring=scoring, cv=4, n_jobs=-1)
    print(model_name)
    print("正解率平均スコア:{:.3f}".format(np.mean(scores)))



線形SVC
正解率平均スコア:0.742
SVC
正解率平均スコア:0.758
K近傍法
正解率平均スコア:0.738


SVCがよさげなので、ハイパーパラメータをチューニングする

In [11]:
from sklearn.model_selection import GridSearchCV

model = SVC()
param = {
    "C": np.logspace(-4, 4, base=10),
    "gamma": np.logspace(-4, 4, base=10),
}
gscv = GridSearchCV(model, param, scoring="accuracy", n_jobs=-1, cv=4)
gscv.fit(std_X_pred_agebracket, y_pred_agebracket)
gscv.best_score_

0.7618950473918775

In [13]:
gscv.best_params_

{'C': 0.3906939937054613, 'gamma': 0.2682695795279725}