In [1]:
### ライブラリのインポート
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

from sklearn.metrics import (
    confusion_matrix,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score
)

import lightgbm as lgb
import optuna

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
### 入出力データセットの名称
## 入力データ
gs_file = 'gender_submission.csv'
test_file = 'test.csv'
train_file = 'train.csv'

In [3]:
### データのインポート
df_gs = pd.read_csv('original_data/' + gs_file)
df_test = pd.read_csv('original_data/' + test_file)
df_train = pd.read_csv('original_data/' + train_file)

In [4]:
### 学習データをxとyに分割
train_y = df_train['Survived']
train_x = df_train.drop(columns=['Survived'])

In [6]:
### 学習データとテストデータの結合
all_x = pd.concat([train_x, df_test])

In [7]:
### PassengerID
## 不要なので削除する
all_x = all_x.drop(columns=['PassengerId'])

In [10]:
all_x.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1309 entries, 0 to 417
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    1309 non-null   int64  
 1   Name      1309 non-null   object 
 2   Sex       1309 non-null   object 
 3   Age       1046 non-null   float64
 4   SibSp     1309 non-null   int64  
 5   Parch     1309 non-null   int64  
 6   Ticket    1309 non-null   object 
 7   Fare      1308 non-null   float64
 8   Cabin     295 non-null    object 
 9   Embarked  1307 non-null   object 
dtypes: float64(2), int64(3), object(5)
memory usage: 112.5+ KB


In [8]:
### Survived
pass

In [11]:
### Pclass
pass

In [None]:
### Name
### Titleを抽出し、6つにマージする

## 0. Titleの一覧を取得する
# データの確認のために行う
titles = set()
for name in all_x['Name']:
    titles.add(name.split(',')[1].split('.')[0].strip())

## 1. NameとTitleの対応表を作る
Title_Dictionary = {
    "Capt": "Officer",
    "Col": "Officer",
    "Major": "Officer",
    "Jonkheer": "Royalty",
    "Don": "Royalty",
    "Sir" : "Royalty",
    "Dr": "Officer",
    "Rev": "Officer",
    "the Countess":"Royalty",
    "Mme": "Mrs",
    "Mlle": "Miss",
    "Ms": "Mrs",
    "Mr" : "Mr",
    "Mrs" : "Mrs",
    "Miss" : "Miss",
    "Master" : "Master",
    "Lady" : "Royalty"
}

## 2. Name→Titleの変換関数を定義する
def get_titles():
    # 名前からTitleを取得する
    all_x['Title'] = all_x['Name'].map(lambda name:name.split(',')[1].split('.')[0].strip())
    # 辞書を使って変換する
    all_x['Title'] = all_x.Title.map(Title_Dictionary)
    return all_x

## 3. 実際に変換する
all_x = get_titles()


In [15]:
### Sex
pass

In [None]:
### Age

## 1. 3カラムをキーにグループ化し、各グループの年齢の中央値を求める
grouped_train = all_x.iloc[:891].groupby(['Sex','Pclass','Title'])
grouped_median_train = grouped_train.Age.median()
grouped_median_train = grouped_median_train.reset_index()[['Sex', 'Pclass', 'Title', 'Age']]
# 最大でも高々36通りなので、全部可視化してみる
# print(grouped_median_train)

## 2. 各行の3カラムの組み合わせに対応する年齢の中央値を返す関数を定義する
def fill_age(row):
    condition = (
        (grouped_median_train['Sex'] == row['Sex']) & 
        (grouped_median_train['Title'] == row['Title']) & 
        (grouped_median_train['Pclass'] == row['Pclass'])
    ) 
    return grouped_median_train[condition]['Age'].values[0]

## 3. 2を使って欠損値補完する
def process_age():
    global all_x
    # a function that fills the missing values of the Age variable
    all_x['Age'] = all_x.apply(lambda row: fill_age(row) if np.isnan(row['Age']) else row['Age'], axis=1)
    return all_x

all_x

       Sex  Pclass    Title   Age
0   female       1     Miss  30.0
1   female       1      Mrs  40.0
2   female       1  Officer  49.0
3   female       1  Royalty  40.5
4   female       2     Miss  24.0
5   female       2      Mrs  31.5
6   female       3     Miss  18.0
7   female       3      Mrs  31.0
8     male       1   Master   4.0
9     male       1       Mr  40.0
10    male       1  Officer  51.0
11    male       1  Royalty  40.0
12    male       2   Master   1.0
13    male       2       Mr  31.0
14    male       2  Officer  46.5
15    male       3   Master   4.0
16    male       3       Mr  26.0


Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,Mr
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,Miss
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,Mrs
4,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,Mr
...,...,...,...,...,...,...,...,...,...,...,...
413,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S,Mr
414,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C,
415,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S,Mr
416,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S,Mr
