In [1]:
# タイタニック LightGBMを使用する

import os
import pandas as pd
import numpy as np

TITANIC_PATH = os.path.join("datasets", "titanic")

def load_titanic_data(filename, titanic_path=TITANIC_PATH):
    csv_path = os.path.join(titanic_path, filename)
    return pd.read_csv(csv_path)

In [2]:
train_data = load_titanic_data("train.csv")
test_data = load_titanic_data("test.csv")

In [3]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
train_data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [5]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [6]:
train_data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [7]:
test_data.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [8]:
# ワンホットエンコーディング

train_data = pd.get_dummies(train_data, columns=['Sex', 'Embarked'])
test_data = pd.get_dummies(test_data, columns=['Sex', 'Embarked'])
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,1,0,3,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,,0,1,0,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,C85,1,0,1,0,0
2,3,1,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,,1,0,0,0,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,C123,1,0,0,0,1
4,5,0,3,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,,0,1,0,0,1


In [9]:
# drop PassengerId, Name, Ticket, Cabin
train_data.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1, inplace=True)
test_data.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1, inplace=True)

In [10]:
# 特徴量と目的変数に分離
train_label = train_data['Survived']
train_data.drop(['Survived'], axis=1, inplace=True)
test_X = test_data.copy()

In [11]:
from sklearn.model_selection import train_test_split

# hold-out法でtrainXを分割
train_X, valid_X, train_y, valid_y = train_test_split(train_data, train_label,
                                                     test_size=0.2, random_state=42, shuffle=True)

In [12]:
import lightgbm as lgb
from sklearn.metrics import log_loss

# データセットの生成
lgb_train = lgb.Dataset(train_X, train_y)
lgb_eval = lgb.Dataset(valid_X, valid_y)

# ハイパーパラメータの設定
params = {'objective': 'binary'}
num_round = 100 # イテレーションの回数
stop_rounds = 20 # 早期打ち切りの上限

gbm = lgb.train(params, lgb_train, num_boost_round= num_round,
               valid_sets=[lgb_train, lgb_eval],verbose_eval=10,
               early_stopping_rounds=stop_rounds)

# Log Lossの表示
valid_pred = gbm.predict(valid_X)
score = log_loss(valid_y, valid_pred)
print(score)

Training until validation scores don't improve for 20 rounds
[10]	training's binary_logloss: 0.420847	valid_1's binary_logloss: 0.469703
[20]	training's binary_logloss: 0.346287	valid_1's binary_logloss: 0.435809
[30]	training's binary_logloss: 0.305037	valid_1's binary_logloss: 0.432699
[40]	training's binary_logloss: 0.271464	valid_1's binary_logloss: 0.430737
[50]	training's binary_logloss: 0.244926	valid_1's binary_logloss: 0.433077
Early stopping, best iteration is:
[36]	training's binary_logloss: 0.284258	valid_1's binary_logloss: 0.426887
0.4268873513425494


In [13]:
from sklearn.metrics import accuracy_score

# validを使った正解率
accuracy = accuracy_score(valid_y, (valid_pred > 0.5).astype(int))
print(accuracy)

0.8268156424581006


In [14]:
# テストセットで評価

gbm_predictions = gbm.predict(test_X, num_iteration=gbm.best_iteration)

In [15]:
# datasetsディレクトリに結果を出力

result_out = load_titanic_data("test.csv")
result_out["Survived"] = (gbm_predictions > 0.5).astype(int) 

submission_path = os.path.join(TITANIC_PATH, "submission.csv")
result_out[["PassengerId","Survived"]].to_csv(submission_path,index=False)

     PassengerId  Pclass                                               Name  \
0            892       3                                   Kelly, Mr. James   
1            893       3                   Wilkes, Mrs. James (Ellen Needs)   
2            894       2                          Myles, Mr. Thomas Francis   
3            895       3                                   Wirz, Mr. Albert   
4            896       3       Hirvonen, Mrs. Alexander (Helga E Lindqvist)   
5            897       3                         Svensson, Mr. Johan Cervin   
6            898       3                               Connolly, Miss. Kate   
7            899       2                       Caldwell, Mr. Albert Francis   
8            900       3          Abrahim, Mrs. Joseph (Sophie Halaut Easu)   
9            901       3                            Davies, Mr. John Samuel   
10           902       3                                   Ilieff, Mr. Ylio   
11           903       1                         Jon