In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
import warnings
warnings.simplefilter('ignore', FutureWarning)

In [3]:
# データ保存ディレクトリへ移動
ls = os.getcwd().split('\\')
ls[-1] = 'data'
lsr=""
for i in ls:
    lsr += i+'\\'

# ディレクトリ移動
os.chdir(lsr)

In [4]:
train = pd.read_csv('./datasets_nb001/nb001_train.csv', index_col=0)
test = pd.read_csv('./datasets_nb001/nb001_test.csv', index_col=0)

In [5]:
print(train.shape)
print(test.shape)

(891, 20)
(418, 13)


In [6]:
# 提出データ用に受け皿を作成
PassengerID_df = test.PassengerId

# 前処理 (Pandas の get_dummies() で処理)

In [7]:
# Sex と Embarked の One-Hot Encoding
train = pd.get_dummies(train, columns=['Sex','Embarked'])
test = pd.get_dummies(test, columns=['Sex','Embarked'])

# 不要な列の削除
train.drop(['PassengerId', 'Name', 'Ticket', 'Cabin', 'CategoricalAge', 'SibSp_0_1_2over',
            'Parch_0_1_2_3over', 'FamilySize', 'IsAlone', 'CategoricalFare',
            'Title', 'Title_num'],
            axis=1, inplace=True)

test.drop(['PassengerId', 'Name', 'Ticket', 'Cabin', 'Title', 'Title_num'],
            axis=1, inplace=True)

In [8]:
train.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,0,3,22.0,1,0,7.25,False,True,False,False,True
1,1,1,38.0,1,0,71.2833,True,False,True,False,False
2,1,3,26.0,0,0,7.925,True,False,False,False,True
3,1,1,35.0,1,0,53.1,True,False,False,False,True
4,0,3,35.0,0,0,8.05,False,True,False,False,True


In [9]:
test.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,3,34.5,0,0,7.8292,False,True,False,True,False
1,3,47.0,1,0,7.0,True,False,False,False,True
2,2,62.0,0,0,9.6875,False,True,False,True,False
3,3,27.0,0,0,8.6625,False,True,False,False,True
4,3,22.0,1,1,12.2875,True,False,False,False,True


# ホールドアウト法(データセットの分割)での学習・推論

In [10]:
X_train = train.drop(['Survived'], axis=1)
y_train = train['Survived']

In [11]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# X_train と y_train を train と valid に分割
train_x, valid_x, train_y, valid_y = train_test_split(X_train, y_train,
                                                      test_size=0.33, random_state=0)

In [12]:
lgb_train = lgb.Dataset(train_x, train_y)
lgb_eval = lgb.Dataset(valid_x, valid_y)

In [13]:
# パラメータを定義
lgbm_params = {'objective': 'binary'}

In [14]:
# 学習
evals_result = {}
gbm = lgb.train(params=lgbm_params,
                train_set=lgb_train,
                valid_sets=[lgb_train,lgb_eval],
                callbacks=[lgb.early_stopping(stopping_rounds=20,
                                              verbose=True)],
                );

[LightGBM] [Info] Number of positive: 231, number of negative: 365
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000739 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 189
[LightGBM] [Info] Number of data points in the train set: 596, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.387584 -> initscore=-0.457480
[LightGBM] [Info] Start training from score -0.457480
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[29]	training's binary_logloss: 0.301373	valid_1's binary_logloss: 0.400278


In [15]:
# valid_x について推論
oof = (gbm.predict(valid_x) > 0.5).astype(int)
print('score', round(accuracy_score(valid_y,oof)*100,2))

score 82.37


In [16]:
# test データに対しても予測を行う
test_pred = (gbm.predict(test)>0.5).astype(int)

In [17]:
# 提出用データを作成
sample_submission = pd.DataFrame()
sample_submission['Survived'] = test_pred
sample_submission.to_csv(r'./submit/nb002_test_pred_lightgbd_holdout_method.csv', encoding='utf-8')

In [18]:
sample_submission['PassengerID'] = PassengerID_df

In [19]:
sample_submission

Unnamed: 0,Survived,PassengerID
0,0,892
1,0,893
2,0,894
3,0,895
4,1,896
...,...,...
413,0,1305
414,1,1306
415,0,1307
416,0,1308
