In [1]:
import numpy as np
import pandas as pd

In [2]:
train = pd.read_csv("../input/titanic/train.csv")
test = pd.read_csv("../input/titanic/test.csv")
gender_submission = pd.read_csv("../input/titanic/gender_submission.csv")

In [3]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [5]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [6]:
gender_submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [7]:
data = pd.concat([train, test], sort=False)

In [8]:
data.isnull().sum()

PassengerId       0
Survived        418
Pclass            0
Name              0
Sex               0
Age             263
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1014
Embarked          2
dtype: int64

In [9]:
data["Sex"].replace(["male", "female"], [0, 1], inplace=True)

In [10]:
data["Embarked"].replace(["S", "C", "Q"], [0, 1, 2], inplace=True)
data["Embarked"].fillna(data.Embarked.mean(), inplace=True)

In [11]:
data["Fare"].fillna(np.mean(data["Fare"]), inplace=True)

In [12]:
age_avg = data["Age"].mean()
age_std = data["Age"].std()
data["Age"].fillna(np.random.randint(age_avg - age_std, age_avg + age_std), inplace=True)

In [13]:
delete_columns = ["Name", "PassengerId",  "Ticket", "Cabin"]
data.drop(delete_columns, axis=1, inplace=True)

In [14]:
train = data[:len(train)]
test = data[:len(test)]

In [15]:
Y_train = train["Survived"]
X_train = train.drop("Survived", axis=1)
X_test = test.drop("Survived", axis=1)

In [16]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [17]:
train_x, valid_x, train_y, valid_y = train_test_split(X_train, Y_train, test_size=0.33, random_state=0)

In [18]:
gbm = lgb.LGBMClassifier(objective='binary')

In [19]:
gbm.fit(train_x, train_y, eval_set = [(valid_x, valid_y)], early_stopping_rounds=20, verbose=10)

Training until validation scores don't improve for 20 rounds
[10]	valid_0's binary_logloss: 0.459141
[20]	valid_0's binary_logloss: 0.413783
[30]	valid_0's binary_logloss: 0.401802
[40]	valid_0's binary_logloss: 0.407321
Early stopping, best iteration is:
[28]	valid_0's binary_logloss: 0.400753


LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective='binary',
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [20]:
oof = gbm.predict(valid_x, num_iteration=gbm.best_iteration_)
print('score', round(accuracy_score(valid_y, oof)*100,2), '%')

score 82.03 %


In [21]:
test_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration_)  # testの予測
sub = pd.read_csv("../input/titanic/gender_submission.csv")
sub["Survived"] = list(map(int, test_pred))
sub.to_csv("../output/submission.csv", index=False)

In [22]:
from sklearn.model_selection import KFold

In [23]:
kf = KFold(n_splits=3)  # 3分割交差検証のためにインスタンス化

# スコアとモデルを格納するリスト
score_list = []
models = []

for fold_, (train_index, valid_index) in enumerate(kf.split(X_train, Y_train)):
    train_x = X_train.iloc[train_index]
    valid_x = X_train.iloc[valid_index]
    train_y = Y_train[train_index]
    valid_y = Y_train[valid_index]
    
    print(f'fold{fold_ + 1} start')

    gbm = lgb.LGBMClassifier(objective='binary')
    gbm.fit(train_x, train_y, eval_set = [(valid_x, valid_y)],
                early_stopping_rounds=20,
                verbose= -1) # 学習の状況を表示しない
    
    oof = gbm.predict(valid_x, num_iteration=gbm.best_iteration_)
    score_list.append(round(accuracy_score(valid_y, oof)*100,2))
    models.append(gbm)  # 学習が終わったモデルをリストに入れておく
    print(f'fold{fold_ + 1} end\n' )
print(score_list, '平均score', np.mean(score_list), "%")

fold1 start
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[25]	valid_0's binary_logloss: 0.471555
fold1 end

fold2 start
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[23]	valid_0's binary_logloss: 0.435255
fold2 end

fold3 start
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[33]	valid_0's binary_logloss: 0.391991
fold3 end

[77.44, 85.19, 83.16] 平均score 81.92999999999999 %


In [25]:
test_pred = np.zeros((len(test), 3))  # 行:len(test), 列:3のall zeroの配列を用意

for fold_, gbm in enumerate(models):  # 学習ずみのmodelをgbmに入れる
    pred_ = gbm.predict(X_test, num_iteration=gbm.best_iteration_)  # testの予測
    test_pred[:, fold_] = pred_  # １回目は0列目、2回目は1列目、2回目は3列目に格納

In [26]:
pred = (np.mean(test_pred, axis=1) > 0.5).astype(int)  # 平均をとって、0と１に変換
sub['Survived'] = pred
sub.to_csv('../output/3-fold_cross-validation.csv', index = False)