# 라이브러리(필요한 도구) 불러오기

In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score


## 데이터 로딩

In [2]:
train_raw = pd.read_csv('titanic/train.csv')
test_raw = pd.read_csv('titanic/test.csv')

In [34]:
train_raw['Name']

0                                Braund, Mr. Owen Harris
1      Cumings, Mrs. John Bradley (Florence Briggs Th...
2                                 Heikkinen, Miss. Laina
3           Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                               Allen, Mr. William Henry
                             ...                        
886                                Montvila, Rev. Juozas
887                         Graham, Miss. Margaret Edith
888             Johnston, Miss. Catherine Helen "Carrie"
889                                Behr, Mr. Karl Howell
890                                  Dooley, Mr. Patrick
Name: Name, Length: 891, dtype: object

In [65]:
from functools import partial
from sklearn.preprocessing import LabelEncoder

cat_features = ['Sex', 'Embarked','Ticket_0','Cabin_0']
le_dict = dict()


def preprocessing(df, phase = 'train'):
    '''
    # ticket
    '''
    def process_ticket(raw, option = 1):
        str_list = raw.split(' ')
        if len(str_list) == 1:
            tmp = '', str_list[0]
        elif len(str_list) == 2:
            tmp = str_list[0], str_list[1]
        elif len(str_list) == 3:
            tmp = str_list[0]+str_list[1], str_list[2]
        return tmp[option]

    df['Ticket_0'] = df['Ticket'].apply(partial(process_ticket, option = 0))
    df['Ticket_1'] = df['Ticket'].apply(partial(process_ticket, option = 1))
    df['Ticket_0'] = df['Ticket_0'].apply(lambda x: ''.join(x.split('.')))
    df['Ticket_0'] = df['Ticket_0'].apply(lambda x: ''.join(x.split('/')))
    df.loc[df['Ticket_1'] == 'LINE', 'Ticket_1'] = '0'
    df['Ticket_1'] = df['Ticket_1'].astype(int)

    '''
    # 대푯값 사용
    '''
    df['Age'] = df['Age'].fillna(28)
    df['Embarked'] = df['Embarked'].fillna('S')

    '''
    # Cabin 채움
    '''
    df['Cabin'] = df['Cabin'].fillna('nan')
    def process_cabin(raw, option = 1):
        str_list = raw.split(' ')
        str_list = str_list[0]
        if str_list == 'nan':
            tmp = 'nan', 0
        elif len(str_list) == 1:
            tmp = str_list[0], 0
        else:
            tmp = str_list[0], int(str_list[1:])
        return tmp[option]
    df['Cabin_0'] = df['Cabin'].apply(partial(process_cabin, option = 0))
    df['Cabin_1'] = df['Cabin'].apply(partial(process_cabin, option = 1))

    '''
    이름
    '''
    # list_ = ['mr','mrs','miss']
    # def process_name(raw):
    #     for l in list_:
    #         if l in raw.lower():
    #             return l
    #     return 'nan'
    # df['Name_p'] = df['Name'].apply(process_name)

    '''
    # 버릴거 버림
    '''
    df.drop(columns = ['Name', 'Ticket', 'Cabin'], inplace = True)

    '''
    label encoder 처리
    '''
    for col in cat_features:
        if phase == 'train':
            encoder = LabelEncoder()
            unique_arr = np.unique(df[col])
            encoder.fit(unique_arr)
            le_dict[col] = encoder
        else:
            # train에 없는 게 있다면 replace -- 확인필수
            encoder = le_dict[col]
            df[col] = df[col].apply(lambda x: x if x in encoder.classes_ else '')
        df[col] = encoder.transform(df[col])
    return df

train = preprocessing(train_raw.copy())
test = preprocessing(test_raw.copy(), phase = 'test')
train_x = train.drop('Survived', 1)
train_y = train['Survived']

In [63]:
train_x = train.drop('Survived', 1)
train_y = train['Survived']

## 모델 정의 및 학습

In [40]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

In [80]:
import pandas as pd
import numpy as np
from sklearn.metrics import *
from sklearn.model_selection import KFold, StratifiedKFold
import lightgbm as lgb


def f_pr_auc(probas_pred, y_true):
    """
    lightgbm custom loss functions

    [Input]
    probas_pred: 예측 결과
    y_true: 실제값

    [Output]
    metric 이름, auc 값, whether maximize or minimize (True is maximize)
    """

    labels = y_true.get_label()
    p, r, _ = precision_recall_curve(labels, probas_pred)
    score = auc(r, p)
    return "pr_auc", score, True


def lgb_train_model(train_x, train_y, params, n_fold, fold_rs=0, categorical_feature = cat_features):
    """
    lightgbm cross validation with given data

    [Input]
    train_x, train_y: 학습에 사용될 data와 label
    params: lightgbm parameter
    n_fold: k-fold cross validation의 fold 수
    fold_rs: fold를 나눌 random state

    [Output]
    models: fold별로 피팅된 model
    valid_probs: validation set에 대한 예측 결과
    """

    valid_probs = np.zeros((train_y.shape))

    # -------------------------------------------------------------------------------------
    # Kfold cross validation

    models = []
    # k_fold = KFold(n_splits=n_fold, shuffle=True, random_state=fold_rs)
    k_fold = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=fold_rs)
    # split train, validation set
    for train_idx, val_idx in k_fold.split(train_x, train_y):

        # input 데이터 형식이 dataframe일 때와 array일 때를 구분
        if type(train_x) == pd.DataFrame:
            X = train_x.iloc[train_idx, :]
            valid_x = train_x.iloc[val_idx, :]
        elif type(train_x) == np.ndarray:
            X = train_x[train_idx, :]
            valid_x = train_x[val_idx, :]
        else:
            print('Unknown data type for X')
            return -1, -1

        y = train_y[train_idx]
        valid_y = train_y[val_idx]

        d_train = lgb.Dataset(X, y, categorical_feature=categorical_feature)
        d_val = lgb.Dataset(valid_x, valid_y, categorical_feature=categorical_feature)

        # run training
        model = lgb.train(
            params,
            train_set=d_train,
            num_boost_round=10000,
            valid_sets=[d_train, d_val],
            feval=f_pr_auc,
            early_stopping_rounds=100,
            verbose_eval=False,
            categorical_feature=categorical_feature
        )

        # cal valid prediction
        valid_prob = model.predict(valid_x)
        valid_probs[val_idx] = valid_prob

        models.append(model)

    return models, valid_probs

lgb_param = {
    'force_col_wise': True,
    'objective': 'binary',
    'tree_learner': 'feature',
    'boosting': 'gbdt',
    'metrics': 'auc',
    'random_state': 0,
    'verbose': -1,
    'max_depth': -1,
    'n_jobs': -1,
}

models_lgb, valid_probs_lgb = lgb_train_model(train_x, train_y, lgb_param, n_fold = 5, fold_rs=0)
auc_score_lgb = roc_auc_score(train_y, valid_probs_lgb)
print('Lightgbm validation score:: {:.5f}'.format(auc_score_lgb))


Lightgbm validation score:: 0.84834


In [81]:
train_x.shape

(891, 12)

In [76]:
models_lgb[0].feature_importance()

array([28,  7,  4, 26,  0,  0, 19,  3,  0, 27,  0,  5], dtype=int32)

In [82]:
test_preds = []
for model in models_lgb:
  test_pred = model.predict(test)
  test_preds.append(test_pred)
final_preds = np.mean(test_preds, axis=0)

In [78]:
final_preds

array([0.09114544, 0.28844851, 0.08969981, 0.1880847 , 0.56621828,
       0.25054957, 0.50627685, 0.17109527, 0.74133275, 0.09315019,
       0.10441103, 0.20948341, 0.8654472 , 0.08661375, 0.84936332,
       0.82952272, 0.09622957, 0.11426404, 0.52289585, 0.44983975,
       0.22077084, 0.55736234, 0.86623053, 0.23967439, 0.83448396,
       0.08821607, 0.86686102, 0.113844  , 0.54837236, 0.23476812,
       0.09713922, 0.12069573, 0.39642379, 0.17807581, 0.5014564 ,
       0.11022753, 0.22002111, 0.2559675 , 0.10663573, 0.50296021,
       0.09555086, 0.54047553, 0.133309  , 0.83765296, 0.86522869,
       0.15770171, 0.52202624, 0.14999498, 0.85888676, 0.50765382,
       0.63060377, 0.19067357, 0.85150871, 0.80179654, 0.13901569,
       0.12066743, 0.08613427, 0.11869588, 0.13743474, 0.8522826 ,
       0.09535074, 0.11544504, 0.10703479, 0.56021101, 0.70193797,
       0.84790484, 0.53424533, 0.17045146, 0.64722725, 0.79079038,
       0.52136143, 0.09596044, 0.45062428, 0.60106923, 0.86151

## 제출 파일 생성

In [83]:
submission = pd.read_csv("titanic/submission.csv")
submission['Survived'] = final_preds
submission.to_csv('submission/v_2.csv', index = False)