In [None]:
# Googleドライブのマウント
from google.colab import drive
drive.mount('/content/drive')

## ライブラリのインポート

In [None]:
import os
import sys
import pickle
import csv
import logging
import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold

# 評価関数 ------------------------
from sklearn.metrics import f1_score
## f1_scoreは確率値のままでは評価できないので、カスタマイズが必要
def f1_score_prob(y_true, y_pred, threshold=0.5):
    return f1_score(y_true, np.where(y_pred > threshold, 1, 0))

# モデル --------------------------
import lightgbm as lgb
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [None]:
# ディレクトリ移動 ----------------------------------------------------
BASE_PATH = '/content/drive/MyDrive/コンペティション/【練習問題】債務不履行リスクの低減'
os.chdir(BASE_PATH)

## config.py
- ディレクトリを作成
    - `./feature/`：特徴量を保存
    - `./model/`：モデル・計算ログを保存
    - `./submission/`：提出ファイルを保存
- 前処理
- 前処理済みのデータを`.pkl`ファイルに保存
    - `.pkl`のほうが読み込みが早い

In [None]:
class Config:
    # train,testのパスを指定 ---------------
    train_path_csv = './data/train.csv'
    test_path_csv = './data/test.csv'
    # --------------------------------------
    train_path = train_path_csv.replace('.csv', '.pkl')
    test_path = test_path_csv.replace('.csv', '.pkl')

    def __init__(self):
        pass

    def make_dir(self):
        # ディレクトリ作成
        os.makedirs('./feature', exist_ok=True)
        os.makedirs('./model', exist_ok=True)
        os.makedirs('./submission', exist_ok=True)
        # csvファイル読み込み
        train = pd.read_csv(self.train_path_csv)
        test = pd.read_csv(self.test_path_csv)
        # 前処理
        train, test = self.preprocess(train, test)
        # pickleファイル保存
        pickle.dump(train, open(self.train_path, 'wb'))
        pickle.dump(test, open(self.test_path, 'wb'))

    def preprocess(self, train, test):
        # 前処理をここに書く -----------
        df_all = pd.concat([train, test])

        df_all['term'] = df_all['term'].str.extract('(\d+)', expand=False).astype('int')
        grade_unq = sorted(df_all['grade'].unique())
        df_all['grade'] = df_all['grade'].map({k:v for v,k in enumerate(grade_unq)})
        df_all['employment_length'] = df_all['employment_length'].str.extract('(\d+)', expand=False).fillna(0).astype('int')
        dummy_purpose =  pd.get_dummies(df_all['purpose'], prefix='purpose')
        dummy_apptype = pd.get_dummies(df_all['application_type'], prefix='application_type')
        df_all['loan_status'] = df_all['loan_status'].map({'FullyPaid':0, 'ChargedOff':1})
        df_all = pd.concat([df_all.drop(['purpose', 'application_type'], axis=1), dummy_purpose, dummy_apptype], axis=1)

        train = df_all[~df_all['loan_status'].isnull()]
        test = df_all[df_all['loan_status'].isnull()]
        # ------------------------------
        return train, test

## base.py
特徴量・モデルクラスの基底クラス。これを継承してクラスを定義してください。

In [None]:
# 特徴量定義の基底クラス
class Feature:
    """docstringに特徴量の説明を記述"""
    prefix = ''
    suffix = ''

    def __init__(self):
        self.name = self.__class__.__name__
        self.train = pd.read_pickle(Config.train_path)
        self.test = pd.read_pickle(Config.test_path)

    def run(self):
        self.create_features()
        prefix = self.prefix + '_' if self.prefix else ''
        suffix = '_' + self.suffix if self.suffix else ''
        self.train.columns = prefix + self.train.columns + suffix
        self.test.columns = prefix + self.test.columns + suffix
        return self

    def create_features(self):
        raise NotImplementedError

    def save(self):
        self.train.to_pickle(f'./feature/{self.name}_train.pkl')
        self.test.to_pickle(f'./feature/{self.name}_test.pkl')

    def create_memo(self):
        file_path = './feature/_feature_memo.csv'
        if not os.path.isfile(file_path):
            with open(file_path, 'w') as f:pass
        with open(file_path, 'r+', encoding='utf-8') as f:
            lines = f.readlines()
            lines = [line.strip() for line in lines]
            col = [line for line in lines if line.split(',')[0] == self.name]
            if len(col) != 0:
                return
            writer = csv.writer(f)
            writer.writerow([self.name, self.__doc__])

# モデル定義の基底クラス
class ModelBase:
    def __init__(self, run_name, params=None):
        self.run_name = run_name
        self.params = params
        self.model = None
        self.y_pred = None

    def train(self, tr_x, tr_y):
        raise NotImplementedError

    def predict(self, x):
        raise NotImplementedError

    def save_model(self):
        pickle.dump(self.model, open(f'./model/{self.run_name}.pkl', 'wb'))

    def load_model(self):
        self.model = pickle.load(open(f'./model/{self.run_name}.pkl', 'rb'))


## runner.py
- 学習・推論を実行するクラス

In [None]:
class Runner:
    def __init__(self, run_name, model_cls, features, target, params, metric, n_fold=4):
        """
        run_name  : str, 実行名
        model_cls : class, モデルクラス
        features  : list, 特徴量のリスト(ex. ['feat1', 'feat2', ...])
        target    : str, 目的変数
        params    : dict, モデルのハイパーパラメータ
        metric    : func, モデルの評価指標
        n_fold    : int, foldの個数
        """
        self.run_name = run_name
        self.model_cls = model_cls
        self.features = features
        self.target = target
        self.params = params
        self.metric = metric
        self.n_fold = n_fold
        self.STDOUT = sys.stdout

        self.logger(f'features: {self.features}')
        self.logger(f'params: {self.params}')

    # 1fold分の学習
    def train_fold(self, i_fold):
        validation = (i_fold != 'all')
        train_x = self.load_x_train()
        train_y = self.load_y_train()

        if validation:
            # バリデーションデータの分割
            tr_idx, va_idx = self.load_index_fold(i_fold)
            tr_x, tr_y = train_x.iloc[tr_idx], train_y.iloc[tr_idx]
            va_x, va_y = train_x.iloc[va_idx], train_y.iloc[va_idx]

            # モデルの学習
            model = self.build_model(i_fold)
            model.train(tr_x, tr_y)

            # バリデーションデータに対する予測と評価
            va_pred = model.predict(va_x)
            score = self.metric(va_y, va_pred)

            return model, va_idx, va_pred, score
        else:
            # モデルの学習
            model = self.build_model(i_fold)
            model.train(train_x, train_y)

            return model, None, None, None

    # クロスバリデーションでの学習
    def run_train_cv(self):
        va_idxes = []
        va_preds = []
        scores = []

        for i_fold in range(self.n_fold):
            # 学習
            print(f'{self.run_name} - Fold {i_fold + 1}')
            with open(f'./model/{self.run_name}_calc.log', 'a') as f:
                sys.stdout = f
                print(f'\n---------- Fold {i_fold + 1} ({datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}) ----------\n')
                model, va_idx, va_pred, score = self.train_fold(i_fold)
            sys.stdout = self.STDOUT
            # モデルの保存
            model.save_model()
            # 結果を保持
            va_idxes.append(va_idx)
            va_preds.append(va_pred)
            scores.append(score)
            # ログ出力
            print(f'Fold {i_fold} Score: {score}')
            self.logger(f'Fold {i_fold} Score: {score}')

        # 各foldの結果をまとめる
        va_idxes = np.concatenate(va_idxes)
        va_preds = np.concatenate(va_preds, axis=0)[np.argsort(va_idxes)]

        # CVスコア(平均値)の表示
        print(f'{self.run_name} - training cv - score {np.mean(scores)}')
        self.logger(f'{self.run_name} - training cv - score {np.mean(scores)}')

    # 各foldのモデルをアンサンブルして予測
    def run_predict_cv(self):
        test_x = self.load_x_test()
        preds = []

        for i_fold in range(self.n_fold):
            # 学習済みモデルの読み込み
            model = self.build_model(i_fold)
            model.load_model()
            # 予測
            pred = model.predict(test_x)
            preds.append(pred)

        # 各foldの結果の平均をとる
        preds = np.mean(preds, axis=0)

        return preds

    # 全学習データで学習
    def run_train_all(self):
        model, _, _, _ = self.train_fold('all')
        model.save_model()

    # run_train_allで学習したモデルで予測
    def run_predict_all(self):
        test_x = self.load_x_test()

        model = self.build_model('all')
        model.load_model()
        preds = model.predict(test_x)

        return preds

    # モデルを作成(インスタンス化)
    def build_model(self, i_fold):
        run_fold_name = f'{self.run_name}-{i_fold}'
        return self.model_cls(run_fold_name, self.params)

    # foldを指定して対応するindexを返す
    def load_index_fold(self, i_fold):
        train_y = self.load_y_train()
        dummy_x = np.zeros(len(train_y))
        skf = StratifiedKFold(n_splits=self.n_fold, shuffle=True)
        return list(skf.split(dummy_x, train_y))[i_fold]

    # 学習データの特徴量を読み込み
    def load_x_train(self):
        x_train = pd.read_pickle(Config.train_path)
        feat_origin = [feat for feat in self.features if feat in x_train.columns]
        feat_generated = [feat for feat in self.features if feat not in x_train.columns]
        if len(feat_generated) > 0:
            train_generated = pd.concat([pd.read_pickle(f'./feature/{feat}_train.pkl') for feat in feat_generated], axis=1)
            x_train = pd.concat([x_train[feat_origin], train_generated], axis=1)
        else:
            x_train = x_train[feat_origin]
        return x_train

    # 学習データの目的変数を読み込み
    def load_y_train(self):
        return pd.read_pickle(Config.train_path)[self.target]

    # テストデータの特徴量を読み込み
    def load_x_test(self):
        x_test = pd.read_pickle(Config.test_path)
        feat_origin = [feat for feat in self.features if feat in x_test.columns]
        feat_generated = [feat for feat in self.features if feat not in x_test.columns]
        if len(feat_generated) > 0:
            test_generated = pd.concat([pd.read_pickle(f'./feature/{feat}_test.pkl') for feat in feat_generated], axis=1)
            x_test = pd.concat([x_test[feat_origin], test_generated], axis=1)
        else:
            x_test = x_test[feat_origin]
        return x_test

    def logger(self, text):
        with open(f'./model/{self.run_name}_calc.log', 'a') as f:
            sys.stdout = f
            print(text)
        sys.stdout = self.STDOUT

## feature.py
`base.py`のFeatureクラスを継承して、`create_features()`メソッドで特徴量を作成する。

In [None]:
# 特徴量を定義 ------------------------------
class loan_amt_PER_term(Feature):
    """返済期間1年あたりの借入総額"""
    def __init__(self):
        super().__init__()

    def create_features(self):
        df_all = pd.concat([self.train, self.test])
        df_all[self.name] = df_all['loan_amnt'] / df_all['term']
        self.train = df_all[~df_all['loan_status'].isnull()][[self.name]]
        self.test = df_all[df_all['loan_status'].isnull()][[self.name]]

## model.py
`base.py`のModelクラスを継承して、`train()`,`predict()`,`save_model()`を定義する。

In [None]:
# 使いたいモデルを定義 ------------------------
class ModelLGBM(ModelBase):
    """LightGBM"""
    def __init__(self, run_name ,params=None):
        super().__init__(run_name, params)

    def train(self, tr_x, tr_y):
        from sklearn.model_selection import train_test_split
        tr_x, va_x, tr_y, va_y = train_test_split(tr_x, tr_y, train_size=0.8)
        pos_rate = tr_y.value_counts()[1] / len(tr_y)
        lgb_train = lgb.Dataset(tr_x, tr_y, weight=np.where(tr_y==1,1/pos_rate, 1/(1-pos_rate)))  # 重みづけ
        lgb_eval = lgb.Dataset(va_x, va_y, reference=lgb_train)
        self.model = lgb.train(self.params,
                               lgb_train,
                               valid_sets=lgb_eval,
                               callbacks=[lgb.early_stopping(stopping_rounds=10)]
                               )

    def predict(self, x):
        self.y_pred = self.model.predict(x)
        return self.y_pred

    def save_model(self):
        pickle.dump(self.model, open(f'./model/{self.run_name}.pkl', 'wb'))
        # 特徴量重要度のプロットも保存
        lgb.plot_importance(self.model)
        plt.savefig(f'./model/{self.run_name}_feature_importance.png')

class ModelLR(ModelBase):
    """ロジスティック回帰"""
    def __init__(self, run_name, params=None):
        super().__init__(run_name, params)
        self.model = LogisticRegression(**self.params)

    def train(self, tr_x, tr_y, va_x=None, va_y=None):
        self.model.fit(tr_x, tr_y)

    def predict(self, x):
        self.y_pred = self.model.predict_proba(x)[:, 1]
        return self.y_pred

class ModelRFC(ModelBase):
    """ランダムフォレスト"""
    def __init__(self, run_name, params=None):
        super().__init__(run_name, params)
        self.model = RandomForestClassifier(**self.params)

    def train(self, tr_x, tr_y, va_x=None, va_y=None):
        self.model.fit(tr_x, tr_y)

    def predict(self, x):
        self.y_pred = self.model.predict_proba(x)[:, 1]
        return self.y_pred

## RUN.py
実行フェーズ

In [None]:
# 初回のみ実行
Config().make_dir()

In [None]:
# 特徴量を作成・保存 ----------------------
feat_cls = [loan_amt_PER_term]

for feat_cl in feat_cls:
    if os.path.isfile(f'./feature/{feat_cl.__name__}_train.pkl'): # すでに作成した特徴量はskip
        print(f'[{feat_cl.__name__}] is already exist.')
        continue
    feat = feat_cl()
    feat.run().save()
    feat.create_memo()

In [None]:
# 特徴量 ----------------------------------
features = ['loan_amnt', 'term', 'interest_rate', 'grade',
            'employment_length', 'credit_score', 'purpose_car',
            'purpose_credit_card', 'purpose_debt_consolidation',
            'purpose_home_improvement', 'purpose_house', 'purpose_major_purchase',
            'purpose_medical', 'purpose_moving', 'purpose_other',
            'purpose_renewable_energy', 'purpose_small_business',
            'purpose_vacation', 'purpose_wedding', 'application_type_Individual',
            'application_type_Joint App', 'loan_amt_PER_term']

# 目的変数 --------------------------------
target = 'loan_status'

# 実行名 ----------------------------------
run_name = 'run001'

# パラメータ ------------------------------
params = {
    'objective': 'binary',
    'metric': 'binary_log_loss',
    'num_iterations': 10000
}

# 学習 ------------------------------------
# モデル、評価関数を適宜変更してください。
runner = Runner(run_name, ModelLGBM, features, target, params, f1_score_prob)
runner.run_train_cv()
preds = runner.run_predict_cv()

# submission.csvの作成 -----------------------------------------------------
# 提出形式に応じて書き換えてください。
threshold = 0.5
ids = pd.read_pickle(Config.test_path)['id']
sub = pd.DataFrame({'id': ids, 'prob': np.where(preds>threshold, 1, 0)})
sub.to_csv(f'./submission/submission_{run_name}.csv', index=False, header=False)