# 前回行った各手法

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
pd.set_option("display.max_rows",500)
pd.set_option("display.max_columns",500)

train = pd.read_csv("/Users/nobu/Documents/データセット/home-credit-default-risk/application_train.csv")
test = pd.read_csv("/Users/nobu/Documents/データセット/home-credit-default-risk/application_test.csv")

ROC_Score_lst = []#結果を補完するリスト

In [2]:
#　objectデータだけ抜き出す
train.select_dtypes(include=object).head(100)

#ラベルだけ抜き出す
object_label = train.select_dtypes(include=object).columns

In [3]:
#抜き出したカテゴリ型の特徴量をエンコーディングし元のtrainデータに置き換える
import category_encoders as ce
encode_data = ce.OrdinalEncoder(cols=list(object_label),handle_unknown='impute')

#問１、２用の検証データを作成
train_df = encode_data.fit_transform(train)

In [4]:
#train_dfの中でTARGETとの相関係数が強い（負も含む）上位10項目のインデックスを抽出
target_corr_index = train_df.corr()["TARGET"].abs().nlargest(10).index


#train_dfの抽出したインデックスで相関係数を算出
target_corr_top10 = train_df[target_corr_index].corr()

In [5]:
#欠損値を中央値埋め
train_df = train_df.fillna(train_df.median())

# 特徴量と目的変数に分け、それぞれをndarrayに変換
X = train_df.loc[:,["EXT_SOURCE_3","EXT_SOURCE_2","EXT_SOURCE_1"]]
y = train_df["TARGET"].values

In [6]:
#訓練用75%と検証用25%に分割
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=1,stratify=y)

#特徴量を標準化
from sklearn.preprocessing import StandardScaler

std = StandardScaler()
std.fit(X_train)

std_X_train = std.transform(X_train)
std_X_test = std.transform(X_test)

#学習
from sklearn.linear_model import LogisticRegression
LR =  LogisticRegression()
LR.fit(std_X_train, y_train)

#推定
LR_pred = LR.predict_proba(std_X_test)#predict_probaは各データがそれぞれのクラスに所属する確率を求める

#ROCを求めるのに適合させるためにknn１_predの確率の部分を１次元に変換
LR_rate = LR_pred[:,1:].flatten()
LR_rate

#ROCを求める
from sklearn.metrics import roc_auc_score
print(roc_auc_score(y_test, LR_rate))

ROC_Score_lst.append(roc_auc_score(y_test, LR_rate))

0.7210138261748492


# 【問題1】クロスバリデーション

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
logreg = LogisticRegression()


In [8]:
# k 分割交差検証（主に回帰に使われる）
from sklearn.model_selection import KFold
kfold = KFold(n_splits=5)
print('Cross-validation scores: \n{}'.format(cross_val_score(logreg, X, y, cv=kfold)))

Cross-validation scores: 
[0.91979253 0.91709213 0.91881565 0.91966115 0.92047413]


In [9]:
# 層化 k 分割交差検証（主にクラス分類に使われる）
from sklearn.model_selection import StratifiedKFold
stratifiedkfold = StratifiedKFold(n_splits=5)
print('Cross-validation scores: \n{}'.format(cross_val_score(logreg, X,y, cv=stratifiedkfold)))

Cross-validation scores: 
[0.91919093 0.91905954 0.91917336 0.91918962 0.91917336]


# 【問題2】グリッドサーチ
https://qiita.com/tomov3/items/039d4271ed30490edf7b<br>
https://qiita.com/FujiedaTaro/items/5784eda386146f1fd6e7<br>
https://qiita.com/saiaron/items/bb96c0f898cd0cbcd788

#### GridSearchCVクラスには引数としてモデル、探索範囲、さらにクロスバリデーションを何分割で行うかを与えます。
#### クロスバリデーションの機能も含まれているため、これを使用する場合はKFoldクラスを利用する必要はありません。



## ロジスティック回帰のパラメーター詳細
https://qiita.com/s_yaginuma/items/460eb7bbd78e9c47df9b

### param_grid=  {"C": [0.01, 0.1, 0.5, 1,10,100]}

In [10]:
#param_grid=  {"C": [0.01, 0.1, 0.5, 1,10,100]}

# GridSearchCVのインスタンスを作成&学習&スコア記録
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# パラメータを dict 型で指定
param_grid=  {"C": [0.01, 0.1, 0.5, 1,10,100]}

# validation set は GridSearchCV が自動で作成してくれるため，training set と test set の分割のみを実行すればよい
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5,scoring='roc_auc')

# fit 関数を呼ぶことで交差検証とグリッドサーチがどちらも実行される
grid_search.fit(X_train, y_train)

#最良のスコアとパラメータは自動的に best_score_，best_estimator_ 変数にそれぞれ格納されます．

print('Test set score: {}'.format(grid_search.score(X_test, y_test)))
print('Best parameters: {}'.format(grid_search.best_params_))
print('Best cross-validation: {}'.format(grid_search.best_score_))

Test set score: 0.716527903373628
Best parameters: {'C': 100}
Best cross-validation: 0.7181746492226664


#### ➡︎ ここで重要なのは，パラメータの選択（grid_search.fit(X_train, y_train) の部分）に test set を使用していないという点<br>GridSearchCV により，汎化精度が最も高くなるようなパラメータの発見が可能となります．

In [11]:
#param_grid=  {"C": [0.01, 0.1, 0.5, 1,10,100],"solver" : ["newton-cg", "lbfgs", "liblinear", "sag", "saga"],"penalty" : ["l1","l2"]}


# GridSearchCVのインスタンスを作成&学習&スコア記録
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# パラメータを dict 型で指定
param_grid=  {"C": [0.01, 0.1, 0.5, 1,10,100],"solver" : ["newton-cg", "lbfgs", "liblinear", "sag", "saga"],"penalty" : ["l1","l2"]}

# validation set は GridSearchCV が自動で作成してくれるため，training set と test set の分割のみを実行すればよい
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=10,scoring='roc_auc')#評価指標roc_aucを指定

# fit 関数を呼ぶことで交差検証とグリッドサーチがどちらも実行される
grid_search.fit(X_train, y_train)

#最良のスコアとパラメータは自動的に best_score_，best_estimator_ 変数にそれぞれ格納されます．

print('Test set score: {}'.format(grid_search.score(X_test, y_test)))
print('Best parameters: {}'.format(grid_search.best_params_))
print('Best cross-validation: {}'.format(grid_search.best_score_))

ValueError: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 penalty.

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

ValueError: Solver sag supports only 'l2' or 'none' penalties, got l1 penalty.

ValueError: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 penalty.

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

ValueError: Solver sag supports only 'l2' or 'none' penalties, got l1 penalty.

ValueError: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 penalty.

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

ValueError: Solver sag supports only 'l2' or 'none' penalties, got l1 penalty.

ValueError: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 penalty.

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

ValueError: Solver sag supports only 'l2' or 'none' penalties, got l1 penalty.

ValueErr

Test set score: 0.7165278778612715
Best parameters: {'C': 100, 'penalty': 'l1', 'solver': 'saga'}
Best cross-validation: 0.7181679138544601


# 【問題3】Kaggle Notebooksからの調査

### ①[sklearn.preprocessing.PolynomialFeatures ]
sklearn.preprocessing.PolynomialFeatures（degree = 2、interaction_only = False、include_bias = True、order = 'C' ）
指定された次数以下の次数を持つ特徴のすべての多項式の組み合わせで構成される新しい特徴行列を生成します。


https://www.kaggle.com/willkoehrsen/start-here-a-gentle-introduction で
EXT_SOURCE変数とDAYS_BIRTH変数を使用して多項式の特徴を作成するのに用いられている。
結果として作成したいくつかの特徴量でTARGETとの相関が高くなっている


### ②ドメイン知識をもとに作成された特徴量
https://www.kaggle.com/jsaguiar/lightgbm-with-simple-features
#### CREDIT_INCOME_PERCENT：クライアントの収入に対するクレジット額の割合<br>train['AMT_CREDIT'] / train['AMT_INCOME_TOTAL']
#### ANNUITY_INCOME_PERCENT：クライアントの収入に対するローン年金の割合<br>train['AMT_ANNUITY'] / train['AMT_INCOME_TOTAL']
#### CREDIT_TERM：支払いの月数（年金は毎月の支払い期日であるため）<br>train['AMT_ANNUITY'] / train['AMT_CREDIT']
#### DAYS_EMPLOYED_PERCENT：クライアントの年齢に対する就業日の割合<br>train['DAYS_EMPLOYED'] / train['DAYS_BIRTH']

### ③LightGBMを使っている人が多かった

# 【問題4】高い汎化性能のモデル作成
## オリジナル仮説：EXT_SOURCEを総合的に着目し検証


EXT_SOURCEが信用スコアということでアメリカの信用スコアについて調べた

➡︎アメリカの信用情報機関は三つ。(エキファックス（Equifax）エクスペリアン（Experian）トランスユニオン（TransUnion）)
<br>集まった金融機関の利用情報と事故情報を元に、「FICOスコア」(300〜850点で個人の信用力を格付けしていく手法)でクレジットスコアが算出される

➡︎日本も信用情報機関はCIC、JICC、JBAの三つ。これらはそれぞれ管轄する情報が異なる。
<br>例：CICは消費者金融や信販系クレジットカード会社、JICCはネット専業銀行や地方銀行、JBAはメガバンクや、都市銀行や第一・第二地方銀行など

➡︎返済力を調べるならこれらを総合的に見るべきでは？


➡︎アメリカの評価指標FICOが３００〜８５０点でスコアを算出するなか、今回のデータは全て小数点以下。どう解釈するか？

In [12]:
train_df.loc[:,["TARGET","DAYS_BIRTH","EXT_SOURCE_1","EXT_SOURCE_2","EXT_SOURCE_3"]].corr()

Unnamed: 0,TARGET,DAYS_BIRTH,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3
TARGET,1.0,0.078239,-0.098887,-0.160295,-0.155892
DAYS_BIRTH,0.078239,1.0,-0.362185,-0.091947,-0.178527
EXT_SOURCE_1,-0.098887,-0.362185,1.0,0.134993,0.1091
EXT_SOURCE_2,-0.160295,-0.091947,0.134993,1.0,0.094147
EXT_SOURCE_3,-0.155892,-0.178527,0.1091,0.094147,1.0


TARGETと負の相関があることからEXT_SOURCEの数値が低いほど返済力があると考えられる。

In [13]:
#EXT_SOURCEを総合的に見るためにそれぞれを合計する
train_df["total_EXT_SOURCE"] = train_df["EXT_SOURCE_1"] + train_df["EXT_SOURCE_2"] + train_df["EXT_SOURCE_3"]

## 案１：TARGETとの相関が高かったEXT_SOURCEを合計したものと、DAYS_BIRTHを特徴量に設定しROC得点を算出

In [14]:
X = train_df.loc[:,["DAYS_BIRTH","total_EXT_SOURCE"]].values
y = train_df["TARGET"].values


In [15]:
logreg = LogisticRegression()
stratifiedkfold = StratifiedKFold(n_splits=5)
print('Cross-validation Average scores: \n{}'.format(cross_val_score(logreg, X,y, cv=stratifiedkfold,scoring='roc_auc').mean()))


ROC_Score_lst.append(0.5830347081056406)

Cross-validation Average scores: 
0.5830347081056406


## 案２：LightGBMを使う(案１と同条件)

In [16]:
import lightgbm as lgb
lgb = lgb.LGBMClassifier()

stratifiedkfold = StratifiedKFold(n_splits=5)
print('Cross-validation Average scores: \n{}'.format(cross_val_score(lgb, X,y, cv=stratifiedkfold,scoring='roc_auc').mean()))

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


Cross-validation Average scores: 
0.7175745251967905


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=0)

import lightgbm as lgb
lgb_train = lgb.Dataset(X_train, y_train)
lgb_test = lgb.Dataset(X_test, y_test)

params = {
    'task' : 'train',
    'boosting_type' : 'gbdt',
    'objective' : 'regression',
    'metric' : {'l2', 'auc'},
    'num_leaves' : 31,
    'learning_rate' : 0.05,
    'feature_fraction' : 0.9,
    'bagging_fraction' : 0.8,
    'bagging_freq': 5,
    'verbose' : 0
}

gbm = lgb.train(params, lgb_train, valid_sets=lgb_test, early_stopping_rounds=100, verbose_eval=100)
print(gbm)
ROC_Score_lst.append(0.0696189)

## 案3：案１で作成した特徴量に加え、以下で作成されていた特徴量を用いてLightGBM
https://www.kaggle.com/jsaguiar/lightgbm-with-simple-features

In [18]:
#クライアントの収入に対するクレジット額の割合
train_df['CREDIT_INCOME_PERCENT'] = train_df['AMT_CREDIT'] / train_df['AMT_INCOME_TOTAL']
#クライアントの収入に対するローン支払額の割合
train_df['ANNUITY_INCOME_PERCENT'] = train_df['AMT_ANNUITY'] / train_df['AMT_INCOME_TOTAL']
#支払いの月数（年金は毎月の支払い期日であるため）
train_df['CREDIT_TERM'] = train_df['AMT_ANNUITY'] / train_df['AMT_CREDIT']
#クライアントの年齢に対する就業日の割合
train_df['DAYS_EMPLOYED_PERCENT'] = train_df['DAYS_EMPLOYED'] / train_df['DAYS_BIRTH']

In [19]:
X = train_df.loc[:,["total_EXT_SOURCE","CREDIT_INCOME_PERCENT","ANNUITY_INCOME_PERCENT","CREDIT_TERM","DAYS_EMPLOYED_PERCENT"]]
y = train_df["TARGET"]

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

import lightgbm as lgb
lgb_train = lgb.Dataset(X_train, y_train)
lgb_test = lgb.Dataset(X_test, y_test)

params = {
    'task' : 'train',
    'boosting_type' : 'gbdt',
    'objective' : 'regression',
    'metric' : {'l2', 'auc'},
    'num_leaves' : 31,
    'learning_rate' : 0.05,
    'feature_fraction' : 0.9,
    'bagging_fraction' : 0.8,
    'bagging_freq': 5,
    'verbose' : 0
}

gbm = lgb.train(params, lgb_train, valid_sets=lgb_test, early_stopping_rounds=100, verbose_eval=100)
print(gbm)
ROC_Score_lst.append(0.0679306)


Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.743599	valid_0's l2: 0.0679309
Did not meet early stopping. Best iteration is:
[97]	valid_0's auc: 0.743649	valid_0's l2: 0.0679306
<lightgbm.basic.Booster object at 0x1a237a8400>


### 案４：案３から案１で作成したtotal_EXT_SOURCEを抜いて検証

In [21]:
X = train_df.loc[:,["CREDIT_INCOME_PERCENT","ANNUITY_INCOME_PERCENT","CREDIT_TERM","DAYS_EMPLOYED_PERCENT"]]
y = train_df["TARGET"]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

import lightgbm as lgb
lgb_train = lgb.Dataset(X_train, y_train)
lgb_test = lgb.Dataset(X_test, y_test)

params = {
    'task' : 'train',
    'boosting_type' : 'gbdt',
    'objective' : 'regression',
    'metric' : {'l2', 'auc'},
    'num_leaves' : 31,
    'learning_rate' : 0.05,
    'feature_fraction' : 0.9,
    'bagging_fraction' : 0.8,
    'bagging_freq': 5,
    'verbose' : 0
}

gbm = lgb.train(params, lgb_train, valid_sets=lgb_test, early_stopping_rounds=100, verbose_eval=100)
print(gbm)
ROC_Score_lst.append(0.0704786)

Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.676755	valid_0's l2: 0.0704824
Did not meet early stopping. Best iteration is:
[97]	valid_0's auc: 0.676898	valid_0's l2: 0.0704786
<lightgbm.basic.Booster object at 0x1a22152a90>


In [22]:
result = pd.DataFrame(ROC_Score_lst,index=["1_holdout","2_total_EXT_SOURCE","3_LightGBM",
                                           "4_new_feature","5_Del_total_EXT_SOURCE"],columns=["ROC_Score"])

result

Unnamed: 0,ROC_Score
1_holdout,0.721014
2_total_EXT_SOURCE,0.583035
3_LightGBM,0.069619
4_new_feature,0.067931
5_Del_total_EXT_SOURCE,0.070479


➡︎　①の前回提出したロジスティック回帰＋ホールドアウト法が最も高得点であった。
<br>⑤にて④からtotal_EXT_SOURCEを抜いて検証したが、⑤の方が得点が高かったため、EXT_SOURCEを総合的に捉えることは有効ではないと考えられる。

# 【問題5】最終的なモデルの選定
### ①は前回提出しているため、⑤で提出する

In [23]:
#案３再現

#クライアントの収入に対するクレジット額の割合
train_df['CREDIT_INCOME_PERCENT'] = train_df['AMT_CREDIT'] / train_df['AMT_INCOME_TOTAL']
#クライアントの収入に対するローン支払額の割合
train_df['ANNUITY_INCOME_PERCENT'] = train_df['AMT_ANNUITY'] / train_df['AMT_INCOME_TOTAL']
#支払いの月数（年金は毎月の支払い期日であるため）
train_df['CREDIT_TERM'] = train_df['AMT_ANNUITY'] / train_df['AMT_CREDIT']
#クライアントの年齢に対する就業日の割合
train_df['DAYS_EMPLOYED_PERCENT'] = train_df['DAYS_EMPLOYED'] / train_df['DAYS_BIRTH']
#total_EXT_SOURCE作成
train_df["total_EXT_SOURCE"] = train_df["EXT_SOURCE_1"] + train_df["EXT_SOURCE_2"] + train_df["EXT_SOURCE_3"]

X = train_df.loc[:,["CREDIT_INCOME_PERCENT","ANNUITY_INCOME_PERCENT","CREDIT_TERM","DAYS_EMPLOYED_PERCENT"]]
y = train_df["TARGET"]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

import lightgbm as lgb
lgb_train = lgb.Dataset(X_train, y_train)
lgb_test = lgb.Dataset(X_test, y_test)

params = {
    'task' : 'train',
    'boosting_type' : 'gbdt',
    'objective' : 'regression',
    'metric' : {'l2', 'auc'},
    'num_leaves' : 31,
    'learning_rate' : 0.05,
    'feature_fraction' : 0.9,
    'bagging_fraction' : 0.8,
    'bagging_freq': 5,
    'verbose' : 0
}

gbm = lgb.train(params, lgb_train, valid_sets=lgb_test, early_stopping_rounds=100, verbose_eval=100)


Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.676755	valid_0's l2: 0.0704824
Did not meet early stopping. Best iteration is:
[97]	valid_0's auc: 0.676898	valid_0's l2: 0.0704786


In [24]:
#テストデータを訓練データ同様に前処理

#欠損値を中央値埋め
test.fillna(test.median(),inplace=True)

#　objectデータだけ抜き出す
test.select_dtypes(include=object)

#ラベルだけ抜き出す
object_label = test.select_dtypes(include=object).columns

#抜き出したカテゴリ型の特徴量をエンコーディングし元のtrainデータに置き換える
import category_encoders as ce
encode_datae = ce.OrdinalEncoder(cols=list(object_label),handle_unknown='impute')
test_df = encode_data.fit_transform(test)

#total_EXT_SOURCEを作成
test_df["total_EXT_SOURCE"] = test_df["EXT_SOURCE_1"] + test_df["EXT_SOURCE_2"] + test_df["EXT_SOURCE_3"]


#クライアントの収入に対するクレジット額の割合
test_df['CREDIT_INCOME_PERCENT'] = test['AMT_CREDIT'] / test['AMT_INCOME_TOTAL']
#クライアントの収入に対するローン支払額の割合
test_df['ANNUITY_INCOME_PERCENT'] = test['AMT_ANNUITY'] / test['AMT_INCOME_TOTAL']
#支払いの月数（年金は毎月の支払い期日であるため）
test_df['CREDIT_TERM'] = test['AMT_ANNUITY'] / test['AMT_CREDIT']
#クライアントの年齢に対する就業日の割合
test_df['DAYS_EMPLOYED_PERCENT'] = test['DAYS_EMPLOYED'] / test['DAYS_BIRTH']


In [25]:
#推定
gbm_pred = gbm.predict(test_df)

In [26]:
#提出用データ作成
submission_data =pd.DataFrame({'SK_ID_CURR': test_df['SK_ID_CURR'], 'TARGET': gbm_pred})
submission_data.to_csv('bbbbbbbb', index=False)

SCORE➡︎0.50562

・・・過学習していたのか、、、もう少し試行錯誤してみます、、、