# 1. 재무/비재무 점수 생성

In [1]:
import pandas as pd

df_for_merge = pd.read_csv('./냄훈.csv', encoding='cp949')
df_finance_ratio = pd.read_csv('./custom_data/all_finance_data.csv', encoding='cp949')

In [2]:
# 재무비율 컬럼 추출
use_finance_columns = df_finance_ratio.columns[:32]

In [3]:
# 재무비율에 대한 스코어카드를 생성하므로 재무비율 데이터만 추출
finance_ratio = df_finance_ratio[use_finance_columns]

In [4]:
# 타겟값과 연결짓기 위해 데이터 합치기
df = pd.merge(df_for_merge[['사업자등록번호', '결산년월', '휴폐업구분']], finance_ratio, on=['사업자등록번호', '결산년월'], how='left')

In [5]:
# 스코어카드 생성을 위해 X, y로 나누기
X = df.iloc[:, 3:]
y = df['휴폐업구분']

## 1-1. 재무 스코어카드 생성

In [6]:
from optbinning import Scorecard, BinningProcess
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

df_application_train, df_application_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42)

In [7]:
# 타겟값을 제외한 모든 데이터의 컬럼명
list_features = X.columns.values

# 범주형 변수 컬럼명만 따로 빼기
list_categorical = df_application_train.select_dtypes(include=['object', 'category']).columns.values

# 선택방법?
selection_criteria = {"iv": {"min": 0.005, 'max':0.5, "strategy": "highest"}}

# Instatiate BinningProcess
binning_process = BinningProcess(
    categorical_variables=list_categorical,
    variable_names=list_features,
    selection_criteria=selection_criteria,
)

In [8]:
logreg = LogisticRegression(C=3, max_iter=1000, random_state=161)

In [9]:
# 스케일 방법 정의
scaling_method = "min_max"
scaling_method_data = {"min": 0, "max": 1000}

# 스코어카드 객체 정의
scorecard = Scorecard(
    binning_process=binning_process,
    estimator=logreg,
    scaling_method=scaling_method,
    scaling_method_params=scaling_method_data,
    intercept_based=False,
    reverse_scorecard=True,
)

scorecard.fit(df_application_train, y_train)

Scorecard(binning_process=BinningProcess(categorical_variables=array([], dtype=object),
                                         selection_criteria={'iv': {'max': 0.5,
                                                                    'min': 0.005,
                                                                    'strategy': 'highest'}},
                                         variable_names=array(['총자본증가율', '영업이익증가율', '당기순이익증가율', '자기자본증가율', '매출액증가율', '매출총이익률',
       '매출액영업이익률', '매출액경상이익률', '매출액순이익률', '총자산영업이익률', '자기자본영업이익률',
       '자기자본순이익률', '금융비용부담률', '수지비율', '사내유보대자기자본비율', '총자산순이익률', '총자본회전율',
       '자기자본회전율', '타인자본회전율', '유동자산회전율', '재고자산회전율', '당좌자산회전율', '순운전자본회전율',
       '운전자본회전율', '유보율', '자기자본비율', '유동비율', '당좌비율', '재고자산대순운전자본비율',
       '매출채권대매입채무비율'], dtype=object)),
          estimator=LogisticRegression(C=3, max_iter=1000, random_state=161),
          reverse_scorecard=True, scaling_method='min_max',
          scaling_method_params={'max': 1000, 'min': 0})

In [43]:
finance_column_list = ['총자본증가율', '영업이익증가율', '당기순이익증가율', '자기자본증가율', '매출액증가율', '매출총이익률',
                       '매출액영업이익률', '매출액경상이익률', '매출액순이익률', '총자산영업이익률', '자기자본영업이익률',
                       '자기자본순이익률', '금융비용부담률', '수지비율', '사내유보대자기자본비율', '총자산순이익률', '총자본회전율',
                       '자기자본회전율', '타인자본회전율', '유동자산회전율', '재고자산회전율', '당좌자산회전율', '순운전자본회전율',
                       '운전자본회전율', '유보율', '자기자본비율', '유동비율', '당좌비율', '재고자산대순운전자본비율',
                       '매출채권대매입채무비율']
len(finance_column_list)

30

In [10]:
# 스코어카드 테이블 생성
scorecard_summary = scorecard.table(style="detailed").round(3)

## 1-1-1. 스코어카드 내보내기

In [11]:
scorecard_summary.to_csv('./scorecard_finance_ratio.csv', index=False, encoding='cp949')

In [12]:
scorecard_summary

Unnamed: 0,Variable,Bin id,Bin,Count,Count (%),Non-event,Event,Event rate,WoE,IV,JS,Coefficient,Points
0,총자본증가율,0,"(-inf, -0.14)",4941,0.085,4331,610,0.123,-0.947,0.117,0.014,-0.273,31.375
1,총자본증가율,1,"[-0.14, 0.00)",18813,0.322,17764,1049,0.056,-0.078,0.002,0.000,-0.273,18.461
2,총자본증가율,2,"[0.00, 0.02)",3270,0.056,3129,141,0.043,0.193,0.002,0.000,-0.273,14.444
3,총자본증가율,3,"[0.02, 0.06)",6869,0.118,6628,241,0.035,0.407,0.016,0.002,-0.273,11.257
4,총자본증가율,4,"[0.06, 0.11)",5222,0.089,5086,136,0.026,0.715,0.034,0.004,-0.273,6.691
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6,매출채권대매입채무비율,6,"[2.38, 4.50)",7502,0.129,7164,338,0.045,0.147,0.003,0.000,-0.247,15.332
7,매출채권대매입채무비율,7,"[4.50, 8.50)",3998,0.069,3804,194,0.049,0.069,0.000,0.000,-0.247,16.379
8,매출채권대매입채무비율,8,"[8.50, inf)",4289,0.073,3948,341,0.080,-0.458,0.019,0.002,-0.247,23.471
9,매출채권대매입채무비율,9,Special,0,0.000,0,0,0.000,0.000,0.000,0.000,-0.247,17.309


### 1-1-2. 재무점수 계산하기

In [13]:
import numpy as np

def change_bin(value):
    if value in ['Special', 'Missing']:
        return np.nan
    value = value.replace('(', '')
    value = value.replace(')', '')
    value = value.replace(' ', '')
    value = value.replace('[', '')
    value = value.replace(']', '')
    a = value.split(',')
    temp = []
    for i in a:
        if i == '-inf':
            temp.append(-np.inf)
        else:
            temp.append(float(i))
    return temp

In [14]:
chage_bin_scorecard = scorecard_summary.copy()
chage_bin_scorecard['Bin'] = scorecard_summary['Bin'].apply(change_bin)

In [15]:
chage_bin_scorecard.dropna(inplace=True)

In [16]:
chage_bin_scorecard

Unnamed: 0,Variable,Bin id,Bin,Count,Count (%),Non-event,Event,Event rate,WoE,IV,JS,Coefficient,Points
0,총자본증가율,0,"[-inf, -0.14]",4941,0.085,4331,610,0.123,-0.947,0.117,0.014,-0.273,31.375
1,총자본증가율,1,"[-0.14, 0.0]",18813,0.322,17764,1049,0.056,-0.078,0.002,0.000,-0.273,18.461
2,총자본증가율,2,"[0.0, 0.02]",3270,0.056,3129,141,0.043,0.193,0.002,0.000,-0.273,14.444
3,총자본증가율,3,"[0.02, 0.06]",6869,0.118,6628,241,0.035,0.407,0.016,0.002,-0.273,11.257
4,총자본증가율,4,"[0.06, 0.11]",5222,0.089,5086,136,0.026,0.715,0.034,0.004,-0.273,6.691
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4,매출채권대매입채무비율,4,"[1.62, 1.94]",2925,0.050,2828,97,0.033,0.466,0.009,0.001,-0.247,11.040
5,매출채권대매입채무비율,5,"[1.94, 2.38]",3102,0.053,2988,114,0.037,0.359,0.006,0.001,-0.247,12.473
6,매출채권대매입채무비율,6,"[2.38, 4.5]",7502,0.129,7164,338,0.045,0.147,0.003,0.000,-0.247,15.332
7,매출채권대매입채무비율,7,"[4.5, 8.5]",3998,0.069,3804,194,0.049,0.069,0.000,0.000,-0.247,16.379


In [17]:
# 유보율은 스코어 계산이 안됐으므로 제거
X.drop('유보율', axis=1, inplace=True)

In [18]:
X_add_score = X.copy()

for column in X.columns:
    result = []
    for value in X[column]:
        scorecard = chage_bin_scorecard.loc[chage_bin_scorecard['Variable'] == column]
        for range_val, score in zip(scorecard['Bin'], scorecard['Points']):
            if range_val[0] <= value < range_val[1]:
                result.append(score)
    column_name = column+'점수'
    X_add_score[column_name] = result

In [19]:
finance_ratio_score = X_add_score.iloc[:, 29:]

In [20]:
df['재무비율점수'] = finance_ratio_score.sum(axis=1)

### 1-1-4. 재무점수 데이터 내보내기

In [21]:
df[['사업자등록번호', '결산년월', '재무비율점수', '휴폐업구분']].to_csv('./finance_ratio_score.csv', index=False, encoding='cp949')

In [22]:
df_for_merge.columns[41:]

Index(['폐업률', '혼자사는사람', '취업자증감', '고용률', '경제활동참가률', '실업률', '광공업생산지수', '재정자립도',
       '특허', '가구별소득', '노인비율', '외국인비율', '인구수', '도시면적', '전기사용량'],
      dtype='object')

## 1-2. 비재무 스코어카드 생성

In [23]:
df_electric = pd.read_csv('./산업재해_전기요금.csv', index_col=0)

In [24]:
df_statistice_electric = pd.merge(df_for_merge, df_electric, on=['사업자등록번호', '결산년월'], how='left')

In [25]:
df_split_statistics_eletric = df_statistice_electric[df_statistice_electric.columns[41:]]

In [26]:
df_split_statistics_eletric

Unnamed: 0,폐업률,혼자사는사람,취업자증감,고용률,경제활동참가률,실업률,광공업생산지수,재정자립도,특허,가구별소득,노인비율,외국인비율,인구수,도시면적,전기사용량,전기요금_minmax,산업재해
0,11.346405,33.4,6.0,60.0,62.7,4.4,97.8,76.5,52270.0,6595.0,15.2,28.97,9639541.0,605512630.0,47167206.0,0.871501,0
1,4.848229,34.9,-34.8,59.3,62.1,4.6,84.7,76.1,53124.0,6575.0,16.1,25.09,9586195.0,605680193.0,45787926.0,0.854501,0
2,0.825521,36.8,3.6,59.2,62.3,4.8,89.3,75.6,54042.0,6826.0,16.8,23.83,9472127.0,605680264.0,47295807.0,0.854335,0
3,11.346405,33.4,6.0,60.0,62.7,4.4,97.8,76.5,52270.0,6595.0,15.2,28.97,9639541.0,605512630.0,47167206.0,0.739186,0
4,4.848229,34.9,-34.8,59.3,62.1,4.6,84.7,76.1,53124.0,6575.0,16.1,25.09,9586195.0,605680193.0,45787926.0,0.753391,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72951,2.882704,34.2,-19.7,62.4,64.6,3.5,115.9,34.4,7217.0,5334.0,19.1,30.43,2176636.0,909043093.0,50422732.0,0.542914,0
72952,0.202634,35.8,16.7,63.0,64.5,2.3,117.0,32.3,7274.0,5522.0,19.8,29.53,2175960.0,912069919.0,48801626.0,0.554117,0
72953,11.346405,33.4,6.0,60.0,62.7,4.4,97.8,76.5,52270.0,6595.0,15.2,28.97,9639541.0,605512630.0,47167206.0,0.871501,0
72954,4.848229,34.9,-34.8,59.3,62.1,4.6,84.7,76.1,53124.0,6575.0,16.1,25.09,9586195.0,605680193.0,45787926.0,0.854501,0


In [27]:
y = df_statistice_electric['휴폐업구분']

In [28]:
from optbinning import Scorecard, BinningProcess
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

df_application_train, df_application_test, y_train, y_test = train_test_split(
df_split_statistics_eletric, y, test_size=0.2, random_state=42)

In [29]:
# 타겟값을 제외한 모든 데이터의 컬럼명
list_features = df_split_statistics_eletric.columns.values

# 범주형 변수 컬럼명만 따로 빼기
list_categorical = df_application_train.select_dtypes(include=['object', 'category']).columns.values

# 선택방법?
selection_criteria = {"iv": {"min": 0.005, 'max':0.5, "strategy": "highest"}}

# Instatiate BinningProcess
binning_process = BinningProcess(
    categorical_variables=list_categorical,
    variable_names=list_features,
    selection_criteria=selection_criteria,
)

In [30]:
logreg = LogisticRegression(C=3, max_iter=1000, random_state=161)

In [31]:
# Define scaling method and values
scaling_method = "min_max"
scaling_method_data = {"min": 0, "max": 1000}

# Instatiate and fit Scorecard
scorecard = Scorecard(
    binning_process=binning_process,
    estimator=logreg,
    scaling_method=scaling_method,
    scaling_method_params=scaling_method_data,
    intercept_based=False,
    reverse_scorecard=True,
)

scorecard.fit(df_application_train, y_train)

Scorecard(binning_process=BinningProcess(categorical_variables=array([], dtype=object),
                                         selection_criteria={'iv': {'max': 0.5,
                                                                    'min': 0.005,
                                                                    'strategy': 'highest'}},
                                         variable_names=array(['폐업률', '혼자사는사람', '취업자증감', '고용률', '경제활동참가률', '실업률', '광공업생산지수',
       '재정자립도', '특허', '가구별소득', '노인비율', '외국인비율', '인구수', '도시면적', '전기사용량',
       '전기요금_minmax', '산업재해'], dtype=object)),
          estimator=LogisticRegression(C=3, max_iter=1000, random_state=161),
          reverse_scorecard=True, scaling_method='min_max',
          scaling_method_params={'max': 1000, 'min': 0})

In [32]:
scorecard_summary = scorecard.table(style="detailed").round(3)

In [33]:
scorecard_summary['Variable'].unique()

array(['혼자사는사람', '취업자증감', '고용률', '경제활동참가률', '실업률', '광공업생산지수', '재정자립도',
       '특허', '가구별소득', '노인비율', '외국인비율', '인구수', '도시면적', '전기사용량',
       '전기요금_minmax'], dtype=object)

In [34]:
scorecard_summary

Unnamed: 0,Variable,Bin id,Bin,Count,Count (%),Non-event,Event,Event rate,WoE,IV,JS,Coefficient,Points
0,혼자사는사람,0,"(-inf, 27.10)",6747,0.116,6061,686,0.102,-0.728,0.085,0.010,-0.582,48.612
1,혼자사는사람,1,"[27.10, 33.75)",32636,0.559,30744,1892,0.058,-0.119,0.008,0.001,-0.582,37.468
2,혼자사는사람,2,"[33.75, 35.15)",9801,0.168,9441,360,0.037,0.360,0.019,0.002,-0.582,28.714
3,혼자사는사람,3,"[35.15, 36.55)",3291,0.056,3258,33,0.010,1.685,0.081,0.009,-0.582,4.469
4,혼자사는사람,4,"[36.55, inf)",5889,0.101,5836,53,0.009,1.795,0.158,0.017,-0.582,2.473
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4,전기요금_minmax,4,"[0.61, 0.73)",13868,0.238,13229,639,0.046,0.123,0.003,0.000,-0.408,33.712
5,전기요금_minmax,5,"[0.73, 0.89)",18414,0.316,17274,1140,0.062,-0.189,0.012,0.002,-0.408,37.715
6,전기요금_minmax,6,"[0.89, inf)",4765,0.082,4394,371,0.078,-0.435,0.019,0.002,-0.408,40.875
7,전기요금_minmax,7,Special,0,0.000,0,0,0.000,0.000,0.000,0.000,-0.408,35.294


In [35]:
chage_bin_scorecard = scorecard_summary.copy()
chage_bin_scorecard['Bin'] = scorecard_summary['Bin'].apply(change_bin)

### 1-2-1. 비재무점수 계산하기

In [36]:
chage_bin_scorecard.dropna(inplace=True)

In [37]:
score_columns = ['혼자사는사람', '취업자증감', '고용률', '경제활동참가률', '실업률', '광공업생산지수', '재정자립도',
       '특허', '가구별소득', '노인비율', '외국인비율', '인구수', '도시면적', '전기사용량',
       '전기요금_minmax']

In [38]:
X_add_score = df_statistice_electric.copy()

for column in score_columns:
    result = []
    for value in df_statistice_electric[column]:
        scorecard = chage_bin_scorecard.loc[chage_bin_scorecard['Variable'] == column]
        for range_val, score in zip(scorecard['Bin'], scorecard['Points']):
            if range_val[0] <= value < range_val[1]:
                result.append(score)
    column_name = column+'점수'
    X_add_score[column_name] = result

In [39]:
X_add_score.columns[58:]

Index(['혼자사는사람점수', '취업자증감점수', '고용률점수', '경제활동참가률점수', '실업률점수', '광공업생산지수점수',
       '재정자립도점수', '특허점수', '가구별소득점수', '노인비율점수', '외국인비율점수', '인구수점수', '도시면적점수',
       '전기사용량점수', '전기요금_minmax점수'],
      dtype='object')

In [40]:
df_statistice_electric_score = X_add_score[X_add_score.columns[58:]]

In [41]:
df_statistice_electric['비재무점수'] = df_statistice_electric_score.sum(axis=1)

### 1-2-2. 비재무점수 데이터 내보내기

In [42]:
df_statistice_electric[['사업자등록번호', '결산년월', '비재무점수', '휴폐업구분']].to_csv('./statistics_score.csv', index=False, encoding='cp949')