## 20-1학기 시스템 경영 종합 설계
# 5조 인공 신경망을 이용한 병원 폐업 예측 모델링  

# - 목차 -
- 연구 목적
- 데이터 로드
- 데이터 전처리
- 모델링 및 학습
- 예측 및 평가

# 연구 목적

대출 상품의 심사 과정에서, 병원의 상환 기간 내 폐업 가능성을 예측해 대출 승인/반려 여부를 결정한다. 또한, 리스크에 따라 우대이율을 책정하기 위해 병원의 폐업 여부를 예측한다.



In [1]:
import pandas as pd
import numpy as np

train_data = pd.read_csv('./data/train.csv').reset_index()
test_data = pd.read_csv('./data/test.csv').reset_index()

train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 301 entries, 0 to 300
Data columns (total 59 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   index               301 non-null    int64  
 1   inst_id             301 non-null    int64  
 2   OC                  301 non-null    object 
 3   sido                301 non-null    object 
 4   sgg                 301 non-null    int64  
 5   openDate            301 non-null    int64  
 6   bedCount            296 non-null    float64
 7   instkind            300 non-null    object 
 8   revenue1            293 non-null    float64
 9   salescost1          293 non-null    float64
 10  sga1                293 non-null    float64
 11  salary1             293 non-null    float64
 12  noi1                293 non-null    float64
 13  noe1                293 non-null    float64
 14  interest1           293 non-null    float64
 15  ctax1               293 non-null    float64
 16  profit1 

# 데이터 전처리
## 결측치
* 학습 데이터 중 8개의 인스턴스에서 대부분의 값이 NaN이므로 8개 행을 삭제하였다. 
* 여러 값에서 결측치(NaN, 0)가 발생하여 평균으로 대체하였다.
* employee 정보는 임금(salary)를 인당 평균 임금으로 나누어 추정하였다. 
* OwnerChange는 NaN일 때 unknown으로 변환한 후 same과 change의 중간값을 부여하였다.


In [3]:
# 8개의 NA행 삭제  
missing_idx_revenue1 = np.where(np.isnan(train_data['revenue1']))[0]
X_train_dropna = train_data.drop(missing_idx_revenue1).reset_index()

# ownerChange
missing_idx_ownerChange = np.where(pd.isnull(X_train_dropna['ownerChange']))
for idx in missing_idx_ownerChange:
    X_train_dropna['ownerChange'].loc[idx] = 'unknown' 
ownerChange_map = {'same':0, 'change':1, 'unknown': 0.5}
X_train_dropna['ownerChange'] = X_train_dropna['ownerChange'].apply(ownerChange_map.get)

In [4]:
# employee estimation
missing_idx_employee1 = np.where(np.isnan(X_train_dropna['employee1']))[0]
missing_idx_employee2 = np.where(np.isnan(X_train_dropna['employee2']))[0]
missing_idx_employee_union = sorted(list(set(missing_idx_employee1).union(set(missing_idx_employee2))))
missing_idx_employee_intersection = sorted(list(set(missing_idx_employee1).intersection(set(missing_idx_employee2))))
for idx in missing_idx_employee_union:
    if idx in missing_idx_employee_intersection:
        pass
    elif idx in missing_idx_employee1:
        X_train_dropna['employee1'].loc[idx] = X_train_dropna['employee2'].loc[idx]
    else:
        X_train_dropna['employee2'].loc[idx] = X_train_dropna['employee1'].loc[idx]
X_train_salary = X_train_dropna.drop(missing_idx_employee_intersection)
avg_salary = (np.sum(X_train_salary['salary1'], axis=0)/np.sum(X_train_salary['employee1'])+
              np.sum(X_train_salary['salary2'], axis=0)/np.sum(X_train_salary['employee2']))/2
for idx in missing_idx_employee_intersection:
    salary1 = X_train_dropna['salary1'].loc[idx]
    salary2 = X_train_dropna['salary2'].loc[idx]
    employee1 = int(salary1/avg_salary)
    employee2 = int(salary2/avg_salary)
    X_train_dropna['employee1'].loc[idx] = employee1
    X_train_dropna['employee2'].loc[idx] = employee2

In [5]:
# 나머지 평균으로 결측치 예측
X_train_dropna = X_train_dropna.replace(0, np.NaN)
X_train_dropna_imputed = X_train_dropna.fillna(X_train_dropna.mean())

## Labeling and Data Scaling

In [6]:
# label
y_train = X_train_dropna['OC']
# 미사용 column 삭제
X_train_dropna_imputed = X_train_dropna_imputed.drop(['OC','sido','inst_id','level_0','index','sgg',
                         'receivableL1','receivableL2'], axis=1)

# category형 변수 one-hot encoding
X_train = pd.get_dummies(X_train_dropna_imputed)

In [7]:
# feature별 standard scaling
feature_mean = []
feature_std = []
for column in X_train.columns[:50]:
    mean = X_train[column].mean()
    std = X_train[column].std()
    X_train[column] -= mean
    X_train[column] /= std
    feature_mean.append(mean)
    feature_std.append(std)

In [8]:
# owner change는 same:0 change:1 unknown:0.5 부여
X_train['ownerChange'] = X_train['ownerChange'].replace(X_train['ownerChange'][0], 0)

In [9]:
# label encoding
open_map = {'open':1, ' close':0}
y_train = y_train.apply(open_map.get)

## 테스트 데이터 전처리

In [10]:
# ownerChange
missing_idx_ownerChange = np.where(pd.isnull(test_data['ownerChange']))
for idx in missing_idx_ownerChange:
    test_data['ownerChange'].loc[idx] = 'unknown' 
test_data['ownerChange'] = test_data['ownerChange'].apply(ownerChange_map.get)

In [11]:
# employee estimation
missing_idx_employee1 = np.where(pd.isnull(test_data['employee1']))[0]
missing_idx_employee2 = np.where(pd.isnull(test_data['employee2']))[0]
missing_idx_employee_union = sorted(list(set(missing_idx_employee1).union(set(missing_idx_employee2))))
missing_idx_employee_intersection = sorted(list(set(missing_idx_employee1).intersection(set(missing_idx_employee2))))
for idx in missing_idx_employee_union:
    if idx in missing_idx_employee_intersection:
        pass
    elif idx in missing_idx_employee1:
        test_data['employee1'].loc[idx] = test_data['employee2'].loc[idx]
    else:
        test_data['employee2'].loc[idx] = test_data['employee1'].loc[idx]
for idx in missing_idx_employee_intersection:
    salary1 = test_data['salary1'].loc[idx]
    salary2 = test_data['salary2'].loc[idx]
    employee1 = int(salary1/avg_salary)
    employee2 = int(salary2/avg_salary)
    test_data['employee1'].loc[idx] = employee1
    test_data['employee2'].loc[idx] = employee2

In [12]:
# 나머지 평균으로 결측치 예측
test_data = test_data.replace(0, np.NaN)
test_data_imputed = test_data.fillna(X_train_dropna.mean())

test_data_imputed.employee1 = test_data_imputed.employee1.astype('str').str.replace(",", "").astype('float')
test_data_imputed.employee2 = test_data_imputed.employee2.astype('str').str.replace(",", "").astype('float')

In [13]:
test_data_imputed = test_data_imputed.drop(['OC','sido','inst_id','index','sgg',
                         'receivableL1','receivableL2'], axis=1)

In [14]:
# category형 변수 one-hot encoding
X_test = pd.get_dummies(test_data_imputed)
X_test['instkind_dental_clinic'] = 0

In [15]:
# feature별 standard scaling
for column_num in range(50):
    mean = feature_mean[column_num]
    std = feature_std[column_num]
    X_test[X_test.columns[column_num]] -= mean
    X_test[X_test.columns[column_num]] /= std

# 모델링 및 학습 (CNN)

In [16]:
from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_validate


RANDOM = 7777
clf = MLPClassifier(hidden_layer_sizes=(30,30,30), max_iter=5000, early_stopping=True, random_state=RANDOM)

cv = cross_validate(clf, X_train, y_train, cv=5,return_train_score=True)

print('train accuracy:',np.mean(cv['train_score']))
print('cross validation accuracy mean:',np.mean(cv['test_score']))

train accuracy: 0.9436988543371522
cross validation accuracy mean: 0.9453535943892462
