# DACON 병원 개/폐업 분류 예측 경진대회
[DACON 병원 개/폐업 분류 예측 경진대회](https://dacon.io/competitions/official/9565/overview/description)

## Base Modeling

### 사용 라이브러리

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import koreanize_matplotlib

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, f1_score, roc_auc_score, mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestClassifier

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

import warnings
warnings.filterwarnings("ignore")

  from pandas import MultiIndex, Int64Index


In [2]:
def eval_CM(y_test, y_pred=None, show_cm=0):
    confusion = confusion_matrix(y_test, y_pred)
    acc = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    if show_cm:
        print(f"정확도: {acc:.4f}\n정밀도: {precision:.4f}\n재현율: {recall:.4f}\nF1: {f1:.4f}")
    else:
        print(confusion)
        print(f"정확도: {acc:.4f}\n정밀도: {precision:.4f}\n재현율: {recall:.4f}\nF1: {f1:.4f}")

def reg_score(y_true, y_pred):
    MSE = mean_squared_error(y_true, y_pred)
    RMSE = np.sqrt(mean_squared_error(y_true,y_pred))
    MAE = np.mean( np.abs((y_true - y_pred) / y_true) )
    NMAE = mean_absolute_error(y_true, y_pred)/ np.mean( np.abs(y_true) )
    MAPE = np.mean( np.abs((y_true - y_pred) / y_true) ) *100
    R2 = r2_score(y_true, y_pred)
    
    print(f"MSE: {np.round(MSE, 3)}\nRMSE: {np.round(RMSE, 3)}\nMAE: {np.round(MAE, 3)}\nNMAE: {np.round(NMAE, 3)}\nMAPE: {np.round(MAPE, 3)}\nR2: {np.round(R2, 3)}")

### Data Load

In [3]:
train = pd.read_csv("data/pre_train.csv")
test = pd.read_csv("data/pre_test.csv")

train.shape, test.shape

((301, 59), (127, 59))

### 범주형 변수 -> 수치형 변수

`OC, sido, bedCount, instkind, ownerChange`를 변경해줘야함

In [4]:
obj2num = ["OC", "sido", "bedCount", "instkind", "ownerChange"]

In [5]:
# train
temp_arr = []
for col in obj2num:
    temp_arr.append(pd.get_dummies(train[col], drop_first=True))

temp = pd.concat(temp_arr, axis=1)
df_train = pd.concat([temp, train.drop(columns=obj2num, axis=1)], axis=1) 

In [6]:
# test
temp_arr = []
for col in obj2num:
    temp_arr.append(pd.get_dummies(test[col], drop_first=True))

temp = pd.concat(temp_arr, axis=1)
df_test = pd.concat([temp, test.drop(columns=obj2num, axis=1)], axis=1) 

#### Train - Data Split

In [7]:
label = "open"
feature_names = df_train.columns.tolist()
feature_names.remove(label)

In [21]:
X_train, X_valid, y_train, y_valid = train_test_split(df_train[feature_names], df_train[label], test_size=0.15, stratify=df_train[label])

print(f"X_train: {X_train.shape}\ny_train: {y_train.shape}\nX_valid: {X_valid.shape}\ny_valid: {y_valid.shape}")

X_train: (255, 75)
y_train: (255,)
X_valid: (46, 75)
y_valid: (46,)


#### Random Forest

In [22]:
clf_rf = RandomForestClassifier()

clf_rf.fit(X_train, y_train)

pred_rf = clf_rf.predict(X_valid)

In [23]:
eval_CM(y_valid, pred_rf, 1)

정확도: 0.9565
정밀도: 0.9565
재현율: 1.0000
F1: 0.9778


#### XGBoost

In [24]:
clf_xgb = XGBClassifier()

clf_xgb.fit(X_train, y_train)

pred_xgb = clf_xgb.predict(X_valid)



In [25]:
eval_CM(y_valid, pred_xgb, 1)

정확도: 0.9348
정밀도: 0.9556
재현율: 0.9773
F1: 0.9663


#### LGBM

In [26]:
clf_lgbm = LGBMClassifier()

clf_lgbm.fit(X_train, y_train)

pred_lgbm = clf_lgbm.predict(X_valid)

In [27]:
eval_CM(y_valid, pred_lgbm, 1)

정확도: 0.9348
정밀도: 0.9556
재현율: 0.9773
F1: 0.9663


### TODO: Oversampling

`test`해서 제출해볼려고했는데 `sido`에서 일반화가 잘 안되어있어서 없는 컬럼으로 나옴