In [4]:
import json
import os
import sys
import numpy as np
import pandas as pd

In [71]:
good_history = pd.read_csv('data/closed_company_history.csv')
good_history.head(7)

Unnamed: 0,BIZ_NO,END_DATE,STRT_DATE,CLSBZ_GB,STAT_OCR_DATE
0,1268145633,99991231,20220518,3,20220518
1,1388101749,99991231,20210319,3,20210319
2,1068179240,99991231,20200731,3,20191213
3,2248114507,99991231,20210531,3,20210531
4,1168132785,99991231,20210630,3,20210630
5,6178132798,99991231,20220715,3,20220715
6,5108108828,99991231,20210305,3,20210305


In [73]:
good_summary = pd.read_csv('data/closed_company_summary.csv', index_col=0)
good_summary.head(7)

Unnamed: 0,BIZ_NO,BZ_TYP,CMP_SCL,PBCO_GB,PSN_CORP_GB,HDOF_BR_GB,FR_IVST_CORP_YN,VENT_YN,LIST_CD,IND_CD_ORDR,...,NATN_NM,EMP_CNT,IS_LP,IS_LC,IS_LTD,IS_SP,IS_ETC,HAS_HOMEPAGE,CEO_CNT,label
0,1268145633,M,2,2,1,1,N,N,,10.0,...,,40.0,False,False,True,False,False,True,1.0,1
1,1388101749,M,2,2,1,1,N,N,,10.0,...,,13.0,False,False,True,False,False,True,1.0,1
2,1068179240,M,2,2,1,1,N,N,,10.0,...,,1.0,False,False,True,False,False,False,1.0,1
3,2248114507,M,2,2,1,1,N,N,,10.0,...,,6.0,False,False,True,False,False,False,2.0,1
4,1168132785,M,2,2,1,1,N,N,,10.0,...,,40.0,False,False,True,False,False,True,1.0,1
5,6178132798,M,2,2,1,1,N,N,,10.0,...,,73.0,False,False,True,False,False,True,1.0,1
6,5108108828,M,2,2,1,1,N,N,,10.0,...,,10.0,False,False,True,False,False,False,1.0,1


In [74]:
estb_dates = good_summary.ESTB_DATE.to_numpy().astype(np.int32)

In [75]:
arr_biz_no = np.unique(good_history.BIZ_NO)
arr_biz_no

array([1018135422, 1018154206, 1018163684, ..., 8998600312, 8998700283,
       8998701263])

#### STATUS CODE
0: 정상  
1: 부도  
2: 휴업  
3: 폐업  
4: 피합병  

In [76]:
gb_to_status = {
    1: 1,
    2: 2,
    3: 3,
    4: 4,
    5: 0,
    6: 0,
}

In [77]:
def history_of_biz(biz_no, df_history, df_summary, dict_status):
    curr_df_history = df_history[df_history['BIZ_NO'] == biz_no]
    curr_df_history = curr_df_history.sort_values(by=['STRT_DATE'])
    curr_df_summary = df_summary[df_summary['BIZ_NO'] == biz_no]
    histories = [[int(curr_df_summary.ESTB_DATE.item()), 0],]
    curr_history = histories[-1]
    for i, row in curr_df_history.iterrows():
        real_start_date = int(min(int(row['STRT_DATE']), row['STAT_OCR_DATE']))
        end_date = row['END_DATE']
        gb = row['CLSBZ_GB']
        if real_start_date >= end_date:
            continue
        status = dict_status[gb]
        prev_start_date, prev_status = curr_history
        if status == prev_status:
            continue
        curr_history = [real_start_date, status]
        histories.append(curr_history)
    return histories

In [80]:
nan_estb = []
non_exist_biz_no = []
for biz_no in arr_biz_no:
    curr_df_summary = good_summary[good_summary['BIZ_NO'] == biz_no]
    if len(curr_df_summary) == 0:
        non_exist_biz_no.append(biz_no)
        continue
    if np.isnan(curr_df_summary.ESTB_DATE.item()):
        nan_estb.append(biz_no)

In [81]:
curr_df_summary

Unnamed: 0,BIZ_NO,BZ_TYP,CMP_SCL,PBCO_GB,PSN_CORP_GB,HDOF_BR_GB,FR_IVST_CORP_YN,VENT_YN,LIST_CD,IND_CD_ORDR,...,NATN_NM,EMP_CNT,IS_LP,IS_LC,IS_LTD,IS_SP,IS_ETC,HAS_HOMEPAGE,CEO_CNT,label
6336,8998701263,M,2,2,1,1,N,N,A307160,10.0,...,,2.0,False,False,True,False,False,False,1.0,1


In [83]:
total_history = []
for biz_no in arr_biz_no:
    if biz_no in nan_estb:
        continue
    if biz_no in non_exist_biz_no:
        continue
    histories = history_of_biz(biz_no, good_history, good_summary, gb_to_status)
    total_history.append([biz_no, histories])
df_history = pd.DataFrame(total_history, columns=['BIZ_NO', 'HISTORY'])

In [84]:
df_history.to_csv('data/closed_company_history_track')

In [85]:
df_history.columns

Index(['BIZ_NO', 'HISTORY'], dtype='object')

In [89]:
good_summary1 = good_summary.dropna(subset = ['ESTB_DATE']).reset_index(drop=True)
mask = good_summary1['BIZ_NO'].isin(non_exist_biz_no)
good_summary1 = good_summary1[~mask].reset_index(drop=True)
good_summary1.to_csv('data/closed_company_summary_valid.csv')

In [93]:
good_summary1.columns

Index(['BIZ_NO', 'BZ_TYP', 'CMP_SCL', 'PBCO_GB', 'PSN_CORP_GB', 'HDOF_BR_GB',
       'FR_IVST_CORP_YN', 'VENT_YN', 'LIST_CD', 'IND_CD_ORDR', 'IND_CD1',
       'IND_CD2', 'IND_CD3', 'PB_ORG_TYP', 'MDSCO_PRTC_YN', 'HDOF_CMP_CD',
       'ESTB_DATE', 'ESTB_GB', 'LIST_DATE', 'MN_BIZ_CONT', 'NATN_NM',
       'EMP_CNT', 'IS_LP', 'IS_LC', 'IS_LTD', 'IS_SP', 'IS_ETC',
       'HAS_HOMEPAGE', 'CEO_CNT', 'label'],
      dtype='object')

In [94]:
len(non_exist_biz_no)

7

In [95]:
len(nan_estb)

565