In [3]:
import json
import os
import sys
import numpy as np
import pandas as pd
from datetime import datetime, date, timedelta

In [29]:
target = 'opened' # 'opened', 'closed'

In [30]:
good_history = pd.read_csv(f'data/{target}_company_history.csv')
good_history.head(7)

Unnamed: 0.1,Unnamed: 0,BIZ_NO,END_DATE,STRT_DATE,CLSBZ_GB,STAT_OCR_DATE
0,0,5088115592,99991231,20150209,5,
1,1,5088115592,20150208,20200731,2,
2,2,1138127100,99991231,20121105,5,
3,3,1138127100,20121104,20200731,2,
4,4,1138127100,20121004,20200731,5,
5,5,1138127100,20100726,20200731,2,
6,6,1348608497,99991231,20200731,5,


In [31]:
good_summary = pd.read_csv(f'data/{target}_company_summary.csv', index_col=0)
good_summary.head(7)

Unnamed: 0,BIZ_NO,BZ_TYP,CMP_SCL,PBCO_GB,PSN_CORP_GB,HDOF_BR_GB,FR_IVST_CORP_YN,VENT_YN,LIST_CD,IND_CD_ORDR,...,NATN_NM,EMP_CNT,IS_LP,IS_LC,IS_LTD,IS_SP,IS_ETC,HAS_HOMEPAGE,CEO_CNT,label
0,2218116923,M,2,2,1,1,N,N,,10.0,...,,12.0,False,False,True,False,False,False,2.0,0
1,1138111739,M,2,2,1,1,N,N,,10.0,...,,43.0,False,False,True,False,False,True,2.0,0
2,1108146490,M,2,2,1,1,N,N,,10.0,...,,3.0,False,False,True,False,False,True,1.0,0
3,3128134737,M,2,2,1,1,N,N,,10.0,...,,24.0,False,False,True,False,False,True,1.0,0
4,2078132193,M,2,2,1,1,N,N,,10.0,...,,35.0,False,False,True,False,False,True,2.0,0
5,4028122626,M,2,2,1,1,N,N,,10.0,...,,7.0,False,False,True,False,False,True,1.0,0
6,2228111902,M,2,2,1,1,N,N,,10.0,...,,10.0,False,False,True,False,False,False,1.0,0


In [32]:
estb_dates = good_summary.ESTB_DATE.to_numpy().astype(np.int32)
arr_biz_no = np.unique(good_history.BIZ_NO)
arr_biz_no

array([1018161162, 1018163758, 1018180233, ..., 8938100809, 8998100409,
       8998700089])

#### STATUS CODE
0: 정상  
1: 부도  
2: 휴업  
3: 폐업  
4: 피합병  

In [33]:
gb_to_status = {
    1: 1,
    2: 2,
    3: 3,
    4: 4,
    5: 0,
    6: 0,
}

In [34]:
def date_to_int(base_date):
    return base_date.year * 1e4 + base_date.month * 1e2 + base_date.day


def int_to_date(num):
    year, month_and_day = divmod(num, 10000)
    month, day = divmod(month_and_day, 100)
    return date(year=int(num//1e4), month=month, day=day) 


In [35]:
def history_of_biz(biz_no, df_history, df_summary, dict_status):
    curr_df_history = df_history[df_history['BIZ_NO'] == biz_no]
    #curr_df_history = curr_df_history.sort_values(by=['STRT_DATE'])
    curr_df_history = curr_df_history.sort_index(ascending=False)
    curr_df_summary = df_summary[df_summary['BIZ_NO'] == biz_no]
    histories = [[int(curr_df_summary.ESTB_DATE.item()), 0],]
    curr_history = histories[-1]
    j = 0
    for i, row in curr_df_history.iterrows():
        real_start_date = int(min(int(row['STRT_DATE']), row['STAT_OCR_DATE']))
        end_date = int(row['END_DATE'])
        gb = row['CLSBZ_GB']
        prev_start_date, prev_status = curr_history
        if j > 0 and prev_end_date != 99991231:
            curr_date = int_to_date(prev_end_date) + timedelta(days=1)
            real_start_date = min(real_start_date, date_to_int(curr_date))
        if real_start_date >= end_date:
            continue
        status = dict_status[gb]
        if status == prev_status:
            continue
        curr_history = [real_start_date, status]
        prev_end_date = end_date
        j += 1
        histories.append(curr_history)
    return histories

In [36]:
nan_estb = []
non_exist_biz_no = []
for biz_no in arr_biz_no:
    curr_df_summary = good_summary[good_summary['BIZ_NO'] == biz_no]
    if len(curr_df_summary) == 0:
        non_exist_biz_no.append(biz_no)
        continue
    if np.isnan(curr_df_summary.ESTB_DATE.item()):
        nan_estb.append(biz_no)

In [37]:
curr_df_summary

Unnamed: 0,BIZ_NO,BZ_TYP,CMP_SCL,PBCO_GB,PSN_CORP_GB,HDOF_BR_GB,FR_IVST_CORP_YN,VENT_YN,LIST_CD,IND_CD_ORDR,...,NATN_NM,EMP_CNT,IS_LP,IS_LC,IS_LTD,IS_SP,IS_ETC,HAS_HOMEPAGE,CEO_CNT,label
127172,8998700089,M,2,2,1,1,N,N,,10.0,...,,10.0,False,False,True,False,False,False,1.0,0


In [38]:
total_history = []
for biz_no in arr_biz_no:
    if biz_no in nan_estb:
        continue
    if biz_no in non_exist_biz_no:
        continue
    histories = history_of_biz(biz_no, good_history, good_summary, gb_to_status)
    total_history.append([biz_no, histories])
df_history = pd.DataFrame(total_history, columns=['BIZ_NO', 'HISTORY'])

In [39]:
df_history.to_csv(f'data/{target}_company_history_track.csv')

In [40]:
df_history.columns

Index(['BIZ_NO', 'HISTORY'], dtype='object')

In [41]:
good_summary1 = good_summary.dropna(subset = ['ESTB_DATE']).reset_index(drop=True)
mask = good_summary1['BIZ_NO'].isin(non_exist_biz_no)
good_summary1 = good_summary1[~mask].reset_index(drop=True)
good_summary1.to_csv(f'data/{target}_company_summary_valid.csv')