In [14]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import random
from ast import literal_eval

In [5]:
df_opened = pd.read_csv('data/opened_company_history_track.csv', index_col=0)
df_opened.head(10)

Unnamed: 0,BIZ_NO,HISTORY
0,1018161162,"[[20000725, 0]]"
1,1018163758,"[[20000930, 0], [20011111, 3], [20080428.0, 0]..."
2,1018180233,"[[20020314, 0], [20180401, 3], [20180628, 0]]"
3,1018188967,"[[20021111, 0]]"
4,1018189006,"[[19390101, 0]]"
5,1018537786,"[[20110401, 0], [20200910, 2], [20201007, 0]]"
6,1018605622,"[[20040504, 0]]"
7,1018615019,"[[20050517, 0], [20111231, 3], [20170207, 0]]"
8,1018618976,"[[20051107, 0]]"
9,1018619368,"[[20051027, 0]]"


#### STATUS CODE
0: 정상  
1: 부도  
2: 휴업  
3: 폐업  
4: 피합병  

In [15]:
df_opened['num_status'] = df_opened.HISTORY.map(lambda x: len(literal_eval(x)))

In [16]:
df_opened.head()

Unnamed: 0,BIZ_NO,HISTORY,num_status
0,1018161162,"[[20000725, 0]]",1
1,1018163758,"[[20000930, 0], [20011111, 3], [20080428.0, 0]...",5
2,1018180233,"[[20020314, 0], [20180401, 3], [20180628, 0]]",3
3,1018188967,"[[20021111, 0]]",1
4,1018189006,"[[19390101, 0]]",1


In [23]:
df_opened1 = df_opened.drop(np.where(df_opened.num_status.to_numpy() == 1)[0], axis=0)
df_opened1 = df_opened1.drop(columns=['num_status']).reset_index(drop=True)
df_opened1.head(10)


Unnamed: 0,BIZ_NO,HISTORY
0,1018163758,"[[20000930, 0], [20011111, 3], [20080428.0, 0]..."
1,1018180233,"[[20020314, 0], [20180401, 3], [20180628, 0]]"
2,1018537786,"[[20110401, 0], [20200910, 2], [20201007, 0]]"
3,1018615019,"[[20050517, 0], [20111231, 3], [20170207, 0]]"
4,1018649275,"[[20090826, 0], [20101011, 3], [20131028.0, 0]]"
5,1018652746,"[[20100126, 0], [20170331, 3], [20170908, 0]]"
6,1018675619,"[[20121001, 0], [20170930, 3], [20171205, 0]]"
7,1018679027,"[[20130215, 0], [20170401, 3], [20170623, 0]]"
8,1018691467,"[[20140714, 0], [20191216, 3], [20200113, 0]]"
9,1018800086,"[[20150522, 0], [20171130, 3], [20180406, 0]]"


In [47]:
history_all_opened = pd.DataFrame(columns=[
    'BIZ_NO', 'year', 'bankrupcy_1month', 'bankrupcy_3month', 'bankrupcy_6month', 'bankrupcy_1year', 'bankrupcy_2year', 'bankrupcy_3year', 'bankrupcy_total',
    'rest_1month', 'rest_3month', 'rest_6month', 'rest_1year', 'rest_2year', 'rest_3year', 'rest_total',
    'closed_1month', 'closed_3month', 'closed_6month', 'closed_1year', 'closed_2year', 'closed_3year', 'closed_total', 'label'
])

In [48]:
def input_value_in_dict(dict_row, state, prev_int_date, int_date, base_int_date, base_int_end_date):
    if state == 1:
        str_state = 'bankrupcy'
    elif state == 2:
        str_state = 'rest'
    elif state == 3:
        str_state = 'closed'
    else:
        return dict_row
    if base_int_end_date < prev_int_date:
        return dict_row
    base_int_date_1mon = base_int_date - 100
    base_int_date_3mon = base_int_date - 300
    base_int_date_6mon = base_int_date - 600
    base_int_date_1yr = base_int_date - 10000
    base_int_date_2yr = base_int_date - 20000
    base_int_date_3yr = base_int_date - 30000
    dict_row[f'{str_state}_total'] += 1
    if prev_int_date < base_int_date_1mon < int_date:
        dict_row[f'{str_state}_1month'] += 1
    if prev_int_date < base_int_date_3mon < int_date:
        dict_row[f'{str_state}_3month'] += 1
    if prev_int_date < base_int_date_6mon < int_date:
        dict_row[f'{str_state}_6month'] += 1
    if prev_int_date < base_int_date_1yr < int_date:
        dict_row[f'{str_state}_1year'] += 1
    if prev_int_date < base_int_date_2yr < int_date:
        dict_row[f'{str_state}_2year'] += 1
    if prev_int_date < base_int_date_3yr < int_date:
        dict_row[f'{str_state}_3year'] += 1
    return dict_row


In [53]:
def get_state_at_date(histories, int_date):
    next_int_date = 99991231
    for i, event in enumerate(histories):
        curr_int_date, curr_state = event
        if i == 0:
            prev_state = curr_state
            prev_int_date = curr_int_date
            continue
        if prev_int_date // 10000 == int_date // 10000:
            if prev_state != 0:
                return prev_state
        prev_state = curr_state
        prev_int_date = curr_int_date
    if prev_int_date // 10000 == int_date // 10000:
        return prev_state
    else:
        return 0
    

In [54]:
years = [2018, 2019, 2020, 2021, 2022]
all_dict_list = []
dict_row = {
    'BIZ_NO': 0, 'year': 0, 'bankrupcy_1month': 0, 'bankrupcy_3month': 0, 'bankrupcy_6month': 0, 'bankrupcy_1year': 0, 'bankrupcy_2year': 0, 'bankrupcy_3year': 0, 'bankrupcy_total': 0,
    'rest_1month': 0, 'rest_3month': 0, 'rest_6month': 0, 'rest_1year': 0, 'rest_2year': 0, 'rest_3year': 0, 'rest_total': 0,
    'closed_1month': 0, 'closed_3month': 0, 'closed_6month': 0, 'closed_1year': 0, 'closed_2year': 0, 'closed_3year': 0, 'closed_total': 0, 'label': 0
}
target_states = [1, 3]
for i, row in df_opened1.iterrows():
    histories = literal_eval(row.HISTORY)
    biz_no = row.BIZ_NO
    for year in years:
        curr_dict_row = dict(dict_row)
        curr_dict_row['BIZ_NO'] = biz_no
        curr_dict_row['year'] = year
        if year == 2022:
            base_int_date = 20220930
            base_int_end_date = base_int_date
        else:
            base_int_date = year * 10000 + 1230
            base_int_end_date = base_int_date + 1
        prev_state = 0
        for i, event in enumerate(histories):
            int_date, curr_state = event
            if int_date > base_int_end_date and prev_int_date > base_int_end_date:
                break
            if i == 0:
                prev_state = curr_state
                prev_int_date = int_date
                continue
            if prev_state != 0 and curr_state == 0:
                curr_dict_row = input_value_in_dict(curr_dict_row, prev_state, prev_int_date, int_date, base_int_date, base_int_end_date)
            prev_state = curr_state
            prev_int_date = int_date
        state = get_state_at_date(histories, base_int_date)
        if state in target_states:
            curr_dict_row['label'] = 1
        all_dict_list.append(curr_dict_row)


In [55]:
len(all_dict_list)

7755

In [56]:
all_dict_list[2]

{'BIZ_NO': 1018163758,
 'year': 2020,
 'bankrupcy_1month': 0,
 'bankrupcy_3month': 0,
 'bankrupcy_6month': 0,
 'bankrupcy_1year': 0,
 'bankrupcy_2year': 0,
 'bankrupcy_3year': 0,
 'bankrupcy_total': 0,
 'rest_1month': 0,
 'rest_3month': 0,
 'rest_6month': 0,
 'rest_1year': 0,
 'rest_2year': 0,
 'rest_3year': 0,
 'rest_total': 0,
 'closed_1month': 0,
 'closed_3month': 0,
 'closed_6month': 0,
 'closed_1year': 0,
 'closed_2year': 0,
 'closed_3year': 0,
 'closed_total': 2,
 'label': 0}

In [57]:
df_closed = pd.read_csv('data/closed_company_history_track.csv', index_col=0)
df_closed.head(10)

Unnamed: 0,BIZ_NO,HISTORY
0,1018135422,"[[19890901, 0], [20200930, 3]]"
1,1018154206,"[[20000301, 0], [20201231, 3]]"
2,1018163684,"[[20001017, 0], [20190917, 3]]"
3,1018178760,"[[20020126, 0], [20191010, 3]]"
4,1018194173,"[[20030430, 0], [20200831, 3]]"
5,1018194815,"[[20030530, 0], [20201224, 3]]"
6,1018513125,"[[20001024, 0], [20201231, 3]]"
7,1018527779,"[[19970906, 0], [20200218, 3]]"
8,1018536341,"[[20100325, 0], [20200929, 3]]"
9,1018540239,"[[20120301, 0], [20191231, 3]]"


In [58]:
df_closed['num_status'] = df_closed.HISTORY.map(lambda x: len(literal_eval(x)))
df_closed1 = df_closed.drop(np.where(df_closed.num_status.to_numpy() == 1)[0], axis=0)
df_closed1 = df_closed1.drop(columns=['num_status']).reset_index(drop=True)
df_closed1.head(10)

Unnamed: 0,BIZ_NO,HISTORY
0,1018135422,"[[19890901, 0], [20200930, 3]]"
1,1018154206,"[[20000301, 0], [20201231, 3]]"
2,1018163684,"[[20001017, 0], [20190917, 3]]"
3,1018178760,"[[20020126, 0], [20191010, 3]]"
4,1018194173,"[[20030430, 0], [20200831, 3]]"
5,1018194815,"[[20030530, 0], [20201224, 3]]"
6,1018513125,"[[20001024, 0], [20201231, 3]]"
7,1018527779,"[[19970906, 0], [20200218, 3]]"
8,1018536341,"[[20100325, 0], [20200929, 3]]"
9,1018540239,"[[20120301, 0], [20191231, 3]]"


In [61]:
history_all_closed = pd.DataFrame(columns=[
    'BIZ_NO', 'year', 'bankrupcy_1month', 'bankrupcy_3month', 'bankrupcy_6month', 'bankrupcy_1year', 'bankrupcy_2year', 'bankrupcy_3year', 'bankrupcy_total',
    'rest_1month', 'rest_3month', 'rest_6month', 'rest_1year', 'rest_2year', 'rest_3year', 'rest_total',
    'closed_1month', 'closed_3month', 'closed_6month', 'closed_1year', 'closed_2year', 'closed_3year', 'closed_total', 'label'
])

In [59]:
years = [2018, 2019, 2020, 2021, 2022]
all_dict_list1 = []
dict_row = {
    'BIZ_NO': 0, 'year': 0, 'bankrupcy_1month': 0, 'bankrupcy_3month': 0, 'bankrupcy_6month': 0, 'bankrupcy_1year': 0, 'bankrupcy_2year': 0, 'bankrupcy_3year': 0, 'bankrupcy_total': 0,
    'rest_1month': 0, 'rest_3month': 0, 'rest_6month': 0, 'rest_1year': 0, 'rest_2year': 0, 'rest_3year': 0, 'rest_total': 0,
    'closed_1month': 0, 'closed_3month': 0, 'closed_6month': 0, 'closed_1year': 0, 'closed_2year': 0, 'closed_3year': 0, 'closed_total': 0, 'label': 0
}
target_states = [1, 3]
for i, row in df_closed1.iterrows():
    histories = literal_eval(row.HISTORY)
    biz_no = row.BIZ_NO
    for year in years:
        curr_dict_row = dict(dict_row)
        curr_dict_row['BIZ_NO'] = biz_no
        curr_dict_row['year'] = year
        if year == 2022:
            base_int_date = 20220930
            base_int_end_date = base_int_date
        else:
            base_int_date = year * 10000 + 1230
            base_int_end_date = base_int_date + 1
        prev_state = 0
        for i, event in enumerate(histories):
            int_date, curr_state = event
            if int_date > base_int_end_date and prev_int_date > base_int_end_date:
                break
            if i == 0:
                prev_state = curr_state
                prev_int_date = int_date
                continue
            if prev_state != 0 and curr_state == 0:
                curr_dict_row = input_value_in_dict(curr_dict_row, prev_state, prev_int_date, int_date, base_int_date, base_int_end_date)
            prev_state = curr_state
            prev_int_date = int_date
        state = get_state_at_date(histories, base_int_date)
        if state in target_states:
            curr_dict_row['label'] = 1
        all_dict_list1.append(curr_dict_row)

In [60]:
len(all_dict_list1)

30870

In [65]:
df_all_history = pd.DataFrame(all_dict_list + all_dict_list1)
df_all_history.head(10)

Unnamed: 0,BIZ_NO,year,bankrupcy_1month,bankrupcy_3month,bankrupcy_6month,bankrupcy_1year,bankrupcy_2year,bankrupcy_3year,bankrupcy_total,rest_1month,...,rest_3year,rest_total,closed_1month,closed_3month,closed_6month,closed_1year,closed_2year,closed_3year,closed_total,label
0,1018163758,2018,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2,0
1,1018163758,2019,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2,0
2,1018163758,2020,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2,0
3,1018163758,2021,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2,0
4,1018163758,2022,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2,0
5,1018180233,2018,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
6,1018180233,2019,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
7,1018180233,2020,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
8,1018180233,2021,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
9,1018180233,2022,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [66]:
df_all_history.label.sum()

6377

In [67]:
df_all_history.to_csv('data/all_histories.csv')