# 1. 데이터 불러오기 및 데이터 확인

In [6]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv('./custom_data/base_data.csv', encoding='cp949')

# 2. 결측치 및 이상치 제거
- 02_EDA&Base_Line.ipynb와 같은 절차로 결측치와 이상치를 제거한다.

In [7]:
finance_columns = df.columns[2:46].tolist()
non_finance_columns = df.columns[46:].tolist()

# 결측치를 0으로 변환
df[finance_columns] = df[finance_columns].fillna(0)

for column in finance_columns:
    df[column] = df[column].apply(lambda x: 0 if x == 1000000000000 else x)
    df[column] = df[column].apply(lambda x: 0 if x == 888888888889 else x)

In [8]:
print(f"정제 후 결측치의 개수: {df[finance_columns].isna().sum().sum()}개")
total = 0
for column in finance_columns:
    total += len(df.loc[(df[column] == 1000000000000) | (df[column] == 888888888889)])
print(f"정제 후 이상치의 개수: {total}개")

정제 후 결측치의 개수: 0개
정제 후 이상치의 개수: 0개


In [9]:
# 자본금 이상치 0으로 변환
df['자본금'] = df['자본금'].apply(lambda x: 0 if x < 0 else x)
count_minus_capital_stock = df.loc[df['자본금'] < 0].shape[0]
print(f"정제 후 자본금이 음수인 기업의 개수: {count_minus_capital_stock}개")

정제 후 자본금이 음수인 기업의 개수: 0개


In [11]:
df[df['자산총계'] == 0]

Unnamed: 0,사업자등록번호,결산년월,유동자산,매출채권,비유동자산,유형자산,자산총계,유동부채,비유동부채,부 채 총 계,...,상장일자,주요사업내용,국가명,홈페이지URL,대표자명,직원수,종료일자,시작일자,휴폐업구분,상태발생일자
280,1018164892,20191231,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,유가증권의 매매,몰타,www.db.com,안성은,35.0,,,,
281,1018164892,20201231,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,유가증권의 매매,몰타,www.db.com,안성은,35.0,,,,
282,1018164892,20211231,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,유가증권의 매매,몰타,www.db.com,안성은,35.0,,,,
336,1018179639,20191231,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,증권투자신탁법에 따른 증권투자신탁에 대한 펀드운용서비스의 제공,영국,www.dws-korea.com,변현수,38.0,,,,
337,1018179639,20201231,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,증권투자신탁법에 따른 증권투자신탁에 대한 펀드운용서비스의 제공,영국,www.dws-korea.com,변현수,38.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108564,8768801189,20191231,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,대부업,,,김재석,10.0,,,,
108565,8768801189,20201231,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,대부업,,,김재석,10.0,,,,
108986,8948100743,20191231,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,대부업,,,김성대,44.0,,,,
108987,8948100743,20201231,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,대부업,,,김성대,44.0,,,,


In [5]:
# 자산의 이상치 제거
count_zero_total_assets = df.loc[df['자산총계'] == 0].shape[0]
df_finance_missing_value = df.loc[df['자산총계'] != 0]

print(f"자산의 이상치 제거 결과 {count_zero_total_assets}개의 데이터가 제거되었습니다.")

자산의 이상치 제거 결과 990개의 데이터가 제거되었습니다.


# 3. 계산에 필요한 변수 만들기

## 3.1. 매출총이익
- 매출총이익 = 매출액 - 매출원가

In [6]:
finance_columns.append('매출총이익')

df['매출총이익'] = df['매출액'] - df['매출원가']
df['매출총이익']

0         6629483.0
1         6541583.0
2         5933317.0
3         6158586.0
4          602346.0
            ...    
109137    1792712.0
109138          0.0
109139          0.0
109140          0.0
109141    1099319.0
Name: 매출총이익, Length: 109142, dtype: float64

## 3.2. 유보액
- 유보액 = (유보액/총자산(%) / 100) * 총자산

In [7]:
finance_columns.append('유보액')

temp = df['유보액/총자산(%)'] / 100
df['유보액'] = temp * df['자산총계']
df['유보액'] = df['유보액'].replace([np.inf, -np.inf], 0)
df['유보액']

0         1.707641e+06
1         1.561780e+06
2         2.079225e+06
3         3.018128e+06
4        -6.696973e+06
              ...     
109137    7.764357e+05
109138   -3.315551e+06
109139   -5.876556e+06
109140   -7.430339e+06
109141   -2.065972e+07
Name: 유보액, Length: 109142, dtype: float64

## 3.3. 운전자본
- 운전자본 = 재고자산 + 매출채권

In [8]:
finance_columns.append('운전자본')

df['운전자본'] = df['재고자산'] + df['매출채권']
df['운전자본']

0          127320.0
1           94085.0
2           78612.0
3          273522.0
4           17011.0
            ...    
109137    6598838.0
109138          0.0
109139          0.0
109140          0.0
109141    1201346.0
Name: 운전자본, Length: 109142, dtype: float64

## 3.4. 순운전자본
- 순운전자본 = 유동자산-유동부채

In [9]:
finance_columns.append('순운전자본')

df['순운전자본'] = df['유동자산'] - df['유동부채']
df['순운전자본']

0           2244859.0
1          -8385041.0
2           1409116.0
3          -8319242.0
4          -2262788.0
             ...     
109137     -1824989.0
109138   -160727386.0
109139     -3247151.0
109140      -119076.0
109141   -279821149.0
Name: 순운전자본, Length: 109142, dtype: float64

## 3.5. 당좌자산
- 당좌자산 = 유동자산 - 재고자산

In [10]:
finance_columns.append('당좌자산')

df['당좌자산'] = df['유동자산'] - df['재고자산']
df['당좌자산']

0         2667731.0
1         2168837.0
2         2272260.0
3         1175030.0
4          422146.0
            ...    
109137    5512483.0
109138    3608435.0
109139     475283.0
109140    1474607.0
109141    2540577.0
Name: 당좌자산, Length: 109142, dtype: float64

## 3.6. 매입채무
- 매입채무 = 매출액 / 매입채무회전율(회)

In [11]:
finance_columns.append('매입채무')

df['매입채무'] = df['매출액'] / df['매입채무회전율(회)']
df['매입채무'] = df['매입채무'].replace([np.inf, -np.inf], 0).fillna(0)
df['매입채무']

0         0.000000e+00
1         0.000000e+00
2         0.000000e+00
3         0.000000e+00
4         0.000000e+00
              ...     
109137    2.569800e+06
109138    0.000000e+00
109139    0.000000e+00
109140    0.000000e+00
109141    0.000000e+00
Name: 매입채무, Length: 109142, dtype: float64

## 3.7. 데이터 저장

### 3.7.1. 2022년 데이터 삭제

In [12]:
# 2018~2021년 데이터만 분석에 사용하기 위해서 2022년 데이터 삭제

df['결산년월'] = df['결산년월'].astype('str')
df = df[~df['결산년월'].str.contains('2022')]
df.shape

(108754, 83)

### 3.7.2. 윈저라이징(winsorizing)

In [13]:
print(finance_columns)

['유동자산', '매출채권', '비유동자산', '유형자산', '자산총계', '유동부채', '비유동부채', '부  채  총  계', '자본금', '이익잉여금(결손금）', '자본총계', '매출액', '판매비와관리비', '영업이익（손실）', '법인세비용차감전순손익', '법인세비용', '당기순이익(손실)', '기업순이익률(%)', '유보액/총자산(%)', '유보액/납입자본(%)', '매출액총이익률(%)', '매출액영업이익률(%)', '매출액순이익률(%)', '수지비율(%)', '경상수지비율', '영업비율(%)', '금융비용대매출액비율(%', '금융비용대부채비율(%)', '금융비용대총비용비율(%', '부채비율(%)', '차입금의존도(%)', '자기자본비율(%)', '순운전자본비율(%)', '유동부채비율(%)', '비유동부채비율(%)', '부채총계대 매출액(%)', '총자본회전율(회)', '재고자산회전율(회)', '매출채권회전율(회)', '매입채무회전율(회)', '미수금', '매출원가', '무형자산', '재고자산', '매출총이익', '유보액', '운전자본', '순운전자본', '당좌자산', '매입채무']


In [14]:
for i in finance_columns:
    upper_value = df[i].quantile(0.99)
    under_value = df[i].quantile(0.01)

    df[i] = df[i].apply(lambda x: upper_value if x >= upper_value else x)
    df[i] = df[i].apply(lambda x: under_value if x <= under_value else x)

In [15]:
df.to_csv('./custom_data/basic_finance_data.csv', index=False, encoding='cp949')

# 4. 기본재무비율
- 많이 사용된 재무비율 X개를 변수로 추출함
- 성장성, 수익성, 생산성, 안정성, 활동성으로 구분
- X개의 변수 중 예측 모델에 영향을 주는 변수 Y개를 선정하고자 함
- information gain과 gain ratio를 사용하여 상위 Z개를 사용함.

In [16]:
# 데이터 불러오기
import pandas as pd
import numpy as np

df = pd.read_csv('./custom_data/basic_finance_data.csv', encoding='cp949')
df.head(5)

Unnamed: 0,사업자등록번호,결산년월,유동자산,매출채권,비유동자산,유형자산,자산총계,유동부채,비유동부채,부 채 총 계,...,종료일자,시작일자,휴폐업구분,상태발생일자,매출총이익,유보액,운전자본,순운전자본,당좌자산,매입채무
0,1018100340,20181231,2667731.0,127320.0,32346444.0,1179096.0,35014174.0,422872.0,16176067.0,16598939.0,...,,,,,6629483.0,1707641.0,127320.0,2244859.0,2667731.0,0.0
1,1018100340,20191231,2168837.0,94085.0,34151633.0,1209369.0,36320470.0,10553878.0,5240445.0,15794323.0,...,,,,,6541583.0,1561780.0,94085.0,-8385041.0,2168837.0,0.0
2,1018100340,20201231,2272260.0,78612.0,38329680.0,1091153.0,40601940.0,863144.0,15496117.0,16359261.0,...,,,,,5933317.0,2079225.0,78612.0,1409116.0,2272260.0,0.0
3,1018100340,20211231,1175030.0,273522.0,37135811.0,993025.0,38310842.0,9494272.0,4901383.0,14395654.0,...,,,,,6158586.0,3018128.0,273522.0,-8319242.0,1175030.0,0.0
4,1018100772,20181231,422146.0,17011.0,20370096.0,20352846.0,20792242.0,2684934.0,11404637.0,14089571.0,...,,,,,602346.0,-6696973.0,17011.0,-2262788.0,422146.0,0.0


In [17]:
df_copy1 = df.copy()
df_copy1 = df_copy1.iloc[:, :2]
df_copy1.head()

Unnamed: 0,사업자등록번호,결산년월
0,1018100340,20181231
1,1018100340,20191231
2,1018100340,20201231
3,1018100340,20211231
4,1018100772,20181231


## 4.1. 성장성
- 총자본증가율 : O
- 영업이익증가율 : O
- 당기순이익증가율 : O
- 자기자본증가율 : O
- 매출액증가율 : O
- 종업원수증가율 : ?

### 4.1.1. 총자본증가율
- 총자산 증가율은 한 기업의 규모가 얼마나 성장하였는지 보여주는 지표
- 총자산 증가율 = (당기말 총자산 - 전기말 총자산) / 전기말 총자산

In [18]:
df_copy1['총자본증가율'] = df.groupby(['사업자등록번호']).자산총계.pct_change()
df_copy1['총자본증가율'] = df_copy1['총자본증가율'].replace([np.inf, -np.inf], 0).fillna(0)
df_copy1['총자본증가율'].head()

0    0.000000
1    0.037308
2    0.117880
3   -0.056428
4    0.000000
Name: 총자본증가율, dtype: float64

### 4.1.2. 영업이익증가율
- 영업이익증가율은 사업의 수익성이 얼마나 성장하고 있는가 알아보는데 쓰일 수 있는 지표이다.
- 영업이익증가율 = (당기말 영업이익 - 전기말 영업이익) / 전기말 영업이익

In [19]:
df = df.rename(columns={'영업이익（손실）':'영업이익'})
df_copy1['영업이익증가율'] = df.groupby(['사업자등록번호']).영업이익.pct_change()
df_copy1['영업이익증가율'] = df_copy1['영업이익증가율'].replace([np.inf, -np.inf], 0).fillna(0)
df_copy1['영업이익증가율'].head()

0    0.000000
1    1.013936
2   -0.919169
3    4.678983
4    0.000000
Name: 영업이익증가율, dtype: float64

### 4.1.3. 당기순이익증가율
- 당기순이익증가율은 당기순이익이 얼마나 증가하고 있는가를 보여주는 지표이다. 기업의 성장성을 확인할 수 있음.
- 당기순이익증가율 = (당기말 당기순이익 - 전기말 당기순이익) / 전기말 당기순이익

In [20]:
df = df.rename(columns={'당기순이익(손실)':'당기순이익'})
df_copy1['당기순이익증가율'] = df.groupby(['사업자등록번호']).당기순이익.pct_change()
df_copy1['당기순이익증가율'] = df_copy1['당기순이익증가율'].replace([np.inf, -np.inf], 0).fillna(0)
df_copy1['당기순이익증가율'].head()

0    0.000000
1    0.089367
2    0.246335
3    0.122320
4    0.000000
Name: 당기순이익증가율, dtype: float64

### 4.1.4. 자기자본증가율
- 자기자본이 당해연도에 얼마나 증가하였는가를 표시하는 지표다. 자기자본증가율을 총자산증가율과 관련하여 비교분석하며, 자기자본이 증가하는 요인으로는 유상증자, 내부유보, 자산재평가 등을 꼽을수 있다.
- 자기자본증가율 = (당기말 자기자본 - 전기말 자기자본) / 전기말 자기자본

In [21]:
df_copy1['자기자본증가율'] = df.groupby(['사업자등록번호']).자본총계.pct_change()
df_copy1['자기자본증가율'] = df_copy1['자기자본증가율'].replace([np.inf, -np.inf], 0).fillna(0)
df_copy1['자기자본증가율'].head()

0    0.000000
1    0.114629
2    0.181063
3   -0.013509
4    0.000000
Name: 자기자본증가율, dtype: float64

### 4.1.5. 매출액증가율
- 매출액증가율은 매출액이 당해연도에 얼마나 증가하였는가를 표시하는 비율이다. 매출액은 정상적인 영업활동에서 계속적으로 발생하는 영업수익이므로 매출액증가율은 기업의 성장률을 판단하는 대표적인 비율이다.
- 매출액증가율 = (당기말 매출액 - 전기말 매출액) / 전기말 매출액

In [22]:
df_copy1['매출액증가율'] = df.groupby(['사업자등록번호']).매출액.pct_change()
df_copy1['매출액증가율'] = df_copy1['매출액증가율'].replace([np.inf, -np.inf], 0).fillna(0)
df_copy1['매출액증가율'].head()

0    0.000000
1   -0.013259
2   -0.092985
3    0.037967
4    0.000000
Name: 매출액증가율, dtype: float64

### 4.1.6. 총업원수증가율

## 4.2. 수익성
- 매출총이익률 : O (매출총이익 / 매출액)
- 매출액영업이익률 : O (영업이익 / 매출액)
- 매출액경상이익률 : O (법인세비용차감전순이익 / 매출액)
- 매출액순이익률 : O (당기순이익 / 매출액)
- 총자산영업이익률 : O (영업이익 / 총자산)
- 자기자본영업이익률 : O (영업이익 / 자기자본)
- 자기자본순이익률 : O (당기순이익 / 자기자본)
- 금융비용부담률 : O (금융비용 / 매출액)
- 수지비율 : O
- 사내유보 대 자기자본비율 : O (유보액 / 자기자본)
- 총자산순이익률 : O (당기순이익 / 총자산)

### 4.2.1. 매출총이익률
- 매출액에서 매출원가를 뺀 것을 총이익이라고 한다.
이 총이익이 매출액의 몇 퍼센트에 해당하는가를 나타내는 것이 매출액총이익률이다.
제조업이라면 관리부문 등을 제외한 생산단계의 수익성을 보는데 쓰인다.
즉, 공장으로부터 제품을 출하하는 단계에서의 이익률이다.
- 매출총이익률 = 매출총이익 / 매출액

In [23]:
df_copy1['매출총이익률'] = df['매출총이익'] / df['매출액']
df_copy1['매출총이익률'] = df_copy1['매출총이익률'].replace([np.inf, -np.inf], 0).fillna(0)
df_copy1['매출총이익률'].head()

0    1.000000
1    1.000000
2    1.000000
3    1.000000
4    0.534425
Name: 매출총이익률, dtype: float64

### 4.2.2. 매출액영업이익률
- 기업의 영업효율성을 측정하는 지표
- 공식 : 영업이익 / 매출액

In [24]:
df_copy1['매출액영업이익률'] = df['영업이익'] / df['매출액']
df_copy1['매출액영업이익률'] = df_copy1['매출액영업이익률'].replace([np.inf, -np.inf], 0).fillna(0)
df_copy1['매출액영업이익률'].head()

0    0.063145
1    0.128879
2    0.011485
3    0.062839
4    0.138134
Name: 매출액영업이익률, dtype: float64

### 4.2.3. 매출액경상이익률
- 경상활동의 효율성을 나타내며, 기업의 주된 영업활동뿐만 아니라 재무활동에서 발생한 경영성과까지 측정하는 지표
- 공식 : 법인세비용차감전순손익 / 매출액

In [25]:
df_copy1['매출액경상이익률'] = df['법인세비용차감전순손익'] / df['매출액']
df_copy1['매출액경상이익률'] = df_copy1['매출액경상이익률'].replace([np.inf, -np.inf], 0).fillna(0)
df_copy1['매출액경상이익률'].head()

0    0.474373
1    0.526796
2    0.704525
3    0.768272
4    0.017886
Name: 매출액경상이익률, dtype: float64

### 4.2.4. 매출액순이익률
- 기업의 재무상태와 경영성적을 간편하게 진단할 수 있는 재무비율 분석의 하나. 
- 매출액순이익률 = 당기순이익 / 매출액

In [26]:
df_copy1['매출액순이익률'] = df['당기순이익'] / df['매출액']
df_copy1['매출액순이익률'] = df_copy1['매출액순이익률'].replace([np.inf, -np.inf], 0).fillna(0)
df_copy1['매출액순이익률'].head()

0    0.378488
1    0.417853
2    0.574174
3    0.620836
4    0.017886
Name: 매출액순이익률, dtype: float64

### 4.2.5. 총자산영업이익률
- 총자산영업이익률 = 영업이익 / 총자산

In [27]:
df_copy1['총자산영업이익률'] = df['영업이익'] / df['자산총계']
df_copy1['총자산영업이익률'] = df_copy1['총자산영업이익률'].replace([np.inf, -np.inf], 0).fillna(0)
df_copy1['총자산영업이익률'].head()

0    0.011956
1    0.023212
2    0.001678
3    0.010102
4    0.007488
Name: 총자산영업이익률, dtype: float64

### 4.2.6. 자기자본영업이익률
- 자기자본영업이익률 = 영업이익 / 자기자본

In [28]:
df_copy1['자기자본영업이익률'] = df['영업이익'] / df['자본총계']
df_copy1['자기자본영업이익률'] = df_copy1['자기자본영업이익률'].replace([np.inf, -np.inf], 0).fillna(0)
df_copy1['자기자본영업이익률'].head()

0    0.022732
1    0.041073
2    0.002811
3    0.016182
4    0.023228
Name: 자기자본영업이익률, dtype: float64

### 4.2.7. 자기자본순이익률
- 투자된 자기자본의 효율적 이용도를 측정한다. 기업이 자본을 이용해서 얼마만큼의 이익을 나타냈는지 나타내는 지표.
- ROE가 높다는 것은 자기자본에 비해 그만큼 당기순이익을 많이 내 효율적인 영업활동을 했다는 뜻이 된다.
- 자기자본순이익률 = 당기순이익 / 자기자본

In [29]:
df_copy1['자기자본순이익률'] = df['당기순이익'] / df['자본총계']
df_copy1['자기자본순이익률'] = df_copy1['자기자본순이익률'].replace([np.inf, -np.inf], 0).fillna(0)
df_copy1['자기자본순이익률'].head()

0    0.136256
1    0.133168
2    0.140527
3    0.159876
4    0.003008
Name: 자기자본순이익률, dtype: float64

### 4.2.8. 금융비용부담률
- 자금차입에 따른 기업의 부담이 어느 정도인가를 나타내는 지표로서 이 비율이 낮을수록 재무구조가 건실한 것으로 본다. 
- 예를 들어 금융비용부담률이 10%인 기업이 있다면 이 회사는 1천원어치를 팔아 1백원을 대출금 등 차입금에 대한 대가로 지불하고 있다는 얘기다.
- 금융비용부담률 = 금융비용 / 매출액
- 재무데이터에 있는 정보 -> '금융비용대매출액비율(%) / 100' 으로 재계산함

In [30]:
df_copy1['금융비용부담률'] = df['금융비용대부채비율(%)'] / 100
df_copy1['금융비용부담률'] = df_copy1['금융비용부담률'].replace([np.inf, -np.inf], 0).fillna(0)
df_copy1['금융비용부담률'].head()

0    0.00000
1    0.00000
2    0.00000
3    0.00000
4    0.01682
Name: 금융비용부담률, dtype: float64

### 4.2.9. 수지비율
- 총수익에 대한 총비용의 비율로서, 총괄적인 비용관리가 수익과 대비하여 얼마나 효과적으로 관리되고 있는지 파악하는데 활용하는 지표
- 수지비율 = 총수익 / 총비용
- 재무데이터에 있는 정보 -> '수지비율(%) / 100' 으로 재계산함

In [31]:
df_copy1['수지비율'] = df['수지비율(%)'] / 100
df_copy1['수지비율'] = df_copy1['수지비율'].replace([np.inf, -np.inf], 0).fillna(0)
df_copy1['수지비율'].head()

0    0.66562
1    0.62600
2    0.58602
3    0.55228
4    0.98361
Name: 수지비율, dtype: float64

### 4.2.10. 사내유보 대 자기자본비율
- 사내유보 대 자기자본비율 = 유보액 / 자기자본

In [32]:
df_copy1['사내유보대자기자본비율'] = df['유보액'] / df['자본총계']
df_copy1['사내유보대자기자본비율'] = df_copy1['사내유보대자기자본비율'].replace([np.inf, -np.inf], 0).fillna(0)
df_copy1['사내유보대자기자본비율'].head()

0    0.092730
1    0.076087
2    0.085767
3    0.126201
4   -0.999150
Name: 사내유보대자기자본비율, dtype: float64

### 4.2.11. 총자산순이익률
- 기업이 보유중인 자산으로 얼마나 효율적으로 당기순이익을 창출하는지 알려주는 지표
- 산업에 따라 크게 달라지기 때문에 경쟁사와 비교를통해 높고 낮음을 판단해야함.
- 기업의 총 자산에서 얼마나 수익을 창출하는지 효율성을 계산하기 위한 가장 간단한 방법임
- 총자산순이익률 = 당기순이익 / 총자산

In [33]:
df_copy1['총자산순이익률'] = df['당기순이익'] / df['자산총계']
df_copy1['총자산순이익률'] = df_copy1['총자산순이익률'].replace([np.inf, -np.inf], 0).fillna(0)
df_copy1['총자산순이익률'].head()

0    0.071662
1    0.075258
2    0.083906
3    0.099801
4    0.000970
Name: 총자산순이익률, dtype: float64

## 4.3. 활동성
- 총자본회전율 : O (매출액 / 총자산)
- 자기자본회전율 : O (매출액 / 자본)
- 타인자본회전율 : O (매출액 / 부채)
- 유동자산회전율 : O (매출액 / 유동자산)
- 재고자산회전율 : O (매출액 / 재고자산)
- 당좌자산회전율 : O (매출액 / 당좌자산)
- 순운전자본회전율 : O (매출액 / 순운전자본) * 순운전자본 : 유동자산-유동부채
- 운전자본회전율 : O (매출액 / 운전자본) * 운전자본 : 재고자산 + 매출채권

### 4.3.1. 총자본회전율
- 기업소유의 총자본이 얼마나 효율적으로 운영되었는가를 측정하는 지표로 당해 매출액을 기록하기 위해 총자본을 얼마나 사용했는지를 나타낸다. 
- 총자산회전율이 높으면 유동자산·고정자산등이 효율적으로 이용되고 있다는 것을 뜻하며, 반대로 낮으면 과잉투자와 같은 비효율적인 투자를 하고 있다는 것을 의미한다.
- 총자본회전율 = 매출액 / 총자산

In [34]:
df_copy1['총자본회전율'] = df['매출액'] / df['자산총계']
df_copy1['총자본회전율'] = df_copy1['총자본회전율'].replace([np.inf, -np.inf], 0).fillna(0)
df_copy1['총자본회전율'].head()

0    0.189337
1    0.180107
2    0.146134
3    0.160753
4    0.054207
Name: 총자본회전율, dtype: float64

### 4.3.2. 자기자본회전율
- 이 비율에 의해 자기자본의 활용도를 측정·검토하여 그 활동성의 양호 여부를 판단하게 된다.
- 이 비율이 높을수록 자기자본의 이용도, 즉 자기자본의 활동성이 양호하다는 것을 나타낸다.
- 자기자본회전율이 높다는 것은 수익성 증대의 가능성이 높다는 것을 의미하기도 한다.
- 이 비율이 현저하게 높을 경우에는 외상매출의 과대현상 또는 자기자본의 과소현상이라는 재무 위험도 내포하고 있다.
- 자기자본회전율 = 매출액 / 자본

In [35]:
df_copy1['자기자본회전율'] = df['매출액'] / df['자본총계']
df_copy1['자기자본회전율'] = df_copy1['자기자본회전율'].replace([np.inf, -np.inf], 0).fillna(0)
df_copy1['자기자본회전율'].head()

0    0.360000
1    0.318695
2    0.244747
3    0.257518
4    0.168156
Name: 자기자본회전율, dtype: float64

### 4.3.3. 타인자본회전율
- 타인자본의 이용도(이용률)를 판단하는 비율이다.
- 타인자본의 능률을 나타내는 것이므로 높을수록 타인자본의 충분한 활용으로 양호한 상태를 나타내며 일정한 표준비율은 없다.
- 타인자본회전율 = 매출액 / 부채

In [36]:
df_copy1['타인자본회전율'] = df['매출액'] / df['부  채  총  계']
df_copy1['타인자본회전율'] = df_copy1['타인자본회전율'].replace([np.inf, -np.inf], 0).fillna(0)
df_copy1['타인자본회전율'].head()

0    0.399392
1    0.414173
2    0.362689
3    0.427809
4    0.079995
Name: 타인자본회전율, dtype: float64

### 4.3.4. 유동자산회전율
- 유동자산회전율 = 매출액 / 유동자산

In [37]:
df_copy1['유동자산회전율'] = df['매출액'] / df['유동자산']
df_copy1['유동자산회전율'] = df_copy1['유동자산회전율'].replace([np.inf, -np.inf], 0).fillna(0)
df_copy1['유동자산회전율'].head()

0    2.485064
1    3.016171
2    2.611196
3    5.241216
4    2.669910
Name: 유동자산회전율, dtype: float64

### 4.3.5. 재고자산회전율
- 일반적으로 이 비율이 높을수록 ①자본수익률이 높아지고 ②매입채무가 감소되며 ③상품의 재고손실을 막을 수 있고 ④보험료, 보관료를 절약할 수 있어 기업측에 유리하게 된다.
- 재고자산회전율 = 매출액 / 재고자산

In [38]:
df_copy1['재고자산회전율'] = df['매출액'] / df['재고자산']
df_copy1['재고자산회전율'] = df_copy1['재고자산회전율'].replace([np.inf, -np.inf], 0).fillna(0)
df_copy1['재고자산회전율'].head()

0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
Name: 재고자산회전율, dtype: float64

### 4.3.6. 당좌자산회전율
- 당좌자산회전율 = 매출액 / 당좌자산

In [39]:
df_copy1['당좌자산회전율'] = df['매출액'] / df['당좌자산']
df_copy1['당좌자산회전율'] = df_copy1['당좌자산회전율'].replace([np.inf, -np.inf], 0).fillna(0)
df_copy1['당좌자산회전율'].head()

0    2.485064
1    3.016171
2    2.611196
3    5.241216
4    2.669910
Name: 당좌자산회전율, dtype: float64

### 4.3.7. 순운전자본회전율
- 이 비율이 높을수록 기업의 현금화가 활발히 이루어지고 있다는 뜻이다.
- 순운전자본회전율 = 매출액 / 순운전자본

In [40]:
df_copy1['순운전자본회전율'] = df['매출액'] / df['순운전자본']
df_copy1['순운전자본회전율'] = df_copy1['순운전자본회전율'].replace([np.inf, -np.inf], 0).fillna(0)
df_copy1['순운전자본회전율'].head()

0    2.953185
1   -0.780149
2    4.210666
3   -0.740282
4   -0.498099
Name: 순운전자본회전율, dtype: float64

### 4.3.8. 운전자본회전율
- 운전자본회전율이 높다는 것은 매출을 발생시키기 위해 회사의 유동자산과 유동부채를 효율적으로 사용하고 있다는 것임. 운영을 위한 추가 자금의 필요성이 적다는 것을 의미함.
- 비율이 매우 높으면(일반적으로 80% 이상) 기업이 매출성장을 지원할 운전자본이 충분하지 않다는 것을 나타낼 수 있다 -> 파산가능성 상승
- 운전자본회전율 = 매출액 / 운전자본
- 운전자본 = 재고자산 + 매출채권

In [41]:
df_copy1['운전자본회전율'] = df['매출액'] / df['운전자본']
df_copy1['운전자본회전율'] = df_copy1['운전자본회전율'].replace([np.inf, -np.inf], 0).fillna(0)
df_copy1['운전자본회전율'].head()

0    52.069455
1    69.528437
2    75.475971
3    22.515871
4    66.256657
Name: 운전자본회전율, dtype: float64

## 4.4. 생산성
- 종업원 1인당 부가가치 : X
- 노동장비율 : ? (유형자산 / 종업원수)
- 기계장비율 : X
- 자본집약도 : ? (총자산 / 종업원수)
- 총자본투자효율 : X
- 설비투자효율 : X
- 1주당 매출액 : X
- 주당순이익 : X
- 주당현금흐름 : X
- 주당순자산가치 : X
- 유보율 : O (유보액 / 자본금)

### 4.4.1. 노동장비율
- 노동사용량에 대한 자본사용량의 비율로 나타내지만, 보다 구체적으로 말하면 상용근로자 1인당 자본설비액(유형고정자산)이다.
- 철강업이나 화학공업 등 중화학공업은 장비율이 높아 노동절약적 산업이며, 섬유공업 등 경공업은 장비율이 낮아 자본절약적 산업이다.
- 노동장비율 = 유형자산 / 종업원수

In [42]:
# df_copy1['노동장비율'] = df['유형자산'] / df['종업원수']
# df_copy1['노동장비율'] = df_copy1['노동장비율'].replace([np.inf, -np.inf], 0).fillna(0)
# df_copy1['노동장비율'].head()

### 4.4.2. 자본집약도
- 종업원 한 사람이 어느 정도의 자본액을 보유하고 있는가를 나타내는 지표로서 노동장비율의 보조지표로 이용된다.
- 자본집약도 = 총자산 / 종업원수

In [43]:
# df_copy1['자본집약도'] = df['자산총계'] / df['종업원수']
# df_copy1['자본집약도'] = df_copy1['자본집약도'].replace([np.inf, -np.inf], 0).fillna(0)
# df_copy1['자본집약도'].head()

### 4.4.3. 유보율
- 이는 기업이 동원할 수 있는 자금량을 측정하는 지표로, 사내유보의 정도를 나타낸다.
- 유보율이 높을수록 불황에 대한 적응력이 높고 무상증자 가능성도 높다.
- 부채비율과 함께 기업의 안전성을 측정하는 데 자주 활용되는데, 부채비율이 낮을수록 유보비율이 높을수록 기업의 안전성이 높다고 할 수 있다.
- 유보율 = 유보액 / 자본금

In [44]:
df_copy1['유보율'] = df['유보액'] / df['자본금']
df_copy1['유보율'] = df_copy1['유보율'].replace([np.inf, -np.inf], 0).fillna(0)
df_copy1['유보율'].head()

0      2.368106
1      2.165830
2      2.883408
3      4.185450
4   -128.787947
Name: 유보율, dtype: float64

## 4.5. 안정성
- 자기자본비율 : O (자기자본 / 자본총계)
- 유동비율 : O (유동자산 / 유동부채)
- 당좌비율 : O (당좌자산 / 유동부채)
- 현금비율 : X
- 재고자산 대 순운전자본비율 : O (재고자산 / 순운전자본)
- 매출채권 대 매입채무비율 : O (매출채권 / 매입채무)
- 이자보상비율(이자비용) : X
- Cash Flow 대 부채비율 : X
- Cash Flow 대 차입금비율 : X
- Cash Flow 대 총자본비율 : X
- Cash Flow 대 매출액비율 : X
- 기계투자효율 : X
- 부가가치율 : X
- 노동소득분배율 : X

### 4.5.1. 자기자본비율
- 기업 재무구조의 건전성을 나타내는 가장 대표적 지표이다.
- 자기자본은 직접적인 금융비용을 부담하지 않고 기업이 장기적으로 운용할 수 있는 안정된 자본이므로 이 비율이 높을수록 기업의 재무구조 건전하다고 할 수 있으며 일반적인 표준비율은 50% 이상으로 보고 있다.
- 자기자본비율 = 자기자본 / 자산총계

In [45]:
df_copy1['자기자본비율'] = df['자본총계'] / df['자산총계']
df_copy1['자기자본비율'] = df_copy1['자기자본비율'].replace([np.inf, -np.inf], 0).fillna(0)
df_copy1['자기자본비율'].head()

0    0.525937
1    0.565140
2    0.597082
3    0.624241
4    0.322364
Name: 자기자본비율, dtype: float64

### 4.5.2. 유동비율
- 회사의 지불능력을 판단하기 위해서 사용하는 분석지표로 유동부채의 몇 배의 유동자산을 가지고 있는가를 나타내며 이 비율이 높을수록 지불능력이 커진다.
- 200%가 이상적이며, 2대 1의 원칙이라고도 한다.
- 유동비율 = 유동자산 / 유동부채

In [46]:
df_copy1['유동비율'] = df['유동자산'] / df['유동부채']
df_copy1['유동비율'] = df_copy1['유동비율'].replace([np.inf, -np.inf], 0).fillna(0)
df_copy1['유동비율'].head()

0    6.308602
1    0.205501
2    2.632539
3    0.123762
4    0.157228
Name: 유동비율, dtype: float64

### 4.5.3. 당좌비율
- 이것은 기업의 지불능력의 대소를 보기 위한 것이며 그 비율이 100% 이상, 즉 당좌자산이 유동부채와 동액 또는 그 이상이 되는 것이 이상적이다.
- 당좌자산 / 유동부채

In [47]:
df_copy1['당좌비율'] = df['당좌자산'] / df['유동부채']
df_copy1['당좌비율'] = df_copy1['당좌비율'].replace([np.inf, -np.inf], 0).fillna(0)
df_copy1['당좌비율'].head()

0    6.308602
1    0.205501
2    2.632539
3    0.123762
4    0.157228
Name: 당좌비율, dtype: float64

### 4.5.4. 재고자산 대 순운전자본비율
- 재고자산 대 순운전자본비율 = 재고자산 / 순운전자본

In [48]:
df_copy1['재고자산대순운전자본비율'] = df['재고자산'] / df['순운전자본']
df_copy1['재고자산대순운전자본비율'] = df_copy1['재고자산대순운전자본비율'].replace([np.inf, -np.inf], 0).fillna(0)
df_copy1['재고자산대순운전자본비율'].head()

0    0.0
1   -0.0
2    0.0
3   -0.0
4   -0.0
Name: 재고자산대순운전자본비율, dtype: float64

### 4.5.5. 매출채권 대 매입채무비율
- 매출채권 대 매입채무비율 = 매출채권 / 매입채무

In [49]:
df_copy1['매출채권대매입채무비율'] = df['매출채권'] / df['매입채무']
df_copy1['매출채권대매입채무비율'] = df_copy1['매출채권대매입채무비율'].replace([np.inf, -np.inf], 0).fillna(0)
df_copy1['매출채권대매입채무비율'].head()

0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
Name: 매출채권대매입채무비율, dtype: float64

## 4.6. 데이터 저장

In [50]:
df_copy1

Unnamed: 0,사업자등록번호,결산년월,총자본증가율,영업이익증가율,당기순이익증가율,자기자본증가율,매출액증가율,매출총이익률,매출액영업이익률,매출액경상이익률,...,재고자산회전율,당좌자산회전율,순운전자본회전율,운전자본회전율,유보율,자기자본비율,유동비율,당좌비율,재고자산대순운전자본비율,매출채권대매입채무비율
0,1018100340,20181231,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.063145,0.474373,...,0.000000,2.485064,2.953185,52.069455,2.368106,0.525937,6.308602,6.308602,0.000000,0.000000
1,1018100340,20191231,0.037308,1.013936,0.089367,0.114629,-0.013259,1.000000,0.128879,0.526796,...,0.000000,3.016171,-0.780149,69.528437,2.165830,0.565140,0.205501,0.205501,-0.000000,0.000000
2,1018100340,20201231,0.117880,-0.919169,0.246335,0.181063,-0.092985,1.000000,0.011485,0.704525,...,0.000000,2.611196,4.210666,75.475971,2.883408,0.597082,2.632539,2.632539,0.000000,0.000000
3,1018100340,20211231,-0.056428,4.678983,0.122320,-0.013509,0.037967,1.000000,0.062839,0.768272,...,0.000000,5.241216,-0.740282,22.515871,4.185450,0.624241,0.123762,0.123762,-0.000000,0.000000
4,1018100772,20181231,0.000000,0.000000,0.000000,0.000000,0.000000,0.534425,0.138134,0.017886,...,0.000000,2.669910,-0.498099,66.256657,-128.787947,0.322364,0.157228,0.157228,-0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108749,8998800427,20211231,0.249737,-2.215573,-5.213883,-0.610733,0.334008,0.071243,-0.061465,-0.089610,...,6.836957,4.564818,-13.788293,3.813320,1.509107,0.068198,0.834363,0.500317,-2.016729,1.135625
108750,8998800785,20181231,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,-0.000000,0.000000,-0.096361,0.130335,0.034244,0.034244,-0.000000,0.000000
108751,8998800785,20191231,0.000000,-0.019334,-0.012369,-0.074572,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,-0.000000,0.000000,-0.170793,0.120615,0.127681,0.127681,-0.000000,0.000000
108752,8998800785,20201231,0.000000,-0.393148,-0.392873,-0.048923,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,-0.000000,0.000000,-0.215951,0.114715,0.925283,0.925283,-0.000000,0.000000


In [51]:
df_copy1.to_csv('./custom_data/basic_finance_ratio_data.csv', index=False, encoding='cp949')

# 5. 재무상태변동성
- 재무정보는 기업 내부 및 외부환경을 시계열로 반영하는 특성을 가지고 있기 때문에 시간 흐름에 따른 변화가 고려되어야 한다 (Wang et al., 2003). 
- 시계열 특성, 즉 경영성과와 재무상태의 변화를 반영하기 위해 당해시점과 이전 시점의 재무변동성을 반영한 재무상태변동성을 기존 재무변수에서 유도하여 생성하고자 한다(Abbasi et al., 2012).
- (당해 시점 재무비율-이전 시점 재무비율) 또는 (당해 시점 재무비율/이전 시점 재무비율)

In [52]:
# 데이터 불러오기
import pandas as pd
import numpy as np

df2 = pd.read_csv('./custom_data/basic_finance_ratio_data.csv', encoding='cp949')
df2.head(5)

Unnamed: 0,사업자등록번호,결산년월,총자본증가율,영업이익증가율,당기순이익증가율,자기자본증가율,매출액증가율,매출총이익률,매출액영업이익률,매출액경상이익률,...,재고자산회전율,당좌자산회전율,순운전자본회전율,운전자본회전율,유보율,자기자본비율,유동비율,당좌비율,재고자산대순운전자본비율,매출채권대매입채무비율
0,1018100340,20181231,0.0,0.0,0.0,0.0,0.0,1.0,0.063145,0.474373,...,0.0,2.485064,2.953185,52.069455,2.368106,0.525937,6.308602,6.308602,0.0,0.0
1,1018100340,20191231,0.037308,1.013936,0.089367,0.114629,-0.013259,1.0,0.128879,0.526796,...,0.0,3.016171,-0.780149,69.528437,2.16583,0.56514,0.205501,0.205501,-0.0,0.0
2,1018100340,20201231,0.11788,-0.919169,0.246335,0.181063,-0.092985,1.0,0.011485,0.704525,...,0.0,2.611196,4.210666,75.475971,2.883408,0.597082,2.632539,2.632539,0.0,0.0
3,1018100340,20211231,-0.056428,4.678983,0.12232,-0.013509,0.037967,1.0,0.062839,0.768272,...,0.0,5.241216,-0.740282,22.515871,4.18545,0.624241,0.123762,0.123762,-0.0,0.0
4,1018100772,20181231,0.0,0.0,0.0,0.0,0.0,0.534425,0.138134,0.017886,...,0.0,2.66991,-0.498099,66.256657,-128.787947,0.322364,0.157228,0.157228,-0.0,0.0


In [53]:
df_copy2 = df2.copy()
df_copy2 = df_copy2.iloc[:, :2]
df_copy2.head()

Unnamed: 0,사업자등록번호,결산년월
0,1018100340,20181231
1,1018100340,20191231
2,1018100340,20201231
3,1018100340,20211231
4,1018100772,20181231


In [54]:
df2.columns

Index(['사업자등록번호', '결산년월', '총자본증가율', '영업이익증가율', '당기순이익증가율', '자기자본증가율', '매출액증가율',
       '매출총이익률', '매출액영업이익률', '매출액경상이익률', '매출액순이익률', '총자산영업이익률', '자기자본영업이익률',
       '자기자본순이익률', '금융비용부담률', '수지비율', '사내유보대자기자본비율', '총자산순이익률', '총자본회전율',
       '자기자본회전율', '타인자본회전율', '유동자산회전율', '재고자산회전율', '당좌자산회전율', '순운전자본회전율',
       '운전자본회전율', '유보율', '자기자본비율', '유동비율', '당좌비율', '재고자산대순운전자본비율',
       '매출채권대매입채무비율'],
      dtype='object')

## 5.1. 차이( - )로 산출

In [55]:
for i in df2.columns[2:]:
    df_copy2[f'{i}(차이)'] = df2.groupby(['사업자등록번호'])[i].diff()
    df_copy2[f'{i}(차이)'] = df_copy2[f'{i}(차이)'].replace([np.inf, -np.inf], 0).fillna(0)

df_copy2.head()

Unnamed: 0,사업자등록번호,결산년월,총자본증가율(차이),영업이익증가율(차이),당기순이익증가율(차이),자기자본증가율(차이),매출액증가율(차이),매출총이익률(차이),매출액영업이익률(차이),매출액경상이익률(차이),...,재고자산회전율(차이),당좌자산회전율(차이),순운전자본회전율(차이),운전자본회전율(차이),유보율(차이),자기자본비율(차이),유동비율(차이),당좌비율(차이),재고자산대순운전자본비율(차이),매출채권대매입채무비율(차이)
0,1018100340,20181231,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1018100340,20191231,0.037308,1.013936,0.089367,0.114629,-0.013259,0.0,0.065734,0.052423,...,0.0,0.531107,-3.733334,17.458982,-0.202276,0.039203,-6.1031,-6.1031,-0.0,0.0
2,1018100340,20201231,0.080573,-1.933106,0.156967,0.066435,-0.079726,0.0,-0.117394,0.177729,...,0.0,-0.404975,4.990815,5.947534,0.717578,0.031942,2.427037,2.427037,0.0,0.0
3,1018100340,20211231,-0.174309,5.598153,-0.124014,-0.194572,0.130951,0.0,0.051354,0.063747,...,0.0,2.63002,-4.950948,-52.9601,1.302042,0.027159,-2.508777,-2.508777,-0.0,0.0
4,1018100772,20181231,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 5.2. 비율( / )로 산출

In [56]:
for i in df2.columns[2:]:
    df_copy2[f'{i}(비율)'] = df2.groupby(['사업자등록번호'])[i].pct_change()
    df_copy2[f'{i}(비율)'] = df_copy2[f'{i}(비율)'].replace([np.inf, -np.inf], 0).fillna(0)

df_copy2.head()

Unnamed: 0,사업자등록번호,결산년월,총자본증가율(차이),영업이익증가율(차이),당기순이익증가율(차이),자기자본증가율(차이),매출액증가율(차이),매출총이익률(차이),매출액영업이익률(차이),매출액경상이익률(차이),...,재고자산회전율(비율),당좌자산회전율(비율),순운전자본회전율(비율),운전자본회전율(비율),유보율(비율),자기자본비율(비율),유동비율(비율),당좌비율(비율),재고자산대순운전자본비율(비율),매출채권대매입채무비율(비율)
0,1018100340,20181231,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1018100340,20191231,0.037308,1.013936,0.089367,0.114629,-0.013259,0.0,0.065734,0.052423,...,0.0,0.213719,-1.264172,0.335302,-0.085417,0.07454,-0.967425,-0.967425,0.0,0.0
2,1018100340,20201231,0.080573,-1.933106,0.156967,0.066435,-0.079726,0.0,-0.117394,0.177729,...,0.0,-0.134268,-6.397258,0.085541,0.331318,0.05652,11.810318,11.810318,0.0,0.0
3,1018100340,20211231,-0.174309,5.598153,-0.124014,-0.194572,0.130951,0.0,0.051354,0.063747,...,0.0,1.007209,-1.175811,-0.701682,0.451564,0.045486,-0.952988,-0.952988,0.0,0.0
4,1018100772,20181231,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 5.3. 데이터 저장

In [57]:
df_copy2.to_csv('./custom_data/financial_position_volatility.csv', index=False, encoding='cp949')

# 6. 산업수준상태변동성
- 경쟁에서 뒤쳐진 부실기업들의 경영성과와 재무상태는 우량기업에 비해 상대적으로 저조할 뿐만 아니라, 산업의 평균과도 비교할 경우 차이가 있을 것임
- 데이터셋에서 산업코드를 기준으로 산업을 18개로 대분류하여 각각의 산업에서 총자산순이익률을 구함.
- 폐업한 기업과 이상치를 가진 기업을 제외한 상위 10개 기업의 총자산순이익률 평균과 해당 산업 전체 총자산순이익률 평균을 구하여 차이( - )와 비율 ( / )을 통해 산업수준상태변동성 변수를 생성함.

## 6.1. 데이터 불러오기

In [59]:
import pandas as pd
import numpy as np

temp1 = pd.read_csv('./custom_data/basic_finance_ratio_data.csv', encoding='cp949')
temp2 = pd.read_csv('./custom_data/feature_engineering_idurstry_code.csv', encoding='cp949')

print(temp1.shape)
print(temp2.shape)

(108754, 32)
(109142, 79)


In [60]:
# 사업자등록번호와 업종대분류 결합하기
temp2_split = temp2[['사업자등록번호', '업종대분류']]
temp2_split.drop_duplicates(inplace=True)
temp2_split

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp2_split.drop_duplicates(inplace=True)


Unnamed: 0,사업자등록번호,업종대분류
0,1018100340,부동산업
4,1018100772,숙박 및 음식점업
8,1018101126,부동산업
12,1018101242,부동산업
16,1018103819,부동산업
...,...,...
109125,8998700283,건설업
109127,8998701263,금융 및 보험업
109130,8998800150,"전문, 과학 및 기술 서비스업"
109134,8998800427,제조업


In [61]:
temp2_keys = temp2_split['사업자등록번호'].to_list()
temp2_values = temp2_split['업종대분류'].to_list()
class_dict = dict(zip(temp2_keys, temp2_values))
class_dict

{1018100340: '부동산업',
 1018100772: '숙박 및 음식점업',
 1018101126: '부동산업',
 1018101242: '부동산업',
 1018103819: '부동산업',
 1018104477: '예술, 스포츠 및 여가관련 서비스업',
 1018104991: '전문, 과학 및 기술 서비스업',
 1018106586: '도매 및 소매업',
 1018115921: '제조업',
 1018116478: '도매 및 소매업',
 1018117254: '부동산업',
 1018118781: '도매 및 소매업',
 1018119209: '숙박 및 음식점업',
 1018119252: '도매 및 소매업',
 1018119311: '도매 및 소매업',
 1018119742: '도매 및 소매업',
 1018121379: '도매 및 소매업',
 1018121627: '도매 및 소매업',
 1018121646: '부동산업',
 1018122626: '예술, 스포츠 및 여가관련 서비스업',
 1018122999: '제조업',
 1018124696: '도매 및 소매업',
 1018124906: '도매 및 소매업',
 1018124925: '도매 및 소매업',
 1018126734: '정보통신업',
 1018126767: '교육 서비스업',
 1018132686: '부동산업',
 1018133287: '부동산업',
 1018135117: '부동산업',
 1018135174: '도매 및 소매업',
 1018135422: '전문, 과학 및 기술 서비스업',
 1018135742: '도매 및 소매업',
 1018136659: '도매 및 소매업',
 1018138303: '도매 및 소매업',
 1018138657: '제조업',
 1018139184: '제조업',
 1018139656: '제조업',
 1018140003: '부동산업',
 1018143015: '제조업',
 1018144519: '도매 및 소매업',
 1018144673: '도매 및 소매업',
 10181451

In [62]:
temp3 = temp1[['사업자등록번호', '결산년월']]
temp3['업종대분류'] = temp3['사업자등록번호'].apply(lambda x: class_dict[x])
temp3

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp3['업종대분류'] = temp3['사업자등록번호'].apply(lambda x: class_dict[x])


Unnamed: 0,사업자등록번호,결산년월,업종대분류
0,1018100340,20181231,부동산업
1,1018100340,20191231,부동산업
2,1018100340,20201231,부동산업
3,1018100340,20211231,부동산업
4,1018100772,20181231,숙박 및 음식점업
...,...,...,...
108749,8998800427,20211231,제조업
108750,8998800785,20181231,부동산업
108751,8998800785,20191231,부동산업
108752,8998800785,20201231,부동산업


## 6.2. 산업별 top10 기업 평균 총자산이익률

In [63]:
here_add = temp3[['사업자등록번호', '결산년월', '업종대분류']]
temp4 = pd.read_csv('./custom_data/mean_top10_per_year.csv', encoding='cp949')

display(here_add)
display(temp4)

Unnamed: 0,사업자등록번호,결산년월,업종대분류
0,1018100340,20181231,부동산업
1,1018100340,20191231,부동산업
2,1018100340,20201231,부동산업
3,1018100340,20211231,부동산업
4,1018100772,20181231,숙박 및 음식점업
...,...,...,...
108749,8998800427,20211231,제조업
108750,8998800785,20181231,부동산업
108751,8998800785,20191231,부동산업
108752,8998800785,20201231,부동산업


Unnamed: 0,년도,업종대분류,평균총자산이익률
0,2018,부동산업,0.885174
1,2018,숙박 및 음식점업,0.415017
2,2018,"예술, 스포츠 및 여가관련 서비스업",0.402173
3,2018,"전문, 과학 및 기술 서비스업",0.473636
4,2018,도매 및 소매업,0.998841
...,...,...,...
67,2021,"협회 및 단체, 수리 및 기타 개인 서비스업",0.235406
68,2021,"수도, 하수 및 폐기물 처리, 원료 재생업",0.768268
69,2021,보건업 및 사회복지 서비스업,0.032564
70,2021,"농업, 임업 및 어업",0.000000


In [64]:
a = dict(zip(temp4['업종대분류'][0:18], temp4['평균총자산이익률'][0:18]))
b = dict(zip(temp4['업종대분류'][18:36], temp4['평균총자산이익률'][18:36]))
c = dict(zip(temp4['업종대분류'][36:54], temp4['평균총자산이익률'][36:54]))
d = dict(zip(temp4['업종대분류'][54:72], temp4['평균총자산이익률'][54:72]))

top10_dict = dict(zip(['2018', '2019', '2020', '2021'], [a, b, c, d]))
top10_dict['2018']

{'부동산업': 0.8851737760404074,
 '숙박 및 음식점업': 0.415017484337433,
 '예술, 스포츠 및 여가관련 서비스업': 0.4021733675641959,
 '전문, 과학 및 기술 서비스업': 0.4736363260254195,
 '도매 및 소매업': 0.9988408377256484,
 '제조업': 0.9355202785839424,
 '정보통신업': 0.8037523298753987,
 '교육 서비스업': 0.3099333821532271,
 '운수 및 창고업': 0.444448340320904,
 '금융 및 보험업': 0.3522601440948336,
 '건설업': 0.604943852741984,
 '사업시설 관리, 사업 지원 및 임대 서비스업': 0.4280661581553598,
 '전기, 가스, 증기 및 공기 조절 공급업': 0.1391009841043759,
 '협회 및 단체, 수리 및 기타 개인 서비스업': 0.2295968837955324,
 '수도, 하수 및 폐기물 처리, 원료 재생업': 0.2795074844306607,
 '보건업 및 사회복지 서비스업': 0.0376239720656183,
 '농업, 임업 및 어업': 0.0,
 '공공 행정, 국방 및 사회보장 행정': 0.0}

In [65]:
here_add['결산년월'] = here_add['결산년월'].astype('str')

def check(x):
    return top10_dict[x.결산년월[:4]][x.업종대분류]

here_add['평균총자산이익률(top10)'] = here_add.apply(check, axis=1)

In [66]:
here_add.head(8)

Unnamed: 0,사업자등록번호,결산년월,업종대분류,평균총자산이익률(top10)
0,1018100340,20181231,부동산업,0.885174
1,1018100340,20191231,부동산업,0.758599
2,1018100340,20201231,부동산업,1.38944
3,1018100340,20211231,부동산업,7.878755
4,1018100772,20181231,숙박 및 음식점업,0.415017
5,1018100772,20191231,숙박 및 음식점업,0.438342
6,1018100772,20201231,숙박 및 음식점업,0.393467
7,1018100772,20211231,숙박 및 음식점업,0.458539


In [67]:
here_add.shape

(108754, 4)

## 6.3. 산업별 전체기업 평균 총자산순이익률

In [68]:
temp3_split = temp3.copy()
temp3_split['총자산순이익률'] = temp1['총자산순이익률']
temp3_split

Unnamed: 0,사업자등록번호,결산년월,업종대분류,총자산순이익률
0,1018100340,20181231,부동산업,0.071662
1,1018100340,20191231,부동산업,0.075258
2,1018100340,20201231,부동산업,0.083906
3,1018100340,20211231,부동산업,0.099801
4,1018100772,20181231,숙박 및 음식점업,0.000970
...,...,...,...,...
108749,8998800427,20211231,제조업,-0.119129
108750,8998800785,20181231,부동산업,-0.009841
108751,8998800785,20191231,부동산업,-0.009719
108752,8998800785,20201231,부동산업,-0.005901


In [69]:
# 년도, 업종대분류별 평균 총자산순이익률 구하기
temp3_split['결산년월'] = temp3_split['결산년월'].astype('str')
temp3_split['년도'] = temp3_split['결산년월'].apply(lambda x: x[:4])
display(temp3_split.head())

all_avg = temp3_split.groupby(['년도', '업종대분류'], as_index=False)['총자산순이익률'].mean()
display(all_avg)

Unnamed: 0,사업자등록번호,결산년월,업종대분류,총자산순이익률,년도
0,1018100340,20181231,부동산업,0.071662,2018
1,1018100340,20191231,부동산업,0.075258,2019
2,1018100340,20201231,부동산업,0.083906,2020
3,1018100340,20211231,부동산업,0.099801,2021
4,1018100772,20181231,숙박 및 음식점업,0.00097,2018


Unnamed: 0,년도,업종대분류,총자산순이익률
0,2018,건설업,0.029095
1,2018,"공공 행정, 국방 및 사회보장 행정",0.038710
2,2018,교육 서비스업,-0.016946
3,2018,금융 및 보험업,0.008393
4,2018,"농업, 임업 및 어업",0.032372
...,...,...,...
67,2021,"전기, 가스, 증기 및 공기 조절 공급업",0.007805
68,2021,"전문, 과학 및 기술 서비스업",0.011890
69,2021,정보통신업,-0.021374
70,2021,제조업,0.019827


In [70]:
a = dict(zip(all_avg['업종대분류'][0:18], all_avg['총자산순이익률'][0:18]))
b = dict(zip(all_avg['업종대분류'][18:36], all_avg['총자산순이익률'][18:36]))
c = dict(zip(all_avg['업종대분류'][36:54], all_avg['총자산순이익률'][36:54]))
d = dict(zip(all_avg['업종대분류'][54:72], all_avg['총자산순이익률'][54:72]))

all_dict = dict(zip(['2018', '2019', '2020', '2021'], [a, b, c, d]))
all_dict['2018']

{'건설업': 0.029094614463102006,
 '공공 행정, 국방 및 사회보장 행정': 0.03871024395421167,
 '교육 서비스업': -0.016946266300815527,
 '금융 및 보험업': 0.008393433188050333,
 '농업, 임업 및 어업': 0.0323723917019534,
 '도매 및 소매업': 0.03241928945116725,
 '보건업 및 사회복지 서비스업': 0.012824401958328195,
 '부동산업': 0.0010488796790737178,
 '사업시설 관리, 사업 지원 및 임대 서비스업': 0.0016968635755774265,
 '수도, 하수 및 폐기물 처리, 원료 재생업': 0.04535656456453273,
 '숙박 및 음식점업': -0.0448953896319436,
 '예술, 스포츠 및 여가관련 서비스업': -0.007024945905419508,
 '운수 및 창고업': -0.0033701143915265334,
 '전기, 가스, 증기 및 공기 조절 공급업': 0.0027060325291122737,
 '전문, 과학 및 기술 서비스업': -0.040782458195778304,
 '정보통신업': -0.17633472867572986,
 '제조업': 0.012005375620417575,
 '협회 및 단체, 수리 및 기타 개인 서비스업': 0.0029175976378275326}

In [71]:
here_add['결산년월'] = here_add['결산년월'].astype('str')

def check(x):
    return all_dict[x.결산년월[:4]][x.업종대분류]

here_add['평균총자산이익률(all)'] = here_add.apply(check, axis=1)
here_add

Unnamed: 0,사업자등록번호,결산년월,업종대분류,평균총자산이익률(top10),평균총자산이익률(all)
0,1018100340,20181231,부동산업,0.885174,0.001049
1,1018100340,20191231,부동산업,0.758599,-0.017979
2,1018100340,20201231,부동산업,1.389440,-0.001585
3,1018100340,20211231,부동산업,7.878755,0.026621
4,1018100772,20181231,숙박 및 음식점업,0.415017,-0.044895
...,...,...,...,...,...
108749,8998800427,20211231,제조업,1.078610,0.019827
108750,8998800785,20181231,부동산업,0.885174,0.001049
108751,8998800785,20191231,부동산업,0.758599,-0.017979
108752,8998800785,20201231,부동산업,1.389440,-0.001585


## 6.4. 데이터 저장(산업별 평균총자산이익률)

In [72]:
here_add.to_csv('./custom_data/mean_total_assets_per_indurstry.csv', index=False, encoding='cp949')

## 6.4. 파생변수생성

In [73]:
base_ratio = pd.read_csv('./custom_data/basic_finance_ratio_data.csv', encoding='cp949')
ROA_mean = pd.read_csv('./custom_data/mean_total_assets_per_indurstry.csv', encoding='cp949')
display(base_ratio.head())
display(ROA_mean.head())


Unnamed: 0,사업자등록번호,결산년월,총자본증가율,영업이익증가율,당기순이익증가율,자기자본증가율,매출액증가율,매출총이익률,매출액영업이익률,매출액경상이익률,...,재고자산회전율,당좌자산회전율,순운전자본회전율,운전자본회전율,유보율,자기자본비율,유동비율,당좌비율,재고자산대순운전자본비율,매출채권대매입채무비율
0,1018100340,20181231,0.0,0.0,0.0,0.0,0.0,1.0,0.063145,0.474373,...,0.0,2.485064,2.953185,52.069455,2.368106,0.525937,6.308602,6.308602,0.0,0.0
1,1018100340,20191231,0.037308,1.013936,0.089367,0.114629,-0.013259,1.0,0.128879,0.526796,...,0.0,3.016171,-0.780149,69.528437,2.16583,0.56514,0.205501,0.205501,-0.0,0.0
2,1018100340,20201231,0.11788,-0.919169,0.246335,0.181063,-0.092985,1.0,0.011485,0.704525,...,0.0,2.611196,4.210666,75.475971,2.883408,0.597082,2.632539,2.632539,0.0,0.0
3,1018100340,20211231,-0.056428,4.678983,0.12232,-0.013509,0.037967,1.0,0.062839,0.768272,...,0.0,5.241216,-0.740282,22.515871,4.18545,0.624241,0.123762,0.123762,-0.0,0.0
4,1018100772,20181231,0.0,0.0,0.0,0.0,0.0,0.534425,0.138134,0.017886,...,0.0,2.66991,-0.498099,66.256657,-128.787947,0.322364,0.157228,0.157228,-0.0,0.0


Unnamed: 0,사업자등록번호,결산년월,업종대분류,평균총자산이익률(top10),평균총자산이익률(all)
0,1018100340,20181231,부동산업,0.885174,0.001049
1,1018100340,20191231,부동산업,0.758599,-0.017979
2,1018100340,20201231,부동산업,1.38944,-0.001585
3,1018100340,20211231,부동산업,7.878755,0.026621
4,1018100772,20181231,숙박 및 음식점업,0.415017,-0.044895


In [74]:
ROA_mean['평균총자산이익률(top10)']

0         0.885174
1         0.758599
2         1.389440
3         7.878755
4         0.415017
            ...   
108749    1.078610
108750    0.885174
108751    0.758599
108752    1.389440
108753    7.878755
Name: 평균총자산이익률(top10), Length: 108754, dtype: float64

In [75]:
here_add2 = ROA_mean[['사업자등록번호', '결산년월']]
here_add2

Unnamed: 0,사업자등록번호,결산년월
0,1018100340,20181231
1,1018100340,20191231
2,1018100340,20201231
3,1018100340,20211231
4,1018100772,20181231
...,...,...
108749,8998800427,20211231
108750,8998800785,20181231
108751,8998800785,20191231
108752,8998800785,20201231


In [76]:
for i in base_ratio.columns[2:]:
    here_add2[f'{i}(top10_차이)'] = base_ratio[i] - ROA_mean['평균총자산이익률(top10)']
    here_add2[f'{i}(top10_차이)'] = here_add2[f'{i}(top10_차이)'].replace([np.inf, -np.inf], 0).fillna(0)

    here_add2[f'{i}(top10_비율)'] = base_ratio[i] / ROA_mean['평균총자산이익률(top10)']
    here_add2[f'{i}(top10_비율)'] = here_add2[f'{i}(top10_비율)'].replace([np.inf, -np.inf], 0).fillna(0)

    here_add2[f'{i}(all_차이)'] = base_ratio[i] - ROA_mean['평균총자산이익률(top10)']
    here_add2[f'{i}(all_차이)'] = here_add2[f'{i}(all_차이)'].replace([np.inf, -np.inf], 0).fillna(0)

    here_add2[f'{i}(all_비율)'] = base_ratio[i] / ROA_mean['평균총자산이익률(top10)']
    here_add2[f'{i}(all_비율)'] = here_add2[f'{i}(all_비율)'].replace([np.inf, -np.inf], 0).fillna(0)

here_add2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  here_add2[f'{i}(top10_차이)'] = base_ratio[i] - ROA_mean['평균총자산이익률(top10)']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  here_add2[f'{i}(top10_차이)'] = here_add2[f'{i}(top10_차이)'].replace([np.inf, -np.inf], 0).fillna(0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  here_add2[f'{i}(top10_비율)'] = bas

Unnamed: 0,사업자등록번호,결산년월,총자본증가율(top10_차이),총자본증가율(top10_비율),총자본증가율(all_차이),총자본증가율(all_비율),영업이익증가율(top10_차이),영업이익증가율(top10_비율),영업이익증가율(all_차이),영업이익증가율(all_비율),...,당좌비율(all_차이),당좌비율(all_비율),재고자산대순운전자본비율(top10_차이),재고자산대순운전자본비율(top10_비율),재고자산대순운전자본비율(all_차이),재고자산대순운전자본비율(all_비율),매출채권대매입채무비율(top10_차이),매출채권대매입채무비율(top10_비율),매출채권대매입채무비율(all_차이),매출채권대매입채무비율(all_비율)
0,1018100340,20181231,-0.885174,0.000000,-0.885174,0.000000,-0.885174,0.000000,-0.885174,0.000000,...,5.423428,7.126964,-0.885174,0.000000,-0.885174,0.000000,-0.885174,0.00000,-0.885174,0.00000
1,1018100340,20191231,-0.721291,0.049180,-0.721291,0.049180,0.255337,1.336591,0.255337,1.336591,...,-0.553097,0.270896,-0.758599,-0.000000,-0.758599,-0.000000,-0.758599,0.00000,-0.758599,0.00000
2,1018100340,20201231,-1.271559,0.084840,-1.271559,0.084840,-2.308609,-0.661540,-2.308609,-0.661540,...,1.243099,1.894676,-1.389440,0.000000,-1.389440,0.000000,-1.389440,0.00000,-1.389440,0.00000
3,1018100340,20211231,-7.935183,-0.007162,-7.935183,-0.007162,-3.199771,0.593873,-3.199771,0.593873,...,-7.754993,0.015708,-7.878755,-0.000000,-7.878755,-0.000000,-7.878755,0.00000,-7.878755,0.00000
4,1018100772,20181231,-0.415017,0.000000,-0.415017,0.000000,-0.415017,0.000000,-0.415017,0.000000,...,-0.257790,0.378846,-0.415017,-0.000000,-0.415017,-0.000000,-0.415017,0.00000,-0.415017,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108749,8998800427,20211231,-0.828872,0.231536,-0.828872,0.231536,-3.294183,-2.054101,-3.294183,-2.054101,...,-0.578293,0.463854,-3.095339,-1.869749,-3.095339,-1.869749,0.057015,1.05286,0.057015,1.05286
108750,8998800785,20181231,-0.885174,0.000000,-0.885174,0.000000,-0.885174,0.000000,-0.885174,0.000000,...,-0.850930,0.038686,-0.885174,-0.000000,-0.885174,-0.000000,-0.885174,0.00000,-0.885174,0.00000
108751,8998800785,20191231,-0.758599,0.000000,-0.758599,0.000000,-0.777933,-0.025486,-0.777933,-0.025486,...,-0.630918,0.168311,-0.758599,-0.000000,-0.758599,-0.000000,-0.758599,0.00000,-0.758599,0.00000
108752,8998800785,20201231,-1.389440,0.000000,-1.389440,0.000000,-1.782588,-0.282955,-1.782588,-0.282955,...,-0.464157,0.665939,-1.389440,-0.000000,-1.389440,-0.000000,-1.389440,0.00000,-1.389440,0.00000


## 6.5. 데이터 저장

In [77]:
here_add2.to_csv('./custom_data/variability_industrial_level.csv', index=False, encoding='cp949')

# 7. 마무리
- 변화율의 경우 기준이 되는 2018년의 값이 모두 0이므로 2018년은 분석에 사용하지 않음 -> 삭제
- 윈저라이징(winsorizing) : 이상치를 다른 데이터로 대체하는 방법
- 상위 99%를 초과하는 데이터는 99% 값으로 대체하고, 하위 1% 미만의 데이터는 1% 데이터로 대체함

## 7.1. 데이터 합치기

In [78]:
a = pd.read_csv('./custom_data/basic_finance_ratio_data.csv', encoding='cp949')
b = pd.read_csv('./custom_data/financial_position_volatility.csv', encoding='cp949')
c = pd.read_csv('./custom_data/variability_industrial_level.csv', encoding='cp949')
final_data = a.copy()

a = a.iloc[:, 2:]
b = b.iloc[:, 2:]
c = c.iloc[:, 2:]

In [79]:
display(a.head())
display(b.head())
display(c.head())

Unnamed: 0,총자본증가율,영업이익증가율,당기순이익증가율,자기자본증가율,매출액증가율,매출총이익률,매출액영업이익률,매출액경상이익률,매출액순이익률,총자산영업이익률,...,재고자산회전율,당좌자산회전율,순운전자본회전율,운전자본회전율,유보율,자기자본비율,유동비율,당좌비율,재고자산대순운전자본비율,매출채권대매입채무비율
0,0.0,0.0,0.0,0.0,0.0,1.0,0.063145,0.474373,0.378488,0.011956,...,0.0,2.485064,2.953185,52.069455,2.368106,0.525937,6.308602,6.308602,0.0,0.0
1,0.037308,1.013936,0.089367,0.114629,-0.013259,1.0,0.128879,0.526796,0.417853,0.023212,...,0.0,3.016171,-0.780149,69.528437,2.16583,0.56514,0.205501,0.205501,-0.0,0.0
2,0.11788,-0.919169,0.246335,0.181063,-0.092985,1.0,0.011485,0.704525,0.574174,0.001678,...,0.0,2.611196,4.210666,75.475971,2.883408,0.597082,2.632539,2.632539,0.0,0.0
3,-0.056428,4.678983,0.12232,-0.013509,0.037967,1.0,0.062839,0.768272,0.620836,0.010102,...,0.0,5.241216,-0.740282,22.515871,4.18545,0.624241,0.123762,0.123762,-0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.534425,0.138134,0.017886,0.017886,0.007488,...,0.0,2.66991,-0.498099,66.256657,-128.787947,0.322364,0.157228,0.157228,-0.0,0.0


Unnamed: 0,총자본증가율(차이),영업이익증가율(차이),당기순이익증가율(차이),자기자본증가율(차이),매출액증가율(차이),매출총이익률(차이),매출액영업이익률(차이),매출액경상이익률(차이),매출액순이익률(차이),총자산영업이익률(차이),...,재고자산회전율(비율),당좌자산회전율(비율),순운전자본회전율(비율),운전자본회전율(비율),유보율(비율),자기자본비율(비율),유동비율(비율),당좌비율(비율),재고자산대순운전자본비율(비율),매출채권대매입채무비율(비율)
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.037308,1.013936,0.089367,0.114629,-0.013259,0.0,0.065734,0.052423,0.039365,0.011256,...,0.0,0.213719,-1.264172,0.335302,-0.085417,0.07454,-0.967425,-0.967425,0.0,0.0
2,0.080573,-1.933106,0.156967,0.066435,-0.079726,0.0,-0.117394,0.177729,0.156321,-0.021534,...,0.0,-0.134268,-6.397258,0.085541,0.331318,0.05652,11.810318,11.810318,0.0,0.0
3,-0.174309,5.598153,-0.124014,-0.194572,0.130951,0.0,0.051354,0.063747,0.046662,0.008423,...,0.0,1.007209,-1.175811,-0.701682,0.451564,0.045486,-0.952988,-0.952988,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,총자본증가율(top10_차이),총자본증가율(top10_비율),총자본증가율(all_차이),총자본증가율(all_비율),영업이익증가율(top10_차이),영업이익증가율(top10_비율),영업이익증가율(all_차이),영업이익증가율(all_비율),당기순이익증가율(top10_차이),당기순이익증가율(top10_비율),...,당좌비율(all_차이),당좌비율(all_비율),재고자산대순운전자본비율(top10_차이),재고자산대순운전자본비율(top10_비율),재고자산대순운전자본비율(all_차이),재고자산대순운전자본비율(all_비율),매출채권대매입채무비율(top10_차이),매출채권대매입채무비율(top10_비율),매출채권대매입채무비율(all_차이),매출채권대매입채무비율(all_비율)
0,-0.885174,0.0,-0.885174,0.0,-0.885174,0.0,-0.885174,0.0,-0.885174,0.0,...,5.423428,7.126964,-0.885174,0.0,-0.885174,0.0,-0.885174,0.0,-0.885174,0.0
1,-0.721291,0.04918,-0.721291,0.04918,0.255337,1.336591,0.255337,1.336591,-0.669232,0.117806,...,-0.553097,0.270896,-0.758599,-0.0,-0.758599,-0.0,-0.758599,0.0,-0.758599,0.0
2,-1.271559,0.08484,-1.271559,0.08484,-2.308609,-0.66154,-2.308609,-0.66154,-1.143105,0.177291,...,1.243099,1.894676,-1.38944,0.0,-1.38944,0.0,-1.38944,0.0,-1.38944,0.0
3,-7.935183,-0.007162,-7.935183,-0.007162,-3.199771,0.593873,-3.199771,0.593873,-7.756434,0.015525,...,-7.754993,0.015708,-7.878755,-0.0,-7.878755,-0.0,-7.878755,0.0,-7.878755,0.0
4,-0.415017,0.0,-0.415017,0.0,-0.415017,0.0,-0.415017,0.0,-0.415017,0.0,...,-0.25779,0.378846,-0.415017,-0.0,-0.415017,-0.0,-0.415017,0.0,-0.415017,0.0


In [80]:
final_data = final_data[['사업자등록번호', '결산년월']]
final_data

Unnamed: 0,사업자등록번호,결산년월
0,1018100340,20181231
1,1018100340,20191231
2,1018100340,20201231
3,1018100340,20211231
4,1018100772,20181231
...,...,...
108749,8998800427,20211231
108750,8998800785,20181231
108751,8998800785,20191231
108752,8998800785,20201231


In [81]:
final_data = pd.concat([final_data, a], axis=1)
final_data = pd.concat([final_data, b], axis=1)
final_data = pd.concat([final_data, c], axis=1)
final_data

Unnamed: 0,사업자등록번호,결산년월,총자본증가율,영업이익증가율,당기순이익증가율,자기자본증가율,매출액증가율,매출총이익률,매출액영업이익률,매출액경상이익률,...,당좌비율(all_차이),당좌비율(all_비율),재고자산대순운전자본비율(top10_차이),재고자산대순운전자본비율(top10_비율),재고자산대순운전자본비율(all_차이),재고자산대순운전자본비율(all_비율),매출채권대매입채무비율(top10_차이),매출채권대매입채무비율(top10_비율),매출채권대매입채무비율(all_차이),매출채권대매입채무비율(all_비율)
0,1018100340,20181231,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.063145,0.474373,...,5.423428,7.126964,-0.885174,0.000000,-0.885174,0.000000,-0.885174,0.00000,-0.885174,0.00000
1,1018100340,20191231,0.037308,1.013936,0.089367,0.114629,-0.013259,1.000000,0.128879,0.526796,...,-0.553097,0.270896,-0.758599,-0.000000,-0.758599,-0.000000,-0.758599,0.00000,-0.758599,0.00000
2,1018100340,20201231,0.117880,-0.919169,0.246335,0.181063,-0.092985,1.000000,0.011485,0.704525,...,1.243099,1.894676,-1.389440,0.000000,-1.389440,0.000000,-1.389440,0.00000,-1.389440,0.00000
3,1018100340,20211231,-0.056428,4.678983,0.122320,-0.013509,0.037967,1.000000,0.062839,0.768272,...,-7.754993,0.015708,-7.878755,-0.000000,-7.878755,-0.000000,-7.878755,0.00000,-7.878755,0.00000
4,1018100772,20181231,0.000000,0.000000,0.000000,0.000000,0.000000,0.534425,0.138134,0.017886,...,-0.257790,0.378846,-0.415017,-0.000000,-0.415017,-0.000000,-0.415017,0.00000,-0.415017,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108749,8998800427,20211231,0.249737,-2.215573,-5.213883,-0.610733,0.334008,0.071243,-0.061465,-0.089610,...,-0.578293,0.463854,-3.095339,-1.869749,-3.095339,-1.869749,0.057015,1.05286,0.057015,1.05286
108750,8998800785,20181231,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,-0.850930,0.038686,-0.885174,-0.000000,-0.885174,-0.000000,-0.885174,0.00000,-0.885174,0.00000
108751,8998800785,20191231,0.000000,-0.019334,-0.012369,-0.074572,0.000000,0.000000,0.000000,0.000000,...,-0.630918,0.168311,-0.758599,-0.000000,-0.758599,-0.000000,-0.758599,0.00000,-0.758599,0.00000
108752,8998800785,20201231,0.000000,-0.393148,-0.392873,-0.048923,0.000000,0.000000,0.000000,0.000000,...,-0.464157,0.665939,-1.389440,-0.000000,-1.389440,-0.000000,-1.389440,0.00000,-1.389440,0.00000


## 7.2. 2018년 제거

In [82]:
final_data['결산년월'] = final_data['결산년월'].astype('str')
final_data = final_data[~final_data['결산년월'].str.contains('2018')]

In [83]:
final_data.reset_index(drop=True, inplace=True)

In [84]:
final_data

Unnamed: 0,사업자등록번호,결산년월,총자본증가율,영업이익증가율,당기순이익증가율,자기자본증가율,매출액증가율,매출총이익률,매출액영업이익률,매출액경상이익률,...,당좌비율(all_차이),당좌비율(all_비율),재고자산대순운전자본비율(top10_차이),재고자산대순운전자본비율(top10_비율),재고자산대순운전자본비율(all_차이),재고자산대순운전자본비율(all_비율),매출채권대매입채무비율(top10_차이),매출채권대매입채무비율(top10_비율),매출채권대매입채무비율(all_차이),매출채권대매입채무비율(all_비율)
0,1018100340,20191231,0.037308,1.013936,0.089367,0.114629,-0.013259,1.000000,0.128879,0.526796,...,-0.553097,0.270896,-0.758599,-0.000000,-0.758599,-0.000000,-0.758599,0.000000,-0.758599,0.000000
1,1018100340,20201231,0.117880,-0.919169,0.246335,0.181063,-0.092985,1.000000,0.011485,0.704525,...,1.243099,1.894676,-1.389440,0.000000,-1.389440,0.000000,-1.389440,0.000000,-1.389440,0.000000
2,1018100340,20211231,-0.056428,4.678983,0.122320,-0.013509,0.037967,1.000000,0.062839,0.768272,...,-7.754993,0.015708,-7.878755,-0.000000,-7.878755,-0.000000,-7.878755,0.000000,-7.878755,0.000000
3,1018100772,20191231,0.028081,0.541345,-0.427849,0.001721,0.020793,0.549552,0.208576,0.010025,...,-0.302263,0.310441,-0.438342,-0.000000,-0.438342,-0.000000,-0.438342,0.000000,-0.438342,0.000000
4,1018100772,20201231,0.008062,-3.774965,-63.241980,-0.106922,-0.424458,-0.045240,-1.005644,-1.084150,...,-0.214421,0.455047,-0.393467,-0.000000,-0.393467,-0.000000,-0.393467,0.000000,-0.393467,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78057,8998800427,20201231,0.149454,0.278016,-0.076174,0.192417,0.188646,0.150174,0.067453,0.032470,...,-0.258801,0.729231,1.287514,2.347057,1.287514,2.347057,0.335617,1.351139,0.335617,1.351139
78058,8998800427,20211231,0.249737,-2.215573,-5.213883,-0.610733,0.334008,0.071243,-0.061465,-0.089610,...,-0.578293,0.463854,-3.095339,-1.869749,-3.095339,-1.869749,0.057015,1.052860,0.057015,1.052860
78059,8998800785,20191231,0.000000,-0.019334,-0.012369,-0.074572,0.000000,0.000000,0.000000,0.000000,...,-0.630918,0.168311,-0.758599,-0.000000,-0.758599,-0.000000,-0.758599,0.000000,-0.758599,0.000000
78060,8998800785,20201231,0.000000,-0.393148,-0.392873,-0.048923,0.000000,0.000000,0.000000,0.000000,...,-0.464157,0.665939,-1.389440,-0.000000,-1.389440,-0.000000,-1.389440,0.000000,-1.389440,0.000000


## 7.3. 데이터 저장

In [85]:
final_data.to_csv('./custom_data/all_finance_data.csv', index=False, encoding='cp949')