In [None]:
# !pip install OpenDartReader

In [3]:
from financial_data_making import financial_dict
import OpenDartReader
import pandas as pd
import time 
df = pd.read_csv('esg_finance_data.csv', dtype={'corp_code': 'str', 'stock_code': 'str'})
df['year'] = ['2020'] * 847 + ['2021'] * 847 + ['2022'] * 847
df['year'] = df['year'].astype('int')
print(df.dtypes)
print(df.shape)
print(len(df.corp_code.unique())) # 847
df.head(3)

corp_name     object
esg           object
fin_state     object
corp_code     object
stock_code    object
year           int32
dtype: object
(2541, 6)
847


Unnamed: 0,corp_name,esg,fin_state,corp_code,stock_code,year
0,기아,A+,60490443000000,106641,270,2020
1,S-Oil,A+,15690510000000,138279,10950,2020
2,풀무원,A+,1693573176270,155355,17810,2020


In [None]:
api_key = '1ee244d6168bde153cb2d45463d71139f567d1f4'
dart = OpenDartReader(api_key)

# 매출액
# 손익계산서
def sales_docu(code, year):
    sales = dart.finstate(code, year)
    # real_sales = sales[(sales['fs_nm']=='연결재무제표')&(sales['account_nm']=='매출액')].iloc[0, :] # 매출액 
    c_sales = sales[(sales['fs_nm']=='연결재무제표')&(sales['account_nm']=='매출액')]['thstrm_amount'].values # 매출액 # 문제1
    p_sales = sales[(sales['fs_nm']=='연결재무제표')&(sales['account_nm']=='매출액')]['frmtrm_amount'].values # t-1기 매출액 # 문제 2

    div_cash = dart.report(code, '배당', year) #(연결)현금배당성향(%)
    c_div = div_cash[div_cash['se']=='(연결)현금배당성향(%)']['thstrm'].values[0] # 순이익이 마이너스일 경우 0% # 배당금 / 당기순이익
    p_div = div_cash[div_cash['se']=='(연결)현금배당성향(%)']['frmtrm'].values[0]

    holder = dart.report(code, '최대주주', year) # 보통주 계 + 우선수 계 (%)
    ratio_stakeholder = holder[(holder['stock_knd']=='보통주')&(holder['nm']=='계')].values # 문제3

    exec = dart.report(code, '임원', year) #'사외이사' / 전체 임원 수   
    out_exec = exec[exec['ofcps'] == '사외이사'].shape[0]
    exec = exec.shape[0]

    employee = dart.report(code, '직원', year)['rgllbr_co'].to_list() # 총 직원 수
    
    output = {'sales': c_sales, 
              'pre_sales': p_sales, 
              'cash_div': c_div, 
              'pre_cash_civ': p_div,
              'stakeholder' : ratio_stakeholder,
              'num_outexecutives': out_exec,
              'num_executives': exec,
              'num_employee': employee}
    
    return output

In [None]:
# 데이터 수집 (1)

data = []
good_index = []
bad_index = []

for index, (code , year) in enumerate(zip(df['stock_code'], df['year'])):
    try:
        data.append(sales_docu(code, year))
        good_index.append(index)
    except:
        bad_index.append(index)
print(len(good_index), len(bad_index)) # 

# data downloading
with open ('s_docu.txt', 'w', encoding='utf-8') as f:
    for x in df.iloc[good_index, 0].values:
        f.write(f'{x}\n')
    f.close()
    
with open ('f_docu.txt', 'w', encoding='utf-8') as f:
    for x in df.iloc[bad_index, 0].values:
        f.write(f'{x}\n')
    f.close()

In [4]:
api_key = '1ee244d6168bde153cb2d45463d71139f567d1f4'
dart = OpenDartReader(api_key)

def report_docu(code, year):
    lst = dart.list(code, start='2019', end='2024') # 2019-2024 모든 보고서에서 데이터 찾아오기
    
    # 변경1: 원하는 년도의 사업 보고서 고르기 
    report_num = lst[(lst['report_nm'].str.contains('사업보고서'))&(lst['report_nm'].str.contains(str(year)))].iloc[0, 5]
    url = dart.sub_docs(report_num) #사업보고서 번호를 통해 url 요청
    html = url[url['title'].str.contains('연결재무제표')].iloc[0, 1] # url 주소
    
    docu = pd.read_html(html) # 데이터 읽어오기 
    if docu[1].empty | docu[3].empty: # [1] 재무상태표 [3] 손익계산서
        print('Some document cannot be extracted')
    else:
        fin_state = docu[1].rename(columns={'Unnamed: 0': 'tag'}) # 재무상태표

        asset = fin_state[fin_state['tag'] == '자산총계'] # 자산 총계 DF
        debt= fin_state[fin_state['tag'] == '부채총계'] # 부채 총계 DF
        intangible = fin_state[fin_state['tag'] == '무형자산'] # 무형자산 DF

        income = docu[3].rename(columns={'Unnamed: 0': 'tag'}) # 손익계산서DF

        profit = income[income['tag'].str.contains('영업이익')] # 영업이익(손실) 
        profit_tax = income[income['tag'].str.contains('법인세비용차감전')] # t기 법인세비용차감전순이익(손실)
        # dict_2 = {'asset': asset,
        #             'debt': debt,
        #             'intang': intangible,
        #             'profit': profit,
        #             'profit_without_tax': c_profit_tax,
        #             'pre_profit_without_tax': p_profit_tax}   
    return pd.concat([asset, debt, intangible, profit, profit_tax], axis=0).reset_index(drop=True)                                                                                           

In [None]:
# 데이터 수집 (2-1)
# year를 2022으로 고정해서 2022, 2021, 2020 데이터 수집

good_index_re = []
bad_index_re = []

# 847개 회사 unique한 stock_code를 이용 
# O(file 1) = 0.5 secs
# 약 425초, 7분 예상

for index, code in enumerate(df['stock_code'].unique()): 
    try:
        print(f'{index}, Good!') # check if it works well
        bad_index_re.append(index)
        report_docu(code, 2022).to_csv(f'./dataset/{code}_{str(2022)}.csv', encoding='utf8') # dataset 파일 안에 code에 따라 저장
    except:
        print(f'{index}, bad...') # check if there is a problem
        bad_index_re.append(index)
        
print(len(good_index_re), len(bad_index_re)) 

In [None]:
# 데이터 수집 (2-2)
# 사업보고서 2020를 통해서 2019년도 자료 구하기 가능

good_index_re = []
bad_index_re = []

for index, code in enumerate(df['stock_code'].unique()):
    try:
        print(f'{index}, Good!')
        bad_index_re.append(index)
        report_docu(code, 2020).to_csv(f'./dataset/{code}_{str(2020)}.csv', encoding='utf8') # dataset 파일 안에 code에 따라 저장
    except:
        print(f'{index}, bad...')
        bad_index_re.append(index)
        
print(len(good_index_re), len(bad_index_re)) # 

In [None]:
# succees_report_docu = df.iloc[good_index_re, 0]
# fail_report_docu = df.iloc[bad_index_re, 0]

# # data downloading
# with open ('s_report.txt', 'w') as f:
#     for x in df.iloc[good_index_re, 0].values:
#         f.write(f'{x}\n')
#     f.close()
    
# with open ('f_report.txt', 'w') as f:
#     for x in df.iloc[bad_index_re, 0].values:
#         f.write(f'{x}\n')
#     f.close()
    
# import pickle
# with open('data_report.pickle', 'wb') as f:
#     # for  data in data_re
#     pickle.dump(data_re, f)