In [None]:
import OpenDartReader
import pandas as pd 
import os
import glob

df = pd.read_csv('esg_finance_data.csv', dtype={'corp_code': 'str', 'stock_code': 'str'})
df['year'] = ['2020'] * 847 + ['2021'] * 847 + ['2022'] * 847
df['year'] = df['year'].astype('int')
print(df.dtypes)
print(df.shape, len(df.corp_code.unique())) #847
df.head(3)

In [None]:
# 연결 재무제표 데이터

api_key = '1ee244d6168bde153cb2d45463d71139f567d1f4'
dart = OpenDartReader(api_key)

def report_docu(code, year):
    lst = dart.list(code, start='2019', end='2024') # 2019-2024 모든 보고서에서 데이터 찾아오기
    
    # 변경1: 원하는 년도의 사업 보고서 고르기 
    report_num = lst[(lst['report_nm'].str.contains('사업보고서'))&(lst['report_nm'].str.contains(str(year)))].iloc[0, 5]
    url = dart.sub_docs(report_num) #사업보고서 번호를 통해 url 요청
    html = url[url['title'].str.contains('연결재무제표')].iloc[0, 1] # url 주소
    
    docu = pd.read_html(html) # 데이터 읽어오기 
    if docu[1].empty | docu[3].empty: # [1] 재무상태표 [3] 손익계산서
        print('Some document cannot be extracted')
    else:
        fin_state = docu[1].rename(columns={'Unnamed: 0': 'tag'})
        fin_state = fin_state[(fin_state['tag'].str.contains('자산')) | (fin_state['tag'].str.contains('부채'))] # 재무상태표
        # asset = fin_state[fin_state['tag'] == '자산총계'] # 자산 총계 DF
        # debt= fin_state[fin_state['tag'] == '부채총계'] # 부채 총계 DF
        # intangible = fin_state[fin_state['tag'].str.contains('무형자산')] # 무형자산 DF
        income = docu[3].rename(columns={'Unnamed: 0': 'tag'}) # 손익계산서DF
        income = income[income['tag'].str.contains('이익')]
        # profit = income[income['tag'].str.contains('영업이익')] # 영업이익(손실) 
        # profit_tax = income[income['tag'].str.contains('법인세비용차감전')] # t기 법인세비용차감전순이익(손실)
        
    return fin_state, income

In [None]:
api_key = '1ee244d6168bde153cb2d45463d71139f567d1f4'
dart = OpenDartReader(api_key)

def missed_report_docu_fix(code, year):
    lst = dart.list(code, start='2019', end='2024') # 2019-2024 모든 보고서에서 데이터 찾아오기

    # 변경1: 원하는 년도의 사업 보고서 고르기 
    report_num = lst[(lst['report_nm'].str.contains('사업보고서'))&(lst['report_nm'].str.contains(str(year)))].iloc[0, 5]
    url = dart.sub_docs(report_num) #사업보고서 번호를 통해 url 요청
    html = url[url['title'].str.contains('연결재무제표')].iloc[0, 1] # url 주소

    docu = pd.read_html(html) # 데이터 읽어오기 
    fin_state = None
    income = None
    for index, content in enumerate(docu):
        if content.iloc[0,0] =='연결 재무상태표':
            fin_state = docu[index+1].rename(columns={'Unnamed: 0': 'tag'}) # 재무상태표
            fin_state = fin_state[(fin_state['tag'].str.contains('자산')) | (fin_state['tag'].str.contains('부채'))]
            
            
        elif content.iloc[0,0] =='연결 손익계산서':
            income = docu[index+1].rename(columns={'Unnamed: 0': 'tag'}) # 손익계산서
            income = income[income['tag'].str.contains('이익')]
        
    return fin_state, income

### 2022 재무상태표 & 손익계산서

In [None]:
good_index, bad_index = [], []

for index, code in enumerate(df['stock_code'].unique()):
    try:
        fin, inc = report_docu(code, 2022)
        fin.to_csv(f'./2022/fin_2022/{code}_{str(2022)}.csv', encoding='utf8')
        inc.to_csv(f'./2022/inc_2022/{code}_{str(2022)}.csv', encoding='utf8')
          
        print(f'{index}, Good!') # check if it works well
        good_index.append(index)
    except:
        print(f'{index}, bad...') # check if there is a problem
        bad_index.append(index)
        
print(len(good_index), len(bad_index))

In [None]:
# 누락 데이터 확인 # dataset_2022

file_path1 = './2022/fin_2022'

codes = []
for f in glob.glob(os.path.join(file_path1, '*.csv')):
    code = f.split('_')[1][-6:]
    codes.append(code)
    
missed_code = [code for code in list(df['stock_code'].unique()) if code not in codes]

print(len(codes), len(missed_code)) # 185개 기업 데이터 누락

df1 = pd.DataFrame(missed_code, columns=['stock_code'])
df2 = pd.merge(df1, df[['stock_code', 'corp_name']], on='stock_code', how='left')
df2 = df2.drop_duplicates(['stock_code'])
df2.head(3)

good_index, bad_index = [], []

for index, code in enumerate(df2['stock_code'].unique()): # 185개 기업
    try:
        fin, inc = missed_report_docu_fix(code, 2022)
        fin.to_csv(f'./2022/fin_2022/{code}_{str(2022)}.csv', encoding='utf8')
        inc.to_csv(f'./2022/inc_2022/{code}_{str(2022)}.csv', encoding='utf8')
        good_index.append(index)
    except:
        bad_index.append(index)
        
print(len(good_index), len(bad_index))

In [54]:
# check

entire_codes = []
for f in glob.glob(os.path.join('./2022/fin_2022', '*.csv')):
    code = f.split('\\')[1][:6]
    entire_codes.append(code)
    
code_missing = [code for code in list(df['stock_code'].unique()) if code not in entire_codes]

print('저장된 회사 수: ', len(entire_codes)) # 847개 중 690개 # 157개 누락
print('누락 회사 수: ', len(code_missing))

code_missing.sort()
miss_df_2022 = pd.merge(pd.DataFrame(code_missing, columns=['stock_code']), df[['stock_code', 'corp_name']].drop_duplicates(), 
                        on='stock_code', how='left')
miss_df_2022.to_csv('./df_2022_miss.csv', encoding='utf8')

저장된 회사 수:  690
누락 회사 수:  157


### 2022 재무상표 & 손익계산서

In [None]:
good_index, bad_index = [], []

for index, code in enumerate(df['stock_code'].unique()): 
    try:
        fin, inc = report_docu(code, 2020)
        fin.to_csv(f'./2020/fin_2020/{code}_{str(2020)}.csv', encoding='utf8')
        inc.to_csv(f'./2020/inc_2020/{code}_{str(2020)}.csv', encoding='utf8')
          
        print(f'{index}, Good!') # check if it works well
        good_index.append(index)
    except:
        print(f'{index}, bad...') # check if there is a problem
        bad_index.append(index)
        
print(len(good_index), len(bad_index))

In [None]:
# 누락 데이터 확인 # dataset_2020
file_path2 = './2020/fin_2020'

codes = []
for f in glob.glob(os.path.join(file_path2, '*.csv')):
    code = f.split('_')[1][-6:]
    codes.append(code)
    
missed_code = [code for code in list(df['stock_code'].unique()) if code not in codes]

print(len(codes), len(missed_code)) # 185개 기업 데이터 누락

df1 = pd.DataFrame(missed_code, columns=['stock_code'])
df2 = pd.merge(df1, df[['stock_code', 'corp_name']], on='stock_code', how='left')
df2 = df2.drop_duplicates(['stock_code'])
df2.head(3)

good_index, bad_index = [], []

for index, code in enumerate(df2['stock_code'].unique()): # 185개 기업
    try:
        fin, inc = missed_report_docu_fix(code, 2020)
        fin.to_csv(f'./2020/fin_2020/{code}_{str(2020)}.csv', encoding='utf8')
        inc.to_csv(f'./2020/inc_2020/{code}_{str(2020)}.csv', encoding='utf8')
        print(f'{index}, Good!') # check if it works well
        good_index.append(index)
    except:
        print(f'{index}, bad...') # check if there is a problem
        bad_index.append(index)
        
print(len(good_index), len(bad_index))

In [55]:
# check
entire_codes = []
for f in glob.glob(os.path.join('./2020/fin_2020', '*.csv')):
    code = f.split('\\')[1][:6]
    entire_codes.append(code)
    
code_missing = [code for code in list(df['stock_code'].unique()) if code not in entire_codes]

print('저장된 회사 수: ', len(entire_codes)) # 847개 중 690개 # 157개 누락
print('누락 회사 수: ', len(code_missing)) # 847개 중 177개 누락

code_missing.sort()
miss_df_2020 = pd.merge(pd.DataFrame(code_missing, columns=['stock_code']), df[['stock_code', 'corp_name']].drop_duplicates(), 
                        on='stock_code', how='left')
miss_df_2020.to_csv('./df_2020_miss.csv', encoding='utf8')

저장된 회사 수:  670
누락 회사 수:  177
