In [38]:
import pandas as pd
import numpy as np
import os
import requests
import json
from tqdm import tqdm

# 1. Collect the latest 10-k file dates for all the companies

In [39]:
cik = []
sic = []
sic_description = []
latest_filedate = []
directory = '../data/submissions'
for filename in tqdm(os.listdir(directory)):
    if re.match(r'^CIK\d{10}.json$', filename):
        file_path = os.path.join(directory, filename)
        with open(file_path, 'r') as file:
            submission = json.load(file)
            for i in range(len(submission['filings']['recent']['form'])):
                if submission['filings']['recent']['form'][i] == '10-K':
                    cik.append(submission['cik'])
                    sic.append(submission['sic'])
                    sic_description.append(submission['sicDescription'])
                    latest_filedate.append(submission['filings']['recent']['filingDate'][i])
                    break 

100%|█████████████████████████████████| 874902/874902 [05:10<00:00, 2817.81it/s]


In [40]:
latest10k = pd.DataFrame({
    'cik': cik,
    'sic': sic,
    'sic_description': sic_description,
    'latest_filedate': latest_filedate
})
latest10k

Unnamed: 0,cik,sic,sic_description,latest_filedate
0,743870,6500,Real Estate,2017-03-17
1,1324721,6189,Asset-Backed Securities,2006-03-30
2,1277021,3911,"Jewelry, Precious Metal",2010-04-01
3,815097,4400,Water Transportation,2024-01-26
4,1466739,7361,Services-Employment Agencies,2015-09-15
...,...,...,...,...
38112,1873441,6770,Blank Checks,2023-04-17
38113,1004990,5912,Retail-Drug Stores and Proprietary Stores,2002-08-22
38114,80984,3531,Construction Machinery & Equip,2002-04-01
38115,1276998,6189,Asset-Backed Securities,2005-03-28


## 1.1. Filtering onbankruptcy Data

In [41]:
latest10k['latest_filedate'] = pd.to_datetime(latest10k['latest_filedate'])
latest10k["cik"] = latest10k["cik"].astype(str).str.zfill(10)

# select companies with 10-k files in 2023 or later
nonbank_list = latest10k[latest10k['latest_filedate'].dt.year >= 2023]["cik"]

In [43]:
# Extract all the 10-k files for those companies
directory = "../data/submissions"
cik_list = []
sic_list = []
sic_description_list = []
filedate_list = []
accession_num_list = []
primary_doc_list = []

for cik in tqdm(nonbank_list):
    filename = "CIK" + cik + ".json"
    file_path = os.path.join(directory, filename)
    with open(file_path, 'r') as file:
        submission = json.load(file)
        for i in range(len(submission['filings']['recent']['form'])):
            if submission['filings']['recent']['form'][i] == '10-K':
                cik_list.append(submission['cik'])
                sic_list.append(submission['sic'])
                sic_description_list.append(submission['sicDescription'])
                filedate_list.append(submission['filings']['recent']['filingDate'][i])
                accession_num_list.append(submission['filings']['recent']['accessionNumber'][i])
                primary_doc_list.append(submission['filings']['recent']['primaryDocument'][i])

100%|█████████████████████████████████████| 7287/7287 [00:06<00:00, 1107.43it/s]


In [44]:
nonbank = pd.DataFrame({
    'cik': cik_list,
    'sic': sic_list,
    'sic_description': sic_description_list,
    'filedate': filedate_list,
    'accession_num': accession_num_list,
    'primary_doc': primary_doc_list
}, dtype = str)
nonbank['filelink'] = 'https://www.sec.gov/Archives/edgar/data/' +\
    nonbank['cik'] + '/' + nonbank['accession_num'].str.replace('-', '') + '/' + nonbank['primary_doc']

In [45]:
nonbank.to_csv('../data/nonbankrupt.csv')

## Bankruptcy data

In [47]:
# Load BRD data
brd_data = pd.read_csv("../data/florida_ucla_brd.csv", encoding='unicode_escape')
# select items with non-empty cik and 10k file links.
filtered_data = brd_data.dropna(subset=['CikBefore', 'Date10kBefore'])
bankrupt_data = filtered_data[['CikBefore', 'Date10kBefore']]
bankrupt_data['CikBefore'] = bankrupt_data['CikBefore'].astype('int')
brd_data['Date10kBefore'] = pd.to_datetime(brd_data['Date10kBefore'])
brd_data_2009 = brd_data[brd_data['Date10kBefore'].dt.year >= 2009]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bankrupt_data['CikBefore'] = bankrupt_data['CikBefore'].astype('int')


In [48]:
latest10k_2009 = latest10k[latest10k["latest_filedate"].dt.year >= 2009]
latest10k_2009['cik'] = latest10k_2009['cik'].astype('int64')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  latest10k_2009['cik'] = latest10k_2009['cik'].astype('int64')


In [53]:
result = pd.merge(brd_data_2009, latest10k_2009, left_on = 'CikBefore', right_on = 'cik', how = 'inner')

In [54]:
accession_num_list = []
primary_doc_list = []
directory = '../data/submissions'
for i in range(result.shape[0]):
    filename = "CIK" + result['cik'].astype(str).str.zfill(10)[i] + ".json"
    file_path = os.path.join(directory, filename)
    with open(file_path, 'r') as file:
        submission = json.load(file)
        for j in range(len(submission['filings']['recent']['form'])):
            if submission['filings']['recent']['form'][j] == '10-K' and submission['filings']['recent']['filingDate'][j] == result['latest_filedate'].astype(str)[i]:
                accession_num_list.append(submission['filings']['recent']['accessionNumber'][j])
                primary_doc_list.append(submission['filings']['recent']['primaryDocument'][j])
                break

bankruptcy_data = pd.DataFrame({
    'cik': result['cik'],
    'sic': result['sic'],
    'sic_description': result['sic_description'],
    'filedate': result['latest_filedate'],
    'accession_num': accession_num_list,
    'primary_doc': primary_doc_list
}, dtype = str)

bankruptcy_data['filelink'] = 'https://www.sec.gov/Archives/edgar/data/' + bankruptcy_data['cik'] + '/' + bankruptcy_data['accession_num'].str.replace('-', '') + '/' + bankruptcy_data['primary_doc']
bankruptcy_data = bankruptcy_data.reset_index(drop = True)
bankruptcy_data

Unnamed: 0,cik,sic,sic_description,filedate,accession_num,primary_doc,filelink
0,1503518,8011,Services-Offices & Clinics of Doctors of Med...,2016-08-23,0001047469-16-015101,a2229524z10-k.htm,https://www.sec.gov/Archives/edgar/data/150351...
1,18172,5051,Wholesale-Metals Service Centers & of fices,2021-03-04,0001558370-21-002392,tmb-20201231x10k.htm,https://www.sec.gov/Archives/edgar/data/18172/...
2,1167178,3690,"Miscellaneous Electrical Machinery, Equipment ...",2012-03-12,0001047469-12-002475,a2207978z10-k.htm,https://www.sec.gov/Archives/edgar/data/116717...
3,1606180,8093,"Services-Specialty Outpatient Facilities, NEC",2019-04-15,0001564590-19-011552,aac-10k_20181231.htm,https://www.sec.gov/Archives/edgar/data/160618...
4,2034,5122,"Wholesale-Drugs, Proprietaries & Druggists' Su...",2018-09-28,0001144204-18-051414,tv501271_10k.htm,https://www.sec.gov/Archives/edgar/data/2034/0...
...,...,...,...,...,...,...,...
315,106455,1221,Bituminous Coal & Lignite Surface Mining,2018-04-02,0000106455-18-000028,wlb201710-k.htm,https://www.sec.gov/Archives/edgar/data/106455...
316,1255474,1311,Crude Petroleum & Natural Gas,2022-02-23,0001255474-22-000008,wll-20211231x10k.htm,https://www.sec.gov/Archives/edgar/data/125547...
317,1095996,1531,Operative Builders,2019-02-28,0001628280-19-002280,wlh-12312018x10k.htm,https://www.sec.gov/Archives/edgar/data/109599...
318,1282266,4813,Telephone Communications (No Radiotelephone),2020-05-19,0001282266-20-000022,a201910k.htm,https://www.sec.gov/Archives/edgar/data/128226...


In [55]:
bankruptcy_data.to_csv('../data/bankruptc.csv')