In [34]:
import zipfile

import pandas as pd
import numpy as np
import os
import json
from tqdm import tqdm

# Load bankruptcy and non-bankruptcy data 

In [None]:
bank = pd.read_csv("../data/bankrupt.csv", index_col=0)
bank["bank_status"] = 1
nonbank = pd.read_csv("../data/nonbankrupt.csv", index_col=0)
nonbank["bank_status"] = 0
all_company = pd.concat([bank, nonbank], ignore_index=True)
all_company["cik"] = all_company["cik"].astype(str).str.zfill(10)

# Collect financial terms

In [None]:
# find financial number from the companfacts folder 
# Path to your large ZIP file
zip_file_path = '../data/companyfacts.zip'
# Open the ZIP file in read mode
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_contents = set(zip_ref.namelist())
    for value in tqdm(all_company["cik"]):
        # Extract a specific file
        specific_file = 'CIK' + value + '.json'
        if specific_file in zip_contents:
            zip_ref.extract(specific_file, '../data/all_company_data')

def collect(company_json_file, finantial_terms, cik, company_facts):
    tmp = pd.DataFrame(columns=["filed_date"] + finantial_terms)
    for term in finantial_terms:
        if term in company_json_file['facts']['us-gaap']:
            units = company_json_file['facts']['us-gaap'][term].get("units", {}).get("USD", [])
            for unit in units:
                if unit["form"] == "10-K":
                    filed_date = unit.get("filed")
                    if filed_date in tmp["filed_date"].values:
                        tmp.loc[tmp["filed_date"] == filed_date, term] = unit.get("val")
                    else:
                        tmp = pd.concat([tmp, pd.DataFrame({"filed_date": [filed_date], term: [unit.get("val")]})])
    tmp["cik"] = cik
    company_facts = pd.concat([company_facts, tmp])
    return company_facts

def collect_company_facts():
    finantial_terms = ['Assets', 'AssetsCurrent', 'Liabilities', 'LiabilitiesCurrent', 'StockholdersEquity', 'InterestExpense',\
        'SalesRevenueNet', 'AccountsReceivableNet', 'CostOfGoodsAndServicesSold', 'ProfitLoss', 'EarningsPerShareBasic']
    folder_path = '../data/all_company_data'
    company_facts = pd.DataFrame(columns=["cik", "filed_date"] + finantial_terms)
    
    for file_name in tqdm(os.listdir(folder_path)):
        json_file_path = os.path.join(folder_path, file_name)
        with open(json_file_path, 'r') as json_file:
            company_json_file = json.load(json_file)
            if 'us-gaap' not in company_json_file['facts']:
                continue
            cik = company_json_file.get("cik", "Unknown")
            company_facts = collect(company_json_file, finantial_terms, cik, company_facts)
    
    return company_facts

company_facts = collect_company_facts()
company_facts["cik"] = company_facts["cik"].astype(str).str.zfill(10)

# Merge financial terms and compan

In [None]:
merged_data = pd.merge(company_facts, all_company, left_on=['cik', 'filed_date'], right_on=['cik', 'filedate'], how='inner')

merged_data.to_csv("../data/company_facts_and_filelinks.csv")

In [36]:
merged_data = pd.read_csv("../data/company_facts_and_filelinks.csv", index_col=0)
merged_data

Unnamed: 0,cik,filed_date,Assets,AssetsCurrent,Liabilities,LiabilitiesCurrent,StockholdersEquity,InterestExpense,SalesRevenueNet,AccountsReceivableNet,CostOfGoodsAndServicesSold,ProfitLoss,EarningsPerShareBasic,sic,sic_description,filedate,accession_num,primary_doc,filelink,bank_status
0,815097,2010-01-29,3.683500e+10,1.518000e+09,,4.967000e+09,2.203500e+10,380000000.0,,,,,,4400.0,Water Transportation,2010-01-29,0001193125-10-016470,d10k.htm,https://www.sec.gov/Archives/edgar/data/815097...,0
1,815097,2011-01-31,3.749000e+10,1.244000e+09,,5.755000e+09,2.303100e+10,378000000.0,,,,1.978000e+09,,4400.0,Water Transportation,2011-01-31,0001193125-11-018320,d10k.htm,https://www.sec.gov/Archives/edgar/data/815097...,0
2,815097,2012-01-30,3.863700e+10,1.312000e+09,,6.105000e+09,2.383200e+10,365000000.0,,,,,,4400.0,Water Transportation,2012-01-30,0001193125-12-028375,d254914d10k.htm,https://www.sec.gov/Archives/edgar/data/815097...,0
3,815097,2013-01-29,3.916100e+10,1.821000e+09,,7.340000e+09,2.392900e+10,336000000.0,,,,,,4400.0,Water Transportation,2013-01-29,0001193125-13-027239,d387954d10k.htm,https://www.sec.gov/Archives/edgar/data/815097...,0
4,815097,2014-01-29,4.010400e+10,1.937000e+09,,6.720000e+09,2.455600e+10,319000000.0,,,,,,4400.0,Water Transportation,2014-01-29,0001193125-14-025514,d645418d10k.htm,https://www.sec.gov/Archives/edgar/data/815097...,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48378,1381668,2021-11-24,1.405745e+10,,1.232517e+10,,1.732280e+09,157721000.0,,,,,,6035.0,"Savings Institution, Federally Chartered",2021-11-24,0001381668-21-000109,tfsl-20210930.htm,https://www.sec.gov/Archives/edgar/data/138166...,0
48379,1381668,2022-11-22,1.578988e+10,,1.394554e+10,,1.844339e+09,141937000.0,,,,,,6035.0,"Savings Institution, Federally Chartered",2022-11-22,0001381668-22-000128,tfsl-20220930.htm,https://www.sec.gov/Archives/edgar/data/138166...,0
48380,1381668,2023-11-21,1.691798e+10,,1.499062e+10,,1.927361e+09,328352000.0,,,,,,6035.0,"Savings Institution, Federally Chartered",2023-11-21,0001381668-23-000092,tfsl-20230930.htm,https://www.sec.gov/Archives/edgar/data/138166...,0
48381,1873441,2022-03-31,3.070976e+08,1.096477e+06,2.295583e+07,7.131280e+05,-2.185826e+07,,,,,,,6770.0,Blank Checks,2022-03-31,0001193125-22-091446,d317121d10k.htm,https://www.sec.gov/Archives/edgar/data/187344...,0
