In [2]:
# import modules
import os
import json
from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
# Create an empty dataframe to store the filings
dataframes = []

In [4]:
# Create dataframes for each filing
directory = 'datasets/EXTRACTED_FILINGS'

for filename in tqdm(os.listdir(directory)):
    if filename.endswith('.json'):
        with open(os.path.join(directory, filename), 'r') as file:
            data = json.load(file)
            cik = data.get('cik', None)
            company = data.get('company', None)
            period_of_report = data.get('period_of_report', None)
            if period_of_report:
                year = period_of_report.split('-')[0]
            else:
                year = None

            item_1A = data.get('item_1A', None)
            item_7 = data.get('item_7', None)
            item_7A = data.get('item_7A', None)

            # Create a DataFrame for the current file's data
            df = pd.DataFrame({
                'cik': [cik],
                'company': [company],
                'year': [year],
                'item_1A': [item_1A],
                'item_7': [item_7],
                'item_7A': [item_7A]
            })
            
            # Append the DataFrame to the list
            dataframes.append(df)

# Concatenate all the DataFrames in the list
filings = pd.concat(dataframes, ignore_index=True)

# Save the DataFrame as a json file
filings.to_json('datasets/EXTRACTED_FILINGS.json', orient='records')


100%|██████████| 7660/7660 [01:16<00:00, 100.67it/s]


In [5]:
filings.head()

Unnamed: 0,cik,company,year,item_1A,item_7,item_7A
0,1000228,SCHEIN HENRY INC,2005,ITEM 1A. Risk Factors\nThe healthcare products...,ITEM 7. Management’s Discussion and Analysis o...,ITEM 7A. Quantitative and Qualitative Disclosu...
1,1000228,HENRY SCHEIN INC,2006,ITEM 1A. Risk Factors\nThe healthcare products...,ITEM 7. Management’s Discussion and Analysis o...,ITEM 7A. Quantitative and Qualitative Disclosu...
2,1000228,HENRY SCHEIN INC,2007,ITEM 1A. Risk Factors\nThe healthcare products...,ITEM 7. Management’s Discussion and Analysis o...,ITEM 7A. Quantitative and Qualitative Disclosu...
3,1000228,HENRY SCHEIN INC,2008,ITEM 1A. Risk Factors\nDeclining economic cond...,ITEM 7. Management’s Discussion and Analysis o...,ITEM 7A. Quantitative and Qualitative Disclosu...
4,1000228,HENRY SCHEIN INC,2009,ITEM 1A. Risk Factors\nDeclining economic cond...,ITEM 7. Management’s Discussion and Analysis o...,ITEM 7A. Quantitative and Qualitative Disclosu...


In [6]:
filings['item_1A'][0]

'ITEM 1A. Risk Factors\nThe healthcare products distribution industry is highly competitive, and we may not be able to compete successfully.\nWe compete with numerous companies, including several major manufacturers and distributors. Some of our competitors have greater financial and other resources than we do, which could allow them to compete more successfully. Most of our products are available from several sources and our customers tend to have relationships with several distributors. Competitors could obtain exclusive rights to market particular products, which we would then be unable to market. Manufacturers could also increase their efforts to sell directly to end-users and bypass distributors like us. Industry consolidation among healthcare products distributors, the unavailability of products, whether due to our inability to gain access to products or interruptions in supply from manufacturers, or the emergence of new competitors could also increase competition. In the future,

In [9]:
import re
# Remomve headers from item 1A, 7 and 7A
items = ['item_1A','item_7','item_7A']
for item in items:
    filings[item] = [re.sub('(?m)^(ITEM 1A.|ITEM 7.|ITEM 7A.|\s*-+).*\n', '', sent, flags=re.IGNORECASE) for sent in filings[item]]

filings['item_1A'][0]


'The healthcare products distribution industry is highly competitive, and we may not be able to compete successfully.\nWe compete with numerous companies, including several major manufacturers and distributors. Some of our competitors have greater financial and other resources than we do, which could allow them to compete more successfully. Most of our products are available from several sources and our customers tend to have relationships with several distributors. Competitors could obtain exclusive rights to market particular products, which we would then be unable to market. Manufacturers could also increase their efforts to sell directly to end-users and bypass distributors like us. Industry consolidation among healthcare products distributors, the unavailability of products, whether due to our inability to gain access to products or interruptions in supply from manufacturers, or the emergence of new competitors could also increase competition. In the future, we may be unable to co

# Calculate and extract EBIT file

In [53]:
# Define the directory where the metric CSV files are stored
metric_data_directory = 'datasets/metric_data'

# Define the filename for the EBIT CSV file
ebit_output_file = 'datasets/metric_data/EBIT.csv'

# List of metrics required for EBIT calculation
metrics_for_ebit = [
    'RevenueFromContractWithCustomerExcludingAssessedTax',
    'CostOfGoodsAndServicesSold',
    'SellingGeneralAndAdministrativeExpense'
]

# Initialize an empty DataFrame for EBIT
ebit_df = None

# Loop through the metrics for EBIT calculation
for metric in metrics_for_ebit:
    # Load the metric-specific CSV file
    metric_file = os.path.join(metric_data_directory, f'{metric}.csv')
    df = pd.read_csv(metric_file)
    
    # Pivot the data to have metrics as columns, indexed by 'year' and 'cik'
    df_pivot = df.pivot(index=['year', 'cik'], columns='metric', values='val')
    
    # If this is the first metric, set the EBIT DataFrame to the pivot table
    if ebit_df is None:
        ebit_df = df_pivot.copy()
    else:
        # Subtract the metric-specific DataFrame from the existing EBIT DataFrame
        ebit_df = ebit_df.sub(df_pivot, fill_value=0)
    
# Calculate EBIT as the difference between specific columns
ebit_df['EBIT'] = ebit_df['Revenue'] - ebit_df['Cost of Goods Sold'] - ebit_df['Selling, General and Administrative Expenses']

# Reset the index
ebit_df.reset_index(inplace=True)

# Save the EBIT DataFrame to a CSV file
ebit_df.to_csv(ebit_output_file, index=False)

# Print a completion message
print("EBIT calculation and export completed.")


EBIT calculation and export completed.


In [54]:
ebit_df.head()

metric,year,cik,Cost of Goods Sold,Revenue,"Selling, General and Administrative Expenses",EBIT
0,2006,38074,,,-1046336000.0,
1,2006,824416,,,-302199.0,
2,2006,1103982,,,-7032000000.0,
3,2007,1800,,,-7407998000.0,
4,2007,2969,-6698900000.0,,-999800000.0,


In [27]:
def plot_num_filings(metric):
    metric['year'].value_counts().sort_index().plot(kind='bar', figsize=(10, 6))
    plt.title(f'Number of {metric["metric"].iloc[0]} Filings per Year')
    plt.xlabel('Year')
    plt.ylabel('Number of Filings')