In [54]:
import os
import pandas as pd

In [65]:
def integrate_data(text_dataset, metrics_folder, output_folder):
    """
    Integrates the text dataset with the metric data.
    
    Parameters
    ----------
    text_dataset : str
        Path to the text dataset.
    metrics_folder : str
        Path to the folder containing the metric data.
    output_folder : str
        Path to the folder where the integrated data will be saved.
    """
    # Read in the text data
    df = pd.read_csv(text_dataset)
    fls = df[df['Label'] == 'FLS']
    
    # Suppress warnings
    pd.options.mode.chained_assignment = None  # Suppress SettingWithCopyWarning
    
    # Preprocess data
    fls.rename(columns={'Metric': 'metric',"CIK":"cik","Year":"year",'Sentence':'text','Item':'item'}, inplace=True)
    fls.drop(columns=['index','Label','Company'], inplace=True)
    fls['metric'] = fls['metric'].replace({"Net Income":"Net Income (Loss)",
                                           "EPS":'Diluted Earnings per share',
                                           "Cash Flow (Investing)":"Net Cash from Investing Activities",
                                           "Cash Flow (Financing)":"Net Cash from Financing Activities",
                                           "Cash Flow (Operating)":"Net Cash from Operating Activities",})
    
    # Create a list to store names of all the files
    metric_files = [file for file in os.listdir(metrics_folder) if file.endswith('.csv')]
    
    # Loop through each metric file 
    for metric_file in metric_files:
        # Read in the metric file
        metric = pd.read_csv(metrics_folder + metric_file)
        
        # Merge datasets based on CIK number, year and metric
        merged_data = fls.merge(metric, on=['cik', 'year','metric'], how='inner')
        merged_data = merged_data[['text','item', 'cik', 'year', 'val']]
        
        # Save data
        if merged_data.empty:
            print('No data for: ' + metric_file)
        else:
            merged_data.to_csv(output_folder + metric_file, index=False)
            print('Saved: ' + metric_file)


In [66]:
integrate_data('../datasets/distilbert_dataset.csv', '../datasets/metric_data/', '../datasets/distilbert_data/')
integrate_data('../datasets/finbert_dataset.csv', '../datasets/metric_data/', '../datasets/finbert_data/')

No data for: CostOfGoodsAndServicesSold.csv
Saved: EarningsPerShareDiluted.csv
Saved: EBIT.csv
No data for: NetCashProvidedByUsedInContinuingOperations.csv
Saved: NetCashProvidedByUsedInFinancingActivities.csv
Saved: NetCashProvidedByUsedInInvestingActivities.csv
Saved: NetIncomeLoss.csv
Saved: RevenueFromContractWithCustomerExcludingAssessedTax.csv
No data for: SellingGeneralAndAdministrativeExpense.csv
No data for: CostOfGoodsAndServicesSold.csv
Saved: EarningsPerShareDiluted.csv
Saved: EBIT.csv
No data for: NetCashProvidedByUsedInContinuingOperations.csv
Saved: NetCashProvidedByUsedInFinancingActivities.csv
Saved: NetCashProvidedByUsedInInvestingActivities.csv
Saved: NetIncomeLoss.csv
Saved: RevenueFromContractWithCustomerExcludingAssessedTax.csv
No data for: SellingGeneralAndAdministrativeExpense.csv
