# Import the necessary Libraries

In [None]:
import pandas as pd
import numpy as np
import random
import requests
import time

## To start our analysis and to comply with compute restrictions I will first analyse the data for a total of 5 companies out of 500

I will first start by studying 5 companies in the same industry so to have a more industry specific insights. Given the companies in that industry I will know the patterns that emerge in that specific industry and from that point on, when I have completed that industry I can generalize it to other companies in the same industry at first and than into other industries

### Advantages:
1. **Domain-Specific patterns**
2. Comparable analysis
3. Controlled complexity
4. Faster iteration
5. Clear baseline

### Information Techonology Industry
1. **Microsoft** 
2. **ServiceNow**
3. **AMD**
4. **Salesforce**
5. **Palantir**

# Balance Sheet for Finance Industry

In [None]:
import pandas as pd
import numpy as np

def reconstruct_balance_sheet_financial_services(df):
    """
    Reconstructs balance sheet for Financial Services companies from XBRL-tagged data.

    Args:
        df: DataFrame containing XBRL-tagged financial data

    Returns:
        DataFrame with reconstructed balance sheet for each period
    """
    # Create a new DataFrame to store the reconstructed balance sheet
    balance_sheet = pd.DataFrame(index=df.index)

    # Copy identifying columns
    id_columns = ['filed', 'company_name', 'end', 'unit', 'form', 'frame', 'cik']
    for col in id_columns:
        if col in df.columns:
            balance_sheet[col] = df[col]

    # Get all available columns for matching (normalize to remove us-gaap: prefix for easier matching)
    available_columns = set([col.replace('us-gaap:', '') for col in df.columns])

    # Function to normalize and check for column existence
    def has_column(col_name):
        return col_name.replace('us-gaap:', '') in available_columns

    # Get actual us-gaap prefixed column name from available columns if it exists
    def get_us_gaap_column(col_name):
         normalized_name = col_name.replace('us-gaap:', '')
         for col in df.columns:
             if col.replace('us-gaap:', '') == normalized_name:
                 return col
         return None # Should not happen if has_column returned True


    # Define mappings for balance sheet accounts based on US GAAP XBRL for Financial Services

    #assets = {} and other accounts are dictionaries with the structure you can find in the sector file

    # Helper function to find the best matching column (updated to handle us-gaap: prefix)
    def find_best_match(account_info, available_cols_normalized):
        # Check primary tag
        primary_normalized = account_info['primary'].replace('us-gaap:', '')
        if primary_normalized in available_cols_normalized:
            return get_us_gaap_column(primary_normalized)

        # Check alternatives
        for alt in account_info['alternatives']:
            alt_normalized = alt.replace('us-gaap:', '')
            if alt_normalized in available_cols_normalized:
                 return get_us_gaap_column(alt_normalized)

        # Handle partial matches (less reliable but can be useful)
        for col_normalized in available_cols_normalized:
            if primary_normalized in col_normalized:
                 return get_us_gaap_column(col_normalized)

        for alt in account_info['alternatives']:
            alt_normalized = alt.replace('us-gaap:', '')
            for col_normalized in available_cols_normalized:
                if alt_normalized in col_normalized:
                    return get_us_gaap_column(col_normalized)

        return None


    # Process each section
    def process_section(section_name, section_mapping):
        available_cols_normalized = set([col.replace('us-gaap:', '') for col in df.columns]) # Recalculate for safety
        for account_name, account_info in section_mapping.items():
            # Try to find the best match
            best_match_col = find_best_match(account_info, available_cols_normalized)

            if best_match_col:
                # Direct or alternative match found
                balance_sheet[f"{section_name} - {account_name}"] = df[best_match_col]
            elif account_info['children']:
                # Try to sum children if parent not found
                available_children_cols = []
                for child in account_info['children']:
                     child_normalized = child.replace('us-gaap:', '')
                     if child_normalized in available_cols_normalized:
                          available_children_cols.append(get_us_gaap_column(child_normalized))
                     else:
                         # Look for variations in available columns (partial match for children)
                         for col_normalized in available_cols_normalized:
                             if child_normalized in col_normalized:
                                 available_children_cols.append(get_us_gaap_column(col_normalized))


                if available_children_cols:
                    # Ensure children columns are unique in case of partial match overlaps
                    available_children_cols = list(set(available_children_cols))
                    balance_sheet[f"{section_name} - {account_name}"] = df[available_children_cols].sum(axis=1)
                else:
                    # No matches found
                    balance_sheet[f"{section_name} - {account_name}"] = pd.Series(np.nan, index=df.index)
            else:
                # No matches found
                balance_sheet[f"{section_name} - {account_name}"] = pd.Series(np.nan, index=df.index)

    # Process each section
    process_section("Assets", assets)
    process_section("Liabilities", liabilities)
    process_section("Equity", equity)

    # Add validation checks (will be handled by add_missing_balance_sheet_totals and its validation)

    return balance_sheet

# --- Helper functions (from your original code, assuming they work correctly) ---

# Function to display a human-readable balance sheet
def display_balance_sheet(balance_sheet, in_billions=True):
    """
    Display the balance sheet in a clean, readable format.

    Args:
        balance_sheet: DataFrame with the reconstructed balance sheet
        in_billions: If True, display values in billions; otherwise in millions

    Returns:
        DataFrame with the formatted balance sheet
    """
    # Make a copy to avoid modifying the original
    formatted_bs = balance_sheet.copy()

    # Identify numeric columns
    numeric_cols = [col for col in formatted_bs.columns if any(col.startswith(prefix) for prefix in ['Assets', 'Liabilities', 'Equity', 'Validation'])]

    # Convert to billions or millions
    divisor = 1_000_000_000 if in_billions else 1_000_000
    suffix = 'B' if in_billions else 'M'

    for col in numeric_cols:
        # Check if the column is numeric before dividing
        if pd.api.types.is_numeric_dtype(formatted_bs[col]):
             formatted_bs[col] = formatted_bs[col] / divisor
             # Optional: Format to a specific number of decimal places if needed
             # formatted_bs[col] = formatted_bs[col].round(2)


    # Format the date column if it exists
    if 'end' in formatted_bs.columns:
        try:
            formatted_bs['end'] = pd.to_datetime(formatted_bs['end']).dt.strftime('%Y-%m-%d')
        except:
            pass # Keep original format if conversion fails

    # Create sections for display (ensure columns exist before trying to select)
    asset_cols = [col for col in formatted_bs.columns if col.startswith('Assets')]
    liability_cols = [col for col in formatted_bs.columns if col.startswith('Liabilities')]
    equity_cols = [col for col in formatted_bs.columns if col.startswith('Equity')]
    validation_cols = [col for col in formatted_bs.columns if col.startswith('Validation')]

    # Columns to display, maintaining original order where possible but prioritizing key info
    display_cols_order = (
        ['filed', 'company_name', 'end', 'unit', 'form', 'frame', 'cik'] +
        sorted([col for col in asset_cols if 'Total' not in col]) +
        sorted([col for col in asset_cols if 'Total' in col]) +
        sorted([col for col in liability_cols if 'Total' not in col]) +
        sorted([col for col in liability_cols if 'Total' in col]) +
        sorted([col for col in equity_cols if 'Total' not in col and 'Liabilities and Equity' not in col]) +
        sorted([col for col in equity_cols if 'Total Stockholders Equity' in col]) +
        sorted([col for col in equity_cols if 'Liabilities and Equity' in col]) +
        validation_cols
    )

    # Filter display_cols_order to only include columns actually present in formatted_bs
    display_cols_final = [col for col in display_cols_order if col in formatted_bs.columns]


    return formatted_bs[display_cols_final]


# Function to add missing Balance Sheet totals based on accounting relationships
def add_missing_balance_sheet_totals(balance_sheet):
    """
    Adds missing total columns according to accounting relationships:
    - If Total Liabilities is missing, compute it as (Total Liabilities and Equity - Total Stockholders Equity)
    - If Total Stockholders Equity is missing, compute it as (Total Liabilities and Equity - Total Liabilities)
    - If Total Liabilities and Equity is missing, set it equal to Total Assets
    - Also adds validation columns for these relationships.

    Args:
        balance_sheet: DataFrame with the reconstructed balance sheet

    Returns:
        DataFrame with missing totals computed where possible and validation columns
    """
    # Make a copy to avoid modifying the original
    result = balance_sheet.copy()

    # Define key total column names
    total_assets_col = 'Assets - Total Assets'
    total_liabilities_col = 'Liabilities - Total Liabilities'
    total_equity_col = 'Equity - Total Stockholders Equity'
    total_liab_equity_col = 'Equity - Total Liabilities and Equity'

    # Ensure total columns exist before attempting calculations row-wise
    for col in [total_assets_col, total_liabilities_col, total_equity_col, total_liab_equity_col]:
        if col not in result.columns:
            result[col] = pd.NA # Use pandas NA for potentially mixed types if needed, or just np.nan

    # Process row by row to handle NaN values in specific cells
    for idx, row in result.iterrows():
        # Case 1: Compute missing Total Liabilities
        if (pd.notna(row[total_liab_equity_col]) and
            pd.notna(row[total_equity_col]) and
            pd.isna(row[total_liabilities_col])):

            result.at[idx, total_liabilities_col] = (
                row[total_liab_equity_col] -
                row[total_equity_col]
            )
            #print(f"Row {idx}: Computed missing Total Liabilities") # Optional: print for debugging

        # Case 2: Compute missing Total Stockholders Equity
        if (pd.notna(row[total_liab_equity_col]) and
            pd.notna(row[total_liabilities_col]) and
            pd.isna(row[total_equity_col])):

            result.at[idx, total_equity_col] = (
                row[total_liab_equity_col] -
                row[total_liabilities_col]
            )
            #print(f"Row {idx}: Computed missing Total Stockholders Equity") # Optional: print for debugging

        # Case 3: Compute missing Total Liabilities and Equity
        if pd.isna(row[total_liab_equity_col]):
            if pd.notna(row[total_assets_col]):
                # Set Total Liabilities and Equity = Total Assets (accounting equality)
                result.at[idx, total_liab_equity_col] = row[total_assets_col]
                #print(f"Row {idx}: Set Total Liabilities and Equity equal to Total Assets") # Optional: print for debugging
            elif (pd.notna(row[total_liabilities_col]) and
                  pd.notna(row[total_equity_col])):
                # Compute Total Liabilities and Equity as sum of components
                result.at[idx, total_liab_equity_col] = (
                    row[total_liabilities_col] +
                    row[total_equity_col]
                )
                #print(f"Row {idx}: Computed Total Liabilities and Equity from its components") # Optional: print for debugging

    # Add validation columns
    result['Validation - A = L+E Difference'] = (
        result[total_assets_col] -
        result[total_liab_equity_col]
    )

    # This check validates if the sum of Liabilities and Equity components equals Total Liabilities and Equity
    # Only perform if both Total Liabilities and Total Stockholders Equity columns are present
    if total_liabilities_col in result.columns and total_equity_col in result.columns:
         result['Validation - L+E Components Sum Difference'] = (
             result[total_liabilities_col] +
             result[total_equity_col] -
             result[total_liab_equity_col]
         )
    else:
         result['Validation - L+E Components Sum Difference'] = pd.NA # Cannot compute if components are missing


    return result


# Function to remove columns that only contain NaN or NA values
def remove_nan_only_columns(df):
    """
    Removes columns that contain only NaN or pandas NA values.

    Args:
        df: DataFrame to clean

    Returns:
        DataFrame with NaN-only columns removed
    """
    # Use .isnull().all() which handles both np.nan and pd.NA
    nan_cols = df.columns[df.isnull().all()].tolist()

    # Drop those columns
    return df.drop(columns=nan_cols)

# --- Main Analysis Function ---

def analyze_balance_sheet_financial_services(df):
    """
    Analyze company balance sheet from XBRL data, specifically for Financial Services.

    Args:
        df: DataFrame containing XBRL-tagged financial data

    Returns:
        The reconstructed balance sheet
    """
    print("Reconstructing balance sheet for Financial Services from XBRL data...")
    balance_sheet = reconstruct_balance_sheet_financial_services(df)

    # Add missing totals based on accounting relationships and add validation columns
    balance_sheet = add_missing_balance_sheet_totals(balance_sheet)

    # Remove columns with only NaN or NA values after attempting to fill totals
    balance_sheet = remove_nan_only_columns(balance_sheet)

    print("\nBalance Sheet Summary:")
    print(f"- Total columns: {len(balance_sheet.columns)}")
    print(f"- Data for {len(balance_sheet)} reporting periods")


    # Display the balance sheet
    print("\nFormatted Balance Sheet (in billions USD):")
    formatted_bs = display_balance_sheet(balance_sheet, in_billions=True)

    return balance_sheet, formatted_bs

# Example Usage (assuming consolidated_df is your input DataFrame):
balance_sheet_fs, formatted_bs_fs = analyze_balance_sheet_financial_services(consolidated_df)

In [36]:
import pandas as pd
import numpy as np

def reconstruct_balance_sheet_financial_services(df):
    """
    Reconstructs balance sheet for Financial Services companies from XBRL-tagged data.

    Args:
        df: DataFrame containing XBRL-tagged financial data

    Returns:
        DataFrame with reconstructed balance sheet for each period
    """
    # Create a new DataFrame to store the reconstructed balance sheet
    balance_sheet = pd.DataFrame(index=df.index)

    # Copy identifying columns
    id_columns = ['filed', 'company_name', 'end', 'unit', 'form', 'frame', 'cik']
    for col in id_columns:
        if col in df.columns:
            balance_sheet[col] = df[col]

    # Get all available columns for matching (normalize to remove us-gaap: prefix for easier matching)
    available_columns = set([col.replace('us-gaap:', '') for col in df.columns])

    # Function to normalize and check for column existence
    def has_column(col_name):
        return col_name.replace('us-gaap:', '') in available_columns

    # Get actual us-gaap prefixed column name from available columns if it exists
    def get_us_gaap_column(col_name):
         normalized_name = col_name.replace('us-gaap:', '')
         for col in df.columns:
             if col.replace('us-gaap:', '') == normalized_name:
                 return col
         return None # Should not happen if has_column returned True


    # Define mappings for balance sheet accounts based on US GAAP XBRL for Financial Services

    # Assets section
    assets = {
        # Highly Liquid Assets
        'Cash and Cash Equivalents': {
            'primary': 'CashAndCashEquivalentsAtCarryingValue',
            'alternatives': ['CashAndCashEquivalents', 'CashAndDueFromBanks'],
            'children': []
        },
        'Restricted Cash and Cash Equivalents': {
            'primary': 'RestrictedCashAndCashEquivalents',
            'alternatives': [],
            'children': []
        },
        'Federal Funds Sold and Securities Purchased Under Agreements to Resell': {
             'primary': 'FederalFundsSoldAndSecuritiesPurchasedUnderAgreementsToResell',
             'alternatives': ['SecuritiesPurchasedUnderAgreementsToResell'],
             'children': []
        },
        'Interest Bearing Deposits in Banks': {
            'primary': 'InterestBearingDepositsInBanks',
            'alternatives': [],
            'children': []
        },

        # Investment Securities
        'Investment Securities (Total)': { # Aggregate or a placeholder for detailed breakdown
             'primary': 'InvestmentSecurities', # General tag, may not be used by all
             'alternatives': ['InvestmentSecuritiesIncludingRestricted'],
             'children': []
        },
        'Available for Sale Securities (Debt)': {
             'primary': 'AvailableForSaleSecuritiesDebtSecurities',
             'alternatives': [],
             'children': []
        },
        'Available for Sale Securities (Equity)': {
             'primary': 'AvailableForSaleSecuritiesEquitySecurities',
             'alternatives': [],
             'children': []
        },
        'Held to Maturity Securities': {
            'primary': 'HeldToMaturitySecurities',
            'alternatives': [],
            'children': []
        },
         'Trading Account Assets': {
            'primary': 'TradingAccountAssets',
            'alternatives': [],
            'children': []
        },
        'Equity Securities Without Readily Determinable Fair Value': {
             'primary': 'EquitySecuritiesWithoutReadilyDeterminableFairValueAmount',
             'alternatives': [],
             'children': []
        },

        # Loans and Receivables (often a primary asset)
        'Loans and Leases Receivable, Net': {
            'primary': 'LoansAndLeasesReceivableNet',
            'alternatives': ['LoansReceivableNet'],
            'children': []
        },
        'Allowance for Credit Losses (Loans and Leases)': {
             'primary': 'AllowanceForLoanAndLeaseLosses',
             'alternatives': ['AllowanceForCreditLossesOnLoansAndNotesReceivable'],
             'children': []
        },
        'Loans Held for Sale': {
             'primary': 'LoansReceivableHeldForSale',
             'alternatives': ['LoansReceivableHeldForSaleNetNotPartOfDisposalGroup'],
             'children': []
        },
        'Accounts Receivable': {
             'primary': 'AccountsReceivable',
             'alternatives': ['AccountsReceivableNetCurrent'], # Could be current or noncurrent
             'children': []
        },
         'Interest and Fees Receivable': {
             'primary': 'InterestAndFeesReceivable',
             'alternatives': ['AccruedInvestmentIncomeReceivable'],
             'children': []
        },
        'Financing Receivables (Other)': { # Catches other specific financing receivables
             'primary': 'FinancingReceivableExcludingAccruedInterestBeforeAllowanceForCreditLoss',
             'alternatives': [],
             'children': [] # Could list specific types if needed
        },


        # Derivative Assets
        'Derivative Assets': {
            'primary': 'DerivativeAssets',
            'alternatives': ['DerivativeFairValueOfDerivativeAsset'],
            'children': ['CreditRiskDerivativeAssetsAtFairValue']
        },

        # Property, Intangibles, and Other Assets
        'Property, Plant and Equipment, Net': {
            'primary': 'PropertyPlantAndEquipmentNet',
            'alternatives': ['PropertyAndEquipmentNet'],
            'children': []
        },
         'Operating Lease Right-of-Use Assets': {
            'primary': 'OperatingLeaseRightOfUseAsset',
            'alternatives': [],
            'children': []
        },
        'Goodwill': {
            'primary': 'Goodwill',
            'alternatives': [],
            'children': []
        },
        'Intangible Assets, Net (Excluding Goodwill)': {
            'primary': 'IntangibleAssetsNetExcludingGoodwill',
            'alternatives': ['IntangibleAssetsNet'],
            'children': []
        },
         'Servicing Assets': {
            'primary': 'ServicingAssetAtFairValueAmount',
            'alternatives': ['ServicingAssets'],
            'children': []
        },
        'Deferred Policy Acquisition Costs': {
            'primary': 'DeferredPolicyAcquisitionCosts',
            'alternatives': [],
            'children': []
        },
        'Deferred Tax Assets, Net': {
            'primary': 'DeferredTaxAssetsNet',
            'alternatives': ['DeferredTaxAssetsGross'], # Net is usually presented
            'children': ['DeferredTaxAssetsValuationAllowance'] # Contra asset
        },
        'Other Assets': { # Catch-all for other assets
            'primary': 'OtherAssets',
            'alternatives': ['OtherAssetsCurrent', 'OtherAssetsNoncurrent', 'PrepaidExpensesAndOtherAssetsCurrent'],
            'children': []
        },

        # Totals
        'Total Assets': {
            'primary': 'Assets',
            'alternatives': [],
            'children': []
        }
    }

    # Liabilities section
    liabilities = {
        # Deposits (a primary liability for banks)
        'Deposits': {
            'primary': 'Deposits',
            'alternatives': ['DepositsDomestic', 'DepositsForeign'],
            'children': ['InterestBearingDeposits', 'NoninterestBearingDeposits']
        },

        # Borrowings
        'Short-term Borrowings': {
            'primary': 'ShortTermBorrowings',
            'alternatives': ['OtherShortTermBorrowings'],
            'children': []
        },
        'Federal Funds Purchased and Securities Sold Under Agreements to Repurchase': {
            'primary': 'FederalFundsPurchasedAndSecuritiesSoldUnderAgreementsToRepurchase',
            'alternatives': ['SecuritiesSoldUnderAgreementsToRepurchase'],
            'children': []
        },
         'Trading Account Liabilities': {
            'primary': 'TradingAccountLiabilities',
            'alternatives': [],
            'children': []
        },
        'Long-term Debt': {
            'primary': 'LongTermDebtNoncurrent', # Focus on noncurrent portion for balance sheet
            'alternatives': ['LongTermDebt', 'DebtNoncurrent'],
            'children': []
        },
        'Current Portion of Long-term Debt': {
             'primary': 'LongTermDebtCurrent',
             'alternatives': ['DebtCurrent'],
             'children': []
        },
        'Long-term Debt and Capital Lease Obligations': {
            'primary': 'LongTermDebtAndCapitalLeaseObligationsIncludingCurrentMaturities',
            'alternatives': ['DebtAndCapitalLeaseObligations'],
            'children': [] # Often a total of current and noncurrent debt/leases
        },
         'Operating Lease Liabilities': {
            'primary': 'OperatingLeaseLiability',
            'alternatives': ['OperatingLeaseLiabilityNoncurrent', 'OperatingLeaseLiabilityCurrent'],
            'children': [] # Can be presented as current and noncurrent
        },

        # Other Liabilities
        'Accounts Payable': {
            'primary': 'AccountsPayable',
            'alternatives': ['AccountsPayableCurrent'], # Could be current or noncurrent
            'children': []
        },
        'Accrued Liabilities': {
            'primary': 'AccruedLiabilities',
            'alternatives': ['AccruedExpenses', 'AccruedLiabilitiesCurrent', 'AccruedLiabilitiesNoncurrent'],
            'children': ['AccountsPayableAndAccruedLiabilitiesCurrentAndNoncurrent', 'AccountsPayableAndOtherAccruedLiabilities'] # Often grouped with AP
        },
         'Income Taxes Payable': {
            'primary': 'IncomeTaxesPayableCurrent', # Often current, but can be noncurrent
            'alternatives': ['IncomeTaxPayableCurrent', 'IncomeTaxesPayable'],
            'children': []
        },
         'Deferred Tax Liabilities': {
            'primary': 'DeferredTaxLiabilities',
            'alternatives': ['DeferredIncomeTaxLiabilities'],
            'children': [] # Can be broken down by source, see asset side for examples
        },
        'Derivative Liabilities': {
            'primary': 'DerivativeLiabilities',
            'alternatives': ['DerivativeFairValueOfDerivativeLiability'],
            'children': ['CreditRiskDerivativeLiabilitiesAtFairValue']
        },
        'Other Liabilities': { # Catch-all for other liabilities
            'primary': 'OtherLiabilities',
            'alternatives': ['OtherLiabilitiesCurrent', 'OtherLiabilitiesNoncurrent'],
            'children': []
        },

        # Insurance Specific Liabilities
         'Future Policy Benefits': {
            'primary': 'FuturePolicyBenefits',
            'alternatives': ['LiabilityForFuturePolicyBenefits'],
            'children': []
        },
         'Unearned Premiums': {
            'primary': 'UnearnedPremiums',
            'alternatives': [],
            'children': []
        },
        'Off Balance Sheet Credit Loss Liability': {
             'primary': 'OffBalanceSheetCreditLossLiability',
             'alternatives': [],
             'children': []
        },
         'Unrecognized Tax Benefits': {
            'primary': 'UnrecognizedTaxBenefits',
            'alternatives': [],
            'children': []
        },


        # Totals
        'Total Liabilities': {
            'primary': 'Liabilities',
            'alternatives': [],
            'children': []
        }
    }

    # Equity section
    equity = {
        'Preferred Stock': {
            'primary': 'PreferredStockValue',
            'alternatives': ['PreferredStockIncludingAdditionalPaidInCapitalNetOfDiscount'],
            'children': []
        },
        'Common Stock': {
            'primary': 'CommonStockValue',
            'alternatives': [],
            'children': []
        },
        'Additional Paid-in Capital': {
            'primary': 'AdditionalPaidInCapital',
            'alternatives': ['AdditionalPaidInCapitalCommonStock'],
            'children': []
        },
         'Common Stock and Additional Paid-in Capital': { # Often grouped
            'primary': 'CommonStockAndAdditionalPaidInCapital',
            'alternatives': [],
            'children': ['CommonStockValue', 'AdditionalPaidInCapital']
         },
        'Retained Earnings (Accumulated Deficit)': {
            'primary': 'RetainedEarningsAccumulatedDeficit',
            'alternatives': ['RetainedEarnings'],
            'children': []
        },
        'Treasury Stock': {
            'primary': 'TreasuryStockValue',
            'alternatives': ['TreasuryStockCommonValue'],
            'children': []
        },
        'Accumulated Other Comprehensive Income (Loss)': {
            'primary': 'AccumulatedOtherComprehensiveIncomeLossNetOfTax', # Often net of tax
            'alternatives': ['AccumulatedOtherComprehensiveIncomeLoss'],
            'children': ['OtherComprehensiveIncomeAvailableforsaleSecuritiesAdjustmentNetOfTaxPortionAttributableToParent',
                         'OtherComprehensiveIncomeDefinedBenefitPlansAdjustmentNetOfTaxPortionAttributableToParent',
                         'OtherComprehensiveIncomeForeignCurrencyTranslationAdjustmentNetOfTaxPortionAttributableToParent',
                         'OtherComprehensiveIncomeLossCashFlowHedgeGainLossAfterReclassificationAndTaxParent'] # Examples of components
        },
        'Total Stockholders Equity': {
            'primary': 'StockholdersEquity',
            'alternatives': ['TotalEquity'],
            'children': []
        },
        'Total Liabilities and Equity': {
            'primary': 'LiabilitiesAndStockholdersEquity',
            'alternatives': ['LiabilitiesAndEquity'],
            'children': []
        }
    }

    # Helper function to find the best matching column (updated to handle us-gaap: prefix)
    def find_best_match(account_info, available_cols_normalized):
        # Check primary tag
        primary_normalized = account_info['primary'].replace('us-gaap:', '')
        if primary_normalized in available_cols_normalized:
            return get_us_gaap_column(primary_normalized)

        # Check alternatives
        for alt in account_info['alternatives']:
            alt_normalized = alt.replace('us-gaap:', '')
            if alt_normalized in available_cols_normalized:
                 return get_us_gaap_column(alt_normalized)

        # Handle partial matches (less reliable but can be useful)
        for col_normalized in available_cols_normalized:
            if primary_normalized in col_normalized:
                 return get_us_gaap_column(col_normalized)

        for alt in account_info['alternatives']:
            alt_normalized = alt.replace('us-gaap:', '')
            for col_normalized in available_cols_normalized:
                if alt_normalized in col_normalized:
                    return get_us_gaap_column(col_normalized)

        return None


    # Process each section
    def process_section(section_name, section_mapping):
        available_cols_normalized = set([col.replace('us-gaap:', '') for col in df.columns]) # Recalculate for safety
        for account_name, account_info in section_mapping.items():
            # Try to find the best match
            best_match_col = find_best_match(account_info, available_cols_normalized)

            if best_match_col:
                # Direct or alternative match found
                balance_sheet[f"{section_name} - {account_name}"] = df[best_match_col]
            elif account_info['children']:
                # Try to sum children if parent not found
                available_children_cols = []
                for child in account_info['children']:
                     child_normalized = child.replace('us-gaap:', '')
                     if child_normalized in available_cols_normalized:
                          available_children_cols.append(get_us_gaap_column(child_normalized))
                     else:
                         # Look for variations in available columns (partial match for children)
                         for col_normalized in available_cols_normalized:
                             if child_normalized in col_normalized:
                                 available_children_cols.append(get_us_gaap_column(col_normalized))


                if available_children_cols:
                    # Ensure children columns are unique in case of partial match overlaps
                    available_children_cols = list(set(available_children_cols))
                    balance_sheet[f"{section_name} - {account_name}"] = df[available_children_cols].sum(axis=1)
                else:
                    # No matches found
                    balance_sheet[f"{section_name} - {account_name}"] = pd.Series(np.nan, index=df.index)
            else:
                # No matches found
                balance_sheet[f"{section_name} - {account_name}"] = pd.Series(np.nan, index=df.index)

    # Process each section
    process_section("Assets", assets)
    process_section("Liabilities", liabilities)
    process_section("Equity", equity)

    # Add validation checks (will be handled by add_missing_balance_sheet_totals and its validation)

    return balance_sheet

# --- Helper functions (from your original code, assuming they work correctly) ---

# Function to display a human-readable balance sheet
def display_balance_sheet(balance_sheet, in_billions=True):
    """
    Display the balance sheet in a clean, readable format.

    Args:
        balance_sheet: DataFrame with the reconstructed balance sheet
        in_billions: If True, display values in billions; otherwise in millions

    Returns:
        DataFrame with the formatted balance sheet
    """
    # Make a copy to avoid modifying the original
    formatted_bs = balance_sheet.copy()

    # Identify numeric columns
    numeric_cols = [col for col in formatted_bs.columns if any(col.startswith(prefix) for prefix in ['Assets', 'Liabilities', 'Equity', 'Validation'])]

    # Convert to billions or millions
    divisor = 1_000_000_000 if in_billions else 1_000_000
    suffix = 'B' if in_billions else 'M'

    for col in numeric_cols:
        # Check if the column is numeric before dividing
        if pd.api.types.is_numeric_dtype(formatted_bs[col]):
             formatted_bs[col] = formatted_bs[col] / divisor
             # Optional: Format to a specific number of decimal places if needed
             # formatted_bs[col] = formatted_bs[col].round(2)


    # Format the date column if it exists
    if 'end' in formatted_bs.columns:
        try:
            formatted_bs['end'] = pd.to_datetime(formatted_bs['end']).dt.strftime('%Y-%m-%d')
        except:
            pass # Keep original format if conversion fails

    # Create sections for display (ensure columns exist before trying to select)
    asset_cols = [col for col in formatted_bs.columns if col.startswith('Assets')]
    liability_cols = [col for col in formatted_bs.columns if col.startswith('Liabilities')]
    equity_cols = [col for col in formatted_bs.columns if col.startswith('Equity')]
    validation_cols = [col for col in formatted_bs.columns if col.startswith('Validation')]

    # Columns to display, maintaining original order where possible but prioritizing key info
    display_cols_order = (
        ['filed', 'company_name', 'end', 'unit', 'form', 'frame', 'cik'] +
        sorted([col for col in asset_cols if 'Total' not in col]) +
        sorted([col for col in asset_cols if 'Total' in col]) +
        sorted([col for col in liability_cols if 'Total' not in col]) +
        sorted([col for col in liability_cols if 'Total' in col]) +
        sorted([col for col in equity_cols if 'Total' not in col and 'Liabilities and Equity' not in col]) +
        sorted([col for col in equity_cols if 'Total Stockholders Equity' in col]) +
        sorted([col for col in equity_cols if 'Liabilities and Equity' in col]) +
        validation_cols
    )

    # Filter display_cols_order to only include columns actually present in formatted_bs
    display_cols_final = [col for col in display_cols_order if col in formatted_bs.columns]


    return formatted_bs[display_cols_final]


# Function to add missing Balance Sheet totals based on accounting relationships
def add_missing_balance_sheet_totals(balance_sheet):
    """
    Adds missing total columns according to accounting relationships:
    - If Total Liabilities is missing, compute it as (Total Liabilities and Equity - Total Stockholders Equity)
    - If Total Stockholders Equity is missing, compute it as (Total Liabilities and Equity - Total Liabilities)
    - If Total Liabilities and Equity is missing, set it equal to Total Assets
    - Also adds validation columns for these relationships.

    Args:
        balance_sheet: DataFrame with the reconstructed balance sheet

    Returns:
        DataFrame with missing totals computed where possible and validation columns
    """
    # Make a copy to avoid modifying the original
    result = balance_sheet.copy()

    # Define key total column names
    total_assets_col = 'Assets - Total Assets'
    total_liabilities_col = 'Liabilities - Total Liabilities'
    total_equity_col = 'Equity - Total Stockholders Equity'
    total_liab_equity_col = 'Equity - Total Liabilities and Equity'

    # Ensure total columns exist before attempting calculations row-wise
    for col in [total_assets_col, total_liabilities_col, total_equity_col, total_liab_equity_col]:
        if col not in result.columns:
            result[col] = pd.NA # Use pandas NA for potentially mixed types if needed, or just np.nan

    # Process row by row to handle NaN values in specific cells
    for idx, row in result.iterrows():
        # Case 1: Compute missing Total Liabilities
        if (pd.notna(row[total_liab_equity_col]) and
            pd.notna(row[total_equity_col]) and
            pd.isna(row[total_liabilities_col])):

            result.at[idx, total_liabilities_col] = (
                row[total_liab_equity_col] -
                row[total_equity_col]
            )
            #print(f"Row {idx}: Computed missing Total Liabilities") # Optional: print for debugging

        # Case 2: Compute missing Total Stockholders Equity
        if (pd.notna(row[total_liab_equity_col]) and
            pd.notna(row[total_liabilities_col]) and
            pd.isna(row[total_equity_col])):

            result.at[idx, total_equity_col] = (
                row[total_liab_equity_col] -
                row[total_liabilities_col]
            )
            #print(f"Row {idx}: Computed missing Total Stockholders Equity") # Optional: print for debugging

        # Case 3: Compute missing Total Liabilities and Equity
        if pd.isna(row[total_liab_equity_col]):
            if pd.notna(row[total_assets_col]):
                # Set Total Liabilities and Equity = Total Assets (accounting equality)
                result.at[idx, total_liab_equity_col] = row[total_assets_col]
                #print(f"Row {idx}: Set Total Liabilities and Equity equal to Total Assets") # Optional: print for debugging
            elif (pd.notna(row[total_liabilities_col]) and
                  pd.notna(row[total_equity_col])):
                # Compute Total Liabilities and Equity as sum of components
                result.at[idx, total_liab_equity_col] = (
                    row[total_liabilities_col] +
                    row[total_equity_col]
                )
                #print(f"Row {idx}: Computed Total Liabilities and Equity from its components") # Optional: print for debugging

    # Add validation columns
    result['Validation - A = L+E Difference'] = (
        result[total_assets_col] -
        result[total_liab_equity_col]
    )

    # This check validates if the sum of Liabilities and Equity components equals Total Liabilities and Equity
    # Only perform if both Total Liabilities and Total Stockholders Equity columns are present
    if total_liabilities_col in result.columns and total_equity_col in result.columns:
         result['Validation - L+E Components Sum Difference'] = (
             result[total_liabilities_col] +
             result[total_equity_col] -
             result[total_liab_equity_col]
         )
    else:
         result['Validation - L+E Components Sum Difference'] = pd.NA # Cannot compute if components are missing


    return result


# Function to remove columns that only contain NaN or NA values
def remove_nan_only_columns(df):
    """
    Removes columns that contain only NaN or pandas NA values.

    Args:
        df: DataFrame to clean

    Returns:
        DataFrame with NaN-only columns removed
    """
    # Use .isnull().all() which handles both np.nan and pd.NA
    nan_cols = df.columns[df.isnull().all()].tolist()

    # Drop those columns
    return df.drop(columns=nan_cols)

# --- Main Analysis Function ---

def analyze_balance_sheet_financial_services(df):
    """
    Analyze company balance sheet from XBRL data, specifically for Financial Services.

    Args:
        df: DataFrame containing XBRL-tagged financial data

    Returns:
        The reconstructed balance sheet
    """
    print("Reconstructing balance sheet for Financial Services from XBRL data...")
    balance_sheet = reconstruct_balance_sheet_financial_services(df)

    # Add missing totals based on accounting relationships and add validation columns
    balance_sheet = add_missing_balance_sheet_totals(balance_sheet)

    # Remove columns with only NaN or NA values after attempting to fill totals
    balance_sheet = remove_nan_only_columns(balance_sheet)

    print("\nBalance Sheet Summary:")
    print(f"- Total columns: {len(balance_sheet.columns)}")
    print(f"- Data for {len(balance_sheet)} reporting periods")


    # Display the balance sheet
    print("\nFormatted Balance Sheet (in billions USD):")
    formatted_bs = display_balance_sheet(balance_sheet, in_billions=True)

    return balance_sheet, formatted_bs

# Example Usage (assuming consolidated_df is your input DataFrame):
balance_sheet_fs, formatted_bs_fs = analyze_balance_sheet_financial_services(consolidated_df)

Reconstructing balance sheet for Financial Services from XBRL data...

Balance Sheet Summary:
- Total columns: 37
- Data for 25 reporting periods

Formatted Balance Sheet (in billions USD):


In [37]:
balance_sheet_fs.head(20)

Unnamed: 0,filed,company_name,end,unit,form,frame,cik,Assets - Cash and Cash Equivalents,Assets - Available for Sale Securities (Debt),Assets - Accounts Receivable,...,Liabilities - Unrecognized Tax Benefits,Liabilities - Total Liabilities,Equity - Additional Paid-in Capital,Equity - Common Stock and Additional Paid-in Capital,Equity - Retained Earnings (Accumulated Deficit),Equity - Accumulated Other Comprehensive Income (Loss),Equity - Total Stockholders Equity,Equity - Total Liabilities and Equity,Validation - A = L+E Difference,Validation - L+E Components Sum Difference
48,2019-01-08,"NIKE, Inc.",2018-11-30,USD,10-Q,CY2018Q4,320187,3423000000.0,,4346000000.0,...,773000000.0,13948000000.0,6707000000.0,6799000000.0,1810000000.0,209000000.0,8729000000.0,22677000000.0,0.0,0.0
49,2019-04-04,"NIKE, Inc.",2019-02-28,USD,10-Q,CY2019Q1,320187,3695000000.0,,4549000000.0,...,798000000.0,13960000000.0,6910000000.0,7003000000.0,1851000000.0,197000000.0,8961000000.0,22921000000.0,0.0,0.0
50,2019-07-23,"NIKE, Inc.",2019-05-31,USD,10-K,CY2018,320187,4466000000.0,197000000.0,4272000000.0,...,808000000.0,235000000.0,7163000000.0,7488000000.0,1643000000.0,231000000.0,9040000000.0,23717000000.0,0.0,-14442000000.0
51,2019-10-04,"NIKE, Inc.",2019-08-31,USD,10-Q,CY2019Q3,320187,3446000000.0,198000000.0,4656000000.0,...,897000000.0,17049000000.0,7296000000.0,7376000000.0,1721000000.0,180000000.0,9200000000.0,26249000000.0,0.0,0.0
52,2020-01-07,"NIKE, Inc.",2019-11-30,USD,10-Q,CY2019Q4,320187,3070000000.0,432000000.0,4792000000.0,...,804000000.0,17251000000.0,7719000000.0,7829000000.0,1577000000.0,52000000.0,9351000000.0,26602000000.0,0.0,0.0
54,2020-04-07,"NIKE, Inc.",2020-02-29,USD,10-Q,CY2020Q1,320187,2863000000.0,319000000.0,4473000000.0,...,802000000.0,1300000000.0,7971000000.0,8084000000.0,1128000000.0,-57000000.0,9045000000.0,26220000000.0,0.0,-15875000000.0
55,2020-07-24,"NIKE, Inc.",2020-05-31,USD,10-K,CY2019,320187,8348000000.0,439000000.0,2749000000.0,...,771000000.0,165000000.0,8299000000.0,8728000000.0,-191000000.0,-56000000.0,8055000000.0,31342000000.0,0.0,-23122000000.0
56,2020-10-08,"NIKE, Inc.",2020-08-31,USD,10-Q,CY2020Q3,320187,8148000000.0,1332000000.0,3813000000.0,...,860000000.0,24034000000.0,8695000000.0,8831000000.0,927000000.0,-401000000.0,9224000000.0,33258000000.0,0.0,0.0
57,2021-01-05,"NIKE, Inc.",2020-11-30,USD,10-Q,CY2020Q4,320187,8635000000.0,3177000000.0,3713000000.0,...,872000000.0,24196000000.0,9336000000.0,9525000000.0,1730000000.0,-429000000.0,10640000000.0,34836000000.0,0.0,0.0
58,2021-04-02,"NIKE, Inc.",2021-02-28,USD,10-Q,CY2021Q1,320187,8516000000.0,4012000000.0,3669000000.0,...,877000000.0,24254000000.0,9645000000.0,9787000000.0,2729000000.0,-446000000.0,11931000000.0,36185000000.0,0.0,0.0


# Balance Sheet for 10-K IT

In [8]:
import pandas as pd
import numpy as np

def reconstruct_balance_sheet(df):
    """
    Reconstructs balance sheet from XBRL-tagged data.
    
    Args:
        df: DataFrame containing XBRL-tagged financial data
        
    Returns:
        DataFrame with reconstructed balance sheet for each period
    """
    # Create a new DataFrame to store the reconstructed balance sheet
    balance_sheet = pd.DataFrame(index=df.index)
    
    # Copy identifying columns
    id_columns = ['filed', 'company_name', 'end', 'unit', 'form', 'frame', 'cik']
    for col in id_columns:
        if col in df.columns:
            balance_sheet[col] = df[col]
    
    # Get all available columns for matching
    available_columns = set(df.columns)
    
    # Define mappings for balance sheet accounts based on XBRL structure
    
    # Assets section
    assets = {
        # Current Assets
        'Cash and Cash Equivalents': {
            'primary': 'CashAndCashEquivalents',
            'alternatives': ['CashAndCashEquivalentsAtCarryingValue'],
            'children': []
        },
        'Short-term Investments': {
            'primary': 'ShortTermInvestments',
            'alternatives': ['MarketableSecuritiesCurrent'],
            'children': ['AvailableForSaleSecuritiesCurrent', 'InvestmentsAvailableForSaleCurrent']
        },
        'Accounts Receivable, Net (Current)': {
            'primary': 'AccountsReceivableNetCurrent',
            'alternatives': [],
            'children': []
        },
        'Accounts Receivable, Net (Non-current)': {
            'primary': 'AccountsReceivableNetNoncurrent',
            'alternatives': [],
            'children': []
        },
        'Inventory': {
            'primary': 'InventoryNet',
            'alternatives': ['Inventory'],
            'children': ['InventoryRawMaterialsNetOfReserves','InventoryWorkInProcessNetOfReserves','InventoryFinishedGoodsNetOfReserves']
        },
        'Other Current Assets': {
            'primary': 'OtherAssetsCurrent',
            'alternatives': ['OtherCurrentAssets'],
            'children': ['PrepaidExpensesCurrent']
        },
        'Total Current Assets': {
            'primary': 'AssetsCurrent',
            'alternatives': ['CurrentAssets'],
            'children': []
        },
        
        # Non-current Assets
        'Property, Plant and Equipment, Net': {
            'primary': 'PropertyPlantAndEquipmentNet',
            'alternatives': ['PropertyAndEquipmentNet'],
            'children': []
        },
        'Operating Lease Right-of-Use Assets': {
            'primary': 'OperatingLeaseRightOfUseAssets',
            'alternatives': [],
            'children': []
        },
        'Equity Investments': {
            'primary': 'EquityInvestments',
            'alternatives': [],
            'children': []
        },
        'Goodwill': {
            'primary': 'Goodwill',
            'alternatives': [],
            'children': []
        },
        'Intangible Assets, Net': {
            'primary': 'IntangibleAssetsNetExcludingGoodwill',
            'alternatives': ['IntangibleAssetsNet'],
            'children': []
        },
        'Other Long-term Assets': {
            'primary': 'OtherAssetsNoncurrent',
            'alternatives': ['OtherNoncurrentAssets', 'OtherLongTermAssets'],
            'children': ['DeferredIncomeTaxesNoncurrent']
        },
        'Total Assets': {
            'primary': 'Assets',
            'alternatives': [],
            'children': []
        }
    }
    
    # Liabilities section
    liabilities = {
        # Current Liabilities
        'Accounts Payable': {
            'primary': 'AccountsPayableCurrent',
            'alternatives': ['AccountsPayable'],
            'children': []
        },
        'Current Portion of Long-term Debt': {
            'primary': 'LongTermDebtCurrent',
            'alternatives': ['DebtCurrent'],
            'children': []
        },
        # Add to Current Liabilities section (after Accrued Compensation)
        'Accrued Expenses': {
            'primary': 'AccruedLiabilities',
            'alternatives': ['AccruedExpenses', 'AccruedLiabilitiesCurrent'],
            'children': []
        },

        # Add under Non-current Liabilities (before Total Liabilities)
        'Deferred Liabilities': {
            'primary': 'DeferredRevenue',
            'alternatives': ['ContractLiabilities', 'DeferredLiabilities'],
            'children': ['DeferredRevenueCurrent', 'DeferredRevenueNoncurrent']
        },
        'Accrued Compensation': {
            'primary': 'AccruedCompensation',
            'alternatives': [],
            'children': []
        },
        'Income Taxes Payable': {
            'primary': 'IncomeTaxesPayableCurrent',
            'alternatives': ['IncomeTaxPayableCurrent'],
            'children': []
        },
        'Unearned Revenue (Current)': {
            'primary': 'DeferredRevenueCurrent',
            'alternatives': ['UnearnedRevenueCurrent', 'ContractLiabilitiesCurrent'],
            'children': []
        },
        'Other Current Liabilities': {
            'primary': 'OtherLiabilitiesCurrent',
            'alternatives': ['OtherCurrentLiabilities', 'OtherAccruedLiabilitiesCurrent'],
            'children': []
        },
        'Total Current Liabilities': {
            'primary': 'LiabilitiesCurrent',
            'alternatives': ['CurrentLiabilities'],
            'children': []
        },
        
        # Non-current Liabilities
        'Long-term Debt': {
            'primary': 'LongTermDebtNoncurrent',
            'alternatives': ['DebtNoncurrent', 'LongTermDebt'],
            'children': []
        },
        'Long-term Income Taxes': {
            'primary': 'IncomeTaxesPayableNoncurrent',
            'alternatives': ['IncomeTaxPayableNoncurrent', 'LongTermIncomeTaxPayable'],
            'children': []
        },
        'Long-term Unearned Revenue': {
            'primary': 'DeferredRevenueNoncurrent',
            'alternatives': ['UnearnedRevenueNoncurrent', 'ContractLiabilitiesNoncurrent'],
            'children': []
        },
        'Operating Lease Liabilities (Non-current)': {
            'primary': 'OperatingLeaseLiabilityNoncurrent',
            'alternatives': [],
            'children': []
        },
        'Other Long-term Liabilities': {
            'primary': 'OtherLiabilitiesNoncurrent',
            'alternatives': ['OtherNoncurrentLiabilities', 'OtherLongTermLiabilities'],
            'children': []
        },
        'Total Liabilities': {
            'primary': 'Liabilities',
            'alternatives': [],
            'children': []
        }
    }
    
    # Equity section
    equity = {
        'Common Stock and Paid-in Capital': {
            'primary': 'CommonStockAndAdditionalPaidInCapital',
            'alternatives': [],
            'children': ['CommonStockValue', 'AdditionalPaidInCapital']
        },
        'Retained Earnings': {
            'primary': 'RetainedEarningsAccumulatedDeficit',
            'alternatives': ['RetainedEarnings'],
            'children': []
        },
        'Accumulated Other Comprehensive Income': {
            'primary': 'AccumulatedOtherComprehensiveIncomeLoss',
            'alternatives': [],
            'children': []
        },
        'Total Stockholders Equity': {
            'primary': 'StockholdersEquity',
            'alternatives': ['TotalEquity'],
            'children': []
        },
        'Total Liabilities and Equity': {
            'primary': 'LiabilitiesAndStockholdersEquity',
            'alternatives': ['LiabilitiesAndEquity'],
            'children': []
        }
    }
    
    # Helper function to find the best matching column
    def find_best_match(account_info, available_cols):
        # Check primary tag
        if account_info['primary'] in available_cols:
            return account_info['primary']
        
        # Check alternatives
        for alt in account_info['alternatives']:
            if alt in available_cols:
                return alt
                
        # Handle partial matches for more flexible matching
        for col in available_cols:
            if account_info['primary'] in col:
                return col
                
        for alt in account_info['alternatives']:
            for col in available_cols:
                if alt in col:
                    return col
        
        return None
    
    # Process each section
    def process_section(section_name, section_mapping):
        for account_name, account_info in section_mapping.items():
            # Try to find the best match
            best_match = find_best_match(account_info, available_columns)
            
            if best_match:
                # Direct match found
                balance_sheet[f"{section_name} - {account_name}"] = df[best_match]
            elif account_info['children']:
                # Try to sum children if parent not found
                available_children = []
                for child in account_info['children']:
                    if child in available_columns:
                        available_children.append(child)
                    else:
                        # Look for variations
                        for col in available_columns:
                            if child in col:
                                available_children.append(col)
                
                if available_children:
                    balance_sheet[f"{section_name} - {account_name}"] = df[available_children].sum(axis=1)
                else:
                    # No matches found
                    balance_sheet[f"{section_name} - {account_name}"] = pd.Series(np.nan, index=df.index)
            else:
                # No matches found
                balance_sheet[f"{section_name} - {account_name}"] = pd.Series(np.nan, index=df.index)
    
    # Process each section
    process_section("Assets", assets)
    process_section("Liabilities", liabilities)
    process_section("Equity", equity)
    
    # Add validation checks - this will be updated in the add_total_liabilities_and_equity function
    
    # Check if current assets and liabilities match their components
    if 'Assets - Total Current Assets' in balance_sheet.columns:
        current_asset_cols = [col for col in balance_sheet.columns if col.startswith('Assets - ') and 'Current' in col and 'Total' not in col]
        if current_asset_cols:
            balance_sheet['Validation - Current Assets Difference'] = balance_sheet['Assets - Total Current Assets'] - balance_sheet[current_asset_cols].sum(axis=1)
    
    if 'Liabilities - Total Current Liabilities' in balance_sheet.columns:
        current_liab_cols = [col for col in balance_sheet.columns if col.startswith('Liabilities - ') and 'Current' in col and 'Total' not in col]
        if current_liab_cols:
            balance_sheet['Validation - Current Liabilities Difference'] = balance_sheet['Liabilities - Total Current Liabilities'] - balance_sheet[current_liab_cols].sum(axis=1)
    
    return balance_sheet

# Function to display a human-readable balance sheet
def display_balance_sheet(balance_sheet, in_billions=True):
    """
    Display the balance sheet in a clean, readable format.
    
    Args:
        balance_sheet: DataFrame with the reconstructed balance sheet
        in_billions: If True, display values in billions; otherwise in millions
    
    Returns:
        DataFrame with the formatted balance sheet
    """
    # Make a copy to avoid modifying the original
    formatted_bs = balance_sheet.copy()
    
    # Identify numeric columns
    numeric_cols = [col for col in formatted_bs.columns if any(col.startswith(prefix) for prefix in ['Assets', 'Liabilities', 'Equity', 'Validation'])]
    
    # Convert to billions or millions
    divisor = 1_000_000_000 if in_billions else 1_000_000
    suffix = 'B' if in_billions else 'M'
    
    for col in numeric_cols:
        formatted_bs[col] = formatted_bs[col] / divisor
        
    # Format the date column if it exists
    if 'end' in formatted_bs.columns:
        try:
            formatted_bs['end'] = pd.to_datetime(formatted_bs['end']).dt.strftime('%Y-%m-%d')
        except:
            pass
    
    # Create sections for display
    asset_cols = [col for col in formatted_bs.columns if col.startswith('Assets')]
    liability_cols = [col for col in formatted_bs.columns if col.startswith('Liabilities')]
    equity_cols = [col for col in formatted_bs.columns if col.startswith('Equity')]
    validation_cols = [col for col in formatted_bs.columns if col.startswith('Validation')]
    
    # Columns to display
    display_cols = ['end'] if 'end' in formatted_bs.columns else []
    display_cols.extend(asset_cols + liability_cols + equity_cols + validation_cols)
    
    return formatted_bs[display_cols]

# Function to add Total Liabilities and Equity if it doesn't exist
def add_total_liabilities_and_equity(balance_sheet):
    """
    Adds Total Liabilities and Equity column if it doesn't exist already.
    Ensures it equals Total Assets for validation.
    
    Args:
        balance_sheet: DataFrame with the reconstructed balance sheet
    
    Returns:
        DataFrame with Total Liabilities and Equity added
    """
    if 'Equity - Total Liabilities and Equity' not in balance_sheet.columns:
        if 'Liabilities - Total Liabilities' in balance_sheet.columns and 'Equity - Total Stockholders\' Equity' in balance_sheet.columns:
            balance_sheet['Equity - Total Liabilities and Equity'] = (
                balance_sheet['Liabilities - Total Liabilities'] + 
                balance_sheet['Equity - Total Stockholders\' Equity']
            )
    
    # Add validation check to ensure Total Assets = Total Liabilities and Equity
    if 'Assets - Total Assets' in balance_sheet.columns and 'Equity - Total Liabilities and Equity' in balance_sheet.columns:
        balance_sheet['Validation - A = L+E Difference'] = (
            balance_sheet['Assets - Total Assets'] - 
            balance_sheet['Equity - Total Liabilities and Equity']
        )
    
    return balance_sheet

# Function to remove columns that only contain NaN values
def remove_nan_only_columns(df):
    """
    Removes columns that contain only NaN values.
    
    Args:
        df: DataFrame to clean
    
    Returns:
        DataFrame with NaN-only columns removed
    """
    # Find columns where all values are NaN
    nan_cols = df.columns[df.isna().all()].tolist()
    
    # Drop those columns
    return df.drop(columns=nan_cols)
def add_missing_balance_sheet_totals(balance_sheet):
    """
    Adds missing total columns according to accounting relationships:
    - If all accounts exist, do nothing
    - If Total Liabilities is missing, compute it as (Total Liabilities and Equity - Total Stockholders Equity)
    - If Total Stockholders Equity is missing, compute it as (Total Liabilities and Equity - Total Liabilities)
    - If Total Liabilities and Equity is missing, set it equal to Total Assets
    
    Args:
        balance_sheet: DataFrame with the reconstructed balance sheet
    
    Returns:
        DataFrame with missing totals computed where possible
    """
    # Make a copy to avoid modifying the original
    result = balance_sheet.copy()
    
    # Check for the existence of key columns
    has_total_assets = 'Assets - Total Assets' in result.columns
    has_total_liabilities = 'Liabilities - Total Liabilities' in result.columns
    has_total_equity = 'Equity - Total Stockholders Equity' in result.columns
    has_total_liab_equity = 'Equity - Total Liabilities and Equity' in result.columns
    
    # Process at column level first (if entire column is missing)
    # Case 1: Compute missing Total Liabilities
    if not has_total_liabilities and has_total_liab_equity and has_total_equity:
        result['Liabilities - Total Liabilities'] = (
            result['Equity - Total Liabilities and Equity'] - 
            result['Equity - Total Stockholders Equity']
        )
        print("Computed missing Total Liabilities column")
        has_total_liabilities = True
    
    # Case 2: Compute missing Total Stockholders Equity
    if not has_total_equity and has_total_liab_equity and has_total_liabilities:
        result['Equity - Total Stockholders Equity'] = (
            result['Equity - Total Liabilities and Equity'] - 
            result['Liabilities - Total Liabilities']
        )
        print("Computed missing Total Stockholders Equity column")
        has_total_equity = True
    
    # Case 3: Compute missing Total Liabilities and Equity
    if not has_total_liab_equity:
        if has_total_assets:
            # Set Total Liabilities and Equity = Total Assets (accounting equality)
            result['Equity - Total Liabilities and Equity'] = result['Assets - Total Assets']
            print("Set Total Liabilities and Equity equal to Total Assets")
            has_total_liab_equity = True
        elif has_total_liabilities and has_total_equity:
            # Compute Total Liabilities and Equity as sum of components
            result['Equity - Total Liabilities and Equity'] = (
                result['Liabilities - Total Liabilities'] + 
                result['Equity - Total Stockholders Equity']
            )
            print("Computed Total Liabilities and Equity from its components")
            has_total_liab_equity = True
    
    # Now process row by row to handle NaN values in specific cells
    if has_total_liabilities and has_total_equity and has_total_liab_equity:
        for idx, row in result.iterrows():
            # Case 1: Compute missing Total Liabilities in specific rows
            if (pd.notna(row['Equity - Total Liabilities and Equity']) and 
                pd.notna(row['Equity - Total Stockholders Equity']) and
                pd.isna(row['Liabilities - Total Liabilities'])):
                
                result.at[idx, 'Liabilities - Total Liabilities'] = (
                    row['Equity - Total Liabilities and Equity'] - 
                    row['Equity - Total Stockholders Equity']
                )
                print(f"Row {idx}: Computed missing Total Liabilities")
            
            # Case 2: Compute missing Total Stockholders Equity in specific rows
            if (pd.notna(row['Equity - Total Liabilities and Equity']) and 
                pd.notna(row['Liabilities - Total Liabilities']) and
                pd.isna(row['Equity - Total Stockholders Equity'])):
                
                result.at[idx, 'Equity - Total Stockholders Equity'] = (
                    row['Equity - Total Liabilities and Equity'] - 
                    row['Liabilities - Total Liabilities']
                )
                print(f"Row {idx}: Computed missing Total Stockholders Equity")
            
            # Case 3: Compute missing Total Liabilities and Equity in specific rows
            if pd.isna(row['Equity - Total Liabilities and Equity']):
                if pd.notna(row['Assets - Total Assets']):
                    # Set Total Liabilities and Equity = Total Assets (accounting equality)
                    result.at[idx, 'Equity - Total Liabilities and Equity'] = row['Assets - Total Assets']
                    print(f"Row {idx}: Set Total Liabilities and Equity equal to Total Assets")
                elif (pd.notna(row['Liabilities - Total Liabilities']) and 
                      pd.notna(row['Equity - Total Stockholders Equity'])):
                    # Compute Total Liabilities and Equity as sum of components
                    result.at[idx, 'Equity - Total Liabilities and Equity'] = (
                        row['Liabilities - Total Liabilities'] + 
                        row['Equity - Total Stockholders Equity']
                    )
                    print(f"Row {idx}: Computed Total Liabilities and Equity from its components")
    
    # Add validation columns
    if has_total_assets and has_total_liab_equity:
        result['Validation - A = L+E Difference'] = (
            result['Assets - Total Assets'] - 
            result['Equity - Total Liabilities and Equity']
        )
    
    if has_total_liabilities and has_total_equity and has_total_liab_equity:
        result['Validation - Components = Total L+E Difference'] = (
            result['Liabilities - Total Liabilities'] + 
            result['Equity - Total Stockholders Equity'] - 
            result['Equity - Total Liabilities and Equity']
        )
    
    return result

def analyze_balance_sheet(df):
    """
    Analyze company balance sheet from XBRL data.
    
    Args:
        df: DataFrame containing XBRL-tagged financial data
    
    Returns:
        The reconstructed balance sheet
    """
    print("Reconstructing balance sheet from XBRL data...")
    balance_sheet = reconstruct_balance_sheet(df)
    
    # Add missing totals based on accounting relationships
    balance_sheet = add_missing_balance_sheet_totals(balance_sheet)
    
    # Remove columns with only NaN values
    balance_sheet = remove_nan_only_columns(balance_sheet)
    
    print("\nBalance Sheet Summary:")
    print(f"- Total columns: {len(balance_sheet.columns)}")
    print(f"- Data for {len(balance_sheet)} reporting periods")
    
    # Display the balance sheet
    print("\nFormatted Balance Sheet (in billions USD):")
    formatted_bs = display_balance_sheet(balance_sheet, in_billions=True)
    
    return balance_sheet, formatted_bs

# To use this code with your DataFrame:
# balance_sheet, formatted_bs = analyze_balance_sheet(consolidated_df)

# To use this code with your DataFrame:
balance_sheet, formatted_bs = analyze_balance_sheet(consolidated_df)

Reconstructing balance sheet from XBRL data...

Balance Sheet Summary:
- Total columns: 25
- Data for 26 reporting periods

Formatted Balance Sheet (in billions USD):


In [9]:
yearly = balance_sheet[balance_sheet['frame'].str.match(r'^CY\d{4}$')]
yearly.T

Unnamed: 0,51,56,61,65,70,74,78
filed,2019-02-26,2020-02-25,2021-02-23,2022-02-22,2023-02-21,2024-02-16,2025-02-14
company_name,JPMorgan Chase & Co,JPMorgan Chase & Co,JPMorgan Chase & Co,JPMorgan Chase & Co,JPMorgan Chase & Co,JPMorgan Chase & Co,JPMorgan Chase & Co
end,2018-12-31,2019-12-31,2020-12-31,2021-12-31,2022-12-31,2023-12-31,2024-12-31
unit,USD,USD,USD,USD,USD,USD,USD
form,10-K,10-K,10-K,10-K,10-K,10-K,10-K
frame,CY2018,CY2019,CY2020,CY2021,CY2022,CY2023,CY2024
cik,19617,19617,19617,19617,19617,19617,19617
Assets - Cash and Cash Equivalents,40800000000.0,46500000000.0,24400000000.0,19700000000.0,26800000000.0,19600000000.0,27500000000.0
"Assets - Property, Plant and Equipment, Net",14934000000.0,25813000000.0,27109000000.0,27070000000.0,27734000000.0,,
Assets - Goodwill,47471000000.0,47823000000.0,49248000000.0,50315000000.0,51662000000.0,52634000000.0,52565000000.0


# Balance Sheet after removing the columns that were not usefull for our workflow

In [10]:
# Keep rows with exactly Q1, Q2, Q3, Q4 or just the year (like CY2019)
balance_sheet = balance_sheet[~balance_sheet['frame'].str.match(r'CY\d{4}Q[1-4].+')]
balance_sheet = balance_sheet.sort_values(by='end')
balance_sheet.T

Unnamed: 0,50,51,52,54,55,56,57,58,59,61,...,69,70,71,72,73,74,75,76,77,78
filed,2019-11-04,2019-02-26,2019-05-02,2019-08-06,2019-11-04,2020-02-25,2020-05-07,2020-08-03,2020-11-02,2021-02-23,...,2022-11-03,2023-02-21,2023-05-03,2023-08-03,2023-11-01,2024-02-16,2024-05-01,2025-02-14,2024-10-30,2025-02-14
company_name,JPMorgan Chase & Co,JPMorgan Chase & Co,JPMorgan Chase & Co,JPMorgan Chase & Co,JPMorgan Chase & Co,JPMorgan Chase & Co,JPMorgan Chase & Co,JPMorgan Chase & Co,JPMorgan Chase & Co,JPMorgan Chase & Co,...,JPMorgan Chase & Co,JPMorgan Chase & Co,JPMorgan Chase & Co,JPMorgan Chase & Co,JPMorgan Chase & Co,JPMorgan Chase & Co,JPMorgan Chase & Co,JPMorgan Chase & Co,JPMorgan Chase & Co,JPMorgan Chase & Co
end,2018-09-30,2018-12-31,2019-03-31,2019-06-30,2019-09-30,2019-12-31,2020-03-31,2020-06-30,2020-09-30,2020-12-31,...,2022-09-30,2022-12-31,2023-03-31,2023-06-30,2023-09-30,2023-12-31,2024-03-31,2024-06-30,2024-09-30,2024-12-31
unit,USD,USD,USD,USD,USD,USD,USD,USD,USD,USD,...,USD,USD,USD,USD,USD,USD,USD,USD,USD,USD
form,10-Q,10-K,10-Q,10-Q,10-Q,10-K,10-Q,10-Q,10-Q,10-K,...,10-Q,10-K,10-Q,10-Q,10-Q,10-K,10-Q,10-K,10-Q,10-K
frame,CY2018Q3,CY2018,CY2019Q1,CY2019Q2,CY2019Q3,CY2019,CY2020Q1,CY2020Q2,CY2020Q3,CY2020,...,CY2022Q3,CY2022,CY2023Q1,CY2023Q2,CY2023Q3,CY2023,CY2024Q1,CY2024Q2,CY2024Q3,CY2024
cik,19617,19617,19617,19617,19617,19617,19617,19617,19617,19617,...,19617,19617,19617,19617,19617,19617,19617,19617,19617,19617
Assets - Cash and Cash Equivalents,42200000000.0,40800000000.0,42800000000.0,45500000000.0,47100000000.0,46500000000.0,23500000000.0,25200000000.0,24700000000.0,24400000000.0,...,27100000000.0,26800000000.0,21600000000.0,25600000000.0,23200000000.0,19600000000.0,20400000000.0,23400000000.0,25100000000.0,27500000000.0
"Assets - Property, Plant and Equipment, Net",14180000000.0,14934000000.0,24160000000.0,24665000000.0,25117000000.0,25813000000.0,25882000000.0,26301000000.0,26672000000.0,27109000000.0,...,27199000000.0,27734000000.0,28266000000.0,29493000000.0,29677000000.0,,,,,
Assets - Goodwill,47483000000.0,47471000000.0,47474000000.0,47477000000.0,47818000000.0,47823000000.0,47800000000.0,47811000000.0,47819000000.0,49248000000.0,...,51461000000.0,51662000000.0,52144000000.0,52380000000.0,52492000000.0,52634000000.0,52636000000.0,52620000000.0,52711000000.0,52565000000.0


In [11]:
balance_sheet.columns

Index(['filed', 'company_name', 'end', 'unit', 'form', 'frame', 'cik',
       'Assets - Cash and Cash Equivalents',
       'Assets - Property, Plant and Equipment, Net', 'Assets - Goodwill',
       'Assets - Intangible Assets, Net', 'Assets - Total Assets',
       'Liabilities - Accounts Payable', 'Liabilities - Accrued Expenses',
       'Liabilities - Total Current Liabilities',
       'Liabilities - Long-term Debt', 'Liabilities - Total Liabilities',
       'Equity - Common Stock and Paid-in Capital',
       'Equity - Retained Earnings',
       'Equity - Accumulated Other Comprehensive Income',
       'Equity - Total Stockholders Equity',
       'Equity - Total Liabilities and Equity',
       'Validation - Current Liabilities Difference',
       'Validation - A = L+E Difference',
       'Validation - Components = Total L+E Difference'],
      dtype='object')

### Income statement

In [12]:
import pandas as pd
import numpy as np

def reconstruct_income_statement(df):
    """
    Reconstructs income statement from XBRL-tagged data, specifically tailored for IT companies like Microsoft.
    
    Args:
        df: DataFrame containing XBRL-tagged financial data
        
    Returns:
        DataFrame with reconstructed income statement for each period
    """
    # Create a new DataFrame to store the reconstructed income statement
    income_statement = pd.DataFrame(index=df.index)
    
    # Copy identifying columns
    id_columns = ['filed', 'company_name', 'end', 'unit', 'form', 'frame', 'cik']
    for col in id_columns:
        if col in df.columns:
            income_statement[col] = df[col]
    
    # Get all available columns for matching
    available_columns = set(df.columns)
    
    # Define mappings for income statement accounts based on XBRL structure
    
    # Revenue section
    revenue = {
        'Total Revenue': {
            'primary': 'RevenueFromContractWithCustomerExcludingAssessedTax',
            'alternatives': ['SalesRevenueNet', 'Revenue', 'Revenues'],
            'children': ['ProductRevenue', 'ServiceRevenue', 'CloudComputingArrangementRevenue', 'SubscriptionServiceRevenue']
        },
        'Product Revenue': {
            'primary': 'ProductRevenue',
            'alternatives': ['SalesOfProductsNet'],
            'children': []
        },
        'Service Revenue': {
            'primary': 'ServiceRevenue',
            'alternatives': ['SalesOfServicesNet'],
            'children': []
        },
        'Cloud Revenue': {
            'primary': 'CloudComputingArrangementRevenue',
            'alternatives': ['HostingServiceRevenue', 'SubscriptionServiceRevenue'],
            'children': []
        }
    }
    
    # Cost of Revenue section
    cost_of_revenue = {
        'Total Cost of Revenue': {
            'primary': 'CostOfRevenue',
            'alternatives': ['CostOfGoodsSold', 'CostOfSales', 'CostOfGoodsAndServicesSold'],
            'children': ['CostOfProductRevenue', 'CostOfServiceRevenue']
        },
        'Cost of Product Revenue': {
            'primary': 'CostOfProductRevenue',
            'alternatives': ['CostOfGoodsSoldProduct'],
            'children': []
        },
        'Cost of Service Revenue': {
            'primary': 'CostOfServiceRevenue',
            'alternatives': ['CostOfServicesSold'],
            'children': []
        }
    }
    
    # Gross Profit
    gross_profit = {
        'Gross Profit': {
            'primary': 'GrossProfit',
            'alternatives': [],
            'children': []
        }
    }
    
    # Operating Expenses section
    operating_expenses = {
        'Research and Development': {
            'primary': 'ResearchAndDevelopmentExpense',
            'alternatives': ['ResearchDevelopmentExpense'],
            'children': []
        },
        'Sales and Marketing': {
            'primary': 'SalesAndMarketingExpense',
            'alternatives': ['MarketingExpense', 'SellingExpense', 'SellingAndMarketingExpense'],
            'children': []
        },
        'General and Administrative': {
            'primary': 'GeneralAndAdministrativeExpense',
            'alternatives': ['AdminExpense'],
            'children': []
        },
        'Selling, General and Administrative': {
            'primary': 'SellingGeneralAndAdministrativeExpenses',
            'alternatives': ['GeneralAndAdministrativeExpense'],
            'children': ['SalesAndMarketingExpense', 'GeneralAndAdministrativeExpense']
        },
        'Amortization of Intangible Assets': {
            'primary': 'AmortizationOfIntangibleAssets',
            'alternatives': ['AmortizationOfIntangibleAssetsExcludingGoodwill'],
            'children': []
        },
        'Restructuring Charges': {
            'primary': 'RestructuringCharges',
            'alternatives': ['RestructuringCosts'],
            'children': []
        },
        'Impairment of Goodwill': {
            'primary': 'ImpairmentOfGoodwill',
            'alternatives': [],
            'children': []
        },
        'Impairment of Intangible Assets': {
            'primary': 'ImpairmentOfIntangibleAssets',
            'alternatives': [],
            'children': []
        },
        'Total Operating Expenses': {
            'primary': 'OperatingExpenses',
            'alternatives': ['TotalOperatingExpenses'],
            'children': []
        }
    }
    
    # Operating Income
    operating_income = {
        'Operating Income': {
            'primary': 'IncomeLossFromOperations',
            'alternatives': ['OperatingIncomeLoss', 'IncomeFromOperations'],
            'children': []
        }
    }
    
    # Other Income/Expense section
    other_income_expense = {
        'Interest Income': {
            'primary': 'InterestIncome',
            'alternatives': ['InvestmentIncomeInterest'],
            'children': []
        },
        'Interest Expense': {
            'primary': 'InterestExpense',
            'alternatives': ['InterestCost'],
            'children': []
        },
        'Gain/Loss on Investments': {
            'primary': 'GainLossOnInvestments',
            'alternatives': ['InvestmentIncomeNetGainLossFromSaleOfInvestment'],
            'children': []
        },
        'Foreign Currency Transaction Gain/Loss': {
            'primary': 'ForeignCurrencyTransactionGainLoss',
            'alternatives': [],
            'children': []
        },
        'Other Income/Expense, Net': {
            'primary': 'OtherIncomeExpenseNet',
            'alternatives': ['NonoperatingIncomeExpense'],
            'children': []
        }
    }
    
    # Income before taxes
    income_before_tax = {
        'Income Before Income Taxes': {
            'primary': 'IncomeLossBeforeIncomeTax',
            'alternatives': ['IncomeBeforeTax', 'EarningsBeforeIncomeTax'],
            'children': []
        }
    }
    
    # Income tax provision
    income_tax = {
        'Income Tax Expense/Benefit': {
            'primary': 'IncomeTaxExpenseBenefit',
            'alternatives': ['IncomeTaxesPaidCurrent', 'IncomeTaxesPaid'],
            'children': []
        }
    }
    
    # Net Income
    net_income = {
        'Net Income': {
            'primary': 'ProfitLoss',
            'alternatives': ['NetIncomeLoss'],
            'children': []
        },
        'Net Income Attributable to Noncontrolling Interest': {
            'primary': 'NetIncomeLossAttributableToNoncontrollingInterest',
            'alternatives': [],
            'children': []
        },
        'Net Income Attributable to Company': {
            'primary': 'NetIncomeLossAttributableToParent',
            'alternatives': ['NetIncomeLossAvailableToCommonStockholdersBasic'],
            'children': []
        }
    }
    
    # Earnings Per Share
    eps = {
        'Basic EPS': {
            'primary': 'EarningsPerShareBasic',
            'alternatives': [],
            'children': []
        },
        'Diluted EPS': {
            'primary': 'EarningsPerShareDiluted',
            'alternatives': [],
            'children': []
        }
    }
    
    # Helper function to find the best matching column
    def find_best_match(account_info, available_cols):
        # Check primary tag
        if account_info['primary'] in available_cols:
            return account_info['primary']
        
        # Check alternatives
        for alt in account_info['alternatives']:
            if alt in available_cols:
                return alt
                
        # Handle partial matches for more flexible matching
        primary_matches = []
        for col in available_cols:
            if account_info['primary'] in col:
                primary_matches.append(col)
        
        # If we found exactly one partial match for primary, return it
        if len(primary_matches) == 1:
            return primary_matches[0]
                
        # If we found multiple partial matches, try to find the best one
        if primary_matches:
            # For certain important accounts, use a more specific matching strategy
            if account_info['primary'] in ['ProfitLoss', 'RevenueFromContractWithCustomer', 'CostOfRevenue']:
                for match in primary_matches:
                    if account_info['primary'] in match and not any(x in match.lower() for x in ['segment', 'division', 'product']):
                        return match
            
            return primary_matches[0]  # Default to first one
        
        # Check for partial matches in alternatives
        for alt in account_info['alternatives']:
            for col in available_cols:
                if alt in col:
                    return col
        
        return None
    
    # Process each section
    def process_section(section_name, section_mapping):
        for account_name, account_info in section_mapping.items():
            # Try to find the best match
            best_match = find_best_match(account_info, available_columns)
            
            if best_match:
                # Direct match found
                income_statement[f"{section_name} - {account_name}"] = df[best_match]
            elif account_info['children']:
                # Try to sum children if parent not found
                available_children = []
                for child in account_info['children']:
                    if child in available_columns:
                        available_children.append(child)
                    else:
                        # Look for variations
                        for col in available_columns:
                            if child in col:
                                available_children.append(col)
                
                if available_children:
                    income_statement[f"{section_name} - {account_name}"] = df[available_children].sum(axis=1)
                else:
                    # No matches found
                    income_statement[f"{section_name} - {account_name}"] = pd.Series(np.nan, index=df.index)
            else:
                # No matches found
                income_statement[f"{section_name} - {account_name}"] = pd.Series(np.nan, index=df.index)
    
    # Process each section
    process_section("Revenue", revenue)
    process_section("Cost", cost_of_revenue)
    process_section("Gross", gross_profit)
    process_section("Operating Expenses", operating_expenses)
    process_section("Operating", operating_income)
    process_section("Other", other_income_expense)
    process_section("Income Before Tax", income_before_tax)
    process_section("Tax", income_tax)
    process_section("Net Income", net_income)
    process_section("EPS", eps)
    
    # Calculate missing values and add validation checks
    
    # If Gross Profit is missing, calculate it
    if ('Gross - Gross Profit' not in income_statement.columns or income_statement['Gross - Gross Profit'].isna().all()) and \
       'Revenue - Total Revenue' in income_statement.columns and 'Cost - Total Cost of Revenue' in income_statement.columns:
        income_statement['Gross - Gross Profit (calculated)'] = income_statement['Revenue - Total Revenue'] - income_statement['Cost - Total Cost of Revenue']
    
    # If Operating Income is missing, calculate it
    if ('Operating - Operating Income' not in income_statement.columns or income_statement['Operating - Operating Income'].isna().all()) and \
       'Gross - Gross Profit' in income_statement.columns and 'Operating Expenses - Total Operating Expenses' in income_statement.columns:
        income_statement['Operating - Operating Income (calculated)'] = income_statement['Gross - Gross Profit'] - income_statement['Operating Expenses - Total Operating Expenses']
    elif ('Operating - Operating Income' not in income_statement.columns or income_statement['Operating - Operating Income'].isna().all()) and \
         'Gross - Gross Profit (calculated)' in income_statement.columns and 'Operating Expenses - Total Operating Expenses' in income_statement.columns:
        income_statement['Operating - Operating Income (calculated)'] = income_statement['Gross - Gross Profit (calculated)'] - income_statement['Operating Expenses - Total Operating Expenses']
    
    # If Income Before Tax is missing, calculate it
    if ('Income Before Tax - Income Before Income Taxes' not in income_statement.columns or income_statement['Income Before Tax - Income Before Income Taxes'].isna().all()):
        # Try to calculate from Operating Income and Other Income/Expense
        if 'Operating - Operating Income' in income_statement.columns and 'Other - Other Income/Expense, Net' in income_statement.columns:
            income_statement['Income Before Tax - Income Before Income Taxes (calculated)'] = income_statement['Operating - Operating Income'] + income_statement['Other - Other Income/Expense, Net']
        elif 'Operating - Operating Income (calculated)' in income_statement.columns and 'Other - Other Income/Expense, Net' in income_statement.columns:
            income_statement['Income Before Tax - Income Before Income Taxes (calculated)'] = income_statement['Operating - Operating Income (calculated)'] + income_statement['Other - Other Income/Expense, Net']
    
    # If Net Income is missing, calculate it
    if ('Net Income - Net Income' not in income_statement.columns or income_statement['Net Income - Net Income'].isna().all()):
        # Try to calculate from Income Before Tax and Income Tax Expense
        if 'Income Before Tax - Income Before Income Taxes' in income_statement.columns and 'Tax - Income Tax Expense/Benefit' in income_statement.columns:
            income_statement['Net Income - Net Income (calculated)'] = income_statement['Income Before Tax - Income Before Income Taxes'] - income_statement['Tax - Income Tax Expense/Benefit']
        elif 'Income Before Tax - Income Before Income Taxes (calculated)' in income_statement.columns and 'Tax - Income Tax Expense/Benefit' in income_statement.columns:
            income_statement['Net Income - Net Income (calculated)'] = income_statement['Income Before Tax - Income Before Income Taxes (calculated)'] - income_statement['Tax - Income Tax Expense/Benefit']
    
    # Add validation checks for key relationships
    # Validation: Revenue - COGS = Gross Profit
    if 'Revenue - Total Revenue' in income_statement.columns and 'Cost - Total Cost of Revenue' in income_statement.columns and 'Gross - Gross Profit' in income_statement.columns:
        income_statement['Validation - Gross Profit Difference'] = income_statement['Gross - Gross Profit'] - (income_statement['Revenue - Total Revenue'] - income_statement['Cost - Total Cost of Revenue'])
    
    # Validation: Gross Profit - Operating Expenses = Operating Income
    if 'Gross - Gross Profit' in income_statement.columns and 'Operating Expenses - Total Operating Expenses' in income_statement.columns and 'Operating - Operating Income' in income_statement.columns:
        income_statement['Validation - Operating Income Difference'] = income_statement['Operating - Operating Income'] - (income_statement['Gross - Gross Profit'] - income_statement['Operating Expenses - Total Operating Expenses'])
    
    # Validation: Income Before Tax - Income Tax = Net Income
    if 'Income Before Tax - Income Before Income Taxes' in income_statement.columns and 'Tax - Income Tax Expense/Benefit' in income_statement.columns and 'Net Income - Net Income' in income_statement.columns:
        income_statement['Validation - Net Income Difference'] = income_statement['Net Income - Net Income'] - (income_statement['Income Before Tax - Income Before Income Taxes'] - income_statement['Tax - Income Tax Expense/Benefit'])
    
    return income_statement

# Function to display a human-readable income statement
def display_income_statement(income_statement, in_millions=True):
    """
    Display the income statement in a clean, readable format.
    
    Args:
        income_statement: DataFrame with the reconstructed income statement
        in_millions: If True, display values in millions; otherwise in thousands
    
    Returns:
        DataFrame with the formatted income statement
    """
    # Make a copy to avoid modifying the original
    formatted_is = income_statement.copy()
    
    # Identify numeric columns
    numeric_cols = [col for col in formatted_is.columns if any(col.startswith(prefix) for prefix in ['Revenue', 'Cost', 'Gross', 'Operating', 'Other', 'Income Before Tax', 'Tax', 'Net Income', 'EPS', 'Validation'])]
    
    # Convert to millions or thousands
    divisor = 1_000_000 if in_millions else 1_000
    suffix = 'M' if in_millions else 'K'
    
    for col in numeric_cols:
        if 'EPS' not in col:  # Don't scale EPS values
            formatted_is[col] = formatted_is[col] / divisor
        
    # Format the date column if it exists
    if 'end' in formatted_is.columns:
        try:
            formatted_is['end'] = pd.to_datetime(formatted_is['end']).dt.strftime('%Y-%m-%d')
        except:
            pass
    
    # Create sections for display in proper income statement order
    display_sections = [
        'Revenue',
        'Cost',
        'Gross',
        'Operating Expenses',
        'Operating',
        'Other',
        'Income Before Tax',
        'Tax',
        'Net Income',
        'EPS'
    ]
    
    # Columns to display
    display_cols = ['end'] if 'end' in formatted_is.columns else []
    for section in display_sections:
        section_cols = [col for col in formatted_is.columns if col.startswith(f"{section} - ")]
        display_cols.extend(section_cols)
    
    # Add validation columns at the end if they exist
    validation_cols = [col for col in formatted_is.columns if col.startswith('Validation - ')]
    display_cols.extend(validation_cols)
    
    return formatted_is[display_cols]

# Function to remove columns that only contain NaN values
def remove_nan_only_columns(df):
    """
    Removes columns that contain only NaN values.
    
    Args:
        df: DataFrame to clean
    
    Returns:
        DataFrame with NaN-only columns removed
    """
    # Find columns where all values are NaN
    nan_cols = df.columns[df.isna().all()].tolist()
    
    # Drop those columns
    return df.drop(columns=nan_cols)

# Example usage
def analyze_microsoft_income_statement(df):
    """
    Analyze Microsoft's income statement from XBRL data.
    
    Args:
        df: DataFrame containing Microsoft's XBRL-tagged financial data
    
    Returns:
        The reconstructed income statement
    """
    print("Reconstructing Microsoft's income statement from XBRL data...")
    income_statement = reconstruct_income_statement(df)
    
    # Remove columns with only NaN values
    income_statement = remove_nan_only_columns(income_statement)
    
    print("\nIncome Statement Summary:")
    print(f"- Total columns: {len(income_statement.columns)}")
    print(f"- Data for {len(income_statement)} reporting periods")
    
    # Display the income statement
    print("\nFormatted Income Statement (in millions USD):")
    formatted_is = display_income_statement(income_statement, in_millions=True)
    
    return income_statement, formatted_is

# To use this code with your DataFrame:
income_statement, formatted_is = analyze_microsoft_income_statement(consolidated_df)

Reconstructing Microsoft's income statement from XBRL data...

Income Statement Summary:
- Total columns: 17
- Data for 26 reporting periods

Formatted Income Statement (in millions USD):


In [13]:
# Let's sort the data by date
consolidated_df = consolidated_df.sort_values(by='end')

consolidated_df.to_csv('./test.csv')

In [14]:
# Keep rows with exactly Q1, Q2, Q3, Q4 or just the year (like CY2019)
income_statement = income_statement[~income_statement['frame'].str.match(r'CY\d{4}Q[1-4].+')]
income_statement = income_statement.sort_values(by='end')
income_statement.T

Unnamed: 0,50,51,52,54,55,56,57,58,59,61,...,69,70,71,72,73,74,75,76,77,78
filed,2019-11-04,2019-02-26,2019-05-02,2019-08-06,2019-11-04,2020-02-25,2020-05-07,2020-08-03,2020-11-02,2021-02-23,...,2022-11-03,2023-02-21,2023-05-03,2023-08-03,2023-11-01,2024-02-16,2024-05-01,2025-02-14,2024-10-30,2025-02-14
company_name,JPMorgan Chase & Co,JPMorgan Chase & Co,JPMorgan Chase & Co,JPMorgan Chase & Co,JPMorgan Chase & Co,JPMorgan Chase & Co,JPMorgan Chase & Co,JPMorgan Chase & Co,JPMorgan Chase & Co,JPMorgan Chase & Co,...,JPMorgan Chase & Co,JPMorgan Chase & Co,JPMorgan Chase & Co,JPMorgan Chase & Co,JPMorgan Chase & Co,JPMorgan Chase & Co,JPMorgan Chase & Co,JPMorgan Chase & Co,JPMorgan Chase & Co,JPMorgan Chase & Co
end,2018-09-30,2018-12-31,2019-03-31,2019-06-30,2019-09-30,2019-12-31,2020-03-31,2020-06-30,2020-09-30,2020-12-31,...,2022-09-30,2022-12-31,2023-03-31,2023-06-30,2023-09-30,2023-12-31,2024-03-31,2024-06-30,2024-09-30,2024-12-31
unit,USD,USD,USD,USD,USD,USD,USD,USD,USD,USD,...,USD,USD,USD,USD,USD,USD,USD,USD,USD,USD
form,10-Q,10-K,10-Q,10-Q,10-Q,10-K,10-Q,10-Q,10-Q,10-K,...,10-Q,10-K,10-Q,10-Q,10-Q,10-K,10-Q,10-K,10-Q,10-K
frame,CY2018Q3,CY2018,CY2019Q1,CY2019Q2,CY2019Q3,CY2019,CY2020Q1,CY2020Q2,CY2020Q3,CY2020,...,CY2022Q3,CY2022,CY2023Q1,CY2023Q2,CY2023Q3,CY2023,CY2024Q1,CY2024Q2,CY2024Q3,CY2024
cik,19617,19617,19617,19617,19617,19617,19617,19617,19617,19617,...,19617,19617,19617,19617,19617,19617,19617,19617,19617,19617
Revenue - Total Revenue,,108783000000.0,,,,115720000000.0,,,,119951000000.0,...,,128695000000.0,,,,158104000000.0,,,,177556000000.0
Other - Interest Income,1796000000.0,7248000000.0,2068000000.0,2215000000.0,2450000000.0,9291000000.0,2533000000.0,2461000000.0,2110000000.0,9027000000.0,...,2932000000.0,11347000000.0,4215000000.0,4537000000.0,4873000000.0,18726000000.0,5194000000.0,5426000000.0,6147000000.0,23144000000.0
Other - Interest Expense,5531000000.0,21041000000.0,6936000000.0,7205000000.0,6893000000.0,26795000000.0,4722000000.0,2259000000.0,1687000000.0,9960000000.0,...,8093000000.0,26097000000.0,16293000000.0,19865000000.0,21830000000.0,81321000000.0,24356000000.0,,,


## CashFlow Statement

In [15]:
import pandas as pd
import numpy as np

def reconstruct_cash_flow_statement(df):
    """
    Reconstructs cash flow statement from XBRL-tagged data, specifically tailored for IT companies like Microsoft.
    
    Args:
        df: DataFrame containing XBRL-tagged financial data
        
    Returns:
        DataFrame with reconstructed cash flow statement for each period
    """
    # Create a new DataFrame to store the reconstructed cash flow statement
    cash_flow = pd.DataFrame(index=df.index)
    
    # Copy identifying columns
    id_columns = ['filed', 'company_name', 'end', 'unit', 'form', 'frame', 'cik']
    for col in id_columns:
        if col in df.columns:
            cash_flow[col] = df[col]
    
    # Get all available columns for matching
    available_columns = set(df.columns)
    
    # Define mappings for cash flow statement accounts based on XBRL structure
    
    # Operating Activities section
    operating_activities = {
        'Net Income': {
            'primary': 'ProfitLoss',
            'alternatives': ['NetIncomeLoss'],
            'children': []
        },
        'Depreciation and Amortization': {
            'primary': 'DepreciationAndAmortization',
            'alternatives': [],
            'children': ['Depreciation', 'AmortizationOfIntangibleAssets', 'AmortizationOfCapitalizedSoftwareCosts']
        },
        'Stock-based Compensation': {
            'primary': 'StockBasedCompensationExpense',
            'alternatives': ['ShareBasedCompensation'],
            'children': []
        },
        'Deferred Income Taxes': {
            'primary': 'DeferredIncomeTaxExpenseBenefit',
            'alternatives': ['IncomeTaxExpenseBenefitDeferredTax'],
            'children': []
        },
        'Impairment of Goodwill': {
            'primary': 'ImpairmentOfGoodwill',
            'alternatives': [],
            'children': []
        },
        'Impairment of Intangible Assets': {
            'primary': 'ImpairmentOfIntangibleAssets',
            'alternatives': [],
            'children': []
        },
        'Net Gains/Losses on Investments': {
            'primary': 'GainLossOnInvestments',
            'alternatives': ['InvestmentIncomeNetGainLossFromSaleOfInvestment'],
            'children': []
        },
        'Net Gains/Losses on Asset Sales': {
            'primary': 'GainLossOnSaleOfPropertyPlantAndEquipment',
            'alternatives': [],
            'children': []
        },
        'Changes in Operating Assets and Liabilities': {
            'primary': 'ChangeInOperatingAssetsAndLiabilities',
            'alternatives': [],
            'children': []
        },
        'Change in Accounts Receivable': {
            'primary': 'ChangeInAccountsReceivable',
            'alternatives': [],
            'children': []
        },
        'Change in Inventory': {
            'primary': 'ChangeInInventory',
            'alternatives': [],
            'children': []
        },
        'Change in Prepaid Expenses and Other Assets': {
            'primary': 'ChangeInPrepaidExpensesAndOtherAssets',
            'alternatives': [],
            'children': ['ChangeInPrepaidExpenses', 'ChangeInOtherAssets']
        },
        'Change in Accounts Payable': {
            'primary': 'ChangeInAccountsPayable',
            'alternatives': [],
            'children': []
        },
        'Change in Accrued Liabilities': {
            'primary': 'ChangeInAccruedLiabilities',
            'alternatives': ['ChangeInAccruedExpenses'],
            'children': ['ChangeInAccruedCompensation']
        },
        'Change in Deferred Revenue': {
            'primary': 'ChangeInDeferredRevenue',
            'alternatives': ['ChangeInContractLiabilities', 'ChangeInUnearnedRevenue'],
            'children': []
        },
        'Change in Income Taxes': {
            'primary': 'ChangeInIncomeTaxesPayableReceivable',
            'alternatives': ['ChangeInIncomeTaxPayable'],
            'children': []
        },
        'Other Operating Activities': {
            'primary': 'OtherOperatingActivitiesCashFlowAdjustments',
            'alternatives': [],
            'children': []
        },
        'Net Cash from Operating Activities': {
            'primary': 'NetCashProvidedByUsedInOperatingActivities',
            'alternatives': ['CashFlowsFromUsedInOperatingActivities'],
            'children': []
        }
    }
    
    # Investing Activities section
    investing_activities = {
        'Capital Expenditures': {
            'primary': 'PaymentsToAcquirePropertyPlantAndEquipment',
            'alternatives': ['CapitalExpenditures'],
            'children': []
        },
        'Proceeds from Asset Sales': {
            'primary': 'ProceedsFromSaleOfPropertyPlantAndEquipment',
            'alternatives': [],
            'children': []
        },
        'Purchases of Intangible Assets': {
            'primary': 'PaymentsToAcquireIntangibleAssetsExcludingGoodwill',
            'alternatives': [],
            'children': []
        },
        'Acquisitions, Net of Cash Acquired': {
            'primary': 'PaymentsToAcquireBusinessesNetOfCashAcquired',
            'alternatives': ['BusinessAcquisitionsNetCashAcquired'],
            'children': []
        },
        'Purchases of Investments': {
            'primary': 'PaymentsToAcquireInvestments',
            'alternatives': ['PurchaseOfInvestments'],
            'children': []
        },
        'Proceeds from Sale of Investments': {
            'primary': 'ProceedsFromSaleOfInvestments',
            'alternatives': [],
            'children': []
        },
        'Proceeds from Maturities of Investments': {
            'primary': 'ProceedsFromMaturitiesOfInvestments',
            'alternatives': ['ProceedsFromMaturityOfInvestments'],
            'children': []
        },
        'Other Investing Activities': {
            'primary': 'OtherInvestingActivities',
            'alternatives': [],
            'children': []
        },
        'Net Cash from Investing Activities': {
            'primary': 'NetCashProvidedByUsedInInvestingActivities',
            'alternatives': ['CashFlowsFromUsedInInvestingActivities'],
            'children': []
        }
    }
    
    # Financing Activities section
    financing_activities = {
        'Proceeds from Long-term Debt': {
            'primary': 'ProceedsFromIssuanceOfLongTermDebt',
            'alternatives': ['ProceedsFromDebtIssuance'],
            'children': []
        },
        'Repayment of Long-term Debt': {
            'primary': 'PaymentsOfPrincipalOnLongTermDebt',
            'alternatives': ['RepaymentsOfLongTermDebt', 'DebtRepayment'],
            'children': []
        },
        'Proceeds from Common Stock Issuance': {
            'primary': 'ProceedsFromIssuanceOfCommonStock',
            'alternatives': ['ProceedsFromStockPlans'],
            'children': []
        },
        'Common Stock Repurchases': {
            'primary': 'PaymentsForRepurchaseOfCommonStock',
            'alternatives': ['PaymentsToRepurchaseCommonStock', 'StockRepurchaseProgram'],
            'children': []
        },
        'Dividend Payments': {
            'primary': 'PaymentsOfDividends',
            'alternatives': ['DividendsPaid'],
            'children': []
        },
        'Lease Payments': {
            'primary': 'PaymentsOfFinanceLeaseLiabilities',
            'alternatives': ['PaymentsOfCapitalLeaseObligations'],
            'children': []
        },
        'Other Financing Activities': {
            'primary': 'OtherFinancingActivities',
            'alternatives': [],
            'children': []
        },
        'Net Cash from Financing Activities': {
            'primary': 'NetCashProvidedByUsedInFinancingActivities',
            'alternatives': ['CashFlowsFromUsedInFinancingActivities'],
            'children': []
        }
    }
    
    # Effect of Exchange Rate Changes and Net Change in Cash
    cash_reconciliation = {
        'Effect of Exchange Rate Changes': {
            'primary': 'EffectOfExchangeRateOnCashAndCashEquivalents',
            'alternatives': ['EffectOfExchangeRateChangesOnCash'],
            'children': []
        },
        'Net Change in Cash and Cash Equivalents': {
            'primary': 'IncreaseDecreaseInCashAndCashEquivalents',
            'alternatives': ['ChangeInCashAndCashEquivalents', 'CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalentsPeriodIncreaseDecrease'],
            'children': []
        },
        'Cash and Cash Equivalents, Beginning of Period': {
            'primary': 'CashAndCashEquivalentsAtCarryingValue',  # This will be filtered by context to beginning period
            'alternatives': ['CashAndCashEquivalentsPeriodStart', 'CashPeriodStart'],
            'children': []
        },
        'Cash and Cash Equivalents, End of Period': {
            'primary': 'CashAndCashEquivalentsAtCarryingValue',  # This will be filtered by context to ending period
            'alternatives': ['CashAndCashEquivalentsPeriodEnd', 'CashPeriodEnd'],
            'children': []
        }
    }
    
    # Supplemental information
    supplemental_info = {
        'Interest Paid': {
            'primary': 'InterestPaid',
            'alternatives': [],
            'children': []
        },
        'Income Taxes Paid': {
            'primary': 'IncomeTaxesPaid',
            'alternatives': [],
            'children': []
        }
    }
    
    # Helper function to find the best matching column
    def find_best_match(account_info, available_cols):
        # Check primary tag
        if account_info['primary'] in available_cols:
            return account_info['primary']
        
        # Check alternatives
        for alt in account_info['alternatives']:
            if alt in available_cols:
                return alt
                
        # Handle partial matches for more flexible matching
        primary_matches = []
        for col in available_cols:
            if account_info['primary'] in col:
                primary_matches.append(col)
        
        # If we found exactly one partial match for primary, return it
        if len(primary_matches) == 1:
            return primary_matches[0]
                
        # If we found multiple partial matches, try to find the best one
        if primary_matches:
            for match in primary_matches:
                # For cash flow statement, look for "CashFlow" or "NetCash" in the match
                if 'CashFlow' in match or 'NetCash' in match:
                    return match
            
            return primary_matches[0]  # Default to first one
        
        # Check for partial matches in alternatives
        for alt in account_info['alternatives']:
            for col in available_cols:
                if alt in col:
                    return col
        
        return None
    
    # Process each section
    def process_section(section_name, section_mapping):
        for account_name, account_info in section_mapping.items():
            # Special handling for Cash and Cash Equivalents at beginning/end of period
            if account_name == 'Cash and Cash Equivalents, Beginning of Period':
                # Look for beginning period cash
                for col in available_columns:
                    if 'CashAndCashEquivalents' in col and 'Begin' in col:
                        cash_flow[f"{section_name} - {account_name}"] = df[col]
                        break
                else:
                    # Try to find regular cash column with beginning period context
                    best_match = find_best_match(account_info, available_columns)
                    if best_match:
                        cash_flow[f"{section_name} - {account_name}"] = df[best_match]
                    else:
                        cash_flow[f"{section_name} - {account_name}"] = pd.Series(np.nan, index=df.index)
                continue
                
            if account_name == 'Cash and Cash Equivalents, End of Period':
                # Look for ending period cash
                for col in available_columns:
                    if 'CashAndCashEquivalents' in col and 'End' in col:
                        cash_flow[f"{section_name} - {account_name}"] = df[col]
                        break
                else:
                    # Try to find regular cash column with ending period context
                    best_match = find_best_match(account_info, available_columns)
                    if best_match:
                        cash_flow[f"{section_name} - {account_name}"] = df[best_match]
                    else:
                        cash_flow[f"{section_name} - {account_name}"] = pd.Series(np.nan, index=df.index)
                continue
            
            # Standard processing for other accounts
            best_match = find_best_match(account_info, available_columns)
            
            if best_match:
                # Direct match found
                cash_flow[f"{section_name} - {account_name}"] = df[best_match]
            elif account_info['children']:
                # Try to sum children if parent not found
                available_children = []
                for child in account_info['children']:
                    if child in available_columns:
                        available_children.append(child)
                    else:
                        # Look for variations
                        for col in available_columns:
                            if child in col:
                                available_children.append(col)
                
                if available_children:
                    cash_flow[f"{section_name} - {account_name}"] = df[available_children].sum(axis=1)
                else:
                    # No matches found
                    cash_flow[f"{section_name} - {account_name}"] = pd.Series(np.nan, index=df.index)
            else:
                # No matches found
                cash_flow[f"{section_name} - {account_name}"] = pd.Series(np.nan, index=df.index)
    
    # Process each section
    process_section("Operating", operating_activities)
    process_section("Investing", investing_activities)
    process_section("Financing", financing_activities)
    process_section("Cash", cash_reconciliation)
    process_section("Supplemental", supplemental_info)
    
    # Calculate missing values and add validation checks
    
    # If Net Cash from Operating Activities is missing, try to calculate it
    if ('Operating - Net Cash from Operating Activities' not in cash_flow.columns or 
        cash_flow['Operating - Net Cash from Operating Activities'].isna().all()):
        
        operating_cols = [col for col in cash_flow.columns 
                          if col.startswith('Operating - ') and 'Net Cash' not in col]
        
        if operating_cols and 'Operating - Net Income' in operating_cols:
            # Start with net income
            cash_flow['Operating - Net Cash from Operating Activities (calculated)'] = cash_flow['Operating - Net Income']
            
            # Add all other operating activities except net income
            for col in operating_cols:
                if col != 'Operating - Net Income':
                    # Skip NaN columns
                    if not cash_flow[col].isna().all():
                        cash_flow['Operating - Net Cash from Operating Activities (calculated)'] += cash_flow[col]
    
    # If Net Cash from Investing Activities is missing, try to calculate it
    if ('Investing - Net Cash from Investing Activities' not in cash_flow.columns or 
        cash_flow['Investing - Net Cash from Investing Activities'].isna().all()):
        
        investing_cols = [col for col in cash_flow.columns 
                          if col.startswith('Investing - ') and 'Net Cash' not in col]
        
        if investing_cols:
            cash_flow['Investing - Net Cash from Investing Activities (calculated)'] = 0
            
            # Sum all investing activities
            for col in investing_cols:
                # Skip NaN columns
                if not cash_flow[col].isna().all():
                    cash_flow['Investing - Net Cash from Investing Activities (calculated)'] += cash_flow[col]
    
    # If Net Cash from Financing Activities is missing, try to calculate it
    if ('Financing - Net Cash from Financing Activities' not in cash_flow.columns or 
        cash_flow['Financing - Net Cash from Financing Activities'].isna().all()):
        
        financing_cols = [col for col in cash_flow.columns 
                          if col.startswith('Financing - ') and 'Net Cash' not in col]
        
        if financing_cols:
            cash_flow['Financing - Net Cash from Financing Activities (calculated)'] = 0
            
            # Sum all financing activities
            for col in financing_cols:
                # Skip NaN columns
                if not cash_flow[col].isna().all():
                    cash_flow['Financing - Net Cash from Financing Activities (calculated)'] += cash_flow[col]
    
    # If Net Change in Cash is missing, calculate it from operating, investing, and financing
    if ('Cash - Net Change in Cash and Cash Equivalents' not in cash_flow.columns or 
        cash_flow['Cash - Net Change in Cash and Cash Equivalents'].isna().all()):
        
        # Choose the best available values for each section
        op_cash = ('Operating - Net Cash from Operating Activities' 
                   if 'Operating - Net Cash from Operating Activities' in cash_flow.columns 
                   else 'Operating - Net Cash from Operating Activities (calculated)')
        
        inv_cash = ('Investing - Net Cash from Investing Activities' 
                    if 'Investing - Net Cash from Investing Activities' in cash_flow.columns 
                    else 'Investing - Net Cash from Investing Activities (calculated)')
        
        fin_cash = ('Financing - Net Cash from Financing Activities' 
                    if 'Financing - Net Cash from Financing Activities' in cash_flow.columns 
                    else 'Financing - Net Cash from Financing Activities (calculated)')
        
        fx_effect = 'Cash - Effect of Exchange Rate Changes'
        
        # Calculate net change in cash if we have at least operating, investing, and financing
        cash_components = []
        for component in [op_cash, inv_cash, fin_cash]:
            if component in cash_flow.columns and not cash_flow[component].isna().all():
                cash_components.append(component)
                
        # Add forex effect if available
        if fx_effect in cash_flow.columns and not cash_flow[fx_effect].isna().all():
            cash_components.append(fx_effect)
            
        if len(cash_components) >= 3:  # At least have operating, investing, and financing
            cash_flow['Cash - Net Change in Cash and Cash Equivalents (calculated)'] = cash_flow[cash_components].sum(axis=1)
    
    # If Beginning Cash + Net Change doesn't equal Ending Cash, add validation check
    if ('Cash - Cash and Cash Equivalents, Beginning of Period' in cash_flow.columns and 
        'Cash - Cash and Cash Equivalents, End of Period' in cash_flow.columns):
        
        # Choose best available net change value
        net_change_col = ('Cash - Net Change in Cash and Cash Equivalents' 
                           if 'Cash - Net Change in Cash and Cash Equivalents' in cash_flow.columns 
                           else 'Cash - Net Change in Cash and Cash Equivalents (calculated)')
        
        if net_change_col in cash_flow.columns:
            cash_flow['Validation - Cash Reconciliation Difference'] = (
                cash_flow['Cash - Cash and Cash Equivalents, End of Period'] - 
                (cash_flow['Cash - Cash and Cash Equivalents, Beginning of Period'] + cash_flow[net_change_col])
            )
    
    # Add validation for operating + investing + financing = net change
    op_cash_col = next((col for col in cash_flow.columns 
                        if col.startswith('Operating - Net Cash') and not cash_flow[col].isna().all()), None)
    
    inv_cash_col = next((col for col in cash_flow.columns 
                         if col.startswith('Investing - Net Cash') and not cash_flow[col].isna().all()), None)
    
    fin_cash_col = next((col for col in cash_flow.columns 
                         if col.startswith('Financing - Net Cash') and not cash_flow[col].isna().all()), None)
    
    net_change_col = next((col for col in cash_flow.columns 
                           if col.startswith('Cash - Net Change') and not cash_flow[col].isna().all()), None)
    
    fx_effect_col = next((col for col in cash_flow.columns 
                          if 'Effect of Exchange Rate' in col and not cash_flow[col].isna().all()), None)
    
    if op_cash_col and inv_cash_col and fin_cash_col and net_change_col:
        components_sum = cash_flow[op_cash_col] + cash_flow[inv_cash_col] + cash_flow[fin_cash_col]
        
        # Add forex effect if available
        if fx_effect_col:
            components_sum += cash_flow[fx_effect_col]
            
        cash_flow['Validation - Net Cash Components Difference'] = cash_flow[net_change_col] - components_sum
    
    return cash_flow

# Function to display a human-readable cash flow statement
def display_cash_flow_statement(cash_flow, in_millions=True):
    """
    Display the cash flow statement in a clean, readable format.
    
    Args:
        cash_flow: DataFrame with the reconstructed cash flow statement
        in_millions: If True, display values in millions; otherwise in thousands
    
    Returns:
        DataFrame with the formatted cash flow statement
    """
    # Make a copy to avoid modifying the original
    formatted_cf = cash_flow.copy()
    
    # Identify numeric columns
    numeric_cols = [col for col in formatted_cf.columns 
                    if any(col.startswith(prefix) for prefix in ['Operating', 'Investing', 'Financing', 'Cash', 'Supplemental', 'Validation'])]
    
    # Convert to millions or thousands
    divisor = 1_000_000 if in_millions else 1_000
    suffix = 'M' if in_millions else 'K'
    
    for col in numeric_cols:
        formatted_cf[col] = formatted_cf[col] / divisor
        
    # Format the date column if it exists
    if 'end' in formatted_cf.columns:
        try:
            formatted_cf['end'] = pd.to_datetime(formatted_cf['end']).dt.strftime('%Y-%m-%d')
        except:
            pass
    
    # Create sections for display in proper cash flow statement order
    display_sections = [
        'Operating',
        'Investing',
        'Financing',
        'Cash',
        'Supplemental'
    ]
    
    # Columns to display
    display_cols = ['end'] if 'end' in formatted_cf.columns else []
    for section in display_sections:
        section_cols = [col for col in formatted_cf.columns if col.startswith(f"{section} - ")]
        display_cols.extend(section_cols)
    
    # Add validation columns at the end if they exist
    validation_cols = [col for col in formatted_cf.columns if col.startswith('Validation - ')]
    display_cols.extend(validation_cols)
    
    return formatted_cf[display_cols]

# Function to remove columns that only contain NaN values
def remove_nan_only_columns(df):
    """
    Removes columns that contain only NaN values.
    
    Args:
        df: DataFrame to clean
    
    Returns:
        DataFrame with NaN-only columns removed
    """
    # Find columns where all values are NaN
    nan_cols = df.columns[df.isna().all()].tolist()
    
    # Drop those columns
    return df.drop(columns=nan_cols)

# Example usage
def analyze_microsoft_cash_flow(df):
    """
    Analyze Microsoft's cash flow statement from XBRL data.
    
    Args:
        df: DataFrame containing Microsoft's XBRL-tagged financial data
    
    Returns:
        The reconstructed cash flow statement
    """
    print("Reconstructing Microsoft's cash flow statement from XBRL data...")
    cash_flow = reconstruct_cash_flow_statement(df)
    
    # Remove columns with only NaN values
    cash_flow = remove_nan_only_columns(cash_flow)
    
    print("\nCash Flow Statement Summary:")
    print(f"- Total columns: {len(cash_flow.columns)}")
    print(f"- Data for {len(cash_flow)} reporting periods")
    
    # Display the cash flow statement
    print("\nFormatted Cash Flow Statement (in millions USD):")
    formatted_cf = display_cash_flow_statement(cash_flow, in_millions=True)
    
    return cash_flow, formatted_cf

# To use this code with your DataFrame:
cash_flow, formatted_cf = analyze_microsoft_cash_flow(consolidated_df)

Reconstructing Microsoft's cash flow statement from XBRL data...

Cash Flow Statement Summary:
- Total columns: 25
- Data for 26 reporting periods

Formatted Cash Flow Statement (in millions USD):


In [16]:
# Keep rows with exactly Q1, Q2, Q3, Q4 or just the year (like CY2019)
cash_flow = cash_flow[~cash_flow['frame'].str.match(r'CY\d{4}Q[1-4].+')]
cash_flow = cash_flow.sort_values(by='end')
cash_flow.T

Unnamed: 0,50,51,52,54,55,56,57,58,59,61,...,69,70,71,72,73,74,75,76,77,78
filed,2019-11-04,2019-02-26,2019-05-02,2019-08-06,2019-11-04,2020-02-25,2020-05-07,2020-08-03,2020-11-02,2021-02-23,...,2022-11-03,2023-02-21,2023-05-03,2023-08-03,2023-11-01,2024-02-16,2024-05-01,2025-02-14,2024-10-30,2025-02-14
company_name,JPMorgan Chase & Co,JPMorgan Chase & Co,JPMorgan Chase & Co,JPMorgan Chase & Co,JPMorgan Chase & Co,JPMorgan Chase & Co,JPMorgan Chase & Co,JPMorgan Chase & Co,JPMorgan Chase & Co,JPMorgan Chase & Co,...,JPMorgan Chase & Co,JPMorgan Chase & Co,JPMorgan Chase & Co,JPMorgan Chase & Co,JPMorgan Chase & Co,JPMorgan Chase & Co,JPMorgan Chase & Co,JPMorgan Chase & Co,JPMorgan Chase & Co,JPMorgan Chase & Co
end,2018-09-30,2018-12-31,2019-03-31,2019-06-30,2019-09-30,2019-12-31,2020-03-31,2020-06-30,2020-09-30,2020-12-31,...,2022-09-30,2022-12-31,2023-03-31,2023-06-30,2023-09-30,2023-12-31,2024-03-31,2024-06-30,2024-09-30,2024-12-31
unit,USD,USD,USD,USD,USD,USD,USD,USD,USD,USD,...,USD,USD,USD,USD,USD,USD,USD,USD,USD,USD
form,10-Q,10-K,10-Q,10-Q,10-Q,10-K,10-Q,10-Q,10-Q,10-K,...,10-Q,10-K,10-Q,10-Q,10-Q,10-K,10-Q,10-K,10-Q,10-K
frame,CY2018Q3,CY2018,CY2019Q1,CY2019Q2,CY2019Q3,CY2019,CY2020Q1,CY2020Q2,CY2020Q3,CY2020,...,CY2022Q3,CY2022,CY2023Q1,CY2023Q2,CY2023Q3,CY2023,CY2024Q1,CY2024Q2,CY2024Q3,CY2024
cik,19617,19617,19617,19617,19617,19617,19617,19617,19617,19617,...,19617,19617,19617,19617,19617,19617,19617,19617,19617,19617
Operating - Net Income,8380000000.0,32474000000.0,9179000000.0,9652000000.0,9080000000.0,36431000000.0,2865000000.0,4687000000.0,9443000000.0,29131000000.0,...,9737000000.0,37676000000.0,12622000000.0,14472000000.0,13151000000.0,49552000000.0,13419000000.0,18149000000.0,12898000000.0,58471000000.0
Operating - Depreciation and Amortization,901000000.0,16616000000.0,2038000000.0,0.0,0.0,12525000000.0,2197000000.0,0.0,1035000000.0,19259000000.0,...,620000000.0,13808000000.0,2068000000.0,457000000.0,468000000.0,12578000000.0,2439000000.0,438000000.0,394000000.0,12132000000.0
Operating - Stock-based Compensation,522000000.0,2322000000.0,653000000.0,570000000.0,559000000.0,2256000000.0,644000000.0,802000000.0,521000000.0,2451000000.0,...,684000000.0,2794000000.0,870000000.0,834000000.0,782000000.0,3117000000.0,938000000.0,944000000.0,849000000.0,3504000000.0


In [17]:
balance_sheet.head(10)

Unnamed: 0,filed,company_name,end,unit,form,frame,cik,Assets - Cash and Cash Equivalents,"Assets - Property, Plant and Equipment, Net",Assets - Goodwill,...,Liabilities - Long-term Debt,Liabilities - Total Liabilities,Equity - Common Stock and Paid-in Capital,Equity - Retained Earnings,Equity - Accumulated Other Comprehensive Income,Equity - Total Stockholders Equity,Equity - Total Liabilities and Equity,Validation - Current Liabilities Difference,Validation - A = L+E Difference,Validation - Components = Total L+E Difference
50,2019-11-04,JPMorgan Chase & Co,2018-09-30,USD,10-Q,CY2018Q3,19617,42200000000.0,14180000000.0,47483000000.0,...,,2356227000000.0,121202000000.0,195180000000.0,-2425000000.0,258956000000.0,2615183000000.0,209707000000.0,0.0,0.0
51,2019-02-26,JPMorgan Chase & Co,2018-12-31,USD,10-K,CY2018,19617,40800000000.0,14934000000.0,47471000000.0,...,,2366017000000.0,119335000000.0,199202000000.0,-1507000000.0,256515000000.0,2622532000000.0,196710000000.0,0.0,0.0
52,2019-05-02,JPMorgan Chase & Co,2019-03-31,USD,10-Q,CY2019Q1,19617,42800000000.0,24160000000.0,47474000000.0,...,,2477351000000.0,119268000000.0,205437000000.0,-558000000.0,259837000000.0,2737188000000.0,216173000000.0,0.0,0.0
54,2019-08-06,JPMorgan Chase & Co,2019-06-30,USD,10-Q,CY2019Q2,19617,45500000000.0,24665000000.0,47477000000.0,...,,2464164000000.0,119457000000.0,212093000000.0,1114000000.0,263215000000.0,2727379000000.0,216137000000.0,0.0,0.0
55,2019-11-04,JPMorgan Chase & Co,2019-09-30,USD,10-Q,CY2019Q3,19617,47100000000.0,25117000000.0,47818000000.0,...,,2500313000000.0,120980000000.0,217888000000.0,1800000000.0,264348000000.0,2764661000000.0,225063000000.0,0.0,0.0
56,2020-02-25,JPMorgan Chase & Co,2019-12-31,USD,10-K,CY2019,19617,46500000000.0,25813000000.0,47823000000.0,...,,2426049000000.0,119620000000.0,223211000000.0,1569000000.0,261330000000.0,2687379000000.0,210407000000.0,-902000000.0,0.0
57,2020-05-07,JPMorgan Chase & Co,2020-03-31,USD,10-Q,CY2020Q1,19617,23500000000.0,25882000000.0,47800000000.0,...,,2878169000000.0,122025000000.0,220226000000.0,7418000000.0,261262000000.0,3139431000000.0,253874000000.0,-901000000.0,0.0
58,2020-08-03,JPMorgan Chase & Co,2020-06-30,USD,10-Q,CY2020Q2,19617,25200000000.0,26301000000.0,47811000000.0,...,,2948649000000.0,122293000000.0,221732000000.0,8789000000.0,264466000000.0,3213115000000.0,230916000000.0,-472000000.0,0.0
59,2020-11-02,JPMorgan Chase & Co,2020-09-30,USD,10-Q,CY2020Q3,19617,24700000000.0,26672000000.0,47819000000.0,...,,2974963000000.0,122457000000.0,228014000000.0,8940000000.0,271113000000.0,3246076000000.0,234256000000.0,-1015000000.0,0.0
61,2021-02-23,JPMorgan Chase & Co,2020-12-31,USD,10-K,CY2020,19617,24400000000.0,27109000000.0,49248000000.0,...,,3105403000000.0,122562000000.0,236990000000.0,7986000000.0,279354000000.0,3384757000000.0,231285000000.0,0.0,0.0


In [18]:
balance_sheet.to_csv('./MSFT_bs.csv')
income_statement.to_csv('./MSFT_is.csv')

In [19]:
income_statement.head(10)

Unnamed: 0,filed,company_name,end,unit,form,frame,cik,Revenue - Total Revenue,Other - Interest Income,Other - Interest Expense,Income Before Tax - Income Before Income Taxes,Tax - Income Tax Expense/Benefit,Net Income - Net Income,Net Income - Net Income Attributable to Company,EPS - Basic EPS,EPS - Diluted EPS,Validation - Net Income Difference
50,2019-11-04,JPMorgan Chase & Co,2018-09-30,USD,10-Q,CY2018Q3,19617,,1796000000.0,5531000000.0,,2309000000.0,8380000000.0,7948000000.0,2.35,2.34,
51,2019-02-26,JPMorgan Chase & Co,2018-12-31,USD,10-K,CY2018,19617,108783000000.0,7248000000.0,21041000000.0,,8290000000.0,32474000000.0,30709000000.0,9.04,9.0,
52,2019-05-02,JPMorgan Chase & Co,2019-03-31,USD,10-Q,CY2019Q1,19617,,2068000000.0,6936000000.0,,2054000000.0,9179000000.0,8753000000.0,2.65,2.65,
54,2019-08-06,JPMorgan Chase & Co,2019-06-30,USD,10-Q,CY2019Q2,19617,,2215000000.0,7205000000.0,,1690000000.0,9652000000.0,9192000000.0,2.83,2.82,
55,2019-11-04,JPMorgan Chase & Co,2019-09-30,USD,10-Q,CY2019Q3,19617,,2450000000.0,6893000000.0,,2325000000.0,9080000000.0,8606000000.0,2.69,2.68,
56,2020-02-25,JPMorgan Chase & Co,2019-12-31,USD,10-K,CY2019,19617,115720000000.0,9291000000.0,26795000000.0,1816000000.0,8435000000.0,36431000000.0,34642000000.0,10.75,10.72,43050000000.0
57,2020-05-07,JPMorgan Chase & Co,2020-03-31,USD,10-Q,CY2020Q1,19617,,2533000000.0,4722000000.0,,345000000.0,2865000000.0,2431000000.0,0.79,0.78,
58,2020-08-03,JPMorgan Chase & Co,2020-06-30,USD,10-Q,CY2020Q2,19617,,2461000000.0,2259000000.0,,973000000.0,4687000000.0,4265000000.0,1.39,1.38,
59,2020-11-02,JPMorgan Chase & Co,2020-09-30,USD,10-Q,CY2020Q3,19617,,2110000000.0,1687000000.0,,2326000000.0,9443000000.0,9015000000.0,2.93,2.92,
61,2021-02-23,JPMorgan Chase & Co,2020-12-31,USD,10-K,CY2020,19617,119951000000.0,9027000000.0,9960000000.0,1586000000.0,6684000000.0,29131000000.0,27410000000.0,8.89,8.88,34229000000.0


In [20]:
cash_flow.T

Unnamed: 0,50,51,52,54,55,56,57,58,59,61,...,69,70,71,72,73,74,75,76,77,78
filed,2019-11-04,2019-02-26,2019-05-02,2019-08-06,2019-11-04,2020-02-25,2020-05-07,2020-08-03,2020-11-02,2021-02-23,...,2022-11-03,2023-02-21,2023-05-03,2023-08-03,2023-11-01,2024-02-16,2024-05-01,2025-02-14,2024-10-30,2025-02-14
company_name,JPMorgan Chase & Co,JPMorgan Chase & Co,JPMorgan Chase & Co,JPMorgan Chase & Co,JPMorgan Chase & Co,JPMorgan Chase & Co,JPMorgan Chase & Co,JPMorgan Chase & Co,JPMorgan Chase & Co,JPMorgan Chase & Co,...,JPMorgan Chase & Co,JPMorgan Chase & Co,JPMorgan Chase & Co,JPMorgan Chase & Co,JPMorgan Chase & Co,JPMorgan Chase & Co,JPMorgan Chase & Co,JPMorgan Chase & Co,JPMorgan Chase & Co,JPMorgan Chase & Co
end,2018-09-30,2018-12-31,2019-03-31,2019-06-30,2019-09-30,2019-12-31,2020-03-31,2020-06-30,2020-09-30,2020-12-31,...,2022-09-30,2022-12-31,2023-03-31,2023-06-30,2023-09-30,2023-12-31,2024-03-31,2024-06-30,2024-09-30,2024-12-31
unit,USD,USD,USD,USD,USD,USD,USD,USD,USD,USD,...,USD,USD,USD,USD,USD,USD,USD,USD,USD,USD
form,10-Q,10-K,10-Q,10-Q,10-Q,10-K,10-Q,10-Q,10-Q,10-K,...,10-Q,10-K,10-Q,10-Q,10-Q,10-K,10-Q,10-K,10-Q,10-K
frame,CY2018Q3,CY2018,CY2019Q1,CY2019Q2,CY2019Q3,CY2019,CY2020Q1,CY2020Q2,CY2020Q3,CY2020,...,CY2022Q3,CY2022,CY2023Q1,CY2023Q2,CY2023Q3,CY2023,CY2024Q1,CY2024Q2,CY2024Q3,CY2024
cik,19617,19617,19617,19617,19617,19617,19617,19617,19617,19617,...,19617,19617,19617,19617,19617,19617,19617,19617,19617,19617
Operating - Net Income,8380000000.0,32474000000.0,9179000000.0,9652000000.0,9080000000.0,36431000000.0,2865000000.0,4687000000.0,9443000000.0,29131000000.0,...,9737000000.0,37676000000.0,12622000000.0,14472000000.0,13151000000.0,49552000000.0,13419000000.0,18149000000.0,12898000000.0,58471000000.0
Operating - Depreciation and Amortization,901000000.0,16616000000.0,2038000000.0,0.0,0.0,12525000000.0,2197000000.0,0.0,1035000000.0,19259000000.0,...,620000000.0,13808000000.0,2068000000.0,457000000.0,468000000.0,12578000000.0,2439000000.0,438000000.0,394000000.0,12132000000.0
Operating - Stock-based Compensation,522000000.0,2322000000.0,653000000.0,570000000.0,559000000.0,2256000000.0,644000000.0,802000000.0,521000000.0,2451000000.0,...,684000000.0,2794000000.0,870000000.0,834000000.0,782000000.0,3117000000.0,938000000.0,944000000.0,849000000.0,3504000000.0


## I want to merge all of the accounts. This way when I am computing the different ratios all of the different accounts and cross balance sheet income statement accoutns are in the same dataframe.

In [21]:
# Pre-Processing: Clean columns names
def clean_col_names(df):
    """Removes leading/trailing whitespace from DataFrame column names."""
    df.columns = df.columns.str.strip()
    print(f"Cleaned columns: {df.columns.tolist()}") # Optional: verify cleaning
    return df

balance_sheet = clean_col_names(balance_sheet)
income_statement = clean_col_names(income_statement)
cash_flow = clean_col_names(cash_flow)


Cleaned columns: ['filed', 'company_name', 'end', 'unit', 'form', 'frame', 'cik', 'Assets - Cash and Cash Equivalents', 'Assets - Property, Plant and Equipment, Net', 'Assets - Goodwill', 'Assets - Intangible Assets, Net', 'Assets - Total Assets', 'Liabilities - Accounts Payable', 'Liabilities - Accrued Expenses', 'Liabilities - Total Current Liabilities', 'Liabilities - Long-term Debt', 'Liabilities - Total Liabilities', 'Equity - Common Stock and Paid-in Capital', 'Equity - Retained Earnings', 'Equity - Accumulated Other Comprehensive Income', 'Equity - Total Stockholders Equity', 'Equity - Total Liabilities and Equity', 'Validation - Current Liabilities Difference', 'Validation - A = L+E Difference', 'Validation - Components = Total L+E Difference']
Cleaned columns: ['filed', 'company_name', 'end', 'unit', 'form', 'frame', 'cik', 'Revenue - Total Revenue', 'Other - Interest Income', 'Other - Interest Expense', 'Income Before Tax - Income Before Income Taxes', 'Tax - Income Tax Expen

In [22]:
# Define the merge key 
# The merge key is the frame column because this column has the account data for Calendar Year or Calendar Period for different quarters
merge_key = 'frame'

# Step 1: Merge balance_sheet and income_statement
# Use 'outer' merge to keep all unique 'frame' values from both DataFrames.
# Add suffixes to distinguish overlapping columns (other than the merge key).
merged_df_1 = pd.merge(balance_sheet, income_statement,
                       on=merge_key,
                       how='outer',
                       suffixes=('_bs', '_is'))

# Doing the final merge to have the cashflow statement together with the balance sheet and income statement
merged_df_final = pd.merge(merged_df_1, cash_flow,
                           on=merge_key,
                           how='outer',
                           suffixes=('_merged', '_cf')) # Suffixes apply to cols overlapping between merged_df_1 & cash_flow






In [23]:
merged_df_final = clean_col_names(merged_df_final)

Cleaned columns: ['filed_bs', 'company_name_bs', 'end_bs', 'unit_bs', 'form_bs', 'frame', 'cik_bs', 'Assets - Cash and Cash Equivalents', 'Assets - Property, Plant and Equipment, Net', 'Assets - Goodwill', 'Assets - Intangible Assets, Net', 'Assets - Total Assets', 'Liabilities - Accounts Payable', 'Liabilities - Accrued Expenses', 'Liabilities - Total Current Liabilities', 'Liabilities - Long-term Debt', 'Liabilities - Total Liabilities', 'Equity - Common Stock and Paid-in Capital', 'Equity - Retained Earnings', 'Equity - Accumulated Other Comprehensive Income', 'Equity - Total Stockholders Equity', 'Equity - Total Liabilities and Equity', 'Validation - Current Liabilities Difference', 'Validation - A = L+E Difference', 'Validation - Components = Total L+E Difference', 'filed_is', 'company_name_is', 'end_is', 'unit_is', 'form_is', 'cik_is', 'Revenue - Total Revenue', 'Other - Interest Income', 'Other - Interest Expense', 'Income Before Tax - Income Before Income Taxes', 'Tax - Income 

In [24]:
import pandas as pd
import numpy as np

def calculate_cross_statement_ratios(df):
    """
    Calculate financial ratios that use both balance sheet and income statement accounts.
    For a single company with data already sorted chronologically.
    
    Parameters:
    df (DataFrame): DataFrame containing both balance sheet and income statement accounts
    
    Returns:
    DataFrame: Original DataFrame with added ratio columns
    """
    # Create a copy of the DataFrame to avoid modifying the original
    result_df = df.copy()
    
    # Define a function to safely calculate ratios, handling division by zero and NaN
    def safe_divide(numerator, denominator):
        # Perform the division
        result = numerator / denominator
        # Replace infinite values (from division by zero) with NaN
        result.replace([np.inf, -np.inf], np.nan, inplace=True)
        return result
    
    # Define a function to check if all required columns exist
    def columns_exist(column_list):
        return all(column in df.columns for column in column_list)
    
    # Track which calculations were performed and which were skipped
    calculated_items = []
    skipped_items = []
    
    # Define previous period values to calculate
    previous_period_values = [
        {
            'name': 'Prev_Assets_Total_Assets',
            'source': 'Assets - Total Assets',
            'description': 'Previous Period Total Assets'
        },
        {
            'name': 'Prev_Equity_Total_Stockholders_Equity',
            'source': 'Equity - Total Stockholders Equity',
            'description': 'Previous Period Total Stockholders Equity'
        },
        {
            'name': 'Prev_Assets_Inventory',
            'source': 'Assets - Inventory',
            'description': 'Previous Period Inventory'
        },
        {
            'name': 'Prev_Assets_Accounts_Receivable_Current',
            'source': 'Assets - Accounts Receivable, Net (Current)',
            'description': 'Previous Period Accounts Receivable'
        },
        {
            'name': 'Prev_Liabilities_Accounts_Payable',
            'source': 'Liabilities - Accounts Payable',
            'description': 'Previous Period Accounts Payable'
        },
        {
            'name': 'Prev_Assets_Total_Current_Assets',
            'source': 'Assets - Total Current Assets',
            'description': 'Previous Period Total Current Assets'
        },
        {
            'name': 'Prev_Liabilities_Total_Current_Liabilities',
            'source': 'Liabilities - Total Current Liabilities',
            'description': 'Previous Period Total Current Liabilities'
        }
    ]
    
    # Calculate previous period values if columns exist
    for item in previous_period_values:
        if columns_exist([item['source']]):
            result_df[item['name']] = result_df[item['source']].shift(1)
            calculated_items.append(item['name'])
        else:
            skipped_items.append(f"{item['name']} ({item['description']})")
    
    # Define average values to calculate
    average_values = [
        {
            'name': 'Avg_Total_Assets',
            'current': 'Assets - Total Assets',
            'previous': 'Prev_Assets_Total_Assets',
            'description': 'Average Total Assets'
        },
        {
            'name': 'Avg_Stockholders_Equity',
            'current': 'Equity - Total Stockholders Equity',
            'previous': 'Prev_Equity_Total_Stockholders_Equity',
            'description': 'Average Stockholders Equity'
        },
        {
            'name': 'Avg_Inventory',
            'current': 'Assets - Inventory',
            'previous': 'Prev_Assets_Inventory',
            'description': 'Average Inventory'
        },
        {
            'name': 'Avg_Accounts_Receivable',
            'current': 'Assets - Accounts Receivable, Net (Current)',
            'previous': 'Prev_Assets_Accounts_Receivable_Current',
            'description': 'Average Accounts Receivable'
        },
        {
            'name': 'Avg_Accounts_Payable',
            'current': 'Liabilities - Accounts Payable',
            'previous': 'Prev_Liabilities_Accounts_Payable',
            'description': 'Average Accounts Payable'
        }
    ]
    
    # Calculate average values if columns exist
    for item in average_values:
        if columns_exist([item['current']]) and item['previous'] in result_df.columns:
            result_df[item['name']] = (result_df[item['current']] + result_df[item['previous']]) / 2
            calculated_items.append(item['name'])
        else:
            skipped_items.append(f"{item['name']} ({item['description']})")
    
    # Working Capital calculations
    if columns_exist(['Assets - Total Current Assets', 'Liabilities - Total Current Liabilities']):
        result_df['Working_Capital'] = result_df['Assets - Total Current Assets'] - result_df['Liabilities - Total Current Liabilities']
        calculated_items.append('Working_Capital')
        
        if all(col in result_df.columns for col in ['Prev_Assets_Total_Current_Assets', 'Prev_Liabilities_Total_Current_Liabilities']):
            result_df['Prev_Working_Capital'] = result_df['Prev_Assets_Total_Current_Assets'] - result_df['Prev_Liabilities_Total_Current_Liabilities']
            result_df['Avg_Working_Capital'] = (result_df['Working_Capital'] + result_df['Prev_Working_Capital']) / 2
            calculated_items.extend(['Prev_Working_Capital', 'Avg_Working_Capital'])
        else:
            skipped_items.append('Prev_Working_Capital (Previous Period Working Capital)')
            skipped_items.append('Avg_Working_Capital (Average Working Capital)')
    else:
        skipped_items.append('Working_Capital (Current Assets - Current Liabilities)')
        skipped_items.append('Prev_Working_Capital (Previous Period Working Capital)')
        skipped_items.append('Avg_Working_Capital (Average Working Capital)')
    
    # Total Debt calculation
    if columns_exist(['Liabilities - Current Portion of Long-term Debt', 'Liabilities - Long-term Debt']):
        result_df['Total_Debt'] = result_df['Liabilities - Current Portion of Long-term Debt'] + result_df['Liabilities - Long-term Debt']
        calculated_items.append('Total_Debt')
    else:
        skipped_items.append('Total_Debt (Current Portion of Long-term Debt + Long-term Debt)')
    
    # Define ratios to calculate
    ratios = [
        {
            'name': 'Return_on_Assets',
            'numerator': 'Net Income - Net Income',
            'denominator': 'Avg_Total_Assets',
            'description': 'Net Income / Average Total Assets'
        },
        {
            'name': 'Return_on_Equity',
            'numerator': 'Net Income - Net Income',
            'denominator': 'Avg_Stockholders_Equity',
            'description': 'Net Income / Average Stockholders Equity'
        },
        {
            'name': 'Asset_Turnover_Ratio',
            'numerator': 'Revenue - Total Revenue',
            'denominator': 'Avg_Total_Assets',
            'description': 'Revenue / Average Total Assets'
        },
        {
            'name': 'Inventory_Turnover_Ratio',
            'numerator': 'Cost - Total Cost of Revenue',
            'denominator': 'Avg_Inventory',
            'description': 'Cost of Revenue / Average Inventory'
        },
        {
            'name': 'Accounts_Receivable_Turnover_Ratio',
            'numerator': 'Revenue - Total Revenue',
            'denominator': 'Avg_Accounts_Receivable',
            'description': 'Revenue / Average Accounts Receivable'
        },
        {
            'name': 'Accounts_Payable_Turnover_Ratio',
            'numerator': 'Cost - Total Cost of Revenue',
            'denominator': 'Avg_Accounts_Payable',
            'description': 'Cost of Revenue / Average Accounts Payable'
        },
        {
            'name': 'Working_Capital_Turnover_Ratio',
            'numerator': 'Revenue - Total Revenue',
            'denominator': 'Avg_Working_Capital',
            'description': 'Revenue / Average Working Capital'
        },
        {
            'name': 'Debt_to_EBITDA_Ratio',
            'numerator': 'Total_Debt',
            'denominator': 'Operating - Operating Income',
            'description': 'Total Debt / Operating Income'
        }
    ]
    
    # Calculate each ratio if the required columns exist
    for ratio in ratios:
        # Check if numerator exists in original DataFrame or was calculated
        numerator_exists = ratio['numerator'] in df.columns or ratio['numerator'] in result_df.columns
        # Check if denominator exists in original DataFrame or was calculated
        denominator_exists = ratio['denominator'] in df.columns or ratio['denominator'] in result_df.columns
        
        if numerator_exists and denominator_exists:
            result_df[ratio['name']] = safe_divide(
                result_df[ratio['numerator']],
                result_df[ratio['denominator']]
            )
            calculated_items.append(ratio['name'])
        else:
            skipped_items.append(f"{ratio['name']} ({ratio['description']})")
    
    # Print summary
    if calculated_items:
        print(f"Successfully calculated: {', '.join(calculated_items)}")
    if skipped_items:
        print(f"Skipped due to missing columns: {', '.join(skipped_items)}")
    
    return result_df

# Example usage:
# Assuming 'financial_data' is a DataFrame containing both balance sheet and income statement accounts
financial_data_with_ratios = calculate_cross_statement_ratios(merged_df_final)
financial_data_with_ratios.head()

Successfully calculated: Prev_Assets_Total_Assets, Prev_Equity_Total_Stockholders_Equity, Prev_Liabilities_Accounts_Payable, Prev_Liabilities_Total_Current_Liabilities, Avg_Total_Assets, Avg_Stockholders_Equity, Avg_Accounts_Payable, Return_on_Assets, Return_on_Equity, Asset_Turnover_Ratio
Skipped due to missing columns: Prev_Assets_Inventory (Previous Period Inventory), Prev_Assets_Accounts_Receivable_Current (Previous Period Accounts Receivable), Prev_Assets_Total_Current_Assets (Previous Period Total Current Assets), Avg_Inventory (Average Inventory), Avg_Accounts_Receivable (Average Accounts Receivable), Working_Capital (Current Assets - Current Liabilities), Prev_Working_Capital (Previous Period Working Capital), Avg_Working_Capital (Average Working Capital), Total_Debt (Current Portion of Long-term Debt + Long-term Debt), Inventory_Turnover_Ratio (Cost of Revenue / Average Inventory), Accounts_Receivable_Turnover_Ratio (Revenue / Average Accounts Receivable), Accounts_Payable_Tur

Unnamed: 0,filed_bs,company_name_bs,end_bs,unit_bs,form_bs,frame,cik_bs,Assets - Cash and Cash Equivalents,"Assets - Property, Plant and Equipment, Net",Assets - Goodwill,...,Prev_Assets_Total_Assets,Prev_Equity_Total_Stockholders_Equity,Prev_Liabilities_Accounts_Payable,Prev_Liabilities_Total_Current_Liabilities,Avg_Total_Assets,Avg_Stockholders_Equity,Avg_Accounts_Payable,Return_on_Assets,Return_on_Equity,Asset_Turnover_Ratio
0,2019-02-26,JPMorgan Chase & Co,2018-12-31,USD,10-K,CY2018,19617,40800000000.0,14934000000.0,47471000000.0,...,,,,,,,,,,
1,2019-11-04,JPMorgan Chase & Co,2018-09-30,USD,10-Q,CY2018Q3,19617,42200000000.0,14180000000.0,47483000000.0,...,2622532000000.0,256515000000.0,196710000000.0,196710000000.0,2618858000000.0,257735500000.0,203208500000.0,0.0032,0.032514,
2,2020-02-25,JPMorgan Chase & Co,2019-12-31,USD,10-K,CY2019,19617,46500000000.0,25813000000.0,47823000000.0,...,2615183000000.0,258956000000.0,209707000000.0,209707000000.0,2650830000000.0,260143000000.0,210057000000.0,0.013743,0.140042,0.043654
3,2019-05-02,JPMorgan Chase & Co,2019-03-31,USD,10-Q,CY2019Q1,19617,42800000000.0,24160000000.0,47474000000.0,...,2686477000000.0,261330000000.0,210407000000.0,210407000000.0,2711832000000.0,260583500000.0,213290000000.0,0.003385,0.035225,
4,2019-08-06,JPMorgan Chase & Co,2019-06-30,USD,10-Q,CY2019Q2,19617,45500000000.0,24665000000.0,47477000000.0,...,2737188000000.0,259837000000.0,216173000000.0,216173000000.0,2732284000000.0,261526000000.0,216155000000.0,0.003533,0.036906,


In [25]:
# Example usage:
financial_data_with_ratios = calculate_cross_statement_ratios(merged_df_final)


Successfully calculated: Prev_Assets_Total_Assets, Prev_Equity_Total_Stockholders_Equity, Prev_Liabilities_Accounts_Payable, Prev_Liabilities_Total_Current_Liabilities, Avg_Total_Assets, Avg_Stockholders_Equity, Avg_Accounts_Payable, Return_on_Assets, Return_on_Equity, Asset_Turnover_Ratio
Skipped due to missing columns: Prev_Assets_Inventory (Previous Period Inventory), Prev_Assets_Accounts_Receivable_Current (Previous Period Accounts Receivable), Prev_Assets_Total_Current_Assets (Previous Period Total Current Assets), Avg_Inventory (Average Inventory), Avg_Accounts_Receivable (Average Accounts Receivable), Working_Capital (Current Assets - Current Liabilities), Prev_Working_Capital (Previous Period Working Capital), Avg_Working_Capital (Average Working Capital), Total_Debt (Current Portion of Long-term Debt + Long-term Debt), Inventory_Turnover_Ratio (Cost of Revenue / Average Inventory), Accounts_Receivable_Turnover_Ratio (Revenue / Average Accounts Receivable), Accounts_Payable_Tur

In [26]:
financial_data_with_ratios.to_csv('./ratios.csv')

In [27]:
merged_df_final.columns

Index(['filed_bs', 'company_name_bs', 'end_bs', 'unit_bs', 'form_bs', 'frame',
       'cik_bs', 'Assets - Cash and Cash Equivalents',
       'Assets - Property, Plant and Equipment, Net', 'Assets - Goodwill',
       'Assets - Intangible Assets, Net', 'Assets - Total Assets',
       'Liabilities - Accounts Payable', 'Liabilities - Accrued Expenses',
       'Liabilities - Total Current Liabilities',
       'Liabilities - Long-term Debt', 'Liabilities - Total Liabilities',
       'Equity - Common Stock and Paid-in Capital',
       'Equity - Retained Earnings',
       'Equity - Accumulated Other Comprehensive Income',
       'Equity - Total Stockholders Equity',
       'Equity - Total Liabilities and Equity',
       'Validation - Current Liabilities Difference',
       'Validation - A = L+E Difference',
       'Validation - Components = Total L+E Difference', 'filed_is',
       'company_name_is', 'end_is', 'unit_is', 'form_is', 'cik_is',
       'Revenue - Total Revenue', 'Other - Interest 

### Cleaning up the duplicate metadata columns created during the merge process

In [28]:
# Metadata columns (excluding the merge key 'frame') that might be duplicated
metadata_cols_to_clean = ['filed', 'company_name', 'end', 'unit', 'form', 'cik']
# Define the possible suffixes added during merges
suffixes = ['_bs', '_is', '_merged', '_cf'] # Ensure all used suffixes are listed

print("\n--- Cleaning up metadata columns ---")

for col in metadata_cols_to_clean:
    # Find all versions of the column (original name or with suffixes)
    col_versions = [c for c in merged_df_final.columns if c == col or any(c == col + s for s in suffixes)]

    if not col_versions:
        # print(f"Metadata column '{col}' not found.")
        continue

    if len(col_versions) == 1:
        # print(f"Metadata column '{col}' has no duplicates.")
        # Ensure the single version is named correctly (without suffix)
        if col_versions[0] != col and col_versions[0] in merged_df_final.columns:
             merged_df_final.rename(columns={col_versions[0]: col}, inplace=True)
        continue

    # print(f"Coalescing column: {col} from versions: {col_versions}")
    # Identify the base column name (could be the first one found, might have a suffix initially)
    base_col_name = col_versions[0]
    # Ensure the target column exists with the correct base name
    if base_col_name != col and base_col_name in merged_df_final.columns:
         merged_df_final.rename(columns={base_col_name: col}, inplace=True)
    elif base_col_name == col and col not in merged_df_final.columns:
         # This case should ideally not happen if col_versions is not empty, but handle defensively
         print(f"Warning: Base column '{col}' expected but not found directly.")
         continue # Skip if the intended target column isn't there after potential rename

    # Use fillna to fill NaNs in the target column 'col' using other versions
    for version in col_versions[1:]:
        if version in merged_df_final.columns: # Check if the source version column exists
            # print(f"  Filling NaNs in '{col}' using '{version}'")
            # Use fillna to populate missing values in 'col' from 'version'
            merged_df_final[col] = merged_df_final[col].fillna(merged_df_final[version])
        # else: # Optional: print if a version column is missing (might happen with outer merges)
            # print(f"  Version column '{version}' not found for filling '{col}'.")


    # Drop the now redundant source columns (all except the target 'col')
    cols_to_drop = [v for v in col_versions if v != col and v in merged_df_final.columns]
    if cols_to_drop:
      # print(f"  Dropping redundant columns: {cols_to_drop}")
      merged_df_final = merged_df_final.drop(columns=cols_to_drop)


--- Cleaning up metadata columns ---


# Ratio Analysis
### Some of the accounts cells are NaN. When computing the ratios that might cause a Divide by Zero error or a NaN error. For those scenarios I will define a function to safely divide by zero 

In [29]:
import pandas as pd
import numpy as np

def calculate_balance_sheet_ratios(df):
    """
    Calculate financial ratios from balance sheet accounts.
    
    Parameters:
    df (DataFrame): DataFrame containing the balance sheet accounts
    
    Returns:
    DataFrame: Original DataFrame with added ratio columns
    """
    # Create a copy of the DataFrame to avoid modifying the original
    result_df = df.copy()
    
    # Define a function to safely calculate ratios, handling division by zero and NaN
    def safe_divide(numerator, denominator):
        # Perform the division
        result = numerator / denominator
        # Replace infinite values (from division by zero) with NaN
        result.replace([np.inf, -np.inf], np.nan, inplace=True)
        return result
    
    # Define a function to check if all required columns exist
    def columns_exist(column_list):
        return all(column in df.columns for column in column_list)
    
    # Define division-based ratios to calculate
    division_ratios = [
        {
            'name': 'Current_Ratio',
            'numerator': 'Assets - Total Current Assets',
            'denominator': 'Liabilities - Total Current Liabilities',
            'description': 'Current Assets / Current Liabilities'
        },
        {
            'name': 'Quick_Ratio',
            'numerator_formula': lambda df: df['Assets - Total Current Assets'] - df['Assets - Inventory'],
            'numerator_columns': ['Assets - Total Current Assets', 'Assets - Inventory'],
            'denominator': 'Liabilities - Total Current Liabilities',
            'description': '(Current Assets - Inventory) / Current Liabilities'
        },
        {
            'name': 'Cash_Ratio',
            'numerator': 'Assets - Cash and Cash Equivalents',
            'denominator': 'Liabilities - Total Current Liabilities',
            'description': 'Cash and Cash Equivalents / Current Liabilities'
        },
        {
            'name': 'Debt_to_Equity_Ratio',
            'numerator': 'Liabilities - Total Liabilities',
            'denominator': 'Equity - Total Stockholders Equity',
            'description': 'Total Liabilities / Total Stockholders Equity'
        },
        {
            'name': 'Debt_to_Assets_Ratio',
            'numerator': 'Liabilities - Total Liabilities',
            'denominator': 'Assets - Total Assets',
            'description': 'Total Liabilities / Total Assets'
        },
        {
            'name': 'Equity_Ratio',
            'numerator': 'Equity - Total Stockholders Equity',
            'denominator': 'Assets - Total Assets',
            'description': 'Total Stockholders Equity / Total Assets'
        }
    ]
    
    # Define subtraction-based metrics
    subtraction_metrics = [
        {
            'name': 'Working_Capital',
            'minuend': 'Assets - Total Current Assets',
            'subtrahend': 'Liabilities - Total Current Liabilities',
            'description': 'Current Assets - Current Liabilities'
        }
    ]
    
    # Track which ratios were calculated and which were skipped
    calculated_ratios = []
    skipped_ratios = []
    
    # Calculate each division-based ratio if the required columns exist
    for ratio in division_ratios:
        if 'numerator_formula' in ratio:
            # Complex numerator requiring multiple columns
            if columns_exist(ratio['numerator_columns'] + [ratio['denominator']]):
                result_df[ratio['name']] = safe_divide(
                    ratio['numerator_formula'](result_df),
                    result_df[ratio['denominator']]
                )
                calculated_ratios.append(ratio['name'])
            else:
                skipped_ratios.append(f"{ratio['name']} ({ratio['description']})")
        else:
            # Simple numerator and denominator
            if columns_exist([ratio['numerator'], ratio['denominator']]):
                result_df[ratio['name']] = safe_divide(
                    result_df[ratio['numerator']],
                    result_df[ratio['denominator']]
                )
                calculated_ratios.append(ratio['name'])
            else:
                skipped_ratios.append(f"{ratio['name']} ({ratio['description']})")
    
    # Calculate each subtraction-based metric if the required columns exist
    for metric in subtraction_metrics:
        if columns_exist([metric['minuend'], metric['subtrahend']]):
            result_df[metric['name']] = result_df[metric['minuend']] - result_df[metric['subtrahend']]
            calculated_ratios.append(metric['name'])
        else:
            skipped_ratios.append(f"{metric['name']} ({metric['description']})")
    
    # Print summary
    if calculated_ratios:
        print(f"Successfully calculated: {', '.join(calculated_ratios)}")
    if skipped_ratios:
        print(f"Skipped due to missing columns: {', '.join(skipped_ratios)}")
    
    return result_df

# Example usage:
# Assuming 'balance_sheet' is a DataFrame containing the balance sheet accounts
financial_data_with_ratios = calculate_balance_sheet_ratios(balance_sheet)
financial_data_with_ratios.head()

Successfully calculated: Cash_Ratio, Debt_to_Equity_Ratio, Debt_to_Assets_Ratio, Equity_Ratio
Skipped due to missing columns: Current_Ratio (Current Assets / Current Liabilities), Quick_Ratio ((Current Assets - Inventory) / Current Liabilities), Working_Capital (Current Assets - Current Liabilities)


Unnamed: 0,filed,company_name,end,unit,form,frame,cik,Assets - Cash and Cash Equivalents,"Assets - Property, Plant and Equipment, Net",Assets - Goodwill,...,Equity - Accumulated Other Comprehensive Income,Equity - Total Stockholders Equity,Equity - Total Liabilities and Equity,Validation - Current Liabilities Difference,Validation - A = L+E Difference,Validation - Components = Total L+E Difference,Cash_Ratio,Debt_to_Equity_Ratio,Debt_to_Assets_Ratio,Equity_Ratio
50,2019-11-04,JPMorgan Chase & Co,2018-09-30,USD,10-Q,CY2018Q3,19617,42200000000.0,14180000000.0,47483000000.0,...,-2425000000.0,258956000000.0,2615183000000.0,209707000000.0,0.0,0.0,0.201233,9.098947,0.90098,0.09902
51,2019-02-26,JPMorgan Chase & Co,2018-12-31,USD,10-K,CY2018,19617,40800000000.0,14934000000.0,47471000000.0,...,-1507000000.0,256515000000.0,2622532000000.0,196710000000.0,0.0,0.0,0.207412,9.223698,0.902188,0.097812
52,2019-05-02,JPMorgan Chase & Co,2019-03-31,USD,10-Q,CY2019Q1,19617,42800000000.0,24160000000.0,47474000000.0,...,-558000000.0,259837000000.0,2737188000000.0,216173000000.0,0.0,0.0,0.19799,9.53425,0.905072,0.094928
54,2019-08-06,JPMorgan Chase & Co,2019-06-30,USD,10-Q,CY2019Q2,19617,45500000000.0,24665000000.0,47477000000.0,...,1114000000.0,263215000000.0,2727379000000.0,216137000000.0,0.0,0.0,0.210515,9.361792,0.903492,0.096508
55,2019-11-04,JPMorgan Chase & Co,2019-09-30,USD,10-Q,CY2019Q3,19617,47100000000.0,25117000000.0,47818000000.0,...,1800000000.0,264348000000.0,2764661000000.0,225063000000.0,0.0,0.0,0.209275,9.458415,0.904383,0.095617


In [30]:
financial_data_with_ratios.head(30)

Unnamed: 0,filed,company_name,end,unit,form,frame,cik,Assets - Cash and Cash Equivalents,"Assets - Property, Plant and Equipment, Net",Assets - Goodwill,...,Equity - Accumulated Other Comprehensive Income,Equity - Total Stockholders Equity,Equity - Total Liabilities and Equity,Validation - Current Liabilities Difference,Validation - A = L+E Difference,Validation - Components = Total L+E Difference,Cash_Ratio,Debt_to_Equity_Ratio,Debt_to_Assets_Ratio,Equity_Ratio
50,2019-11-04,JPMorgan Chase & Co,2018-09-30,USD,10-Q,CY2018Q3,19617,42200000000.0,14180000000.0,47483000000.0,...,-2425000000.0,258956000000.0,2615183000000.0,209707000000.0,0.0,0.0,0.201233,9.098947,0.90098,0.09902
51,2019-02-26,JPMorgan Chase & Co,2018-12-31,USD,10-K,CY2018,19617,40800000000.0,14934000000.0,47471000000.0,...,-1507000000.0,256515000000.0,2622532000000.0,196710000000.0,0.0,0.0,0.207412,9.223698,0.902188,0.097812
52,2019-05-02,JPMorgan Chase & Co,2019-03-31,USD,10-Q,CY2019Q1,19617,42800000000.0,24160000000.0,47474000000.0,...,-558000000.0,259837000000.0,2737188000000.0,216173000000.0,0.0,0.0,0.19799,9.53425,0.905072,0.094928
54,2019-08-06,JPMorgan Chase & Co,2019-06-30,USD,10-Q,CY2019Q2,19617,45500000000.0,24665000000.0,47477000000.0,...,1114000000.0,263215000000.0,2727379000000.0,216137000000.0,0.0,0.0,0.210515,9.361792,0.903492,0.096508
55,2019-11-04,JPMorgan Chase & Co,2019-09-30,USD,10-Q,CY2019Q3,19617,47100000000.0,25117000000.0,47818000000.0,...,1800000000.0,264348000000.0,2764661000000.0,225063000000.0,0.0,0.0,0.209275,9.458415,0.904383,0.095617
56,2020-02-25,JPMorgan Chase & Co,2019-12-31,USD,10-K,CY2019,19617,46500000000.0,25813000000.0,47823000000.0,...,1569000000.0,261330000000.0,2687379000000.0,210407000000.0,-902000000.0,0.0,0.221,9.283469,0.90306,0.097276
57,2020-05-07,JPMorgan Chase & Co,2020-03-31,USD,10-Q,CY2020Q1,19617,23500000000.0,25882000000.0,47800000000.0,...,7418000000.0,261262000000.0,3139431000000.0,253874000000.0,-901000000.0,0.0,0.092566,11.016409,0.917044,0.083243
58,2020-08-03,JPMorgan Chase & Co,2020-06-30,USD,10-Q,CY2020Q2,19617,25200000000.0,26301000000.0,47811000000.0,...,8789000000.0,264466000000.0,3213115000000.0,230916000000.0,-472000000.0,0.0,0.109131,11.149445,0.917827,0.08232
59,2020-11-02,JPMorgan Chase & Co,2020-09-30,USD,10-Q,CY2020Q3,19617,24700000000.0,26672000000.0,47819000000.0,...,8940000000.0,271113000000.0,3246076000000.0,234256000000.0,-1015000000.0,0.0,0.10544,10.973148,0.916766,0.083546
61,2021-02-23,JPMorgan Chase & Co,2020-12-31,USD,10-K,CY2020,19617,24400000000.0,27109000000.0,49248000000.0,...,7986000000.0,279354000000.0,3384757000000.0,231285000000.0,0.0,0.0,0.105498,11.116372,0.917467,0.082533


In [31]:
income_statement.columns

Index(['filed', 'company_name', 'end', 'unit', 'form', 'frame', 'cik',
       'Revenue - Total Revenue', 'Other - Interest Income',
       'Other - Interest Expense',
       'Income Before Tax - Income Before Income Taxes',
       'Tax - Income Tax Expense/Benefit', 'Net Income - Net Income',
       'Net Income - Net Income Attributable to Company', 'EPS - Basic EPS',
       'EPS - Diluted EPS', 'Validation - Net Income Difference'],
      dtype='object')

In [32]:
import pandas as pd
import numpy as np

def calculate_income_statement_ratios(df):
    """
    Calculate financial ratios from income statement accounts.
    
    Parameters:
    df (DataFrame): DataFrame containing the income statement accounts
    
    Returns:
    DataFrame: Original DataFrame with added ratio columns
    """
    # Create a copy of the DataFrame to avoid modifying the original
    result_df = df.copy()
    
    # Define a function to safely calculate ratios, handling division by zero and NaN
    def safe_divide(numerator, denominator):
        # Perform the division
        result = numerator / denominator
        # Replace infinite values (from division by zero) with NaN
        result.replace([np.inf, -np.inf], np.nan, inplace=True)
        return result
    
    # Define a function to check if all required columns exist
    def columns_exist(column_list):
        return all(column in df.columns for column in column_list)
    
    # Define ratios to calculate
    ratios = [
        {
            'name': 'Gross_Profit_Margin',
            'numerator': 'Gross - Gross Profit',
            'denominator': 'Revenue - Total Revenue',
            'description': 'Gross Profit / Revenue'
        },
        {
            'name': 'Operating_Profit_Margin',
            'numerator': 'Operating - Operating Income',
            'denominator': 'Revenue - Total Revenue',
            'description': 'Operating Income / Revenue'
        },
        {
            'name': 'Net_Profit_Margin',
            'numerator': 'Net Income - Net Income',
            'denominator': 'Revenue - Total Revenue',
            'description': 'Net Income / Revenue'
        },
        {
            'name': 'Interest_Coverage_Ratio',
            'numerator': 'Operating - Operating Income',
            'denominator': 'Other - Interest Expense',
            'description': 'Operating Income / Interest Expense'
        }
    ]
    
    # Track which ratios were calculated and which were skipped
    calculated_ratios = []
    skipped_ratios = []
    
    # Calculate each ratio if the required columns exist
    for ratio in ratios:
        if columns_exist([ratio['numerator'], ratio['denominator']]):
            result_df[ratio['name']] = safe_divide(
                result_df[ratio['numerator']],
                result_df[ratio['denominator']]
            )
            calculated_ratios.append(ratio['name'])
        else:
            skipped_ratios.append(f"{ratio['name']} ({ratio['description']})")
    
    # Print summary
    if calculated_ratios:
        print(f"Successfully calculated: {', '.join(calculated_ratios)}")
    if skipped_ratios:
        print(f"Skipped due to missing columns: {', '.join(skipped_ratios)}")
    
    return result_df

# Example usage:
# Assuming 'income_data' is a DataFrame containing the income statement accounts
income_data_with_ratios = calculate_income_statement_ratios(income_statement)
income_data_with_ratios.head(30)

Successfully calculated: Net_Profit_Margin
Skipped due to missing columns: Gross_Profit_Margin (Gross Profit / Revenue), Operating_Profit_Margin (Operating Income / Revenue), Interest_Coverage_Ratio (Operating Income / Interest Expense)


Unnamed: 0,filed,company_name,end,unit,form,frame,cik,Revenue - Total Revenue,Other - Interest Income,Other - Interest Expense,Income Before Tax - Income Before Income Taxes,Tax - Income Tax Expense/Benefit,Net Income - Net Income,Net Income - Net Income Attributable to Company,EPS - Basic EPS,EPS - Diluted EPS,Validation - Net Income Difference,Net_Profit_Margin
50,2019-11-04,JPMorgan Chase & Co,2018-09-30,USD,10-Q,CY2018Q3,19617,,1796000000.0,5531000000.0,,2309000000.0,8380000000.0,7948000000.0,2.35,2.34,,
51,2019-02-26,JPMorgan Chase & Co,2018-12-31,USD,10-K,CY2018,19617,108783000000.0,7248000000.0,21041000000.0,,8290000000.0,32474000000.0,30709000000.0,9.04,9.0,,0.298521
52,2019-05-02,JPMorgan Chase & Co,2019-03-31,USD,10-Q,CY2019Q1,19617,,2068000000.0,6936000000.0,,2054000000.0,9179000000.0,8753000000.0,2.65,2.65,,
54,2019-08-06,JPMorgan Chase & Co,2019-06-30,USD,10-Q,CY2019Q2,19617,,2215000000.0,7205000000.0,,1690000000.0,9652000000.0,9192000000.0,2.83,2.82,,
55,2019-11-04,JPMorgan Chase & Co,2019-09-30,USD,10-Q,CY2019Q3,19617,,2450000000.0,6893000000.0,,2325000000.0,9080000000.0,8606000000.0,2.69,2.68,,
56,2020-02-25,JPMorgan Chase & Co,2019-12-31,USD,10-K,CY2019,19617,115720000000.0,9291000000.0,26795000000.0,1816000000.0,8435000000.0,36431000000.0,34642000000.0,10.75,10.72,43050000000.0,0.31482
57,2020-05-07,JPMorgan Chase & Co,2020-03-31,USD,10-Q,CY2020Q1,19617,,2533000000.0,4722000000.0,,345000000.0,2865000000.0,2431000000.0,0.79,0.78,,
58,2020-08-03,JPMorgan Chase & Co,2020-06-30,USD,10-Q,CY2020Q2,19617,,2461000000.0,2259000000.0,,973000000.0,4687000000.0,4265000000.0,1.39,1.38,,
59,2020-11-02,JPMorgan Chase & Co,2020-09-30,USD,10-Q,CY2020Q3,19617,,2110000000.0,1687000000.0,,2326000000.0,9443000000.0,9015000000.0,2.93,2.92,,
61,2021-02-23,JPMorgan Chase & Co,2020-12-31,USD,10-K,CY2020,19617,119951000000.0,9027000000.0,9960000000.0,1586000000.0,6684000000.0,29131000000.0,27410000000.0,8.89,8.88,34229000000.0,0.242858


#### There are some ratios that are not well computed because they need computation accross different time period but it will not work for the first period so you need a way out of it. You could shift the whole dataframe or you could start the computation on the second one. 

- Note: Ratios requiring averages (like ROA, ROE, Turnovers, DSO, DIO, DPO) are calculated
- using end-of-period values here for simplicity. For more accurate analysis,
- calculate average balances: (balance_start_period + balance_end_period) / 2.
- This requires having data from the previous period (e.g., using df.shift()).

### In what follows I have defined different functions that compute Liquidity Ratios

In [33]:
import pandas as pd
import numpy as np

def calculate_cash_flow_ratios(df):
    """
    Calculate financial ratios that use cash flow statement accounts,
    often combined with balance sheet or income statement accounts.
    
    Handles missing accounts gracefully by skipping ratios when required data is unavailable.
    
    Parameters:
        df (DataFrame): DataFrame containing combined financial statement accounts
        
    Returns:
        DataFrame: Original DataFrame with added ratio columns
    """
    # Create a copy of the DataFrame to avoid modifying the original
    result_df = df.copy()
    
    # Define a function to safely calculate ratios, handling division by zero and NaN
    def safe_divide(numerator, denominator):
        """Safe division handling zeros and missing values"""
        if numerator is None or denominator is None:
            return None
        
        # Convert pandas Series to numpy arrays if needed
        if isinstance(numerator, pd.Series) and isinstance(denominator, pd.Series):
            # Create mask where either value is NaN
            mask = numerator.notna() & denominator.notna() & (denominator != 0)
            result = pd.Series(index=numerator.index, dtype='float64')
            result.loc[mask] = numerator.loc[mask] / denominator.loc[mask]
            return result
        else:
            # For scalar values
            if pd.isna(numerator) or pd.isna(denominator) or denominator == 0:
                return np.nan
            return numerator / denominator
    
    # Helper function to check if columns exist
    def columns_exist(df, columns):
        """Check if all columns exist in DataFrame"""
        return all(col in df.columns for col in columns)
    
    # Calculate intermediate values needed for ratios, only if the required columns exist
    
    # Calculate Average Total Assets
    if columns_exist(result_df, ['Assets - Total Assets']):
        result_df['Prev_Assets_Total_Assets'] = result_df['Assets - Total Assets'].shift(1)
        result_df['Avg_Total_Assets'] = (
            result_df['Assets - Total Assets'].fillna(0) + 
            result_df['Prev_Assets_Total_Assets'].fillna(0)
        ) / 2
    
    # Calculate Total Debt
    if columns_exist(result_df, ['Liabilities - Current Portion of Long-term Debt', 'Liabilities - Long-term Debt']):
        current_portion = result_df['Liabilities - Current Portion of Long-term Debt'].fillna(0)
        long_term_debt = result_df['Liabilities - Long-term Debt'].fillna(0)
        result_df['Total_Debt'] = current_portion + long_term_debt
    
    # Calculate Free Cash Flow
    if columns_exist(result_df, ['Operating - Net Cash from Operating Activities', 'Investing - Capital Expenditures']):
        operating_cash = result_df['Operating - Net Cash from Operating Activities']
        capex = result_df['Investing - Capital Expenditures']
        # Only calculate where both values are available
        mask = operating_cash.notna() & capex.notna()
        result_df['Free_Cash_Flow'] = pd.Series(index=result_df.index)
        result_df.loc[mask, 'Free_Cash_Flow'] = operating_cash.loc[mask] - capex.loc[mask]
    
    # Calculate various ratios
    
    # 1. Operating Cash Flow Ratio
    if columns_exist(result_df, ['Operating - Net Cash from Operating Activities', 'Liabilities - Total Current Liabilities']):
        result_df['Operating_Cash_Flow_Ratio'] = safe_divide(
            result_df['Operating - Net Cash from Operating Activities'],
            result_df['Liabilities - Total Current Liabilities']
        )
    
    # 2. Cash Flow to Net Income Ratio
    if columns_exist(result_df, ['Operating - Net Cash from Operating Activities', 'Net Income - Net Income']):
        result_df['Cash_Flow_to_Net_Income_Ratio'] = safe_divide(
            result_df['Operating - Net Cash from Operating Activities'],
            result_df['Net Income - Net Income']
        )
    
    # 3. Cash Flow to Total Debt Ratio
    if columns_exist(result_df, ['Operating - Net Cash from Operating Activities']) and 'Total_Debt' in result_df.columns:
        result_df['Cash_Flow_to_Total_Debt_Ratio'] = safe_divide(
            result_df['Operating - Net Cash from Operating Activities'],
            result_df['Total_Debt']
        )
    
    # 4. Capital Expenditures Coverage Ratio
    if columns_exist(result_df, ['Operating - Net Cash from Operating Activities', 'Investing - Capital Expenditures']):
        result_df['Capital_Expenditures_Coverage_Ratio'] = safe_divide(
            result_df['Operating - Net Cash from Operating Activities'],
            result_df['Investing - Capital Expenditures']
        )
    
    # 5. Free Cash Flow - Already calculated above
    
    # 7. Cash Flow Margin
    if columns_exist(result_df, ['Operating - Net Cash from Operating Activities', 'Revenue - Total Revenue']):
        result_df['Cash_Flow_Margin'] = safe_divide(
            result_df['Operating - Net Cash from Operating Activities'],
            result_df['Revenue - Total Revenue']
        )
    
    # 8. Cash Interest Coverage Ratio
    if columns_exist(result_df, [
        'Operating - Net Cash from Operating Activities',
        'Supplemental - Interest Paid',
        'Supplemental - Income Taxes Paid'
    ]):
        # Only calculate if all required values are present
        mask = (
            result_df['Operating - Net Cash from Operating Activities'].notna() &
            result_df['Supplemental - Interest Paid'].notna() &
            result_df['Supplemental - Income Taxes Paid'].notna() &
            (result_df['Supplemental - Interest Paid'] != 0)
        )
        
        result_df['Cash_Interest_Coverage_Ratio'] = pd.Series(index=result_df.index)
        if mask.any():
            result_df.loc[mask, 'Cash_Interest_Coverage_Ratio'] = (
                result_df.loc[mask, 'Operating - Net Cash from Operating Activities'] +
                result_df.loc[mask, 'Supplemental - Interest Paid'] +
                result_df.loc[mask, 'Supplemental - Income Taxes Paid']
            ) / result_df.loc[mask, 'Supplemental - Interest Paid']
    
    # 9. Dividend Payout Ratio (Cash Flow Basis)
    if columns_exist(result_df, ['Financing - Dividend Payments', 'Operating - Net Cash from Operating Activities']):
        result_df['Dividend_Payout_Ratio_Cash_Flow_Basis'] = safe_divide(
            result_df['Financing - Dividend Payments'],
            result_df['Operating - Net Cash from Operating Activities']
        )
    
    # 10. Cash Flow Return on Assets (CFROA)
    if columns_exist(result_df, ['Operating - Net Cash from Operating Activities']) and 'Avg_Total_Assets' in result_df.columns:
        result_df['Cash_Flow_Return_on_Assets'] = safe_divide(
            result_df['Operating - Net Cash from Operating Activities'],
            result_df['Avg_Total_Assets']
        )
    
    return result_df

# Example usage
financial_data_with_cash_flow_ratios = calculate_cash_flow_ratios(merged_df_final)

In [34]:
# Remove columns with identical values
financial_data_with_cash_flow_ratios = financial_data_with_cash_flow_ratios.loc[:, ~financial_data_with_cash_flow_ratios.T.duplicated(keep='first')]

In [35]:
financial_data_with_cash_flow_ratios.columns.tolist()

['filed',
 'company_name',
 'end',
 'unit',
 'form',
 'frame',
 'cik',
 'Assets - Cash and Cash Equivalents',
 'Assets - Property, Plant and Equipment, Net',
 'Assets - Goodwill',
 'Assets - Intangible Assets, Net',
 'Assets - Total Assets',
 'Liabilities - Accounts Payable',
 'Liabilities - Long-term Debt',
 'Liabilities - Total Liabilities',
 'Equity - Common Stock and Paid-in Capital',
 'Equity - Retained Earnings',
 'Equity - Accumulated Other Comprehensive Income',
 'Equity - Total Stockholders Equity',
 'Equity - Total Liabilities and Equity',
 'Validation - A = L+E Difference',
 'Validation - Components = Total L+E Difference',
 'Revenue - Total Revenue',
 'Other - Interest Income',
 'Other - Interest Expense',
 'Income Before Tax - Income Before Income Taxes',
 'Tax - Income Tax Expense/Benefit',
 'Net Income - Net Income',
 'Net Income - Net Income Attributable to Company',
 'EPS - Basic EPS',
 'EPS - Diluted EPS',
 'Validation - Net Income Difference',
 'Operating - Depreciat

In [36]:
financial_data_with_cash_flow_ratios = financial_data_with_cash_flow_ratios.sort_values(by='end')

yearly = financial_data_with_cash_flow_ratios[financial_data_with_cash_flow_ratios['frame'].str.match(r'^CY\d{4}$')]
yearly.to_csv('./test.csv')

# DCF model

To start I will be conducting the dcf analysis for the yearly data only. Once the Calendar Year Accounts (CY) I will move forward and include the quarterly data aswell. 

### Revenue

In [37]:
# First let's create a new dataframe for the DCF
dcf = pd.DataFrame()
dcf['frame'] = yearly['frame']
dcf['Revenue'] = yearly['Revenue - Total Revenue']

# Add revenue growth rate to your DCF model
dcf['Revenue_Growth_Rate'] = yearly['Revenue - Total Revenue'].pct_change()
# Replace NaN in the first row with 0 or remove it as needed
dcf['Revenue_Growth_Rate'] = dcf['Revenue_Growth_Rate'].fillna(0)



### COGS
If I have a column called Cost - Total Cost of Revenue I want to create a new column that populates the COGS column with Total Cost of Revenue column. If the column is not available I want to sum Cost of Product Revenue and Cost of Service Revenue. Before making the addition I want to make sure that these accounts are available

This way the code is more generalizable given the different columns names that I can encounter.

In [38]:
# Create COGS column from available data
if 'Cost - Total Cost of Revenue' in yearly.columns:
    # Direct assignment if column exists
    dcf['COGS'] = yearly['Cost - Total Cost of Revenue']
else:
    # Check if component columns exist and sum them
    product_cost_exists = 'Cost of Product Revenue' in yearly.columns
    service_cost_exists = 'Cost of Service Revenue' in yearly.columns
    
    if product_cost_exists and service_cost_exists:
        dcf['COGS'] = yearly['Cost of Product Revenue'] + yearly['Cost of Service Revenue']
    elif product_cost_exists:
        dcf['COGS'] = yearly['Cost of Product Revenue']
    elif service_cost_exists:
        dcf['COGS'] = yearly['Cost of Service Revenue']
    else:
        # No cost columns available
        dcf['COGS'] = None
        print("Warning: No cost of revenue columns found")

dcf['COGS pct of Revenue'] = dcf['COGS']/dcf['Revenue']




### Gross Profit Margin

In [39]:
if 'Gross - Gross Profit' in yearly.columns:
    dcf['Gross Margin'] = yearly['Gross - Gross Profit']
else: 
    dcf['Gross Margin'] = dcf['Revenue'] - dcf['COGS']



### Operating Expenses

I have few columns that start with Operating Expenses - Specific Name. 
For all the columns that start with Operating Expenses I want to bring them to my DCF dataframe and next to all those account I want a new column that computes Operating Expense / Revenue

In [40]:
op_ex = [col for col in yearly.columns if col.startswith('Operating Expenses')]

for col in op_ex:
    name = col.replace('Operating Expenses -','').strip()
    if name == 'Research and Development' or 'Sales and Marketing' or 'General and Administrative':
        dcf[col] = yearly[col]
        ratio_name = f'{name}/Revenue'
        dcf[ratio_name] = yearly[col]/dcf['Revenue']
    else:
        pass
dcf['Total Operating Expense'] = dcf[op_ex].sum(axis=1)

### EBIT (Operating Income)

In [41]:
if 'Operating - Operating Income' in yearly.columns:
    dcf['EBIT'] = yearly['Operating - Operating Income']
else:
    dcf['EBIT'] = dcf['Gross Margin'] - dcf['Total Operating Expense']

### Tax Expense

In [42]:
if 'Tax - Income Tax Expense/Benefit' in yearly.columns:
    dcf['Tax'] = yearly['Tax - Income Tax Expense/Benefit']
else: 
    # The corporate tax rate in the US is around 21% as of 2024
    dcf['Tax'] = yearly['EBIT']*0.21


### Depretiation and Amortization
Usually found in the CashFlow Statement

In [43]:
if 'Operating - Depreciation and Amortization' in yearly.columns:
    dcf['D&A'] = yearly['Operating - Depreciation and Amortization']
    dcf['pct_growth_revenue'] = dcf['D&A']/dcf['Revenue']
else:
    pass

### Net Working Capital (NWC)
For NWC I need Accounts Receivable, Inventory, Accounts Payable, Accrued Expenses and Deffered Liabilities

In [44]:
cols = ['Assets - Accounts Receivable, Net (Current)','Assets - Inventory','Liabilities - Accounts Payable']
dcf['NWC'] = 0
for col in cols:
    if col in yearly.columns:
        name = col.split('-')[1].strip()
        dcf[name] = yearly[col]
    
        if col.split('-')[0].strip()=='Liabilities':
            dcf['NWC'] = dcf['NWC'] - dcf[name]
            dcf[f'{name}/COGS'] = dcf[name]/dcf['COGS']

        else:
            dcf[f'{name}/Revenue'] = dcf[name]/dcf['Revenue']
            dcf['NWC'] = dcf['NWC'] + dcf[name]
        


In [45]:
# Compute delta NWC
dcf['Delta NWC'] = dcf['NWC'].diff()

# Replace NaN in the first row with 0 or handle it as needed
dcf['Delta NWC'] = dcf['Delta NWC'].fillna(0)

In [46]:
if 'Investing - Capital Expenditures' in yearly.columns:
    dcf['CAPEX'] = yearly['Investing - Capital Expenditures']
    dcf['CAPEX/Revenue'] = dcf['CAPEX']/dcf['Revenue']

### FCF
$$ FCF = EBIT - Tax + D\&A - DeltaNWC - CAPEX$$

In [48]:
# Compute FCF
#dcf['FCF'] = (dcf['EBIT'] - dcf['Tax'] + dcf['D&A'] - dcf['Delta NWC'] - dcf['CAPEX'])


In [49]:
dcf.head(10).T

Unnamed: 0,0,2,6,10,14,18,22
frame,CY2018,CY2019,CY2020,CY2021,CY2022,CY2023,CY2024
Revenue,108783000000.0,115720000000.0,119951000000.0,121649000000.0,128695000000.0,158104000000.0,177556000000.0
Revenue_Growth_Rate,0.0,0.063769,0.036562,0.014156,0.057921,0.228517,0.123033
COGS,,,,,,,
COGS pct of Revenue,,,,,,,
Gross Margin,,,,,,,
Total Operating Expense,0.0,0.0,0.0,0.0,0.0,0.0,0.0
EBIT,,,,,,,
Tax,8290000000.0,8435000000.0,6684000000.0,11228000000.0,8490000000.0,12060000000.0,16610000000.0
D&A,16616000000.0,12525000000.0,19259000000.0,17049000000.0,13808000000.0,12578000000.0,12132000000.0
