# 10-K Filings: Financial Analysis

This notebook demonstrates how to extract and analyze financial metrics from 10-K filings.

In [None]:
# Import libraries
import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
import re
from bs4 import BeautifulSoup

# Add project root to path for importing local modules
sys.path.append('..')

# Import project modules
from src.analysis.financial_analysis import FinancialAnalyzer
from src.visualization.basic_plots import plot_time_series, plot_comparative_metrics, plot_correlation_heatmap

# Set plot style
plt.style.use('fivethirtyeight')
sns.set_palette('Set2')

# Set pandas display options
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 100)

## Load the Processed Filings

Let's load the 10-K filings that we preprocessed in the previous notebooks.

In [None]:
# Load the processed filings
processed_file = '../data/processed/processed_filings.pkl'

if os.path.exists(processed_file):
    processed_df = pd.read_pickle(processed_file)
    print(f"Loaded {len(processed_df)} processed filings from {processed_file}")
else:
    print(f"Error: File not found: {processed_file}")
    print("Please run the '2_data_preprocessing.ipynb' notebook first to preprocess the filings.")

In [None]:
# Check basic information
print(f"Filings for {processed_df['ticker'].nunique()} companies over {processed_df['filing_year'].nunique()} years")
print(f"Companies: {', '.join(sorted(processed_df['ticker'].unique()))}")
print(f"Years: {', '.join(map(str, sorted(processed_df['filing_year'].unique())))}")

## Initialize the Financial Analyzer

Now, we'll create an instance of the `FinancialAnalyzer` class that we'll use to extract and analyze financial metrics from the filings.

In [None]:
# Initialize the financial analyzer
analyzer = FinancialAnalyzer()

# Display the financial metrics that will be extracted
print("Financial metrics to extract:")
for metric, patterns in analyzer.metrics.items():
    print(f"- {metric}: {patterns[0]}")

## Extract Financial Tables

Let's extract tables from the filings to see what financial data is available.

In [None]:
# Select a sample filing to extract tables from
sample_filing = processed_df.iloc[0]
print(f"Sample filing: {sample_filing['ticker']} from {sample_filing['filing_year']}")

# Extract tables from the sample filing
tables = analyzer.extract_tables_from_html(sample_filing['filing_html'])
print(f"\nExtracted {len(tables)} tables from the filing.")

# Filter to keep only the financial tables
financial_tables = analyzer.extract_financial_tables(sample_filing['filing_html'])
print(f"Identified {len(financial_tables)} financial tables.")

In [None]:
# Display a sample of the financial tables
if financial_tables:
    for i, table in enumerate(financial_tables[:3]):  # Show first 3 tables
        print(f"\nTable {i+1} ({table['rows']} rows x {table['cols']} columns):")
        display(table['dataframe'].head(10))

## Extract Financial Metrics

Now let's extract key financial metrics from all the filings.

In [None]:
# Extract metrics for a sample filing first
sample_metrics = analyzer.extract_metrics_from_filing(sample_filing['filing_html'])
print("Financial metrics extracted from sample filing:")
for metric, value in sample_metrics.items():
    print(f"- {metric}: {value:,.2f}" if value is not None else f"- {metric}: None")

In [None]:
# Calculate derived metrics from the sample
derived_metrics = analyzer.calculate_derived_metrics(sample_metrics)
print("\nDerived metrics:")
for metric, value in derived_metrics.items():
    if metric.endswith('_margin'):
        print(f"- {metric}: {value:.2%}" if value is not None else f"- {metric}: None")
    else:
        print(f"- {metric}: {value:.4f}" if value is not None else f"- {metric}: None")

In [None]:
# Extract financial metrics for all filings
print("Extracting financial metrics for all filings... This may take several minutes.")
financial_metrics_df = analyzer.analyze_filings(processed_df)
print(f"Extracted financial metrics for {len(financial_metrics_df)} filings.")

In [None]:
# Display the financial metrics DataFrame
# Exclude the HTML content column for better display
display_cols = [col for col in financial_metrics_df.columns if col != 'filing_html']
financial_metrics_df[display_cols].head()

## Analyze Financial Metrics

Let's analyze the financial metrics to understand the financial performance of the companies.

In [None]:
# Calculate summary statistics for each metric
metrics_cols = ['revenue', 'net_income', 'operating_income', 'total_assets', 'total_liabilities']
metrics_cols = [col for col in metrics_cols if col in financial_metrics_df.columns]

if metrics_cols:
    # Calculate statistics
    metrics_stats = financial_metrics_df[metrics_cols].describe()
    
    # Display statistics
    print("Summary statistics for financial metrics:")
    display(metrics_stats)

In [None]:
# Analyze metrics by company
if 'ticker' in financial_metrics_df.columns:
    # Group by company
    company_metrics = financial_metrics_df.groupby('ticker')[metrics_cols].mean()
    
    # Display metrics by company
    print("Average financial metrics by company:")
    display(company_metrics)

In [None]:
# Format numbers for better display
def format_financial(x):
    if pd.isna(x):
        return 'N/A'
    elif abs(x) >= 1e9:
        return f'${x/1e9:.2f}B'
    elif abs(x) >= 1e6:
        return f'${x/1e6:.2f}M'
    else:
        return f'${x:.2f}'

# Format ratios
def format_ratio(x):
    if pd.isna(x):
        return 'N/A'
    else:
        return f'{x:.2%}'

# Format the company metrics
if 'company_metrics' in locals():
    formatted_metrics = company_metrics.copy()
    
    # Apply formatting
    for col in formatted_metrics.columns:
        if col in ['profit_margin', 'roa', 'debt_to_assets', 'rd_to_revenue']:
            formatted_metrics[col] = formatted_metrics[col].apply(format_ratio)
        else:
            formatted_metrics[col] = formatted_metrics[col].apply(format_financial)
    
    # Display formatted metrics
    print("\nFormatted financial metrics by company:")
    display(formatted_metrics)

## Visualize Financial Metrics

Let's create visualizations to better understand the financial metrics.

In [None]:
# Visualize revenue by company
if 'revenue' in financial_metrics_df.columns:
    # Create a bar chart
    plt.figure(figsize=(12, 6))
    sns.barplot(x='ticker', y='revenue', data=financial_metrics_df, estimator=np.mean, ci=None)
    plt.title('Average Revenue by Company', fontsize=16, fontweight='bold')
    plt.xlabel('Company', fontsize=14)
    plt.ylabel('Revenue', fontsize=14)
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()

In [None]:
# Visualize revenue over time
if 'revenue' in financial_metrics_df.columns and 'filing_date' in financial_metrics_df.columns:
    # Ensure filing_date is datetime
    financial_metrics_df['filing_date'] = pd.to_datetime(financial_metrics_df['filing_date'])
    
    # Create time series plot
    fig, ax = plot_time_series(
        financial_metrics_df,
        date_column='filing_date',
        value_column='revenue',
        company_column='ticker',
        title='Revenue Over Time by Company'
    )
    plt.show()

In [None]:
# Compare multiple financial metrics across companies
metrics_to_compare = ['revenue', 'net_income', 'operating_income']
metrics_to_compare = [m for m in metrics_to_compare if m in financial_metrics_df.columns]

if metrics_to_compare and 'ticker' in financial_metrics_df.columns:
    fig, ax = plot_comparative_metrics(
        financial_metrics_df,
        metrics=metrics_to_compare,
        company_column='ticker',
        title='Financial Metrics Comparison by Company'
    )
    plt.show()

In [None]:
# Create correlation heatmap of financial metrics
# Select numerical columns (excluding ID columns)
num_cols = financial_metrics_df.select_dtypes(include=[np.number]).columns.tolist()
num_cols = [col for col in num_cols if col not in ['filing_year']]

if num_cols:
    fig, ax = plot_correlation_heatmap(
        financial_metrics_df,
        columns=num_cols,
        title='Correlation Heatmap of Financial Metrics'
    )
    plt.show()

## Comparative Financial Analysis

Let's compare financial metrics across companies and years.

In [None]:
# Perform comparative financial analysis
financial_comparison = analyzer.compare_financials(financial_metrics_df)

# Display company comparison
if 'by_company' in financial_comparison:
    print("Financial metrics comparison by company:")
    display(financial_comparison['by_company'])

In [None]:
# Display year comparison
if 'by_year' in financial_comparison:
    print("Financial metrics comparison by year:")
    display(financial_comparison['by_year'])

In [None]:
# Display growth rates
if 'growth_rates' in financial_comparison and not financial_comparison['growth_rates'].empty:
    print("Year-over-year growth rates:")
    display(financial_comparison['growth_rates'])

In [None]:
# Visualize growth rates
if 'growth_rates' in financial_comparison and not financial_comparison['growth_rates'].empty:
    # Select growth rate columns
    growth_cols = [col for col in financial_comparison['growth_rates'].columns if col.endswith('_growth')]
    
    if growth_cols:
        # Create a figure for each growth metric
        for col in growth_cols:
            # Get metric name from column name
            metric_name = col.replace('_growth', '')
            
            # Create a bar chart
            plt.figure(figsize=(12, 6))
            sns.barplot(x='ticker', y=col, data=financial_comparison['growth_rates'])
            plt.title(f'{metric_name.title()} Growth Rate by Company', fontsize=16, fontweight='bold')
            plt.xlabel('Company', fontsize=14)
            plt.ylabel('Growth Rate (%)', fontsize=14)
            
            # Format y-axis as percentage
            from matplotlib.ticker import FuncFormatter
            plt.gca().yaxis.set_major_formatter(FuncFormatter(lambda x, _: f'{x:.0%}'))
            
            # Add a horizontal line at y=0
            plt.axhline(y=0, color='red', linestyle='--', alpha=0.7)
            
            plt.grid(axis='y', linestyle='--', alpha=0.7)
            plt.tight_layout()
            plt.show()

## Financial Ratios Analysis

Let's analyze financial ratios to better understand the financial health of the companies.

In [None]:
# Select financial ratio columns
ratio_cols = ['profit_margin', 'roa', 'debt_to_assets', 'rd_to_revenue']
available_ratios = [col for col in ratio_cols if col in financial_metrics_df.columns]

if available_ratios:
    # Create a figure for each ratio
    for ratio in available_ratios:
        # Create a box plot
        plt.figure(figsize=(12, 6))
        sns.boxplot(x='ticker', y=ratio, data=financial_metrics_df)
        plt.title(f'{ratio.replace("_", " ").title()} by Company', fontsize=16, fontweight='bold')
        plt.xlabel('Company', fontsize=14)
        plt.ylabel(ratio.replace("_", " ").title(), fontsize=14)
        
        # Format y-axis as percentage for ratios
        from matplotlib.ticker import FuncFormatter
        plt.gca().yaxis.set_major_formatter(FuncFormatter(lambda x, _: f'{x:.0%}'))
        
        plt.grid(axis='y', linestyle='--', alpha=0.7)
        plt.tight_layout()
        plt.show()

In [None]:
# Create a dashboard-like view of key financial metrics
if 'ticker' in financial_metrics_df.columns and available_ratios:
    # Calculate average metrics by company
    metrics_by_company = financial_metrics_df.groupby('ticker')[
        available_ratios + [m for m in metrics_to_compare if m in financial_metrics_df.columns]
    ].mean().reset_index()
    
    # Create a multi-panel figure
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    axes = axes.flatten()
    
    # Plot each metric
    for i, metric in enumerate(available_ratios[:4]):  # Limit to 4 metrics
        ax = axes[i]
        
        # Sort by metric value
        sorted_df = metrics_by_company.sort_values(metric)
        
        # Create a horizontal bar chart
        ax.barh(sorted_df['ticker'], sorted_df[metric])
        
        # Set title and labels
        ax.set_title(metric.replace("_", " ").title(), fontsize=14, fontweight='bold')
        ax.set_xlabel(metric.replace("_", " ").title(), fontsize=12)
        ax.set_ylabel('Company', fontsize=12)
        
        # Format x-axis as percentage for ratios
        ax.xaxis.set_major_formatter(FuncFormatter(lambda x, _: f'{x:.0%}'))
        
        # Add grid
        ax.grid(axis='x', linestyle='--', alpha=0.7)
    
    plt.suptitle('Key Financial Ratios by Company', fontsize=20, fontweight='bold')
    plt.tight_layout(rect=[0, 0, 1, 0.95])
    plt.show()

## Save the Financial Analysis Results

Let's save the results of our financial analysis for use in later notebooks.

In [None]:
# Create the results directory if it doesn't exist
if not os.path.exists('../data/results'):
    os.makedirs('../data/results')

# Save financial metrics
financial_metrics_file = '../data/results/financial_analysis.pkl'
financial_metrics_df.to_pickle(financial_metrics_file)
print(f"Saved financial analysis results to {financial_metrics_file}")

# Save financial comparison results
comparison_file = '../data/results/financial_comparison.pkl'
pd.to_pickle(financial_comparison, comparison_file)
print(f"Saved financial comparison results to {comparison_file}")

## Next Steps

In the next notebook (`6_visualizations.ipynb`), we'll create comprehensive visualizations and dashboards that combine the results of our text analysis, sentiment analysis, and financial analysis, including:
1. Interactive time series visualizations
2. Comparative dashboards across companies
3. Correlation analysis between financial metrics and sentiment
4. Trend analysis and forecasting