# 10-K Filings: Exploratory Analysis

This notebook explores the 10-K filings data that has been preprocessed in the previous steps.

In [None]:
# Import libraries
import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Add project root to path for importing local modules
sys.path.append('..')

# Import project modules
from src.data.data_loader import SECDataLoader
from src.data.data_preprocessor import FilingPreprocessor
from src.visualization.basic_plots import (
    plot_time_series, plot_metric_distribution, create_wordcloud,
    plot_sentiment_analysis, plot_comparative_metrics
)

# Set plot style
plt.style.use('fivethirtyeight')
sns.set_palette('Set2')

# Display settings
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 100)

## Load the preprocessed data

Load the preprocessed 10-K filings that were created in the previous notebook.

In [None]:
# Load the preprocessed filings
processed_filings = pd.read_pickle('../data/processed/processed_filings.pkl')

# Display basic information
print(f"Number of filings: {len(processed_filings)}")
print(f"Companies: {processed_filings['ticker'].nunique()}")
print(f"Years: {sorted(processed_filings['filing_year'].unique())}")

# Display the first few rows
processed_filings.head()

## Overview of Available Companies and Years

Let's check which companies and years are available in our dataset.

In [None]:
# Create a pivot table of companies and years
company_year_pivot = pd.pivot_table(
    processed_filings,
    values='accession_number',
    index='ticker',
    columns='filing_year',
    aggfunc='count',
    fill_value=0
)

# Display the pivot table
company_year_pivot

## Explore Filing Metadata

Let's look at some metadata about the filings, such as filing dates, fiscal year ends, etc.

In [None]:
# Display filing metadata
metadata_cols = ['ticker', 'company_name', 'filing_date', 'filing_year', 'fiscal_year_end']
processed_filings[metadata_cols].sort_values(['ticker', 'filing_date']).head(10)

In [None]:
# Plot filing dates by company
fig, ax = plt.subplots(figsize=(12, 6))

# Get unique companies
companies = processed_filings['ticker'].unique()

# Plot a line for each company
for i, company in enumerate(companies):
    company_filings = processed_filings[processed_filings['ticker'] == company]
    ax.scatter(
        company_filings['filing_date'],
        [i] * len(company_filings),
        label=company,
        s=80
    )

# Set y-ticks to company names
ax.set_yticks(range(len(companies)))
ax.set_yticklabels(companies)

# Set labels and title
ax.set_xlabel('Filing Date', fontsize=14)
ax.set_title('10-K Filing Dates by Company', fontsize=16, fontweight='bold')

# Format x-axis with years
import matplotlib.dates as mdates
ax.xaxis.set_major_locator(mdates.YearLocator())
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y'))

# Add grid for better readability
ax.grid(True, axis='x', linestyle='--', alpha=0.7)

plt.tight_layout()
plt.show()

## Explore Filing Sections

Let's look at the sections extracted from the 10-K filings.

In [None]:
# Get the section columns
section_cols = [col for col in processed_filings.columns if col.startswith('section_')]
print(f"Available sections: {[col.replace('section_', '') for col in section_cols]}")

In [None]:
# Calculate section lengths
for section in section_cols:
    processed_filings[f"{section}_length"] = processed_filings[section].str.len()

# Get the length columns
length_cols = [col for col in processed_filings.columns if col.endswith('_length')]

# Calculate mean section lengths by company
section_lengths = processed_filings.groupby('ticker')[length_cols].mean()

# Plot section lengths by company
section_lengths_melted = section_lengths.reset_index().melt(
    id_vars='ticker',
    value_vars=length_cols,
    var_name='section',
    value_name='length'
)

# Clean up section names for plotting
section_lengths_melted['section'] = section_lengths_melted['section'].str.replace('section_', '').str.replace('_length', '')

# Plot
plt.figure(figsize=(14, 8))
sns.barplot(x='section', y='length', hue='ticker', data=section_lengths_melted)
plt.title('Average Section Lengths by Company', fontsize=16, fontweight='bold')
plt.xlabel('Section', fontsize=14)
plt.ylabel('Average Length (characters)', fontsize=14)
plt.xticks(rotation=45, ha='right')
plt.legend(title='Company')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

## Text Analysis on Key Sections

Let's analyze key sections such as Risk Factors (Item 1A) and MD&A (Item 7).

In [None]:
# Select a company for demonstration
target_company = processed_filings['ticker'].unique()[0]  # First company
target_year = processed_filings['filing_year'].max()  # Most recent year

# Get the filing for the target company and year
target_filing = processed_filings[
    (processed_filings['ticker'] == target_company) &
    (processed_filings['filing_year'] == target_year)
].iloc[0]

print(f"Analyzing filing for {target_company} from {target_year}")

In [None]:
# Analyze Risk Factors section (Item 1A)
risk_factors_text = target_filing['section_item_1a']

# Create word cloud for Risk Factors
if len(risk_factors_text) > 0:
    fig, ax = create_wordcloud(
        risk_factors_text,
        title=f"Risk Factors Word Cloud - {target_company} ({target_year})",
        colormap='Reds'
    )
    plt.show()
else:
    print("No Risk Factors section found for this filing.")

In [None]:
# Analyze MD&A section (Item 7)
mda_text = target_filing['section_item_7']

# Create word cloud for MD&A
if len(mda_text) > 0:
    fig, ax = create_wordcloud(
        mda_text,
        title=f"MD&A Word Cloud - {target_company} ({target_year})",
        colormap='Blues'
    )
    plt.show()
else:
    print("No MD&A section found for this filing.")

## Comparative Analysis Across Companies

Let's compare key aspects of the filings across all companies.

In [None]:
# Create metrics for comparison
company_metrics = []

for ticker, company_filings in processed_filings.groupby('ticker'):
    # Get the most recent filing
    most_recent = company_filings.sort_values('filing_date').iloc[-1]
    
    # Calculate metrics
    risk_length = len(most_recent.get('section_item_1a', '')) if 'section_item_1a' in most_recent else 0
    mda_length = len(most_recent.get('section_item_7', '')) if 'section_item_7' in most_recent else 0
    fin_length = len(most_recent.get('section_item_8', '')) if 'section_item_8' in most_recent else 0
    
    # Count risk-related words in Item 1A
    risk_terms = ['risk', 'uncertainty', 'adverse', 'negative', 'decline', 'decrease', 'loss']
    risk_count = 0
    if 'section_item_1a' in most_recent and isinstance(most_recent['section_item_1a'], str):
        risk_text = most_recent['section_item_1a'].lower()
        for term in risk_terms:
            risk_count += risk_text.count(term)
    
    # Add to metrics list
    company_metrics.append({
        'ticker': ticker,
        'company_name': most_recent.get('company_name', ticker),
        'filing_year': most_recent.get('filing_year', ''),
        'risk_section_length': risk_length,
        'mda_section_length': mda_length,
        'fin_section_length': fin_length,
        'risk_term_count': risk_count,
        'risk_density': risk_count / risk_length * 1000 if risk_length > 0 else 0  # Risk terms per 1000 chars
    })

# Create DataFrame
company_metrics_df = pd.DataFrame(company_metrics)
company_metrics_df.head()

In [None]:
# Plot comparative section lengths
fig, ax = plot_comparative_metrics(
    company_metrics_df,
    metrics=['risk_section_length', 'mda_section_length', 'fin_section_length'],
    company_column='ticker',
    title='Comparative Section Lengths by Company'
)
plt.show()

In [None]:
# Plot risk density
plt.figure(figsize=(12, 6))
sns.barplot(x='ticker', y='risk_density', data=company_metrics_df)
plt.title('Risk Term Density by Company', fontsize=16, fontweight='bold')
plt.xlabel('Company', fontsize=14)
plt.ylabel('Risk Terms per 1000 Characters', fontsize=14)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

## Temporal Analysis

Let's analyze how the filings have changed over time for a specific company.

In [None]:
# Select a company with multiple years of filings
target_company = processed_filings['ticker'].value_counts().index[0]  # Company with most filings
company_filings = processed_filings[processed_filings['ticker'] == target_company].copy()

# Calculate metrics for each filing
company_filings['risk_section_length'] = company_filings['section_item_1a'].str.len()
company_filings['mda_section_length'] = company_filings['section_item_7'].str.len()
company_filings['fin_section_length'] = company_filings['section_item_8'].str.len()

# Sort by filing date
company_filings = company_filings.sort_values('filing_date')

In [None]:
# Plot section lengths over time
plt.figure(figsize=(14, 7))

# Plot each section
plt.plot(
    company_filings['filing_date'],
    company_filings['risk_section_length'],
    marker='o',
    linewidth=2,
    label='Risk Factors (Item 1A)'
)
plt.plot(
    company_filings['filing_date'],
    company_filings['mda_section_length'],
    marker='o',
    linewidth=2,
    label='MD&A (Item 7)'
)
plt.plot(
    company_filings['filing_date'],
    company_filings['fin_section_length'],
    marker='o',
    linewidth=2,
    label='Financial Statements (Item 8)'
)

# Set labels and title
plt.title(f'Section Lengths Over Time - {target_company}', fontsize=16, fontweight='bold')
plt.xlabel('Filing Date', fontsize=14)
plt.ylabel('Section Length (characters)', fontsize=14)
plt.legend(fontsize=12)
plt.grid(True, linestyle='--', alpha=0.7)

# Format x-axis with years
plt.gca().xaxis.set_major_locator(mdates.YearLocator())
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y'))
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

## Next Steps

In the next notebooks, we'll perform more in-depth analyses:

1. Text analysis using NLP techniques
2. Financial analysis extracting key metrics
3. Sentiment analysis for MD&A sections
4. Creating interactive visualizations and dashboards