# 10-K Filings: Data Preprocessing

This notebook demonstrates how to preprocess 10-K filings by extracting sections, cleaning text, and preparing the data for analysis.

In [None]:
# Import libraries
import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
import re
from bs4 import BeautifulSoup

# Add project root to path for importing local modules
sys.path.append('..')

# Import project modules
from src.data.data_preprocessor import FilingPreprocessor

# Set plot style
plt.style.use('fivethirtyeight')
sns.set_palette('Set2')

# Set pandas display options
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 100)

## Load the Raw Filings

Let's load the 10-K filings that we downloaded in the previous notebook.

In [None]:
# Load the filings
filings_file = '../data/raw/10k_filings.pkl'

if os.path.exists(filings_file):
    filings_df = pd.read_pickle(filings_file)
    print(f"Loaded {len(filings_df)} filings from {filings_file}")
else:
    print(f"Error: File not found: {filings_file}")
    print("Please run the '1_data_extraction.ipynb' notebook first to download the filings.")

In [None]:
# Check basic information
print(f"Filings for {filings_df['ticker'].nunique()} companies over {filings_df['filing_year'].nunique()} years")
print(f"Companies: {', '.join(sorted(filings_df['ticker'].unique()))}")
print(f"Years: {', '.join(map(str, sorted(filings_df['filing_year'].unique())))}")

## Initialize the Preprocessor

Now, we'll create an instance of the `FilingPreprocessor` class that we'll use to extract sections and clean the text.

In [None]:
# Initialize the preprocessor
preprocessor = FilingPreprocessor()

# Display the available section patterns
print("Available section patterns:")
for section_name, pattern in preprocessor.section_patterns.items():
    print(f"- {section_name}: {pattern}")

## Sample Processing

Before processing all filings, let's test the preprocessor on a single filing to see how it works.

In [None]:
# Select a sample filing
sample_filing = filings_df.iloc[0]
print(f"Sample filing: {sample_filing['ticker']} from {sample_filing['filing_year']}")

# First, let's clean the HTML and extract plain text
sample_text = preprocessor.clean_html(sample_filing['filing_html'])
print(f"\nExtracted {len(sample_text)} characters of plain text")
print(f"Sample of plain text: {sample_text[:500]}...")

In [None]:
# Now, let's extract sections from the text
sample_sections = preprocessor.extract_all_sections(sample_text)

# Print the sections found and their lengths
print("Extracted sections:")
for section_name, section_text in sample_sections.items():
    if section_text:  # Only show non-empty sections
        print(f"- {section_name}: {len(section_text)} characters")

In [None]:
# Let's look at a sample section (e.g., Risk Factors)
section_name = 'item_1a'  # Risk Factors
if section_name in sample_sections and sample_sections[section_name]:
    print(f"Sample of {section_name} (Risk Factors):")
    print(f"{sample_sections[section_name][:1000]}...")
else:
    print(f"Section {section_name} not found in the sample filing.")

## Process All Filings

Now that we've tested the preprocessor on a sample filing, let's process all the filings to extract sections and clean the text.

In [None]:
# Process all filings
print("Processing all filings... This may take several minutes.")
processed_df = preprocessor.process_filings(
    filings_df,
    extract_sections=True,
    clean_text=True
)

print(f"Processed {len(processed_df)} filings.")

## Explore the Processed Data

Let's explore the processed data to see what we've extracted and how the data looks.

In [None]:
# Check the columns in the processed DataFrame
print("Columns in the processed DataFrame:")

# Group columns by type (original, section, cleaned)
original_cols = [col for col in processed_df.columns if not col.startswith('section_') and not col.startswith('clean_')]
section_cols = [col for col in processed_df.columns if col.startswith('section_') and not col.endswith('_chars')]
count_cols = [col for col in processed_df.columns if col.endswith('_chars')]
clean_cols = [col for col in processed_df.columns if col.startswith('clean_')]

print(f"\nOriginal columns ({len(original_cols)}): {original_cols}")
print(f"\nSection columns ({len(section_cols)}): {section_cols}")
print(f"\nSection character count columns ({len(count_cols)}): {count_cols}")
print(f"\nCleaned text columns ({len(clean_cols)}): {clean_cols}")

In [None]:
# Check section extraction success
section_stats = pd.DataFrame()

for section_col in section_cols:
    # Get section name without prefix
    section_name = section_col.replace('section_', '')
    
    # Count non-empty sections
    count_col = f"section_{section_name}_chars"
    if count_col in processed_df.columns:
        non_empty = (processed_df[count_col] > 0).sum()
        pct_found = non_empty / len(processed_df) * 100
        avg_length = processed_df[processed_df[count_col] > 0][count_col].mean()
        
        # Add to stats
        section_stats.loc[section_name, 'count'] = non_empty
        section_stats.loc[section_name, 'percentage'] = pct_found
        section_stats.loc[section_name, 'avg_length'] = avg_length

# Sort by percentage found (descending)
section_stats = section_stats.sort_values('percentage', ascending=False)

# Display stats
section_stats

## Visualize Section Extraction Success

Let's visualize the section extraction success rate to see which sections were most commonly found.

In [None]:
# Plot section extraction success
plt.figure(figsize=(14, 6))
section_stats['percentage'].plot(kind='bar', color='steelblue')
plt.title('Section Extraction Success Rate', fontsize=16, fontweight='bold')
plt.xlabel('Section', fontsize=14)
plt.ylabel('Percentage Found (%)', fontsize=14)
plt.axhline(y=80, color='red', linestyle='--', alpha=0.7, label='80% Threshold')
plt.xticks(rotation=45, ha='right')
plt.legend()
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

## Analyze Section Lengths by Company

Let's compare the lengths of key sections across different companies to see if there are any notable differences.

In [None]:
# Select key sections for comparison
key_sections = ['item_1', 'item_1a', 'item_7', 'item_7a', 'item_8']
key_section_cols = [f"section_{section}_chars" for section in key_sections if f"section_{section}_chars" in processed_df.columns]

# Calculate average section lengths by company
section_lengths_by_company = processed_df.groupby('ticker')[key_section_cols].mean().reset_index()

# Melt the DataFrame for easier plotting
section_lengths_melted = pd.melt(
    section_lengths_by_company,
    id_vars='ticker',
    value_vars=key_section_cols,
    var_name='section',
    value_name='avg_chars'
)

# Clean up section names for display
section_lengths_melted['section'] = section_lengths_melted['section'].apply(lambda x: x.replace('section_', '').replace('_chars', ''))

# Plot
plt.figure(figsize=(14, 8))
sns.barplot(x='section', y='avg_chars', hue='ticker', data=section_lengths_melted)
plt.title('Average Section Lengths by Company', fontsize=16, fontweight='bold')
plt.xlabel('Section', fontsize=14)
plt.ylabel('Average Length (characters)', fontsize=14)
plt.xticks(rotation=45, ha='right')
plt.legend(title='Company')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

## Analyze Section Lengths Over Time

Let's see if there are any trends in section lengths over time.

In [None]:
# Calculate average section lengths by year
section_lengths_by_year = processed_df.groupby('filing_year')[key_section_cols].mean().reset_index()

# Melt the DataFrame for easier plotting
section_lengths_by_year_melted = pd.melt(
    section_lengths_by_year,
    id_vars='filing_year',
    value_vars=key_section_cols,
    var_name='section',
    value_name='avg_chars'
)

# Clean up section names for display
section_lengths_by_year_melted['section'] = section_lengths_by_year_melted['section'].apply(lambda x: x.replace('section_', '').replace('_chars', ''))

# Plot
plt.figure(figsize=(14, 8))
sns.lineplot(x='filing_year', y='avg_chars', hue='section', style='section', marker='o', data=section_lengths_by_year_melted)
plt.title('Average Section Lengths Over Time', fontsize=16, fontweight='bold')
plt.xlabel('Filing Year', fontsize=14)
plt.ylabel('Average Length (characters)', fontsize=14)
plt.legend(title='Section')
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

## Sample of Cleaned Text

Let's look at a sample of the cleaned text to see how it differs from the original.

In [None]:
# Get a sample filing with non-empty Risk Factors section
sample_idx = processed_df[processed_df['section_item_1a_chars'] > 0].index[0]
sample_processed = processed_df.loc[sample_idx]

# Display information about the sample
print(f"Sample filing: {sample_processed['ticker']} from {sample_processed['filing_year']}")

# Compare original and cleaned text for Risk Factors section
original_text = sample_processed['section_item_1a']
cleaned_text = sample_processed['clean_item_1a'] if 'clean_item_1a' in sample_processed else ''

print(f"\nOriginal text length: {len(original_text)} characters")
print(f"Cleaned text length: {len(cleaned_text)} characters")
print(f"\nOriginal text sample: {original_text[:500]}...")
print(f"\nCleaned text sample: {cleaned_text[:500]}...")

## Save the Processed Data

Now that we have processed the filings, let's save the processed data for use in later analysis.

In [None]:
# Create the processed data directory if it doesn't exist
if not os.path.exists('../data/processed'):
    os.makedirs('../data/processed')

# Save to pickle file
output_file = '../data/processed/processed_filings.pkl'
processed_df.to_pickle(output_file)
print(f"Saved processed data to {output_file}")

# Save section extraction stats
stats_file = '../data/processed/section_extraction_stats.csv'
section_stats.to_csv(stats_file)
print(f"Saved section extraction stats to {stats_file}")

## Next Steps

In the next notebook (`3_exploratory_analysis.ipynb`), we'll explore the processed filings to gain insights into the data, including:
1. Overview of available companies and years
2. Analysis of section lengths and content
3. Comparison of sections across companies and time
4. Initial text analysis and word frequency analysis