# 10-K Filings: Text Analysis

This notebook demonstrates how to perform text analysis on 10-K filings, including word frequency analysis, topic modeling, and more.

In [None]:
# Import libraries
import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF

# Add project root to path for importing local modules
sys.path.append('..')

# Import project modules
from src.analysis.text_analysis import TextAnalyzer
from src.visualization.basic_plots import create_wordcloud

# Set plot style
plt.style.use('fivethirtyeight')
sns.set_palette('Set2')

# Set pandas display options
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 100)

# Download NLTK resources if needed
try:
    nltk.data.find('tokenizers/punkt')
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('punkt')
    nltk.download('stopwords')

## Load the Processed Filings

Let's load the 10-K filings that we preprocessed in the previous notebook.

In [None]:
# Load the processed filings
processed_file = '../data/processed/processed_filings.pkl'

if os.path.exists(processed_file):
    processed_df = pd.read_pickle(processed_file)
    print(f"Loaded {len(processed_df)} processed filings from {processed_file}")
else:
    print(f"Error: File not found: {processed_file}")
    print("Please run the '2_data_preprocessing.ipynb' notebook first to preprocess the filings.")

In [None]:
# Check basic information
print(f"Filings for {processed_df['ticker'].nunique()} companies over {processed_df['filing_year'].nunique()} years")
print(f"Companies: {', '.join(sorted(processed_df['ticker'].unique()))}")
print(f"Years: {', '.join(map(str, sorted(processed_df['filing_year'].unique())))}")

## Initialize the Text Analyzer

Now, we'll create an instance of the `TextAnalyzer` class that we'll use to analyze the text content of the filings.

In [None]:
# Initialize the text analyzer
analyzer = TextAnalyzer()

# Display some of the financial domain-specific stopwords
print("Sample of financial domain-specific stopwords:")
print(list(analyzer.financial_stopwords)[:20])

# Display some risk-related terms
print("\nSample of risk-related terms:")
print(list(analyzer.risk_terms)[:20])

# Display some positive business terms
print("\nSample of positive business terms:")
print(list(analyzer.positive_terms)[:20])

## Word Frequency Analysis

Let's analyze the most common words in different sections of the 10-K filings.

In [None]:
# Define key sections to analyze
key_sections = ['item_1', 'item_1a', 'item_7', 'item_7a']
section_names = {
    'item_1': 'Business',
    'item_1a': 'Risk Factors',
    'item_7': 'Management Discussion & Analysis',
    'item_7a': 'Market Risk Disclosures'
}

# Create a dictionary to store word frequencies for each section
section_word_frequencies = {}

# Analyze word frequencies for each section
for section in key_sections:
    section_col = f'section_{section}'
    
    if section_col in processed_df.columns:
        # Combine text from all filings for this section
        combined_text = ' '.join(processed_df[section_col].dropna().astype(str))
        
        # Get word frequencies
        word_freq = analyzer.get_word_frequencies(combined_text, top_n=50, remove_stopwords=True)
        
        # Store in dictionary
        section_word_frequencies[section] = word_freq

In [None]:
# Plot top words for each section
for section, word_freq in section_word_frequencies.items():
    if word_freq is not None and not word_freq.empty:
        # Get top 20 words
        top_words = word_freq.head(20)
        
        # Plot
        plt.figure(figsize=(12, 6))
        sns.barplot(x='frequency', y='word', data=top_words, palette='viridis')
        plt.title(f'Top 20 Words in {section_names.get(section, section)}', fontsize=16, fontweight='bold')
        plt.xlabel('Frequency', fontsize=14)
        plt.ylabel('Word', fontsize=14)
        plt.grid(axis='x', linestyle='--', alpha=0.7)
        plt.tight_layout()
        plt.show()

## Word Clouds

Let's create word clouds for each section to visualize the most common words.

In [None]:
# Create word clouds for each section
for section, word_freq in section_word_frequencies.items():
    if word_freq is not None and not word_freq.empty:
        # Create a dictionary of word frequencies for the word cloud
        word_freq_dict = dict(zip(word_freq['word'], word_freq['frequency']))
        
        # Choose colormap based on section
        if section == 'item_1a':  # Risk Factors
            colormap = 'Reds'
        elif section == 'item_7':  # MD&A
            colormap = 'Blues'
        elif section == 'item_1':  # Business
            colormap = 'Greens'
        else:
            colormap = 'viridis'
        
        # Create word cloud
        fig, ax = create_wordcloud(
            word_freq_dict,
            title=f'Word Cloud for {section_names.get(section, section)}',
            figsize=(12, 8),
            colormap=colormap,
            max_words=100
        )
        plt.show()

## Company-Specific Word Analysis

Let's compare word usage across different companies.

In [None]:
# Select a section for company comparison
comparison_section = 'item_1a'  # Risk Factors
section_col = f'section_{comparison_section}'

# Check if the section exists in the data
if section_col in processed_df.columns:
    # Get companies to compare
    companies = sorted(processed_df['ticker'].unique())
    
    # Create a dictionary to store word frequencies for each company
    company_word_frequencies = {}
    
    # Analyze word frequencies for each company
    for company in companies:
        # Get the most recent filing for this company
        company_filings = processed_df[processed_df['ticker'] == company].sort_values('filing_date', ascending=False)
        if not company_filings.empty:
            latest_filing = company_filings.iloc[0]
            section_text = latest_filing[section_col]
            
            if isinstance(section_text, str) and section_text:
                # Get word frequencies
                word_freq = analyzer.get_word_frequencies(section_text, top_n=30, remove_stopwords=True)
                company_word_frequencies[company] = word_freq
    
    print(f"Analyzed word frequencies for {len(company_word_frequencies)} companies in the {section_names.get(comparison_section, comparison_section)} section.")
else:
    print(f"Section '{comparison_section}' not found in the processed data.")

In [None]:
# Create a function to compare word frequencies across companies
def compare_word_frequencies(company1, company2, top_n=15):
    if company1 not in company_word_frequencies or company2 not in company_word_frequencies:
        print(f"Word frequencies not available for {company1} or {company2}.")
        return
    
    # Get word frequencies
    freq1 = company_word_frequencies[company1]
    freq2 = company_word_frequencies[company2]
    
    # Merge the frequencies
    merged = pd.merge(freq1, freq2, on='word', how='outer', suffixes=(f'_{company1}', f'_{company2}')).fillna(0)
    
    # Calculate the difference
    merged['difference'] = merged[f'frequency_{company1}'] - merged[f'frequency_{company2}']
    
    # Sort by absolute difference
    merged['abs_difference'] = merged['difference'].abs()
    merged = merged.sort_values('abs_difference', ascending=False)
    
    # Get top differentiating words
    top_diff = merged.head(top_n)
    
    # Plot
    plt.figure(figsize=(12, 8))
    
    # Create a horizontal bar chart
    bars = plt.barh(top_diff['word'], top_diff['difference'])
    
    # Color bars based on which company uses the word more
    for i, bar in enumerate(bars):
        if top_diff.iloc[i]['difference'] > 0:
            bar.set_color('steelblue')  # Company 1 uses more
        else:
            bar.set_color('firebrick')  # Company 2 uses more
    
    # Add a vertical line at x=0
    plt.axvline(x=0, color='black', linestyle='-', alpha=0.3)
    
    # Set labels and title
    plt.xlabel('Difference in Word Frequency', fontsize=14)
    plt.ylabel('Word', fontsize=14)
    plt.title(f'Word Frequency Differences in {section_names.get(comparison_section, comparison_section)}: {company1} vs {company2}', fontsize=16, fontweight='bold')
    
    # Add a legend
    plt.legend([f'More in {company1}', f'More in {company2}'], loc='lower right')
    
    # Add grid lines
    plt.grid(axis='x', linestyle='--', alpha=0.7)
    
    plt.tight_layout()
    plt.show()
    
    return top_diff

In [None]:
# Compare word frequencies between two companies
if len(companies) >= 2:
    compare_word_frequencies(companies[0], companies[1])

## Topic Modeling

Let's use topic modeling to identify key themes in the 10-K filings.

In [None]:
# Select a section for topic modeling
topic_section = 'item_7'  # MD&A
section_col = f'section_{topic_section}'

# Check if the section exists in the data
if section_col in processed_df.columns:
    # Get non-empty sections
    section_texts = processed_df[processed_df[f'section_{topic_section}_chars'] > 0][section_col].tolist()
    
    # Check if we have enough data
    if len(section_texts) > 5:
        # Extract topics
        n_topics = 5  # Number of topics to extract
        model, vectorizer, topic_words, doc_topic_matrix = analyzer.extract_topics(
            section_texts,
            n_topics=n_topics,
            n_top_words=10,
            method='lda'  # Latent Dirichlet Allocation
        )
        
        print(f"Extracted {n_topics} topics from {len(section_texts)} {section_names.get(topic_section, topic_section)} sections.")
    else:
        print(f"Not enough data for topic modeling. Found only {len(section_texts)} non-empty sections.")
else:
    print(f"Section '{topic_section}' not found in the processed data.")

In [None]:
# Display the topics and top words
if 'topic_words' in locals() and topic_words:
    # Create a DataFrame to display the topics
    topics_df = pd.DataFrame()
    
    for topic_idx, words in topic_words:
        topics_df[f'Topic {topic_idx+1}'] = words
    
    # Display the topics
    topics_df

In [None]:
# Visualize the topics
if 'topic_words' in locals() and topic_words:
    # Create a figure with subplots for each topic
    fig, axes = plt.subplots(1, n_topics, figsize=(20, 4))
    
    # Create a word cloud for each topic
    for topic_idx, words in topic_words:
        # Create a dictionary of word importance
        word_importance = {word: 1/(i+1) for i, word in enumerate(words)}
        
        # Create word cloud
        wordcloud = WordCloud(
            background_color='white',
            width=400,
            height=300,
            colormap=f'Blues_{topic_idx+3}',
            max_words=10
        ).generate_from_frequencies(word_importance)
        
        # Add to subplot
        ax = axes[topic_idx]
        ax.imshow(wordcloud, interpolation='bilinear')
        ax.set_title(f'Topic {topic_idx+1}', fontsize=14, fontweight='bold')
        ax.axis('off')
    
    plt.suptitle(f'Topics in {section_names.get(topic_section, topic_section)}', fontsize=16, fontweight='bold')
    plt.tight_layout(rect=[0, 0, 1, 0.95])
    plt.show()

## Topic Distribution by Company

Let's analyze how the topics are distributed across different companies.

In [None]:
# Prepare data for topic distribution analysis
if 'doc_topic_matrix' in locals() and doc_topic_matrix is not None:
    # Get the filings that have non-empty MD&A sections
    section_filings = processed_df[processed_df[f'section_{topic_section}_chars'] > 0].reset_index(drop=True)
    
    # Make sure we have the same number of filings as documents in the topic matrix
    if len(section_filings) == doc_topic_matrix.shape[0]:
        # Add topic distributions to the filings DataFrame
        for topic_idx in range(n_topics):
            section_filings[f'topic_{topic_idx+1}'] = doc_topic_matrix[:, topic_idx]
        
        # Calculate average topic distribution by company
        topic_dist_by_company = section_filings.groupby('ticker')[[f'topic_{i+1}' for i in range(n_topics)]].mean()
        
        # Display the topic distribution
        topic_dist_by_company

In [None]:
# Visualize topic distribution by company
if 'topic_dist_by_company' in locals() and not topic_dist_by_company.empty:
    # Convert to long format for plotting
    topic_dist_long = topic_dist_by_company.reset_index().melt(
        id_vars='ticker',
        value_vars=[f'topic_{i+1}' for i in range(n_topics)],
        var_name='topic',
        value_name='proportion'
    )
    
    # Plot
    plt.figure(figsize=(14, 8))
    sns.barplot(x='ticker', y='proportion', hue='topic', data=topic_dist_long)
    plt.title(f'Topic Distribution by Company in {section_names.get(topic_section, topic_section)}', fontsize=16, fontweight='bold')
    plt.xlabel('Company', fontsize=14)
    plt.ylabel('Topic Proportion', fontsize=14)
    plt.legend(title='Topic')
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()

## Comprehensive Text Analysis

Let's perform a comprehensive text analysis on the filings using the `analyze_filings` method of the `TextAnalyzer` class.

In [None]:
# Perform comprehensive text analysis
print("Performing comprehensive text analysis... This may take a few minutes.")
text_metrics_df = analyzer.analyze_filings(processed_df, sections=key_sections)
print(f"Analysis complete. Generated metrics for {len(text_metrics_df)} filings.")

In [None]:
# Display the first few rows of the metrics DataFrame
text_metrics_df.head()

In [None]:
# Compare text metrics across companies
if not text_metrics_df.empty:
    # Select key metrics for comparison
    comparative_metrics = analyzer.compare_filings(text_metrics_df, groupby='ticker', section='full_text')
    
    # Display the comparative metrics
    comparative_metrics

In [None]:
# Visualize sentiment trends
if not text_metrics_df.empty:
    # Plot sentiment trends
    fig = analyzer.plot_sentiment_trends(text_metrics_df, groupby='filing_year', section='full_text')
    plt.show()

## Save the Analysis Results

Let's save the results of our text analysis for use in later notebooks.

In [None]:
# Create the results directory if it doesn't exist
if not os.path.exists('../data/results'):
    os.makedirs('../data/results')

# Save text metrics
text_metrics_file = '../data/results/text_analysis.pkl'
text_metrics_df.to_pickle(text_metrics_file)
print(f"Saved text analysis results to {text_metrics_file}")

# Save word frequencies
word_freq_file = '../data/results/word_frequencies.pkl'
pd.to_pickle(section_word_frequencies, word_freq_file)
print(f"Saved word frequencies to {word_freq_file}")

# Save topic modeling results if available
if 'topic_words' in locals() and topic_words:
    topic_file = '../data/results/topic_modeling.pkl'
    topic_results = {
        'section': topic_section,
        'n_topics': n_topics,
        'topic_words': topic_words,
        'doc_topic_matrix': doc_topic_matrix
    }
    pd.to_pickle(topic_results, topic_file)
    print(f"Saved topic modeling results to {topic_file}")

## Next Steps

In the next notebook (`5_financial_analysis.ipynb`), we'll extract and analyze financial metrics from the 10-K filings, including:
1. Revenue, net income, and other key financial metrics
2. Financial ratios and growth rates
3. Comparative financial analysis across companies
4. Financial trends over time