# Data Collection

## Import Libraries

In [None]:
import os
import time
import datetime
from datetime import datetime
import pandas as pd

import matplotlib.pyplot as plt

import praw # Reddit API wrapper
from dotenv import load_dotenv
import requests
from io import StringIO
from typing import List, Dict, Any, Optional, Tuple
import numpy as np
from tqdm import tqdm
import json
import numpy as np
import importlib.util
import sys
from collections import defaultdict

import re
import glob
from bs4 import BeautifulSoup, XMLParsedAsHTMLWarning
import warnings


import google.generativeai as genai

## S&P 500 Company Parser 

From Wikipedia : https://en.wikipedia.org/wiki/List_of_S%26P_500_companies

In [None]:
url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
response = requests.get(url)
soup = BeautifulSoup(response.text)
table = soup.find("table", {"class": "wikitable"})
sp500 = pd.read_html(StringIO(str(table)))[0]
sp500.head()
sp500.to_csv("./data/sp500.csv", index=False)

## Reddit Data Extractor API


First let's load the api environment to collect all the necessary data. 
The idea is to download posts and comments from different subreddits. 
- Subreddits: 'wallstreetbets', 'stocks', 'investing', 'StockMarket'
- General Search & Stock Specific search

In [None]:
# First we will load the environment
load_dotenv('api.env')
# Import the ids and secret keys from our environment
id = os.getenv('REDDIT_CLIENT_ID')
secret = os.getenv('REDDIT_CLIENT_SECRET')
agent = os.getenv('REDDIT_USER_AGENT', 'StockDataScraper v1.0')
reddit = praw.Reddit(client_id = id,
                     client_secret = secret,
                     user_agent = agent)

### Defining the Function that collects and downloads the data

In [None]:
def get_reddit_data(reddit, subreddit_name, data_type='posts', search_term=None, 
                   time_filter='year', limit=200, comment_limit=30):
    """This is the general function that I will loop through in order to download
      all the Reddit Data"""
    
    subreddit = reddit.subreddit(subreddit_name)
    posts_list = []
    comments_list = []
    
    try:
        # Determine which data to fetch
        if data_type == 'search' and search_term:
            print(f"Searching for '{search_term}' in r/{subreddit_name}...")
            posts = subreddit.search(search_term, limit=limit)
            search_keywords = [search_term.lower(), f"${search_term.lower()}"]
        else:
            print(f"Getting top posts from r/{subreddit_name} for {time_filter}...")
            posts = subreddit.top(time_filter=time_filter, limit=limit)
            search_keywords = None
            
        # Process posts
        for i, post in enumerate(posts):
            # Filter search results if needed
            if search_keywords and not any(kw in (post.title + " " + post.selftext).lower() for kw in search_keywords):
                continue
                
            # Extract post data
            post_data = {
                'post_id': post.id,
                'title': post.title,
                'selftext': post.selftext,
                'score': post.score,
                'upvote_ratio': post.upvote_ratio,
                'created_utc': datetime.fromtimestamp(post.created_utc),
                'num_comments': post.num_comments,
                'author': str(post.author),
                'permalink': post.permalink,
                'url': post.url,
                'is_self': post.is_self,
                'flair': post.link_flair_text,
                'subreddit': subreddit_name,
                'category': 'stock_specific' if data_type == 'search' else 'general'
            }
            
            # Add search term 
            if search_term:
                post_data['search_term'] = search_term
                
            posts_list.append(post_data)
            
            # Get comments
            try:
                post.comments.replace_more(limit=0)
                for comment in post.comments.list()[:comment_limit]:
                    # Filter comments for search terms 
                    if search_keywords and not any(kw in comment.body.lower() for kw in search_keywords):
                        continue
                        
                    comment_data = {
                        'comment_id': comment.id,
                        'post_id': post.id,
                        'parent_id': comment.parent_id,
                        'body': comment.body,
                        'score': comment.score,
                        'created_utc': datetime.fromtimestamp(comment.created_utc),
                        'author': str(comment.author),
                        'subreddit': subreddit_name,
                        'category': 'stock_specific' if data_type == 'search' else 'general'
                    }
                    
                    # Add search
                    if search_term:
                        comment_data['search_term'] = search_term
                        
                    comments_list.append(comment_data)
            except Exception as e:
                print(f"Error processing comments for post {post.id}: {e}")
                
            # Be nice to Reddit's servers
                
        print(f"Found {len(posts_list)} posts and {len(comments_list)} comments")
        return pd.DataFrame(posts_list) if posts_list else pd.DataFrame(), \
               pd.DataFrame(comments_list) if comments_list else pd.DataFrame()
               
    except Exception as e:
        print(f"Error fetching data from r/{subreddit_name}: {e}")
        return pd.DataFrame(), pd.DataFrame()

### First Download General posts from the Subreddits
- Economics
- Finance
- Wall streetbets
- stocks
- investing

Making sure to get enough posts and number of comments per post. 


In [None]:
# Get S&P 500 tickers
sptickers = sp500['Symbol'].tolist()
# List of finance subreddits
subreddits = ['Economics','finance','wallstreetbets', 'stocks', 'investing']
# Initialize aggregated DataFrames

all_posts = pd.DataFrame()
all_comments = pd.DataFrame()

# Loop through all the subreddits defined above and collect the posts and comments data
for subreddit in subreddits:
    posts_df, comments_df = get_reddit_data(reddit, subreddit,data_type='posts',
                                            limit=200,comment_limit=30)
    all_posts = pd.concat([all_posts, posts_df],ignore_index=True)
    all_comments = pd.concat([all_comments, comments_df],ignore_index=True)
    time.sleep(1)

# Safe all posts and comments into our file
all_posts.to_csv('./data/reddit/general_posts.csv',index = False)
all_comments.to_csv('./data/reddit/general_comments.csv',index = False)

### Use the same function this time for specific stock companies

The function loops through different ticker values inside each subreddit to fetch data to have rich dataset (i.e. 500 searches for each subreddit for which there are 5) with comments

In [None]:
# Get S&P 500 tickers
sptickers = sp500['Symbol'].tolist()

subreddits = ['stocks','wallstreetbets','investing','Economics','finance']
all_posts = pd.DataFrame()
all_comments = pd.DataFrame()
for subreddit in subreddits:
    for ticker in sptickers:
        posts_df , comments_df = get_reddit_data(
            reddit, subreddit, data_type='search',search_term=ticker,
            limit=80, comment_limit=20
        )
        all_posts= pd.concat([all_posts, posts_df],ignore_index = True)
        all_comments = pd.concat([all_comments,comments_df],ignore_index=True)
        time.sleep(0.5)
# Save the stock-specific posts and comments to CSV files
all_posts.to_csv('./data/reddit/posts_stock_specific.csv')
all_comments.to_csv('./data/reddit/comments_stock_specific.csv')

## Financial Data Extractor

### First set up a file directory 

In [None]:
# Create data directory
data_dir = "./data/financial_data"
os.makedirs(data_dir, exist_ok=True)

print("Setup complete!")

### Seting up SEC's EDGAR API to download


You can find more information @ https://www.sec.gov/search-filings/edgar-application-programming-interfaces

In [None]:

# Set my identity for the SEC (First name last name and email)
name = "Maseeh"
surname = "Faizan"
email = "maseehfaizan@unil.ch"
headers = {'User-Agent': f'{name} {surname} {email}'}

# Get ticker data
ticker = requests.get('https://www.sec.gov/files/company_tickers.json', headers=headers).json()
ticker_df = pd.DataFrame.from_dict(ticker, orient='index')
ticker_df.rename(columns={'cik_str':'cik','title':'name'}, inplace=True)

# Fill in the cik code and add the leading zeros (Usually we need 10 digits with leading zeros)
ticker_df['cik'] = ticker_df['cik'].astype(str).str.zfill(10)

print(f"Loaded ticker data for {len(ticker_df)} companies")
print("Sample ticker data:")


# API endpoints
COMPANY_FACTS_URL = "https://data.sec.gov/api/xbrl/companyfacts/CIK{}.json"

print("API configuration set!")

# Variables to track progress
processed_companies = []
failed_companies = []
all_raw_data = []

# Processing parameters
delay_between_requests = 0.5  # seconds
max_retries = 3

print("Ready to start downloading!")

### Mapping function to map wikipedia data with SEC's JSON Date

- I have fetched company ticker data from the SEC's EDGAR API but I also have the ticker data from the Wikipedia
- The function bellow makes sure the ticker values are all UPPERCASE and can locate the cik value with leading zeros for the specific ticker

In [None]:
# Create a helper function to get CIK from ticker using your ticker_df
def get_cik_from_ticker(symbol):
    """Get CIK for a given ticker symbol from ticker_df"""
    result = ticker_df[ticker_df['ticker'] == symbol.upper()]
    if not result.empty:
        return result.iloc[0]['cik']
    else:
        return None

### Main download function

This code downloads raw financial data for S&P 500 companies from the SEC EDGAR database. It iterates through each company, retrieves its CIK, and then fetches and processes US-GAAP financial facts. The extracted data for each company is saved as a separate CSV file and appended to a master list for comprehensive analysis.

- This code will output **RAW** Financial data that still needs to be processed. It gives all of the accounts that are available

In [None]:
company_symbols = sp500['Symbol'].tolist()
for i, symbol in enumerate(tqdm(company_symbols, desc="Downloading financial data")):
    print(f"\nProcessing {symbol} ({i+1}/{len(company_symbols)})")
    
    # Get CIK for the symbol using your ticker_df
    cik = get_cik_from_ticker(symbol)
    
    if not cik:
        print(f"Could not find CIK for {symbol}")
        failed_companies.append(symbol)
        continue
    
    print(f"Found CIK: {cik} for {symbol}")
    
    # Download company facts with retries
    facts_data = None
    for attempt in range(max_retries):
        try:
            url = COMPANY_FACTS_URL.format(cik)
            response = requests.get(url, headers=headers)
            
            if response.status_code == 429:
                print("Rate limit hit, waiting 10 seconds...")
                time.sleep(10)
                response = requests.get(url, headers=headers)
            
            if response.status_code == 200:
                facts_data = response.json()
                break
            else:
                print(f"Attempt {attempt+1}: Status code {response.status_code}")
                
        except Exception as e:
            print(f"Attempt {attempt+1} failed: {e}")
        
        if attempt < max_retries - 1:
            time.sleep(delay_between_requests * (2 ** attempt))
    
    if not facts_data:
        print(f"Failed to get data for {symbol} after {max_retries} attempts")
        failed_companies.append(symbol)
        continue
    
    # Extract raw financial data
    if 'facts' in facts_data and 'us-gaap' in facts_data['facts']:
        company_name = facts_data.get('entityName', 'Unknown')
        print(f"Processing data for {company_name}")
        
        us_gaap_facts = facts_data['facts']['us-gaap']
        
        company_records = []
        for concept, concept_data in us_gaap_facts.items():
            units = concept_data.get('units', {})
            
            for unit_type, values in units.items():
                for value_data in values:
                    record = {
                        'symbol': symbol,
                        'cik': cik,
                        'company_name': company_name,
                        'concept': concept,
                        'unit': unit_type,
                        'start': value_data.get('start', None),
                        'end': value_data.get('end', None),
                        'filed': value_data.get('filed', None),
                        'form': value_data.get('form', None),
                        'frame': value_data.get('frame', None),
                        'value': value_data.get('val', None)
                    }
                    company_records.append(record)
        
        if company_records:
            # Convert to DataFrame and save individual company file
            company_df = pd.DataFrame(company_records)
            
            # Convert dates
            for date_col in ['start', 'end', 'filed']:
                company_df[date_col] = pd.to_datetime(company_df[date_col], errors='coerce')
            
            # Save individual company raw data
            company_file = os.path.join(data_dir, f"{symbol}_raw_financials.csv")
            company_df.to_csv(company_file, index=False)
            
            # Add to master list
            all_raw_data.extend(company_records)
            processed_companies.append(symbol)
            
            print(f"Saved {len(company_records)} records for {symbol}")
        else:
            print(f"No financial data found for {symbol}")
            failed_companies.append(symbol)
    else:
        print(f"No US-GAAP facts found for {symbol}")
        failed_companies.append(symbol)
    
    # Wait between requests
    time.sleep(delay_between_requests)

print(f"\nDownload complete!")
print(f"Successfully processed: {len(processed_companies)} companies")
print(f"Failed: {len(failed_companies)} companies")

## 10-K file downloader

The following code downloads 10-K annual reports for S&P 500 companies directly from the SEC EDGAR database. It iterates through each company, fetches its recent submissions, filters for 10-K filings, and then downloads each report as an HTML file, saving them to a local directory. It includes error handling and a small delay to be respectful of the SEC's servers.

In [None]:
# Headers for SEC API requests
headers = {
    'User-Agent': 'Maseeh Faizan maseeh.faizan@unil.ch'
}

# Base URL for SEC data
base_url = "https://data.sec.gov"

# Create directory for storing filings
os.makedirs('./data/sec_filings', exist_ok=True)

# Loop through all companies in the dataframe
for _, row in sp500.iterrows():
    company = row['Symbol']
    cik = row['CIK']

    print(f"Processing {company} (CIK: {cik})...")

    try:
        # Get company submissions
        submissions_url = f"{base_url}/submissions/CIK{cik:010d}.json"
        response = requests.get(submissions_url, headers=headers)
        response.raise_for_status()
        submissions = response.json()

        # Convert to DataFrame
        submissions_df = pd.DataFrame(submissions['filings']['recent'])

        # Filter for 10-K filings
        submissions_df = submissions_df[submissions_df['form'] == '10-K']

        # Create document URLs
        submissions_df['doc_url'] = submissions_df.apply(
            lambda x: f"https://www.sec.gov/Archives/edgar/data/{cik}/{x['accessionNumber'].replace('-', '')}/{x['primaryDocument']}",
            axis=1
        )

        # Skip if no 10-K filings found
        if len(submissions_df) == 0:
            print(f"No 10-K filings found for {company}")
            continue

        # Download each filing
        for idx, filing_row in submissions_df.iterrows():
            url = filing_row['doc_url']
            form = filing_row['form']
            date = filing_row['reportDate']

            # Create filename with company symbol
            filename = f"{company}_{form}_{date}.html"
            file_path = os.path.join('sec_filings', filename)

            try:
                response = requests.get(url, headers=headers)
                with open(file_path, 'w', encoding='utf-8') as f:
                    f.write(response.text)
                print(f"Downloaded: {filename}")
            except Exception as e:
                print(f"Error downloading {filename}: {e}")

        print(f"Completed {company} ({len(submissions_df)} filings)")

        # Add a small delay to be respectful of SEC's servers
        time.sleep(0.1)

    except Exception as e:
        print(f"Error processing {company}: {e}")

print("All companies processed!")

# Data Cleaning & Formatting

## Reddit data

In [None]:
# Loading general and stock specific posts
posts = pd.read_csv('./data/reddit/general_posts.csv').dropna(subset='category') ## Making sure to drop any empty cells
stock_posts = pd.read_csv('./data/reddit/posts_stock_specific.csv').dropna(subset='category')

# Loading comments and stock specific comments
comments = pd.read_csv('./data/reddit/general_comments.csv')
stock_comments = pd.read_csv('./data/reddit/comments_stock_specific.csv')

- First we will make sure the scores are considered as numbers and not strings
- Group by each post and grab top 15 comments for each post based on the score 
- I want to keep all the comments together in the same cell so we will list them together
- We will now merge the listed comments with the appropriate post


In [None]:
comments['score'] = pd.to_numeric(comments['score'])
stock_comments['score'] = pd.to_numeric(stock_comments['score'])


# Group by 'post_id', sort by 'score' within each group, and get the head (top 15)
stock_comments = stock_comments.groupby('post_id').apply(lambda x: x.sort_values(by='score', ascending=False).head(15)).reset_index(drop=True)
comments = comments.groupby('post_id').apply(lambda x: x.sort_values(by='score', ascending=False).head(15)).reset_index(drop=True)

# Group comments by 'post_id' and aggregate the 'body' into lists
stock_comments = stock_comments.groupby('post_id')['body'].apply(list).reset_index()
comments = comments.groupby('post_id')['body'].apply(list).reset_index()

# Merge posts with comments
stocks = pd.merge(stock_posts, stock_comments, on='post_id', how='left')
general = pd.merge(posts, comments, on='post_id', how='left')

### For the merged dataframe I will 
- Make sure the the post date are considered as proper pandas date
- Rename the columns to have a better idea what the column is about 
- Finally concating and merging all the dataframe together. 

In [None]:
#Convert 'created_utc' to datetime and keep only the date ---
stocks['created_utc'] = pd.to_datetime(stocks['created_utc']).dt.date
general['created_utc'] = pd.to_datetime(general['created_utc']).dt.date

# Rename columns for consistency
names = {'selftext':'post','body':'comments'}
stocks = stocks.rename(columns=names)
general = general.rename(columns=names)


main_df = pd.concat([stocks,general], ignore_index=True)
main_df.to_csv('./data/cleaned_stock.csv')

## Financial Data Cleaner/Processor

- Import the Financial csv data
- sort values by date
- Pivot the table to make the concept appear as independant columns across different date&Time
- Given that during the pivote there were a lot of duplicate that were generated, I group the columns by filed date and remove any NaN cells as to have a cleaned dataframe
- To further clean the data, I removed the columns that are 95% empty. I cannot do any meaning full Time Series analysis on those columns

In [None]:

print("Starting data processing...")
sp = pd.read_csv('./data/sp500.csv')
sp.head(3)
for i in sp['Symbol']:
    try:
        print(f'Processing symbol: {i}')
        file_path = f'./data/financial_data/{i}_raw_financials.csv'

        df = pd.read_csv(file_path)
        print(f'Successfully read data for {i}')

        pivoted_df = df.pivot_table(
            index=['cik', 'company_name', 'frame', 'unit', 'end', 'filed', 'form'],
            columns='concept',
            values='value'
        ).reset_index()
        print(f'Pivoted data for {i}')

        metadata_cols = ['cik', 'company_name', 'form', 'frame', 'end', 'unit','filed']
        consolidated_df = pd.DataFrame()
        grouped = pivoted_df.groupby('end')

        for end_date, group in grouped:
            row_data = {'end': end_date}
            for col in metadata_cols:
                if col in group.columns:
                    row_data[col] = group[col].iloc[0]
            
            for col in pivoted_df.columns:
                if col not in metadata_cols and col != 'end':
                    non_nan_values = group[col].dropna()
                    if not non_nan_values.empty:
                        row_data[col] = non_nan_values.iloc[0]
                    else:
                        row_data[col] = None
            
            consolidated_df = pd.concat([consolidated_df, pd.DataFrame([row_data])], ignore_index=True)

        ##### I only want the yearly data not the quarterly data for now ####
        consolidated_df = consolidated_df[consolidated_df['form'] == '10-K']
        consolidated_df = consolidated_df[consolidated_df['frame'].astype(str).str.match((r'^CY\d{4}$'))]
        print(f'Filtered by frame format for {i}')


        print(f'Filtering columns with >95% empty cells for {i}')
        if not consolidated_df.empty:
            nan_percentages = consolidated_df.isna().mean()
            cleaned_df = consolidated_df.loc[:, nan_percentages < 0.95]
            consolidated_df = cleaned_df
            print(f'Removed sparse columns for {i}')
        else:
            print(f'Consolidated dataframe is empty for {i}, skipping NaN column removal.')

        output_file_path = f'./data/clean/{i}.csv'
        
        # Ensure the directory exists before saving
        os.makedirs(os.path.dirname(output_file_path), exist_ok=True)
        
        consolidated_df.to_csv(output_file_path, index=False)
        print(f'Successfully processed and saved data for {i} to {output_file_path}')

    except FileNotFoundError:
        print(f"Error: File not found for symbol {i} at path: {file_path}. Skipping this symbol.")
    except pd.errors.EmptyDataError:
        print(f"Error: File for symbol {i} is empty: {file_path}. Skipping this symbol.")
    except KeyError as e:
        print(f"Error: A required column is missing for symbol {i} (KeyError: {e}). Skipping this symbol.")
    except Exception as e:
        print(f"An unexpected error occurred while processing symbol {i}: {e}")
        print(f"Skipping symbol {i} and continuing with the next one.")

print("Finished processing all symbols.")


### Here I will further clean the data to construct the Balance Sheet, Income Statement and Cash Flow Statement for the Sector / Industry and save them into the csv files


- Check each year (Frame) individually for each financial account
- If the primary account has a NaN/empty value in a specific year
- Look at the alternative account names for that same year
- Use the alternative value when available

### The important thing to note here is I needed a mapping code to translate XBRL accounting standards for different companies in different sectors and different sub-industry to work together. To do so I have creating different python files for each sector containing mapping dictionary for each account where each account has the main account, the alternative account and the childeren account

This will output in Financial_statement folder all the Balance Sheet, Income Statement and Cashflow statement

In [None]:
# Global list to store mapping debug information
mapping_debug_log = []

# Function to load industry mappings from sector-specific Python files
# Function to load industry mappings from sector-specific Python files
def load_industry_mapping(sector, sub_industry, mappings_dir='.'):
    """
    Load the appropriate industry mapping based on sector and sub-industry.
    
    Args:
        sector (str): GICS Sector 
        sub_industry (str): GICS Sub-Industry
        mappings_dir (str): Directory containing the mapping Python files
        
    Returns:
        dict: The mapping dictionary for the specified sector and sub-industry
    """
    try:
        # Convert sector name to filename format
        sector_file = sector.lower().replace(' ', '').replace('&', '').replace('-', '') + '.py'
        
        # Check if file exists (using the specified directory)
        sector_file_path = os.path.join(mappings_dir, sector_file)
        if not os.path.exists(sector_file_path):
            debug_msg = f"Warning: Mapping file {sector_file_path} not found for {sector} - {sub_industry}."
            print(debug_msg)
            mapping_debug_log.append(debug_msg)
            return None
        
        # Load the module dynamically
        spec = importlib.util.spec_from_file_location(sector.lower(), sector_file_path)
        module = importlib.util.module_from_spec(spec)
        spec.loader.exec_module(module)
        
        # Get the industry mappings
        if hasattr(module, 'industry_mappings'):
            industry_mappings = module.industry_mappings
            
            # Log available keys for debugging
            available_keys = list(industry_mappings.keys())
            debug_msg = f"Processing {sector} - {sub_industry}... dict_keys({available_keys})"
            print(debug_msg)
            mapping_debug_log.append(debug_msg)
            
            # Try multiple normalization strategies to find a match
            
            # Strategy 1: Direct key lookup without normalization
            if sub_industry in industry_mappings:
                debug_msg = f"Found direct mapping for {sub_industry}"
                print(debug_msg)
                mapping_debug_log.append(debug_msg)
                return industry_mappings[sub_industry]
            
            # Generate different normalized versions of the sub_industry
            normalized_versions = [
                # Strategy 2: PascalCase with '&' → 'And'
                ''.join(word.capitalize() for word in sub_industry.replace('&', ' And ').split()),
                
                # Strategy 3: PascalCase with '&' removed
                ''.join(word.capitalize() for word in sub_industry.replace('&', ' ').split()),
                
                # Strategy 4: PascalCase with all punctuation removed
                ''.join(word.capitalize() for word in ''.join(c if c.isalnum() or c.isspace() else ' ' for c in sub_industry).split()),
            ]
            
            # Try each normalized version
            for normalized in normalized_versions:
                if normalized in industry_mappings:
                    debug_msg = f"Found mapping for {sub_industry} → {normalized}"
                    print(debug_msg)
                    mapping_debug_log.append(debug_msg)
                    return industry_mappings[normalized]
            
            # Try more aggressive partial matching approaches
            # Create multiple normalized versions for comparison
            sub_variants = [
                # Remove spaces, convert to lowercase
                sub_industry.lower().replace(' ', ''),
                
                # Remove spaces, '&', convert to lowercase
                sub_industry.lower().replace(' ', '').replace('&', ''),
                
                # Replace '&' with 'and', remove spaces, convert to lowercase
                sub_industry.lower().replace(' ', '').replace('&', 'and'),
                
                # Remove all non-alphanumeric chars, convert to lowercase
                ''.join(c.lower() for c in sub_industry if c.isalnum())
            ]
            
            # Try to match with each key using the variants
            for key in industry_mappings:
                # Create similar variants for the key
                key_variants = [
                    # Remove spaces, convert to lowercase
                    key.lower().replace(' ', ''),
                    
                    # Remove spaces, 'And', convert to lowercase
                    key.lower().replace(' ', '').replace('and', ''),
                    
                    # Remove all non-alphanumeric chars, convert to lowercase
                    ''.join(c.lower() for c in key if c.isalnum())
                ]
                
                # Check if any variant of sub_industry matches any variant of key
                for sub_var in sub_variants:
                    for key_var in key_variants:
                        if sub_var == key_var or sub_var in key_var or key_var in sub_var:
                            debug_msg = f"Using mapping for {key} (variant match with {sub_industry})"
                            print(debug_msg)
                            mapping_debug_log.append(debug_msg)
                            return industry_mappings[key]
                
                # Special case for 'Accounts' suffix
                if 'accounts' in key.lower():
                    key_no_accounts = key.lower().replace('accounts', '')
                    for sub_var in sub_variants:
                        if sub_var in key_no_accounts or key_no_accounts in sub_var:
                            debug_msg = f"Using mapping for {key} (matched after removing 'Accounts')"
                            print(debug_msg)
                            mapping_debug_log.append(debug_msg)
                            return industry_mappings[key]
            
            # Use first industry mapping as fallback
            default_key = next(iter(industry_mappings.keys()))
            debug_msg = f"No specific mapping found for {sub_industry}. Using default mapping for {sector}: {default_key}"
            print(debug_msg)
            mapping_debug_log.append(debug_msg)
            return industry_mappings[default_key]
        else:
            debug_msg = f"No industry_mappings found in {sector_file} for {sector} - {sub_industry}."
            print(debug_msg)
            mapping_debug_log.append(debug_msg)
            return None
    except Exception as e:
        debug_msg = f"Error loading industry mapping for {sector} - {sub_industry}: {str(e)}"
        print(debug_msg)
        mapping_debug_log.append(debug_msg)
        return None

# Load company data from CSV
def load_company_data(ticker, data_dir='clean'):
    """
    Load financial data for a specific company.
    
    Args:
        ticker (str): Company ticker symbol
        
    Returns:
        DataFrame: Financial data for the company
    """
    try:
        filepath = os.path.join(data_dir, f"{ticker}.csv")
        df = pd.read_csv(filepath)
        return df
    except Exception as e:
        print(f"Error loading data for {ticker}: {str(e)}")
        return None

# Helper function to extract account value using mapping info
# Updated helper function to extract account value using mapping info
def extract_account_value(df, account_info, available_columns):
    """
    Extract account value from financial data using mapping information.
    Checks cell by cell, using alternatives if primary value is NaN.
    
    Args:
        df (DataFrame): Company financial data
        account_info (dict): Account mapping information
        available_columns (set): Available columns in the DataFrame
        
    Returns:
        Series: Account value from primary or alternatives
    """
    # Initialize result series with NaN values
    result = pd.Series(np.nan, index=df.index)
    
    # Iterate through each row (each reporting period)
    for idx in df.index:
        # Check primary tag first
        if 'primary' in account_info and account_info['primary'] in available_columns:
            primary_value = df.at[idx, account_info['primary']]
            
            # If primary value exists and is not NaN, use it
            if pd.notna(primary_value):
                result.at[idx] = primary_value
                continue
        
        # If primary is NaN or missing, check alternatives
        if 'alternatives' in account_info:
            for alt in account_info['alternatives']:
                if alt in available_columns:
                    alt_value = df.at[idx, alt]
                    if pd.notna(alt_value):
                        result.at[idx] = alt_value
                        break  # Use first non-NaN alternative
        
        # If still NaN, try children as a sum if available
        if pd.isna(result.at[idx]) and 'children' in account_info and account_info['children']:
            child_values = []
            for child in account_info['children']:
                if child in available_columns and pd.notna(df.at[idx, child]):
                    child_values.append(df.at[idx, child])
            
            # If we found any child values, sum them
            if child_values:
                result.at[idx] = sum(child_values)
    
    return result

# Function to reconstruct balance sheet
def reconstruct_balance_sheet(df, industry_mapping):
    """
    Reconstruct balance sheet using industry-specific mappings.
    
    Args:
        df (DataFrame): Company financial data
        industry_mapping (dict): Industry-specific mappings
        
    Returns:
        DataFrame: Reconstructed balance sheet
    """
    # Create new DataFrame for balance sheet
    balance_sheet = pd.DataFrame(index=df.index)
    
    # Copy metadata columns
    id_columns = ['filed', 'company_name', 'end', 'unit', 'form', 'frame', 'cik']
    for col in id_columns:
        if col in df.columns:
            balance_sheet[col] = df[col]
    
    # Get available columns
    available_columns = set(df.columns)
    
    # Process balance sheet sections dynamically
    balance_sheet_sections = ['Assets', 'Liabilities', 'Equity']
    
    for section in balance_sheet_sections:
        if section in industry_mapping:
            # Iterate through all sub-accounts in this section as defined in the mapping
            for account_name, account_info in industry_mapping[section].items():
                value = extract_account_value(df, account_info, available_columns)
                balance_sheet[f"{section} - {account_name}"] = value
    
    # Add missing totals and validate
    balance_sheet = add_missing_balance_sheet_totals(balance_sheet)
    
    # Remove columns with only NaN values
    balance_sheet = remove_nan_only_columns(balance_sheet)
    
    return balance_sheet

# Function to reconstruct income statement
def reconstruct_income_statement(df, industry_mapping):
    """
    Reconstruct income statement using industry-specific mappings.
    
    Args:
        df (DataFrame): Company financial data
        industry_mapping (dict): Industry-specific mappings
        
    Returns:
        DataFrame: Reconstructed income statement
    """
    # Create new DataFrame for income statement
    income_statement = pd.DataFrame(index=df.index)
    
    # Copy metadata columns
    id_columns = ['filed', 'company_name', 'end', 'unit', 'form', 'frame', 'cik']
    for col in id_columns:
        if col in df.columns:
            income_statement[col] = df[col]
    
    # Get available columns
    available_columns = set(df.columns)
    
    # Process Income Statement dynamically
    if 'IncomeStatement' in industry_mapping:
        income_mapping = industry_mapping['IncomeStatement']
        
        # Iterate through all sections in the income statement mapping
        for section_name, section_data in income_mapping.items():
            # Check if this is a nested structure or direct account mapping
            if isinstance(section_data, dict) and 'primary' in section_data:
                # Direct account mapping (unnested)
                value = extract_account_value(df, section_data, available_columns)
                income_statement[f"IncomeStatement - {section_name}"] = value
            else:
                # Nested structure - process each account in the section
                for account_name, account_info in section_data.items():
                    # Check if this is a further nested structure
                    if isinstance(account_info, dict) and 'primary' in account_info:
                        # Direct account mapping
                        value = extract_account_value(df, account_info, available_columns)
                        income_statement[f"IncomeStatement - {section_name} - {account_name}"] = value
                    else:
                        # Further nested structure
                        for sub_account_name, sub_account_info in account_info.items():
                            value = extract_account_value(df, sub_account_info, available_columns)
                            income_statement[f"IncomeStatement - {section_name} - {account_name} - {sub_account_name}"] = value
    
    # Remove columns with only NaN values
    income_statement = remove_nan_only_columns(income_statement)
    
    return income_statement

# Function to reconstruct cash flow statement
def reconstruct_cash_flow_statement(df, industry_mapping):
    """
    Reconstruct cash flow statement using industry-specific mappings.
    
    Args:
        df (DataFrame): Company financial data
        industry_mapping (dict): Industry-specific mappings
        
    Returns:
        DataFrame: Reconstructed cash flow statement
    """
    # Create new DataFrame for cash flow statement
    cash_flow = pd.DataFrame(index=df.index)
    
    # Copy metadata columns
    id_columns = ['filed', 'company_name', 'end', 'unit', 'form', 'frame', 'cik']
    for col in id_columns:
        if col in df.columns:
            cash_flow[col] = df[col]
    
    # Get available columns
    available_columns = set(df.columns)
    
    # Process Cash Flow Statement dynamically
    if 'CashFlowStatement' in industry_mapping:
        cf_mapping = industry_mapping['CashFlowStatement']
        
        # Iterate through all sections in the cash flow mapping
        for section_name, section_data in cf_mapping.items():
            # Process each account in the section
            for account_name, account_info in section_data.items():
                # Check if this is a nested structure
                if isinstance(account_info, dict) and 'primary' in account_info:
                    # Direct account mapping
                    value = extract_account_value(df, account_info, available_columns)
                    cash_flow[f"CashFlow - {section_name} - {account_name}"] = value
                else:
                    # Nested structure
                    for sub_account_name, sub_account_info in account_info.items():
                        value = extract_account_value(df, sub_account_info, available_columns)
                        cash_flow[f"CashFlow - {section_name} - {account_name} - {sub_account_name}"] = value
    
    # Remove columns with only NaN values
    cash_flow = remove_nan_only_columns(cash_flow)
    
    return cash_flow

# Function to add missing balance sheet totals (including your validation logic)
def add_missing_balance_sheet_totals(balance_sheet):
    """
    Adds missing total columns according to accounting relationships.
    
    Args:
        balance_sheet: DataFrame with reconstructed balance sheet
        
    Returns:
        DataFrame with missing totals computed where possible and validation columns
    """
    # Make a copy to avoid modifying the original
    result = balance_sheet.copy()
    
    # Define key total column names
    total_assets_col = 'Assets - Total Assets'
    total_liabilities_col = 'Liabilities - TotalLiabilities'
    total_equity_col = 'Equity - TotalStockholdersEquity'
    total_liab_equity_col = 'Equity - TotalLiabilitiesandEquity'
    
    # Ensure total columns exist before attempting calculations row-wise
    for col in [total_assets_col, total_liabilities_col, total_equity_col, total_liab_equity_col]:
        if col not in result.columns:
            result[col] = pd.NA
    
    # Process row by row to handle NaN values in specific cells
    for idx, row in result.iterrows():
        # Case 1: Compute missing Total Liabilities
        if (pd.notna(row[total_liab_equity_col]) and
            pd.notna(row[total_equity_col]) and
            pd.isna(row[total_liabilities_col])):
            result.at[idx, total_liabilities_col] = (row[total_liab_equity_col] -
                                                    row[total_equity_col])
        
        # Case 2: Compute missing Total Stockholders Equity
        if (pd.notna(row[total_liab_equity_col]) and
            pd.notna(row[total_liabilities_col]) and
            pd.isna(row[total_equity_col])):
            result.at[idx, total_equity_col] = (row[total_liab_equity_col] -
                                              row[total_liabilities_col])
        
        # Case 3: Compute missing Total Liabilities and Equity
        if pd.isna(row[total_liab_equity_col]):
            if pd.notna(row[total_assets_col]):
                # Set Total Liabilities and Equity = Total Assets (accounting equality)
                result.at[idx, total_liab_equity_col] = row[total_assets_col]
            elif (pd.notna(row[total_liabilities_col]) and
                  pd.notna(row[total_equity_col])):
                # Compute Total Liabilities and Equity as sum of components
                result.at[idx, total_liab_equity_col] = (row[total_liabilities_col] +
                                                       row[total_equity_col])
    
    # Add validation columns
    result['Validation - A = L+E Difference'] = (result[total_assets_col] -
                                               result[total_liab_equity_col])
    
    # This check validates if the sum of Liabilities and Equity components equals Total Liabilities and Equity
    if total_liabilities_col in result.columns and total_equity_col in result.columns:
        result['Validation - L+E Components Sum Difference'] = (result[total_liabilities_col] +
                                                              result[total_equity_col] -
                                                              result[total_liab_equity_col])
    else:
        result['Validation - L+E Components Sum Difference'] = pd.NA
    
    return result

# Function to remove columns that only contain NaN values
def remove_nan_only_columns(df):
    """
    Removes columns that contain only NaN values.
    
    Args:
        df: DataFrame to clean
        
    Returns:
        DataFrame with NaN-only columns removed
    """
    nan_cols = df.columns[df.isnull().all()].tolist()
    return df.drop(columns=nan_cols)

# Function to process SP500 companies with debug logging
def process_sp500_companies(sp500_df, data_dir='clean', mappings_dir='.', output_dir='output'):
    """
    Process all S&P 500 companies to generate financial statements.
    
    Args:
        sp500_df (DataFrame): S&P 500 companies data
        data_dir (str): Directory containing company CSV files
        mappings_dir (str): Directory containing mapping Python files
        output_dir (str): Directory to save output files
        
    Returns:
        dict: Debug information including mapping issues
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Track success and failures
    results = {
        'processed_successfully': [],
        'failed_processing': [],
        'mapping_issues': []
    }
    
    # Process each company
    for idx, row in sp500_df.iterrows():
        ticker = row['Symbol']
        sector = row['GICS Sector']
        sub_industry = row['GICS Sub-Industry']
        
        # Clear previous entries in the global debug log for this company
        global mapping_debug_log
        mapping_debug_log = []
        
        print(f"\nProcessing {ticker} ({sector} - {sub_industry})...")
        
        # Load company data with specified directory
        df = load_company_data(ticker, data_dir=data_dir)
        if df is None:
            debug_msg = f"Skipping {ticker} - Could not load data."
            print(debug_msg)
            results['failed_processing'].append({
                'ticker': ticker,
                'sector': sector,
                'sub_industry': sub_industry,
                'reason': 'Data load failure'
            })
            continue
        
        # Load industry mapping with specified directory
        industry_mapping = load_industry_mapping(sector, sub_industry, mappings_dir=mappings_dir)
        if industry_mapping is None:
            debug_msg = f"Skipping {ticker} - Could not load industry mapping."
            print(debug_msg)
            results['failed_processing'].append({
                'ticker': ticker,
                'sector': sector,
                'sub_industry': sub_industry,
                'reason': 'Mapping load failure'
            })
            continue
        
        try:
            # Generate balance sheet
            balance_sheet = reconstruct_balance_sheet(df, industry_mapping)
            balance_sheet_path = os.path.join(output_dir, f"{ticker}_balance_sheet.csv")
            balance_sheet.to_csv(balance_sheet_path, index=False)
            
            # Generate income statement
            income_statement = reconstruct_income_statement(df, industry_mapping)
            income_statement_path = os.path.join(output_dir, f"{ticker}_income_statement.csv")
            income_statement.to_csv(income_statement_path, index=False)
            
            # Generate cash flow statement
            cash_flow = reconstruct_cash_flow_statement(df, industry_mapping)
            cash_flow_path = os.path.join(output_dir, f"{ticker}_cash_flow.csv")
            cash_flow.to_csv(cash_flow_path, index=False)
            
            success_msg = f"Successfully generated financial statements for {ticker}"
            print(success_msg)
            
            # Record success
            results['processed_successfully'].append({
                'ticker': ticker,
                'sector': sector,
                'sub_industry': sub_industry
            })
            
            # If we used default mapping, record this as a mapping issue
            if any("No specific mapping found" in log for log in mapping_debug_log):
                results['mapping_issues'].append({
                    'ticker': ticker,
                    'sector': sector,
                    'sub_industry': sub_industry,
                    'debug_logs': mapping_debug_log.copy()
                })
        
        except Exception as e:
            error_msg = f"Error processing {ticker}: {str(e)}"
            print(error_msg)
            results['failed_processing'].append({
                'ticker': ticker,
                'sector': sector,
                'sub_industry': sub_industry,
                'reason': str(e)
            })
            continue
    
    # Save the debug results to a JSON file
    debug_file_path = os.path.join(output_dir, "mapping_debug_results.json")
    import json
    with open(debug_file_path, 'w') as f:
        json.dump(results, f, indent=2)
    
    print(f"\nDebug information saved to: {debug_file_path}")
    return results

# Function to extract and show mapping issues
def show_mapping_issues(results=None):
    """
    Display mapping issues from processing results.
    
    Args:
        results: Results dictionary from process_sp500_companies
        
    Returns:
        DataFrame with mapping issues
    """
    if results is None:
        # Try to load results from the default location
        import json
        try:
            with open("output/mapping_debug_results.json", 'r') as f:
                results = json.load(f)
        except:
            print("No results file found. Run process_sp500_companies first.")
            return None
    
    # Extract mapping issues
    issues_data = []
    for issue in results['mapping_issues']:
        for log in issue['debug_logs']:
            if "No specific mapping found" in log:
                issues_data.append({
                    'ticker': issue['ticker'],
                    'sector': issue['sector'],
                    'sub_industry': issue['sub_industry'],
                    'log_message': log
                })
    
    # Convert to DataFrame
    if issues_data:
        import pandas as pd
        issues_df = pd.DataFrame(issues_data)
        return issues_df
    else:
        print("No mapping issues found.")
        return None

# Display formatted balance sheet (optional for visualization)
def display_balance_sheet(balance_sheet, in_billions=True):
    """
    Display balance sheet in a readable format.
    
    Args:
        balance_sheet: DataFrame with balance sheet data
        in_billions: If True, display in billions; otherwise in millions
        
    Returns:
        DataFrame with formatted balance sheet
    """
    # Make a copy to avoid modifying the original
    formatted_bs = balance_sheet.copy()
    
    # Identify numeric columns
    numeric_cols = [col for col in formatted_bs.columns 
                   if any(col.startswith(prefix) for prefix in ['Assets', 'Liabilities', 'Equity', 'Validation'])]
    
    # Convert to billions or millions
    divisor = 1_000_000_000 if in_billions else 1_000_000
    
    for col in numeric_cols:
        # Check if the column is numeric before dividing
        if pd.api.types.is_numeric_dtype(formatted_bs[col]):
            formatted_bs[col] = formatted_bs[col] / divisor
    
    # Format the date column if it exists
    if 'end' in formatted_bs.columns:
        try:
            formatted_bs['end'] = pd.to_datetime(formatted_bs['end']).dt.strftime('%Y-%m-%d')
        except:
            pass  # Keep original format if conversion fails
    
    # Organize columns by section
    metadata_cols = ['filed', 'company_name', 'end', 'unit', 'form', 'frame', 'cik']
    asset_cols = [col for col in formatted_bs.columns if col.startswith('Assets')]
    liability_cols = [col for col in formatted_bs.columns if col.startswith('Liabilities')]
    equity_cols = [col for col in formatted_bs.columns if col.startswith('Equity')]
    validation_cols = [col for col in formatted_bs.columns if col.startswith('Validation')]
    
    # Create ordered list of columns
    ordered_cols = (
        [col for col in metadata_cols if col in formatted_bs.columns] +
        sorted([col for col in asset_cols if 'Total' not in col]) +
        sorted([col for col in asset_cols if 'Total' in col]) +
        sorted([col for col in liability_cols if 'Total' not in col]) +
        sorted([col for col in liability_cols if 'Total' in col]) +
        sorted([col for col in equity_cols if 'Total' not in col and 'Liabilities and Equity' not in col]) +
        sorted([col for col in equity_cols if 'Total Stockholders Equity' in col]) +
        sorted([col for col in equity_cols if 'Liabilities and Equity' in col]) +
        validation_cols
    )
    
    # Only include columns that actually exist
    final_cols = [col for col in ordered_cols if col in formatted_bs.columns]
    
    return formatted_bs[final_cols]

# Main execution
# Load S&P 500 companies data
# Load S&P 500 companies data
sp500 = pd.read_csv('./data/sp500.csv')

# Process all companies with specified directories
process_sp500_companies(
    sp500_df=sp500,
    data_dir='/Users/maseehfaizan/Desktop/Maseeh/Projects/Hybrid_Pricer/data/clean', 
    mappings_dir='./data/XBRL_dic',  
    output_dir='/Users/maseehfaizan/Desktop/Maseeh/Projects/Hybrid_Pricer/data/financial_statement'     
)

### Aggregatting all the accounts into a master account

In [None]:
# Dictionary to group files by ticker
ticker_files = defaultdict(list)
folder_path = "./data/financial_statement"

# Get all CSV files in the folder
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

# Group files by ticker (part before first underscore), exclude master files
for file in csv_files:
    if '_' in file and not file.endswith('_master.csv'):
        ticker = file.split('_')[0]
        ticker_files[ticker].append(file)

# Process each ticker group
for ticker, files in ticker_files.items():
    print(f"Processing ticker: {ticker}")
    print(f"Files: {files}")
    
    # Read and merge all files for this ticker
    merged_df = None
    
    for file in files:
        file_path = os.path.join(folder_path, file)
        df = pd.read_csv(file_path)
        
        if merged_df is None:
            merged_df = df
        else:
            # Merge on 'frame' column
            merged_df = pd.merge(merged_df, df, on='frame', how='outer')
    
    # Remove metadata columns
    metadata_cols = [col for col in merged_df.columns if any(col.startswith(meta) for meta in ['company_name', 'end', 'unit', 'form', 'cik'])]
    merged_df = merged_df.drop(columns=metadata_cols)
    
    # Save merged dataframe
    output_file = os.path.join(folder_path, f"{ticker}_master.csv")
    merged_df.to_csv(output_file, index=False)
    print(f"Saved: {ticker}_master.csv")

## Financial text (10-K) Parsing and Pre-Processing

### Since I don't have the Capacity to process all the dataframe through an LLM I will only analyse 5 Sectors totalling 220 companies

- it : Information Technology
- cs : Communication Services
- cd : Consumer Discretionary
- consum : Sonsumer Staples
- hc : Health Care

In [None]:
it = sp[sp['GICS Sector']=='Information Technology']
cs = sp[sp['GICS Sector']=='Communication Services']
cd = sp[sp['GICS Sector']=='Consumer Discretionary']
consum = sp[sp['GICS Sector']=='Consumer Staples']
hc = sp[sp['GICS Sector']=='Health Care']
df = pd.concat([it, cs, cd,consum, hc], ignore_index=True)

### SEC 10-K Parser

This script scans a directory for local 10-K HTML files based on a list of company tickers. For each file, it:
1.  Parses the HTML using `BeautifulSoup` to extract clean text.
2.  Uses regular expressions (`re`) to locate and extract the content of specific sections (e.g., `Item 1. Business`, `Item 1A. Risk Factors`).
3.  Appends the extracted `ticker`, `filing_date`, `section`, and `content` into a list.
4.  Converts the aggregated data into a final pandas DataFrame named `long_df`.

I have tried multiple ways of parsing this text data but creating the mapping like I did bellow works the best
The code looks for Starting and ending patterns and grabs the text between (like a sandwich). The text with the most content wins (i.e gets captured and the rest is discarted) This way I don't capture text in the table of content and I don't grab any (reference text). I have noticed that for some companies they reference a section or an Item some place else so this is the most robust way to capture the information

In [None]:

warnings.filterwarnings("ignore", category=XMLParsedAsHTMLWarning)

# Configuration
companies = list(df['Symbol'].values) # List of company tickers to process
base_dir = './data/sec_filings/'

# List to store data for all companies and all sections
all_data = []

# Section patterns and display names
section_patterns = {
    "Business": {
        "start_patterns": [r'ITEM\s+1\.\s*', r'Item\s+1\.\s*'],
        "end_patterns": [r'ITEM\s+1A\.\s*', r'Item\s+1A\.\s*', r'ITEM\s+1\.A\.\s*', r'Item\s+1\.A\.\s*', r'ITEM\s+2\.\s*', r'Item\s+2\.\s*'],
        "display_name": "Item 1. Business"
    },
    "Risk Factors": {
        "start_patterns": [r'ITEM\s+1A\.\s*', r'Item\s+1A\.\s*'],
        "end_patterns": [r'ITEM\s+1B\.\s*', r'Item\s+1B\.\s*', r'ITEM\s+1C\.\s*', r'Item\s+1C\.\s*', r'ITEM\s+2\.\s*', r'Item\s+2\.\s*'],
        "display_name": "Item 1A. Risk Factors"
    },
    "Cybersecurity": {
        "start_patterns": [r'ITEM\s+1C\.\s*', r'Item\s+1C\.\s*'],
        "end_patterns": [r'ITEM\s+2\.\s*', r'Item\s+2\.\s*'],
        "display_name": "Item 1C. Cybersecurity"
    },
    "Properties": {
        "start_patterns": [r'ITEM\s+2\.\s*', r'Item\s+2\.\s*'],
        "end_patterns": [r'ITEM\s+3\.\s*', r'Item\s+3\.\s*'],
        "display_name": "Item 2. Properties"
    },
    "Legal Proceedings": {
        "start_patterns": [r'ITEM\s+3\.\s*', r'Item\s+3\.\s*'],
        "end_patterns": [r'ITEM\s+4\.\s*', r'Item\s+4\.\s*'],
        "display_name": "Item 3. Legal Proceedings"
    },
    "Management Discussion and Analysis": {
        "start_patterns": [r'ITEM\s+7\.\s*', r'Item\s+7\.\s*'],
        "end_patterns": [r'ITEM\s+7A\.\s*', r'Item\s+7A\.\s*', r'ITEM\s+7\.A\.\s*', r'Item\s+7\.A\.\s*'],
        "display_name": "Item 7. Management Discussion and Analysis"
    },
    "Quantitative and Qualitative Disclosures": {
        "start_patterns": [r'ITEM\s+7A\.\s*', r'Item\s+7A\.\s*'],
        "end_patterns": [r'ITEM\s+8\.\s*', r'Item\s+8\.\s*'],
        "display_name": "Item 7A. Quantitative and Qualitative Disclosures about Market Risk"
    },
}

# Find all 10-K files for specified companies
file_paths = []
for ticker in companies:
    pattern = f"{base_dir}{ticker}_10-K_*.html"
    ticker_files = glob.glob(pattern)
    file_paths.extend(ticker_files)

if not file_paths:
    print("No matching files found. Please check the directory and file naming pattern.")
else:
    print(f"Found {len(file_paths)} files to process.")

    # Process each file
    for html_file_path in file_paths:
        if not os.path.exists(html_file_path):
            print(f"Error: File not found at '{html_file_path}'")
            continue

        try:
            # Extract ticker from filename
            base_filename = os.path.basename(html_file_path)
            ticker = base_filename.split('_')[0]

            # Extract date from filename
            parts = base_filename.split('_')
            filing_date = parts[2].split('.')[0] if len(parts) >= 3 else None

            # Read the HTML file
            with open(html_file_path, 'r', encoding='utf-8') as f:
                html_content = f.read()

            # Parse the HTML
            soup = BeautifulSoup(html_content, 'lxml')

            # Extract Text
            text_content = soup.get_text(separator=" ", strip=True)

            # Clean the text
            text_lines = text_content.splitlines()
            cleaned_lines = []
            for line in text_lines:
                processed_line = re.sub(r'[ \t]+', ' ', line).strip()
                if processed_line:
                    cleaned_lines.append(processed_line)
            final_text = "\n".join(cleaned_lines)

            # Sections to extract
            section_names_to_extract = [
                "Business",
                "Risk Factors",
                "Cybersecurity",
                "Properties",
                "Legal Proceedings",
                "Management Discussion and Analysis",
                "Quantitative and Qualitative Disclosures"
            ]

            # Extract all sections and store results
            for section_name_key in section_names_to_extract:
                section_config = section_patterns[section_name_key]
                start_patterns = section_config["start_patterns"]
                end_patterns = section_config["end_patterns"]
                display_name = section_config["display_name"]

                valid_sections = []
                for start_pattern in start_patterns:
                    for start_match in re.finditer(start_pattern, final_text, re.IGNORECASE):
                        start_pos = start_match.start()
                        search_start = start_pos + len(start_match.group())

                        for end_pattern in end_patterns:
                            end_match = re.search(end_pattern, final_text[search_start:], re.IGNORECASE)
                            if end_match:
                                end_pos = search_start + end_match.start()
                                section_content = final_text[start_pos:end_pos].strip()
                                section_content = re.sub(start_pattern, '', section_content, flags=re.IGNORECASE).strip()

                                min_content_length = 200
                                if len(section_content) > min_content_length:
                                    valid_sections.append({
                                        'content': section_content,
                                        'length': len(section_content),
                                        'display_name': display_name
                                    })
                                break # Found an end pattern for this start, move to next start_match if any

                if valid_sections:
                    main_section = max(valid_sections, key=lambda x: x['length'])
                    section_content_extracted = main_section['content']
                    section_display_name = main_section['display_name']
                    print(f"Successfully extracted {section_display_name} section for {ticker} (filing date: {filing_date})")
                else:
                    section_content_extracted = ""
                    section_display_name = display_name
                    print(f"No {section_name_key} section found for {ticker}. Check the patterns or document structure.")

                all_data.append({
                    'ticker': ticker,
                    'filing_date': filing_date,
                    'section': section_display_name,
                    'content': section_content_extracted
                })

        except Exception as e:
            print(f"Error processing {html_file_path}: {e}")

# Create DataFrame with all sections
if all_data:
    long_df = pd.DataFrame(all_data)
    print("Processing complete!")
else:
    print("No data was extracted. Please check the file paths and contents.")

### The text data is now merged with the S&P 500 DataFrame. This makes sure that I have a well structured dataframe that is well aligned

In [None]:
# Starting with your long_df, get latest filing dates
long_df['filing_date'] = pd.to_datetime(long_df['filing_date'])
latest_dates = long_df.groupby('ticker')['filing_date'].max().reset_index()
latest_filings = long_df.merge(latest_dates, on=['ticker', 'filing_date'])

# Merge with sp dataframe on ticker/Symbol
merged_df = latest_filings.merge(sp500, left_on='ticker', right_on='Symbol', how='left')

# Drop rows where content is NaN
merged_df = merged_df.dropna(subset=['content'])

# GEMINI API  FOR Q&A, PORTER 5 FORCES AND 7 POWER

### I this part of the code the main goal is to parse text data and make sure they are analysable

- First install and initialize GEMINI API 
- I am using the 2.5 Flash model with 0.01 temprature giving reproducible answers 
- I am making sure the answer is in JSON file Forcing the answer to be in JSON format

In [None]:
api_key = os.getenv('GEMINI_API_KEY')
# Configure the API key
genai.configure(api_key=api_key)

# Initialize the model with low temperature
model = genai.GenerativeModel(
    'gemini-2.5-flash-preview-05-20',
    generation_config={
        'temperature': 0.01, # Low temperature for more deterministic responses for reproducibility
        'top_p': 0.95, # Top-p sampling for diversity
        'response_mime_type': 'application/json'  # Force JSON output
    }
)

## Q&A given Text data

I had prepared around 10 questions for each Item of the 10-K. I tried to make sure the questions are well designed for most companies and also make sure that they are relavant for each sections.

This section was not used for the Thesis since I couldn't turn them into analysable data in the given timeframe

This is computationally intensive and might not be the most efficient way of doing things!!!

The code will send over the whole section with company name and the section title along side one question and will do it for the 10 Questions for that section per company. This is very Time and energy consuming

The question mapping is a python file called `qustions.py`

### In the following we are defining a function that will process dataframe with gemini. I will design the dataframe where one of the column will be the prompt for each company. I will itterate through that columns and save the results in a new column called gimini_response

In [None]:
def process_dataframe_with_gemini(df, file_name, prompt_column='question_prompt', max_retries=3):
    responses = []
    gemini_responses = []
    
    for index, row in df.iterrows():
        print(f"Processing row {index + 1}/{len(df)}: {row['section']}")
        
        retry_count = 0
        success = False
        
        while retry_count < max_retries and not success:
            try:
                prompt = row[prompt_column]
                
                if row['content'] == '':
                    response_text = ""
                else:
                    response = model.generate_content(prompt)
                    response_text = response.text
                
                responses.append(response_text)
                gemini_responses.append(response_text)
                success = True
                time.sleep(0.01)
                
            except Exception as e:
                retry_count += 1
                print(f"Error on attempt {retry_count}: {str(e)}")
                
                if retry_count < max_retries:
                    wait_time = 65 if "rate limit" in str(e).lower() else 5
                    print(f"Waiting {wait_time} seconds before retry...")
                    time.sleep(wait_time)
                else:
                    error_text = f"Error after {retry_count} attempts: {str(e)}"
                    responses.append(error_text)
                    gemini_responses.append(error_text)
    
    df['gemini_response'] = responses
    
    with open(file_name, 'w', encoding='utf-8') as f:
        json.dump(gemini_responses, f, ensure_ascii=False, indent=2)
    
    return df

In [None]:
from questions import company_10k_questions
all_rows_with_questions = []

# Iterate over each row in the merged_df
# Using itertuples(index=False) and zipping with columns for robust column name handling
for row_tuple in merged_df.itertuples(index=False):
    # Create a dictionary from the row tuple using original column names
    original_row_dict = dict(zip(merged_df.columns, row_tuple))

    company_name = original_row_dict.get('Security', 'Unknown Company') # Default if 'Security' is missing
    section_from_df = original_row_dict.get('section')

    if section_from_df is None:
        # Skip row if 'section' column is missing or None
        continue

    # Normalize the section name from DataFrame to match dictionary keys
    # (e.g., "Item 1. Business" to "Item 1: Business")
    section_from_df = section_from_df.strip()
    dict_key = section_from_df.replace(". ", ": ", 1)

    questions_list = company_10k_questions.get(dict_key)

    if questions_list:
        for question_template in questions_list:
            # Format the question with the company name
            formatted_question = question_template.format(company_name=company_name)
            
            # Create a new dictionary for the new row, copying original data
            new_row_entry = original_row_dict.copy()
            new_row_entry['Formatted_Question'] = formatted_question
            
            all_rows_with_questions.append(new_row_entry)
    else:
        # Optionally handle cases where a section in df doesn't match any key in questions dict
        print(f"No questions found for section: {section_from_df} (mapped to {dict_key})")


# Create the new DataFrame with each question as a row
expanded_df = pd.DataFrame(all_rows_with_questions)

### Question & Answer Prompt

In [None]:
expanded_df['question_prompt'] = (
 "As financial analysts, we are extracting financial data from the 10-K, more specifically the " +
expanded_df['section'] + " section of the 10-K for the company " + expanded_df['Security'] +
", which is generally operating in the " + expanded_df['GICS Sector'] +
" GICS Sector, specifically the " + expanded_df['GICS Sub-Industry'] + " GICS Sub-Industry.\n" +
"You are an information extraction bot. **Strictly adhere to the text in the \"" + expanded_df['section'] +
"\" section to answer the questions below.**\n" +
"**IMPORTANT: Your response must be ONLY a valid JSON object in this exact format:**\n" +
"{\n" +
" \"question\": \"" + expanded_df['Formatted_Question'] + "\",\n" +
" \"answer\": \"[Your detailed paragraph answer here]\",\n" +
" \"supporting_quote\": \"[Exact quote from the text that supports your answer]\",\n" +
" \"swot_category\": \"[One of: Strength, Weakness, Opportunity, Threat]\",\n" +
" \"confidence\": [Your confidence score from 0.0 to 1.0]\n" +
"}\n" +
"**Guidelines:**\n" +
"- Answer must be a complete paragraph, no bullet points or internal lists\n" +
"- Supporting quote must be an exact excerpt from the provided text\n" +
"- SWOT category determination:\n" +
"  - **Strength**: Positive internal factors that give the company advantages (e.g., strong market position, unique capabilities, competitive advantages)\n" +
"  - **Weakness**: Negative internal factors that put the company at a disadvantage (e.g., operational inefficiencies, resource limitations, competitive disadvantages)\n" +
"  - **Opportunity**: Positive external factors the company could exploit (e.g., market growth, emerging trends, regulatory changes that benefit)\n" +
"  - **Threat**: Negative external factors that could harm the company (e.g., competition, regulatory risks, market decline, economic headwinds)\n" +
"- Confidence should reflect how directly the information answers the question (1.0 = perfect match, 0.0 = no relevant information)\n" +
"- If information is not explicitly present, set answer to \"Information not available in this section.\", swot_category to \"Not Applicable\", and confidence to 0.0\n" +
"- Do not include any text outside the JSON object\n" +
"GICS Sector: " + expanded_df['GICS Sector'] + "\n" +
"GICS Sub-Industry: " + expanded_df['GICS Sub-Industry'] + "\n" +
"**" + expanded_df['section'] + " Text:**\n" +
expanded_df['content'] + "\n" +
"---\n" +
"**Question:** " + expanded_df['Formatted_Question']
)

### Use the code for the Q&A and Keep the answers there

In [None]:
# For Q&A SWOT analysis
file_name = 'gemini_responses.json'
df_QA_processed = process_dataframe_with_gemini(expanded_df, file_name, 'question_prompt')
df_QA_processed.to_csv('./data/cs_questions_answers.csv', index=False)

## Same Analysis for Porter 5 Forces

### I have prepared a JSON file with Porter 5 Forces after analyzing the book so the LLM knows exactly what to chose from when analyzing the companies and their reports section 

- Here I will only pass through section by section which is a more efficient way but MIGHT result into loss

- I have also explicitly asked for a JSON file and made sure the Model doesn't halucinate and stays grounder
- Finally I have asked for Quotes and Justifications where I can verify for a Sub-Sample that the analysis was well performed (Of course I couldn't check for all the answers)


In [None]:
# First, convert the Porter's framework JSON to a string if it's not already
with open('Porter.json', 'r') as f:
    porters_framework_json_str = f.read() # Assuming you have the JSON object

merged_df['porter_prompt'] = (
    "You are a financial analyst engine. Your task is to analyze a section of text extracted from " +
    merged_df['Security'] + "'s 10-K filing and evaluate how it relates to ALL of Porter's Five Forces, " +
    "determining the relevance and threat/power level for each force.\n\n" +
    "**Context:**\n" +
    "- Company: " + merged_df['Security'] + "\n" +
    "- GICS Sector: " + merged_df['GICS Sector'] + "\n" +
    "- GICS Sub-Industry: " + merged_df['GICS Sub-Industry'] + "\n" +
    "- Section: " + merged_df['section'] + "\n\n" +
    "**Your Goal:**\n" +
    "Analyze the provided section and assess how it relates to each of Porter's Five Forces, providing confidence scores and threat levels for all five forces.\n\n" +
    "**IMPORTANT: Your response MUST be ONLY a valid JSON object in this exact format:**\n" +
    "{\n" +
    "  \"analysis\": {\n" +
    "    \"threat_of_new_entrants\": {\n" +
    "      \"confidence_of_fit\": \"[Score from 0.0 to 1.0 indicating how well the text relates to this force]\",\n" +
    "      \"threat_level\": \"[Either 'High', 'Low', or 'Not Applicable' based on the conditions described]\",\n" +
    "      \"relevant_quotes\": [\"[Exact quotes from the text that support this analysis]\"],\n" +
    "      \"justification\": \"[Detailed explanation of the fit score and threat level, referencing specific conditions from the framework]\"\n" +
    "    },\n" +
    "    \"bargaining_power_of_buyers\": {\n" +
    "      \"confidence_of_fit\": \"[Score from 0.0 to 1.0]\",\n" +
    "      \"threat_level\": \"[Either 'High', 'Low', or 'Not Applicable']\",\n" +
    "      \"relevant_quotes\": [\"[Exact quotes from the text]\"],\n" +
    "      \"justification\": \"[Detailed explanation]\"\n" +
    "    },\n" +
    "    \"bargaining_power_of_suppliers\": {\n" +
    "      \"confidence_of_fit\": \"[Score from 0.0 to 1.0]\",\n" +
    "      \"threat_level\": \"[Either 'High', 'Low', or 'Not Applicable']\",\n" +
    "      \"relevant_quotes\": [\"[Exact quotes from the text]\"],\n" +
    "      \"justification\": \"[Detailed explanation]\"\n" +
    "    },\n" +
    "    \"threat_of_substitute_products\": {\n" +
    "      \"confidence_of_fit\": \"[Score from 0.0 to 1.0]\",\n" +
    "      \"threat_level\": \"[Either 'High', 'Low', or 'Not Applicable']\",\n" +
    "      \"relevant_quotes\": [\"[Exact quotes from the text]\"],\n" +
    "      \"justification\": \"[Detailed explanation]\"\n" +
    "    },\n" +
    "    \"intensity_of_rivalry\": {\n" +
    "      \"confidence_of_fit\": \"[Score from 0.0 to 1.0]\",\n" +
    "      \"threat_level\": \"[Either 'High', 'Low', or 'Not Applicable']\",\n" +
    "      \"relevant_quotes\": [\"[Exact quotes from the text]\"],\n" +
    "      \"justification\": \"[Detailed explanation]\"\n" +
    "    }\n" +
    "  },\n" +
    "  \"primary_force\": \"[The force with the highest confidence_of_fit score]\",\n" +
    "  \"overall_summary\": \"[Brief summary of how this section relates to the company's competitive position]\"\n" +
    "}\n\n" +
    "**Instructions:**\n" +
    "1. Read the entire section carefully and identify all competitive dynamics mentioned.\n" +
    "2. For EACH of the five forces:\n" +
    "   - Assign a confidence_of_fit score (0.0 = no relevance, 1.0 = perfect fit)\n" +
    "   - Determine the threat_level based on conditions described in the text\n" +
    "   - Extract exact quotes that support your analysis\n" +
    "   - Provide detailed justification referencing specific conditions from the framework\n" +
    "3. Use \"Not Applicable\" for threat_level when confidence_of_fit is below 0.2\n" +
    "4. Quote EXACT words and phrases from the text - do not paraphrase\n" +
    "5. Reference specific conditions from the PORTERS_FRAMEWORK_JSON in your justifications\n\n" +
    "**Scoring Guidelines:**\n" +
    "- 0.0-0.2: No clear relevance to this force\n" +
    "- 0.3-0.5: Some indirect relevance or implications\n" +
    "- 0.6-0.8: Clear relevance with specific examples\n" +
    "- 0.9-1.0: Direct discussion of this force with multiple specific conditions\n\n" +
    "**Important Analysis Guidelines:**\n" +
    "- Analyze ALL five forces, even if some have low relevance\n" +
    "- Use exact quotes to support your analysis\n" +
    "- Consider both explicit mentions and implicit implications\n" +
    "- Be specific about which conditions from the framework apply\n" +
    "- Assess threat levels based on whether described conditions increase or decrease competitive pressures\n\n" +
    "---\n" +
    "**PORTERS_FRAMEWORK_JSON:**\n" +
    porters_framework_json_str + "\n" +
    "---\n" +
    "**" + merged_df['section'] + " Text:**\n" +
    merged_df['content'] + "\n" +
    "---"
)

### Using the same function as before for porter's 5 Forces

In [None]:
# For Porter's analysis
file_name_porter = 'gemini_porter_responses.json'
df_processed_porter = process_dataframe_with_gemini(merged_df, file_name_porter, 'porter_prompt')
df_processed_porter.to_csv('./data/df_porter_analysis_results_1.csv', index=False)


## Same analysis for Hamilton's 7 Power

In [None]:
# Read the 7 Powers framework from the JSON file
with open('SevenPower.json', 'r') as f:
    seven_powers_json_str = f.read()

merged_df['seven_powers_prompt'] = (
    "You are a strategic business analyst specializing in Hamilton Helmer's 7 Powers framework. " +
    "Your task is to analyze a section of text extracted from " + merged_df['Security'] + 
    "'s 10-K filing and evaluate how it relates to ALL of the 7 Powers.\n\n" +
    "**Context:**\n" +
    "- Company: " + merged_df['Security'] + "\n" +
    "- GICS Sector: " + merged_df['GICS Sector'] + "\n" +
    "- GICS Sub-Industry: " + merged_df['GICS Sub-Industry'] + "\n" +
    "- Section: " + merged_df['section'] + "\n\n" +
    "**Your Goal:**\n" +
    "Analyze the provided section and assess how it relates to each of Hamilton's 7 Powers, providing confidence scores and strength assessments for all seven powers.\n\n" +
    "**IMPORTANT: Your response MUST be ONLY a valid JSON object in this exact format:**\n" +
    "{\n" +
    "  \"analysis\": {\n" +
    "    \"scale_economies\": {\n" +
    "      \"confidence_of_fit\": \"[Score from 0.0 to 1.0 indicating how well the text relates to this power]\",\n" +
    "      \"power_strength\": \"[Numerical score from 0.0 to 1.0]\",\n" +
    "      \"swot_classification\": \"['Strength' if power is present, 'Weakness' if absent but needed, 'Opportunity' if emerging, 'Threat' if competitors have it]\",\n" +
    "      \"relevant_quotes\": [\"[Exact quotes from the text that support this analysis]\"],\n" +
    "      \"key_indicators\": [\"[Specific evidence from the text]\"],\n" +
    "      \"justification\": \"[Detailed explanation of the fit score and strength assessment]\"\n" +
    "    },\n" +
    "    \"network_economies\": {\n" +
    "      \"confidence_of_fit\": \"[Score from 0.0 to 1.0]\",\n" +
    "      \"power_strength\": \"[Numerical score from 0.0 to 1.0]\",\n" +
    "      \"swot_classification\": \"[Strength/Weakness/Opportunity/Threat]\",\n" +
    "      \"relevant_quotes\": [\"[Exact quotes from the text]\"],\n" +
    "      \"key_indicators\": [\"[Specific evidence]\"],\n" +
    "      \"justification\": \"[Detailed explanation]\"\n" +
    "    },\n" +
    "    \"counter_positioning\": {\n" +
    "      \"confidence_of_fit\": \"[Score from 0.0 to 1.0]\",\n" +
    "      \"power_strength\": \"[Numerical score from 0.0 to 1.0]\",\n" +
    "      \"swot_classification\": \"[Strength/Weakness/Opportunity/Threat]\",\n" +
    "      \"relevant_quotes\": [\"[Exact quotes from the text]\"],\n" +
    "      \"key_indicators\": [\"[Specific evidence]\"],\n" +
    "      \"justification\": \"[Detailed explanation]\"\n" +
    "    },\n" +
    "    \"switching_costs\": {\n" +
    "      \"confidence_of_fit\": \"[Score from 0.0 to 1.0]\",\n" +
    "      \"power_strength\": \"[Numerical score from 0.0 to 1.0]\",\n" +
    "      \"swot_classification\": \"[Strength/Weakness/Opportunity/Threat]\",\n" +
    "      \"relevant_quotes\": [\"[Exact quotes from the text]\"],\n" +
    "      \"key_indicators\": [\"[Specific evidence]\"],\n" +
    "      \"justification\": \"[Detailed explanation]\"\n" +
    "    },\n" +
    "    \"branding\": {\n" +
    "      \"confidence_of_fit\": \"[Score from 0.0 to 1.0]\",\n" +
    "      \"power_strength\": \"[Numerical score from 0.0 to 1.0]\",\n" +
    "      \"swot_classification\": \"[Strength/Weakness/Opportunity/Threat]\",\n" +
    "      \"relevant_quotes\": [\"[Exact quotes from the text]\"],\n" +
    "      \"key_indicators\": [\"[Specific evidence]\"],\n" +
    "      \"justification\": \"[Detailed explanation]\"\n" +
    "    },\n" +
    "    \"cornered_resource\": {\n" +
    "      \"confidence_of_fit\": \"[Score from 0.0 to 1.0]\",\n" +
    "      \"power_strength\": \"[Numerical score from 0.0 to 1.0]\",\n" +
    "      \"swot_classification\": \"[Strength/Weakness/Opportunity/Threat]\",\n" +
    "      \"relevant_quotes\": [\"[Exact quotes from the text]\"],\n" +
    "      \"key_indicators\": [\"[Specific evidence]\"],\n" +
    "      \"justification\": \"[Detailed explanation]\"\n" +
    "    },\n" +
    "    \"process_power\": {\n" +
    "      \"confidence_of_fit\": \"[Score from 0.0 to 1.0]\",\n" +
    "      \"power_strength\": \"[Numerical score from 0.0 to 1.0]\",\n" +
    "      \"swot_classification\": \"[Strength/Weakness/Opportunity/Threat]\",\n" +
    "      \"relevant_quotes\": [\"[Exact quotes from the text]\"],\n" +
    "      \"key_indicators\": [\"[Specific evidence]\"],\n" +
    "      \"justification\": \"[Detailed explanation]\"\n" +
    "    }\n" +
    "  },\n" +
    "  \"primary_power\": \"[The power with the highest confidence_of_fit score]\",\n" +
    "  \"power_timing\": \"[Based on primary power: 'Origination', 'Takeoff', 'Stability', or 'N/A']\",\n" +
    "  \"overall_summary\": \"[Brief summary of the company's competitive position based on the 7 Powers analysis]\"\n" +
    "}\n\n" +
    "**Instructions:**\n" +
    "1. Read the entire section carefully and identify all strategic elements mentioned.\n" +
    "2. For EACH of the seven powers:\n" +
    "   - Assign a confidence_of_fit score (0.0 = no relevance, 1.0 = perfect fit)\n" +
    "   - Assign a power_strength score as a number from 0.0 to 1.0\n" +
    "   - Classify as Strength/Weakness/Opportunity/Threat for SWOT analysis\n" +
    "   - Extract exact quotes that support your analysis\n" +
    "   - List key indicators that demonstrate this power\n" +
    "   - Provide detailed justification referencing the framework\n" +
    "3. Quote EXACT words and phrases from the text - do not paraphrase\n" +
    "4. Reference specific definitions, benefits, and barriers from the SEVEN_POWERS_JSON\n\n" +
    "**Scoring Guidelines:**\n" +
    "- confidence_of_fit:\n" +
    "  - 0.0-0.2: No clear relevance to this power\n" +
    "  - 0.3-0.5: Some indirect relevance or weak indicators\n" +
    "  - 0.6-0.8: Clear relevance with specific examples\n" +
    "  - 0.9-1.0: Direct evidence of this power with multiple indicators\n\n" +
    "- power_strength (numerical):\n" +
    "  - 0.0-0.2: Not Present - No meaningful evidence of this power\n" +
    "  - 0.2-0.4: Weak - Some indicators present but limited evidence of real competitive advantage\n" +
    "  - 0.4-0.6: Emerging - Early signs that this power is developing\n" +
    "  - 0.6-0.8: Moderate - Good evidence but perhaps not fully developed or facing some challenges\n" +
    "  - 0.8-1.0: Strong - Clear evidence of an established power with significant competitive advantage\n\n" +
    "**SWOT Classification:**\n" +
    "- \"Strength\": Company clearly possesses this power\n" +
    "- \"Weakness\": Company lacks this power but could benefit from it\n" +
    "- \"Opportunity\": Early indicators suggest this power could be developed\n" +
    "- \"Threat\": Competitors appear to have this power advantage\n\n" +
    "**Important Analysis Guidelines:**\n" +
    "- Analyze ALL seven powers, even if some have low relevance\n" +
    "- Look for evidence of sustainable competitive advantages\n" +
    "- Consider the specific benefit and barrier for each power\n" +
    "- Identify specific metrics, market positions, or strategic elements\n" +
    "- Be precise about which aspects of each power are demonstrated\n\n" +
    "---\n" +
    "**SEVEN_POWERS_JSON:**\n" +
    seven_powers_json_str + "\n" +
    "---\n" +
    "**" + merged_df['section'] + " Text:**\n" +
    merged_df['content'] + "\n" +
    "---"
)

In [None]:
# For Hamilton's Seven Power analysis
file_name_power = 'gemini_power_responses_1.json'
df_processed_power = process_dataframe_with_gemini(merged_df, file_name_power, 'seven_powers_prompt')
df_processed_power.to_csv('./data/df_power_analysis_results_1.csv', index=False)


## Reddit Text data

Finally performed the same analysis for Reddit data but first we need to load cleaned Reddit data clean it a bit further before analysing it with GEMINI API

In [None]:
reddit = pd.read_csv('./data/reddit/cleaned_stock.csv')
reddit = reddit.dropna(subset=['post', 'comments'])

## Convert created_utc to datetime 
reddit['created_utc'] = pd.to_datetime(reddit['created_utc'])

# Extract year from created_utc
reddit['year'] = reddit['created_utc'].dt.year


In [None]:
# Create a new dataframe with only 2018+ data
reddit = reddit[reddit['year'] >= 2018].copy()
# Now create the plot with cleaned data
yearly_dist = reddit.groupby('year').size().sort_index()
yearly_dist.plot(kind='bar')
plt.title('Number of Posts by Year (2020 onwards)')
plt.xlabel('Year')
plt.ylabel('Number of Posts')
plt.xticks(rotation=45)
plt.show()

### Merging the Reddit data with the main `df` dataframe with all the companies This will remove all the companies we are not analyzing and focus on the companies that we are

In [None]:
# Merge df and reddit dataframes on Symbol and search_term columns
reddit_merge = pd.merge(df, reddit, 
                    left_on='Symbol',
                    right_on='search_term',
                    how='inner')
#  Drop unnecessary columns
reddit_merge = reddit_merge.drop(columns=['search_term','Unnamed: 0.1','Unnamed: 0_x','Unnamed: 0_y','permalink'])
# Drop rows where 'post' or 'comments' are NaN
reddit_merge = reddit_merge.dropna(subset = ['post','comments'])

### Prompt

In [None]:
reddit_merge['reddit_sentiment_prompt'] = (
    "**Role:** You are an expert financial analyst specializing in social media sentiment and alternative data.\n\n" +
    "**Objective:** The following data is for the stock ticker **" + reddit_merge['Symbol'] + "**. " +
    "Analyze the Reddit post and its associated comments to provide a structured summary of the discussion for a machine learning model.\n\n" +
    "**Input Data:**\n" +
    "**Ticker:** " + reddit_merge['Symbol'] + "\n" +
    "**Post Title:** " + reddit_merge['title'] + "\n" +
    "**Post Body:** " + reddit_merge['post'] + "\n" +
    "**Upvote Ratio:** " + reddit_merge['upvote_ratio'].astype(str) + "\n" +
    "**Number of Comments:** " + reddit_merge['num_comments'].astype(str) + "\n" +
    "**Comments:** " + reddit_merge['comments'] + "\n\n" +  
    "**Task:** Based on the input data, generate a single JSON object that structures the discussion's content and sentiment. " +
    "The analysis should clearly distinguish between the original post's claims and the community's response in the comments.\n\n" +
    "**JSON Output Schema:**\n" +
    "{\n" +
    "  \"post_summary\": {\n" +
    "    \"thesis\": \"Summarize the main point, argument, or question of the original post. What is the author claiming or asking?\",\n" +
    "    \"evidence_type\": \"Classify the type of evidence used in the post (e.g., 'Official Financials', 'Technical Analysis', 'Speculation', 'News Article', 'Personal Opinion').\"\n" +
    "  },\n" +
    "  \"comments_summary\": {\n" +
    "    \"main_theme\": \"Summarize the general sentiment and key themes of the comments. Are they in agreement or disagreement with the post?\",\n" +
    "    \"counter_arguments\": \"List the most significant counter-arguments or bearish points raised in the comments.\",\n" +
    "    \"corroborating_points\": \"List the most significant points from the comments that support or agree with the original post.\"\n" +
    "  },\n" +
    "  \"quantitative_analysis\": {\n" +
    "    \"sentiment_score\": \"[A float score from -1.0 (very bearish) to 1.0 (very bullish) for the entire discussion]\",\n" +
    "    \"sentiment_reasoning\": \"A brief explanation for the sentiment score, citing the balance between the post and the comments.\",\n" +
    "    \"conviction_level\": \"[Rate as 'Low', 'Medium', or 'High' based on the certainty and language used across the entire discussion.]\",\n" +
    "    \"predominant_emotion\": \"[e.g., 'Analytical', 'Fear', 'Greed', 'Uncertainty', 'Hope']\"\n" +
    "  },\n" +
    "  \"market_outlook\": \"[Classify the overall conclusive outlook for the ticker from this discussion as 'Bullish', 'Bearish', 'Neutral', or 'Contentious']\"\n" +
    "}\n\n" +
    "**Instructions:**\n" +
    "1. The analysis must focus only on the provided ticker: **" + reddit_merge['Symbol'] + "**.\n" +
    "2. Clearly separate the analysis of the Post Body from the Comments.\n" +
    "3. market_outlook should be 'Contentious' if there is strong disagreement between the post and comments or within the comments themselves.\n" +
    "4. Consider the upvote ratio (" + reddit_merge['upvote_ratio'].astype(str) + ") as an indicator of community agreement with the post.\n" +
    "5. Note that there are " + reddit_merge['num_comments'].astype(str) + " comments on this post.\n" +
    "6. Provide the final output in a single JSON block without any additional commentary."
)

### Designing a VERY similar function as before but for Reddit data

In [None]:
import json
import time

def process_reddit_with_gemini(df, file_name, prompt_column='reddit_sentiment_prompt', max_retries=3):
    responses = []
    gemini_responses = []
    
    for index, row in df.iterrows():
        # Display progress with symbol and title (or just index if not available)
        display_text = f"{row['symbol']} - {row['title'][:50]}..." if 'symbol' in row and 'title' in row else f"Row {index}"
        print(f"Processing {index + 1}/{len(df)}: {display_text}")
        
        retry_count = 0
        success = False
        
        while retry_count < max_retries and not success:
            try:
                prompt = row[prompt_column]
                
                response = model.generate_content(prompt)
                response_text = response.text
                
                responses.append(response_text)
                gemini_responses.append(response_text)
                success = True
                time.sleep(0.01)
                
            except Exception as e:
                retry_count += 1
                print(f"Error on attempt {retry_count}: {str(e)}")
                
                if retry_count < max_retries:
                    wait_time = 65 if "rate limit" in str(e).lower() else 5
                    print(f"Waiting {wait_time} seconds before retry...")
                    time.sleep(wait_time)
                else:
                    error_text = f"Error after {retry_count} attempts: {str(e)}"
                    responses.append(error_text)
                    gemini_responses.append(error_text)
    
    df['gemini_response'] = responses
    
    with open(file_name, 'w', encoding='utf-8') as f:
        json.dump(gemini_responses, f, ensure_ascii=False, indent=2)
    
    return df


# For Reddit analysis
file_name_reddit = 'gemini_reddit_analysis.json'
df_reddit_processed = process_reddit_with_gemini(reddit_merge, file_name_reddit, 'reddit_sentiment_prompt')
df_reddit_processed.to_csv('./data/reddit_analysis_results.csv', index=False)

## Using GEMINI API for Financial Statements and DCF Accounts preparations for different sub-industries in given sector

Here I am categorizing different Industry and different analysis metrics. 
- In Information Technology the norme is to compute the **DCF** Model
- In **Financials** We need to compute the *Dividend Discount Model* **DDM**
- In **Real Estate** we need Net Asset Values **NAV**

This is how we will categorize and price these different companies in different sector because this is how a fundamental analyst would have done so 

In [None]:
std_dcf = ['Information Technology','Communication Services','Consumer Discretionary','Consumer Staples','Health Care','Industrials','Materials']
ddm = ['Utilities','Financials']
nav = ['Real Estate','Energy']

### Making sure I have one company per Sub-Industry Because as I did before all the accounts are mapped at the sub-industry level coming from EDGAR API. This means we simply need to map DCF accounts at the Sub-industry level as well 

OUTPUT ACCOUNT `dcf.json`


In [None]:
it = sp[sp['GICS Sector']=='Information Technology']
cs = sp[sp['GICS Sector']=='Communication Services']
cd = sp[sp['GICS Sector']=='Consumer Discretionary']
css = sp[sp['GICS Sector']=='Consumer Staples']
hc = sp[sp['GICS Sector']=='Health Care']
ind = sp[sp['GICS Sector']=='Industrials']
mat = sp[sp['GICS Sector']=='Materials']
df = pd.concat([it, cs, cd,css,hc,ind,mat], ignore_index=True)
# After your existing code
df_one_per_subindustry = df.groupby('GICS Sub-Industry').first().reset_index()

### Support code for DCF Account creation with prompt and a way to make sure the prompt is generated and is in proper columns

In [None]:



sp_analysis = df_one_per_subindustry.copy()

# Initialize counter and accounts storage
ix = 0
sp_analysis['accounts'] = None
sp_analysis['question_prompt'] = None

# Process each company
for idx, row in sp_analysis.iterrows():
    symbol = row['Symbol']
    sub_industry = row['GICS Sub-Industry']
    
    # Retry configuration
    max_retries = 3
    retry_count = 0
    success = False
    
    while retry_count < max_retries and not success:
        try:
            print(f'Processing {symbol} (Attempt {retry_count + 1}/{max_retries})')
            
            # Read the financial statement
            df = pd.read_csv(f'./data/financial_statement/{symbol}_master.csv')
            accounts = df.columns.to_list()
            
            # Store accounts in the dataframe
            sp_analysis.at[idx, 'accounts'] = accounts
            
            # Create the prompt
            accounts_str = ', '.join(accounts)
            prompt = (
                "Hey Gemini I want a concise answer. \n" +
                " The Goal is to create a standard DCF model using the mappings you will provide. Take into account the account names for the specific industry and create a mapping code like shown in the example bellow "+
                "Make sure the answer is given in JSON format "+
                "\n ------ \n" +
                f"Here are account names for companies that are in {sub_industry} sub-industry. \n" +
                accounts_str + "I want these accounts to be mapped like this: \n" +
                """I want you to use these accounts and create a mapping like this
                Make sure to always write Sub-Industry name in Pascal format: .... for example {
                "**ApplicationSoftware**": {
                "revenue": [
                "IncomeStatement - Revenues - TotalRevenue"
                ],
                "cogs": [
                "IncomeStatement - Expenses - CostOfRevenue"
                ],
                "operating_expenses": [
                "IncomeStatement - Expenses - SalesAndMarketing",
                "IncomeStatement - Expenses - GeneralAndAdministrative"
                ],
                "d_and_a": [
                "CashFlow - OperatingActivities - DepreciationAndAmortization"
                ],
                "capex": [
                "CashFlow - InvestingActivities - CapitalExpenditures"
                ],
                "stock_based_compensation": [
                "CashFlow - OperatingActivities - ShareBasedCompensation"
                ],
                "nwc_operating_assets": [
                "Assets - AccountsReceivable",
                "Assets - ContractAssets",
                "Assets - CapitalizedContractCosts",
                "Assets - PrepaidExpenses"
                ],
                "nwc_operating_liabilities": [
                "Liabilities - AccountsPayable",
                "Liabilities - AccruedLiabilities",
                "Liabilities - DeferredRevenue"
                ]
                }
                }
                Make sure the answer is just the mapping no introduction sentence no ending sentence just the mapping code"""+ """Don't add any other comments i.e ( // Note: DepreciationAndAmortization, ShareBasedCompensation, and RestructuringCharges are typically excluded
                // from core operating expenses for normalized free cash flow calculation.)"""
                            )
            
            sp_analysis.at[idx, 'question_prompt'] = str(prompt)
            
            # Mark as successful
            success = True
            print(f'Successfully processed {symbol}')
            
        except FileNotFoundError:
            print(f'File not found for {symbol}. Skipping...')
            # Don't retry for file not found errors
            break
            
        except Exception as e:
            retry_count += 1
            print(f'Error processing {symbol}: {e}')
            
            if retry_count < max_retries:
                print(f'Waiting 60 seconds before retry...')
                time.sleep(60)  # Wait 1 minute before retrying
            else:
                print(f'Max retries reached for {symbol}. Moving to next company.')
                # Optionally store error information
                sp_analysis.at[idx, 'error'] = str(e)


### MAIN DCF JSON CREATION

In [None]:
def parse_json_from_response(response_text):
    """
    Extract JSON content from Gemini's response, handling markdown code blocks
    """
    # Try to find JSON content within ```json ... ``` blocks
    json_pattern = r'```json\s*(.*?)\s*```'
    matches = re.findall(json_pattern, response_text, re.DOTALL)
    
    if matches:
        # Take the first match
        json_str = matches[0].strip()
    else:
        # If no markdown blocks, assume the entire response is JSON
        json_str = response_text.strip()
    
    try:
        # Parse the JSON string
        return json.loads(json_str)
    except json.JSONDecodeError as e:
        print(f"Error parsing JSON: {e}")
        print(f"JSON string attempted: {json_str[:200]}...")  # Show first 200 chars
        return None

def process_dataframe_with_gemini(df, output_file='gemini_responses.json'):
    """
    Process each row's prompt and save Gemini's response as proper JSON
    """
    # Dictionary to store all responses
    all_responses = {}
    response_texts = []  # For dataframe column
    
    # Process each row
    for index, row in df.iterrows():
        try:
            prompt = row['question_prompt']
            sub_industry = row['GICS Sub-Industry']
            
            print(f"\n{'='*50}")
            print(f"Processing {sub_industry}")
            
            # Send prompt to Gemini
            response = model.generate_content(prompt)
            response_text = response.text
            
            print('Response received')
            
            # Parse JSON from response
            parsed_json = parse_json_from_response(response_text)
            
            if parsed_json:
                # If the parsed JSON has a single key (like the examples show),
                # merge it into our main dictionary
                if isinstance(parsed_json, dict) and len(parsed_json) == 1:
                    all_responses.update(parsed_json)
                else:
                    # Otherwise, use the sub-industry as the key
                    all_responses[sub_industry] = parsed_json
                
                print(f"Successfully parsed JSON for {sub_industry}")
                response_texts.append(json.dumps(parsed_json))
            else:
                print(f"Failed to parse JSON for {sub_industry}")
                all_responses[sub_industry] = {"error": "Failed to parse JSON", "raw_response": response_text[:500]}
                response_texts.append(response_text)
            
            print(f"{'='*50}\n")
            
            time.sleep(1)
            
        except Exception as e:
            error_msg = f"Error: {str(e)}"
            print(f"Error processing row {row['GICS Sub-Industry']}: {str(e)}")
            
            # Store error in the responses
            all_responses[row['GICS Sub-Industry']] = {"error": str(e)}
            response_texts.append(error_msg)
            
            time.sleep(60)  # Longer wait on error
    
    # Write the combined JSON to file
    try:
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(all_responses, f, indent=2, ensure_ascii=False)
        print(f"\n✓ Valid JSON file created: {output_file}")
    except Exception as e:
        print(f"\n✗ Error writing JSON file: {e}")
        # Fallback: write as pretty-printed string
        with open(output_file, 'w', encoding='utf-8') as f:
            f.write(json.dumps(all_responses, indent=2, ensure_ascii=False))
    
    # Add responses to dataframe
    df['gemini_response'] = response_texts
    
    return df


# Using the merged dictionary approach (all industries in one object)
df_processed = process_dataframe_with_gemini(sp_analysis, 'dcf.json')


df_processed.to_csv('sp_analysis_with_responses.csv', index=False)
print("Processing complete. Check 'dcf.json' for properly formatted JSON.")

### Finally want to bring important Financial Accounts like total assets total liability revenue etc. Same logic as the DCF

In [None]:
# Target accounts we're looking for
target_accounts = {
    'frame': 'frame',
    'total_assets': 'Assets - TotalAssets',
    'total_liabilities': 'Liabilities - TotalLiabilities', 
    'total_equity': 'Equity - TotalStockholdersEquity',
    'total_revenue': 'IncomeStatement - Revenues - TotalRevenue',
    'gross_profit': 'IncomeStatement - OtherIncomeExpense - GrossProfit',
    'operating_income': 'IncomeStatement - OtherIncomeExpense - OperatingIncome',
    'net_income': 'IncomeStatement - NetIncome - NetIncome',
    'comprehensive_income': 'IncomeStatement - NetIncome - ComprehensiveIncome',
    'basic_eps': 'IncomeStatement - EarningsPerShare - BasicEps',
    'diluted_eps': 'IncomeStatement - EarningsPerShare - DilutedEps',
    'shares_basic': 'IncomeStatement - EarningsPerShare - WeightedAverageSharesBasic',
    'shares_diluted': 'IncomeStatement - EarningsPerShare - WeightedAverageSharesDiluted'
}

# Everytime I will manually change the Sector name to the one I want to analyze and save the data.
sp_analysis = df_one_per_subindustry.copy()

# Initialize counter and accounts storage
ix = 0
sp_analysis['accounts'] = None
sp_analysis['question_prompt'] = None
sp_analysis['available_target_accounts'] = None

# Process each company
for idx, row in sp_analysis.iterrows():
    symbol = row['Symbol']
    sub_industry = row['GICS Sub-Industry']
    
    # Retry configuration
    max_retries = 3
    retry_count = 0
    success = False
    
    while retry_count < max_retries and not success:
        try:
            print(f'Processing {symbol} (Attempt {retry_count + 1}/{max_retries})')
            
            # Read the financial statement
            df = pd.read_csv(f'./data/financial_statement/{symbol}_master.csv')
            accounts = df.columns.to_list()
            
            # Store accounts in the dataframe
            sp_analysis.at[idx, 'accounts'] = accounts
            
            # Find which target accounts are available (exact matches)
            available_targets = {}
            for key, target_col in target_accounts.items():
                if target_col in accounts:
                    available_targets[key] = target_col
            
            # Store available target accounts
            sp_analysis.at[idx, 'available_target_accounts'] = available_targets
            
            # Create the prompt for mapping
            accounts_str = ', '.join(accounts)
            
            # Convert sub_industry to PascalCase
            pascal_sub_industry = ''.join(word.capitalize() for word in sub_industry.replace('-', ' ').replace('&', 'And').split())
            
            prompt = (
                "Analyze the financial statement columns and create a JSON mapping for these specific accounts. "
                "Return ONLY the JSON mapping with no additional text or explanations.\n\n"
                f"Industry: {sub_industry}\n"
                f"Available columns: {accounts_str}\n\n"
                "Find exact matches or closest equivalents for these target accounts:\n"
                "1. frame - Time period identifier\n"
                "2. Assets - TotalAssets - Total assets from balance sheet\n"
                "3. Liabilities - TotalLiabilities - Total liabilities from balance sheet\n"
                "4. Equity - TotalStockholdersEquity - Total stockholders equity\n"
                "5. IncomeStatement - Revenues - TotalRevenue - Total revenue/sales\n"
                "6. IncomeStatement - OtherIncomeExpense - GrossProfit - Gross profit\n"
                "7. IncomeStatement - OtherIncomeExpense - OperatingIncome - Operating income\n"
                "8. IncomeStatement - NetIncome - NetIncome - Net income\n"
                "9. IncomeStatement - NetIncome - ComprehensiveIncome - Comprehensive income\n"
                "10. IncomeStatement - EarningsPerShare - BasicEps - Basic earnings per share\n"
                "11. IncomeStatement - EarningsPerShare - DilutedEps - Diluted earnings per share\n"
                "12. IncomeStatement - EarningsPerShare - WeightedAverageSharesBasic - Weighted avg shares basic\n"
                "13. IncomeStatement - EarningsPerShare - WeightedAverageSharesDiluted - Weighted avg shares diluted\n\n"
                f'Return the mapping in this exact JSON format:\n'
                '{\n'
                f'  "{pascal_sub_industry}": {{\n'
                '    "frame": ["exact_column_name"],\n'
                '    "total_assets": ["exact_column_name"],\n'
                '    "total_liabilities": ["exact_column_name"],\n'
                '    "total_equity": ["exact_column_name"],\n'
                '    "total_revenue": ["exact_column_name"],\n'
                '    "gross_profit": ["exact_column_name"],\n'
                '    "operating_income": ["exact_column_name"],\n'
                '    "net_income": ["exact_column_name"],\n'
                '    "comprehensive_income": ["exact_column_name"],\n'
                '    "basic_eps": ["exact_column_name"],\n'
                '    "diluted_eps": ["exact_column_name"],\n'
                '    "shares_basic": ["exact_column_name"],\n'
                '    "shares_diluted": ["exact_column_name"]\n'
                '  }\n'
                '}\n\n'
                "Rules: Use exact column names from the list above. If a column doesn't exist, use empty array []. No comments or extra text."
            )
            
            sp_analysis.at[idx, 'question_prompt'] = str(prompt)
            
            # Mark as successful
            success = True
            print(f'Successfully processed {symbol}')
            print(f'Available target accounts: {len(available_targets)}/{len(target_accounts)}')
            
        except FileNotFoundError:
            print(f'File not found for {symbol}. Skipping...')
            # Don't retry for file not found errors
            break
            
        except Exception as e:
            retry_count += 1
            print(f'Error processing {symbol}: {e}')
            
            if retry_count < max_retries:
                print(f'Waiting 60 seconds before retry...')
                time.sleep(60)  # Wait 1 minute before retrying
            else:
                print(f'Max retries reached for {symbol}. Moving to next company.')
                # Optionally store error information
                sp_analysis.at[idx, 'error'] = str(e)

# Create a summary of what accounts are commonly available
print("\n" + "="*50)
print("SUMMARY OF ACCOUNT AVAILABILITY")
print("="*50)

# Analyze which target accounts are most commonly available
account_availability = {}
for key in target_accounts.keys():
    account_availability[key] = 0

for idx, row in sp_analysis.iterrows():
    if row['available_target_accounts'] is not None:
        for key in row['available_target_accounts'].keys():
            account_availability[key] += 1

total_companies = len(sp_analysis)
print(f"\nTotal companies processed: {total_companies}")
print("\nAccount availability across companies:")
for account, count in account_availability.items():
    percentage = (count / total_companies) * 100
    print(f"  {account}: {count}/{total_companies} ({percentage:.1f}%)")

# Save results
output_file = f'financial_mapping_analysis_{int(time.time())}.csv'
sp_analysis.to_csv(output_file, index=False)
print(f"\nResults saved to: {output_file}")

# Create a sample JSON structure for manual review
sample_mapping = {}
for idx, row in sp_analysis.iterrows():
    if row['available_target_accounts'] is not None and len(row['available_target_accounts']) > 0:
        symbol = row['Symbol']
        sub_industry = row['GICS Sub-Industry']
        pascal_sub_industry = ''.join(word.capitalize() for word in sub_industry.replace('-', ' ').replace('&', 'And').split())
        
        sample_mapping[symbol] = {
            "sub_industry": sub_industry,
            "pascal_sub_industry": pascal_sub_industry,
            "available_accounts": row['available_target_accounts']
        }

# Save sample mapping for reference
sample_file = f'sample_mapping_{int(time.time())}.json'
with open(sample_file, 'w') as f:
    json.dump(sample_mapping, f, indent=2)
print(f"Sample mapping saved to: {sample_file}")

print(f"\nProcessing complete! Review the prompts and use them with your AI to generate the final JSON mappings.")

In [None]:
def parse_json_from_response(response_text):
    """
    Extract JSON content from Gemini's response, handling markdown code blocks
    """
    # Try to find JSON content within ```json ... ``` blocks
    json_pattern = r'```json\s*(.*?)\s*```'
    matches = re.findall(json_pattern, response_text, re.DOTALL)
    
    if matches:
        # Take the first match
        json_str = matches[0].strip()
    else:
        # If no markdown blocks, try to find JSON content within { }
        brace_pattern = r'\{.*\}'
        brace_matches = re.findall(brace_pattern, response_text, re.DOTALL)
        if brace_matches:
            json_str = brace_matches[0].strip()
        else:
            # Assume the entire response is JSON
            json_str = response_text.strip()
    
    try:
        # Parse the JSON string
        return json.loads(json_str)
    except json.JSONDecodeError as e:
        print(f"Error parsing JSON: {e}")
        print(f"JSON string attempted: {json_str[:300]}...")  # Show first 300 chars
        return None

def process_financial_mapping_with_gemini(df, output_file='financial_mappings.json'):
    """
    Process each row's financial mapping prompt and save Gemini's response as proper JSON
    """
    # Dictionary to store all financial mappings
    all_mappings = {}
    response_texts = []  # For dataframe column
    parsed_mappings = []  # For dataframe column
    
    total_rows = len(df)
    
    # Process each row
    for index, row in df.iterrows():
        try:
            symbol = row['Symbol']
            prompt = row['question_prompt']
            sub_industry = row['GICS Sub-Industry']
            
            print(f"\n{'='*60}")
            print(f"Processing {index + 1}/{total_rows}: {symbol} - {sub_industry}")
            
            # Send prompt to Gemini
            response = model.generate_content(prompt)
            response_text = response.text
            
            print('Response received from Gemini')
            
            # Parse JSON from response
            parsed_json = parse_json_from_response(response_text)
            
            if parsed_json:
                # If the parsed JSON has a single key (industry name),
                # merge it into our main dictionary
                if isinstance(parsed_json, dict) and len(parsed_json) == 1:
                    all_mappings.update(parsed_json)
                    # Get the key name for storage
                    industry_key = list(parsed_json.keys())[0]
                    parsed_mappings.append(parsed_json[industry_key])
                else:
                    # Otherwise, use the symbol as the key
                    all_mappings[symbol] = parsed_json
                    parsed_mappings.append(parsed_json)
                
                print(f"✓ Successfully parsed JSON mapping for {symbol}")
                response_texts.append(json.dumps(parsed_json, indent=2))
            else:
                print(f"✗ Failed to parse JSON for {symbol}")
                error_info = {
                    "error": "Failed to parse JSON", 
                    "raw_response": response_text[:500],
                    "symbol": symbol,
                    "sub_industry": sub_industry
                }
                all_mappings[f"{symbol}_ERROR"] = error_info
                response_texts.append(response_text)
                parsed_mappings.append(error_info)
            
            print(f"{'='*60}\n")
            
            # Rate limiting - wait between requests
            time.sleep(2)
            
        except Exception as e:
            error_msg = f"Error: {str(e)}"
            print(f"✗ Error processing {symbol}: {str(e)}")
            
            # Store error in the mappings
            error_info = {
                "error": str(e),
                "symbol": symbol,
                "sub_industry": sub_industry,
                "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
            }
            all_mappings[f"{symbol}_ERROR"] = error_info
            response_texts.append(error_msg)
            parsed_mappings.append(error_info)
            
            # Longer wait on error
            time.sleep(10)
    
    # Write the combined JSON to file
    try:
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(all_mappings, f, indent=2, ensure_ascii=False)
        print(f"\n✓ Financial mappings JSON file created: {output_file}")
        
        # Also create a summary file
        summary_file = output_file.replace('.json', '_summary.json')
        summary = {
            "total_companies": total_rows,
            "successful_mappings": len([k for k in all_mappings.keys() if not k.endswith('_ERROR')]),
            "failed_mappings": len([k for k in all_mappings.keys() if k.endswith('_ERROR')]),
            "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
            "industries_processed": list(set(df['GICS Sub-Industry'].tolist()))
        }
        
        with open(summary_file, 'w', encoding='utf-8') as f:
            json.dump(summary, f, indent=2, ensure_ascii=False)
        print(f"✓ Summary file created: {summary_file}")
        
    except Exception as e:
        print(f"\n✗ Error writing JSON file: {e}")
        # Fallback: write as pretty-printed string
        fallback_file = output_file.replace('.json', '_fallback.txt')
        with open(fallback_file, 'w', encoding='utf-8') as f:
            f.write(json.dumps(all_mappings, indent=2, ensure_ascii=False))
        print(f"✓ Fallback file created: {fallback_file}")
    
    # Add responses to dataframe
    df_copy = df.copy()
    df_copy['gemini_response'] = response_texts
    df_copy['parsed_mapping'] = parsed_mappings
    
    return df_copy, all_mappings

def validate_financial_mappings(mappings_dict):
    """
    Validate the financial mappings and provide a report
    """
    validation_report = {
        "total_industries": 0,
        "complete_mappings": 0,
        "partial_mappings": 0,
        "failed_mappings": 0,
        "account_coverage": {},
        "missing_accounts": {}
    }
    
    # Target accounts we're looking for
    target_accounts = [
        'frame', 'total_assets', 'total_liabilities', 'total_equity',
        'total_revenue', 'gross_profit', 'operating_income', 'net_income',
        'comprehensive_income', 'basic_eps', 'diluted_eps', 'shares_basic', 'shares_diluted'
    ]
    
    for industry_key, mapping in mappings_dict.items():
        if industry_key.endswith('_ERROR'):
            validation_report["failed_mappings"] += 1
            continue
            
        validation_report["total_industries"] += 1
        
        if isinstance(mapping, dict) and 'error' not in mapping:
            # Count how many target accounts are mapped
            mapped_accounts = []
            missing_accounts = []
            
            for account in target_accounts:
                if account in mapping and mapping[account] and len(mapping[account]) > 0:
                    mapped_accounts.append(account)
                else:
                    missing_accounts.append(account)
            
            # Classify mapping completeness
            coverage_ratio = len(mapped_accounts) / len(target_accounts)
            if coverage_ratio >= 0.8:  # 80% or more
                validation_report["complete_mappings"] += 1
            elif coverage_ratio >= 0.3:  # 30% or more
                validation_report["partial_mappings"] += 1
            else:
                validation_report["failed_mappings"] += 1
            
            validation_report["account_coverage"][industry_key] = {
                "mapped": mapped_accounts,
                "missing": missing_accounts,
                "coverage_ratio": coverage_ratio
            }
    
    return validation_report

# Process the financial mapping data
print("Starting financial mapping processing with Gemini...")
print("This may take several minutes depending on the number of companies...")

# Process the dataframe
df_processed, financial_mappings = process_financial_mapping_with_gemini(
    sp_analysis, 
    'financial_account_mappings.json'
)

# Validate the mappings
validation_report = validate_financial_mappings(financial_mappings)

print(f"\n{'='*60}")
print("PROCESSING COMPLETE - VALIDATION REPORT")
print(f"{'='*60}")
print(f"Total industries processed: {validation_report['total_industries']}")
print(f"Complete mappings (80%+ coverage): {validation_report['complete_mappings']}")
print(f"Partial mappings (30-79% coverage): {validation_report['partial_mappings']}")
print(f"Failed mappings (<30% coverage): {validation_report['failed_mappings']}")

# Save the processed dataframe
df_processed.to_csv('financial_mapping_results.csv', index=False)
print(f"\n✓ Processed dataframe saved to: financial_mapping_results.csv")

# Save validation report
with open('mapping_validation_report.json', 'w', encoding='utf-8') as f:
    json.dump(validation_report, f, indent=2, ensure_ascii=False)
print(f"✓ Validation report saved to: mapping_validation_report.json")

print(f"\n{'='*60}")
print("FILES CREATED:")
print("1. financial_account_mappings.json - Main mapping file")
print("2. financial_account_mappings_summary.json - Processing summary")
print("3. financial_mapping_results.csv - Dataframe with responses")
print("4. mapping_validation_report.json - Validation analysis")
print(f"{'='*60}")

print("\nProcessing complete! Review the JSON files for your financial account mappings.")