# The goal of this notebook is to extract and analyze the text data available in the EDGAR tool

In [132]:
import pandas as pd
import requests
from datetime import datetime
import json

### Define the Header and import the S&P 500 Company data

In [133]:
sp = pd.read_csv('./data/sp500.csv')
sp.head(20)


Unnamed: 0.1,Unnamed: 0,Symbol,Security,GICS Sector,GICS Sub-Industry,Headquarters Location,Date added,CIK,Founded
0,0,MMM,3M,Industrials,Industrial Conglomerates,"Saint Paul, Minnesota",1957-03-04,66740,1902
1,1,AOS,A. O. Smith,Industrials,Building Products,"Milwaukee, Wisconsin",2017-07-26,91142,1916
2,2,ABT,Abbott Laboratories,Health Care,Health Care Equipment,"North Chicago, Illinois",1957-03-04,1800,1888
3,3,ABBV,AbbVie,Health Care,Biotechnology,"North Chicago, Illinois",2012-12-31,1551152,2013 (1888)
4,4,ACN,Accenture,Information Technology,IT Consulting & Other Services,"Dublin, Ireland",2011-07-06,1467373,1989
5,5,ADBE,Adobe Inc.,Information Technology,Application Software,"San Jose, California",1997-05-05,796343,1982
6,6,AMD,Advanced Micro Devices,Information Technology,Semiconductors,"Santa Clara, California",2017-03-20,2488,1969
7,7,AES,AES Corporation,Utilities,Independent Power Producers & Energy Traders,"Arlington, Virginia",1998-10-02,874761,1981
8,8,AFL,Aflac,Financials,Life & Health Insurance,"Columbus, Georgia",1999-05-28,4977,1955
9,9,A,Agilent Technologies,Health Care,Life Sciences Tools & Services,"Santa Clara, California",2000-06-05,1090872,1999


In [134]:
it = sp[sp['GICS Sector']=='Information Technology']
cs = sp[sp['GICS Sector']=='Communication Services']
cd = sp[sp['GICS Sector']=='Consumer Discretionary']
df = pd.concat([it, cs, cd], ignore_index=True)
it.count()

Unnamed: 0               69
Symbol                   69
Security                 69
GICS Sector              69
GICS Sub-Industry        69
Headquarters Location    69
Date added               69
CIK                      69
Founded                  69
dtype: int64

In [135]:
pd.reset_option('all')

  pd.reset_option('all')
  pd.reset_option('all')


In [136]:
import re
import os
import pandas as pd
import glob
from bs4 import BeautifulSoup, XMLParsedAsHTMLWarning
import warnings
warnings.filterwarnings("ignore", category=XMLParsedAsHTMLWarning)

# Configuration
companies = list(it['Symbol'].values) # List of company tickers to process
base_dir = './data/sec_filings/'

# List to store data for all companies and all sections
all_data = []

# Section patterns and display names
section_patterns = {
    "Business": {
        "start_patterns": [r'ITEM\s+1\.\s*', r'Item\s+1\.\s*'],
        "end_patterns": [r'ITEM\s+1A\.\s*', r'Item\s+1A\.\s*', r'ITEM\s+1\.A\.\s*', r'Item\s+1\.A\.\s*', r'ITEM\s+2\.\s*', r'Item\s+2\.\s*'],
        "display_name": "Item 1. Business"
    },
    "Risk Factors": {
        "start_patterns": [r'ITEM\s+1A\.\s*', r'Item\s+1A\.\s*'],
        "end_patterns": [r'ITEM\s+1B\.\s*', r'Item\s+1B\.\s*', r'ITEM\s+1C\.\s*', r'Item\s+1C\.\s*', r'ITEM\s+2\.\s*', r'Item\s+2\.\s*'],
        "display_name": "Item 1A. Risk Factors"
    },
    "Cybersecurity": {
        "start_patterns": [r'ITEM\s+1C\.\s*', r'Item\s+1C\.\s*'],
        "end_patterns": [r'ITEM\s+2\.\s*', r'Item\s+2\.\s*'],
        "display_name": "Item 1C. Cybersecurity"
    },
    "Mine Safety Disclosures": {
    "start_patterns": [
        r'ITEM\s+4\.\s*',
        r'Item\s+4\.\s*',
    ],
    "end_patterns": [
        r'PART\s+II\s*', 
        r'ITEM\s+5\.\s*', 
        r'Item\s+5\.\s*'
    ],
    "display_name": "Item 4. Mine Safety Disclosures"
    },
    "Properties": {
        "start_patterns": [r'ITEM\s+2\.\s*', r'Item\s+2\.\s*'],
        "end_patterns": [r'ITEM\s+3\.\s*', r'Item\s+3\.\s*'],
        "display_name": "Item 2. Properties"
    },
    "Legal Proceedings": {
        "start_patterns": [r'ITEM\s+3\.\s*', r'Item\s+3\.\s*'],
        "end_patterns": [r'ITEM\s+4\.\s*', r'Item\s+4\.\s*'],
        "display_name": "Item 3. Legal Proceedings"
    },
    "Mine Safety Disclosures": {
        "start_patterns": [r'ITEM\s+4\.\s*', r'Item\s+4\.\s*'],
        "end_patterns": [r'PART\s+II\s*', r'ITEM\s+5\.\s*', r'Item\s+5\.\s*'],
        "display_name": "Item 4. Mine Safety Disclosures"
    },
    "Management Discussion and Analysis": {
        "start_patterns": [r'ITEM\s+7\.\s*', r'Item\s+7\.\s*'],
        "end_patterns": [r'ITEM\s+7A\.\s*', r'Item\s+7A\.\s*', r'ITEM\s+7\.A\.\s*', r'Item\s+7\.A\.\s*'],
        "display_name": "Item 7. Management Discussion and Analysis"
    },
    "Quantitative and Qualitative Disclosures": {
        "start_patterns": [r'ITEM\s+7A\.\s*', r'Item\s+7A\.\s*'],
        "end_patterns": [r'ITEM\s+8\.\s*', r'Item\s+8\.\s*'],
        "display_name": "Item 7A. Quantitative and Qualitative Disclosures about Market Risk"
    },
}

# Find all 10-K files for specified companies
file_paths = []
for ticker in companies:
    pattern = f"{base_dir}{ticker}_10-K_*.html"
    ticker_files = glob.glob(pattern)
    file_paths.extend(ticker_files)

if not file_paths:
    print("No matching files found. Please check the directory and file naming pattern.")
else:
    print(f"Found {len(file_paths)} files to process.")

    # Process each file
    for html_file_path in file_paths:
        if not os.path.exists(html_file_path):
            print(f"Error: File not found at '{html_file_path}'")
            continue

        try:
            # Extract ticker from filename
            base_filename = os.path.basename(html_file_path)
            ticker = base_filename.split('_')[0]

            # Extract date from filename
            parts = base_filename.split('_')
            filing_date = parts[2].split('.')[0] if len(parts) >= 3 else None

            # Read the HTML file
            with open(html_file_path, 'r', encoding='utf-8') as f:
                html_content = f.read()

            # Parse the HTML
            soup = BeautifulSoup(html_content, 'lxml')

            # Extract Text
            text_content = soup.get_text(separator=" ", strip=True)

            # Clean the text
            text_lines = text_content.splitlines()
            cleaned_lines = []
            for line in text_lines:
                processed_line = re.sub(r'[ \t]+', ' ', line).strip()
                if processed_line:
                    cleaned_lines.append(processed_line)
            final_text = "\n".join(cleaned_lines)

            # Sections to extract
            section_names_to_extract = [
                "Business",
                "Risk Factors",
                "Cybersecurity",
                "Properties",
                "Legal Proceedings",
                "Mine Safety Disclosures",
                "Management Discussion and Analysis",
                "Quantitative and Qualitative Disclosures"
            ]

            # Extract all sections and store results
            for section_name_key in section_names_to_extract:
                section_config = section_patterns[section_name_key]
                start_patterns = section_config["start_patterns"]
                end_patterns = section_config["end_patterns"]
                display_name = section_config["display_name"]

                valid_sections = []
                for start_pattern in start_patterns:
                    for start_match in re.finditer(start_pattern, final_text, re.IGNORECASE):
                        start_pos = start_match.start()
                        search_start = start_pos + len(start_match.group())

                        for end_pattern in end_patterns:
                            end_match = re.search(end_pattern, final_text[search_start:], re.IGNORECASE)
                            if end_match:
                                end_pos = search_start + end_match.start()
                                section_content = final_text[start_pos:end_pos].strip()
                                section_content = re.sub(start_pattern, '', section_content, flags=re.IGNORECASE).strip()

                                min_content_length = 200
                                if len(section_content) > min_content_length:
                                    valid_sections.append({
                                        'content': section_content,
                                        'length': len(section_content),
                                        'display_name': display_name
                                    })
                                break # Found an end pattern for this start, move to next start_match if any

                if valid_sections:
                    main_section = max(valid_sections, key=lambda x: x['length'])
                    section_content_extracted = main_section['content']
                    section_display_name = main_section['display_name']
                    print(f"Successfully extracted {section_display_name} section for {ticker} (filing date: {filing_date})")
                else:
                    section_content_extracted = ""
                    section_display_name = display_name
                    print(f"No {section_name_key} section found for {ticker}. Check the patterns or document structure.")

                all_data.append({
                    'ticker': ticker,
                    'filing_date': filing_date,
                    'section': section_display_name,
                    'content': section_content_extracted
                })

        except Exception as e:
            print(f"Error processing {html_file_path}: {e}")

# Create DataFrame with all sections
if all_data:
    long_df = pd.DataFrame(all_data)
    print("Processing complete!")
else:
    print("No data was extracted. Please check the file paths and contents.")

Found 620 files to process.
Successfully extracted Item 1. Business section for ACN (filing date: 2022-08-31)
Successfully extracted Item 1A. Risk Factors section for ACN (filing date: 2022-08-31)
No Cybersecurity section found for ACN. Check the patterns or document structure.
Successfully extracted Item 2. Properties section for ACN (filing date: 2022-08-31)
Successfully extracted Item 3. Legal Proceedings section for ACN (filing date: 2022-08-31)
No Mine Safety Disclosures section found for ACN. Check the patterns or document structure.
Successfully extracted Item 7. Management Discussion and Analysis section for ACN (filing date: 2022-08-31)
Successfully extracted Item 7A. Quantitative and Qualitative Disclosures about Market Risk section for ACN (filing date: 2022-08-31)
Successfully extracted Item 1. Business section for ACN (filing date: 2024-08-31)
Successfully extracted Item 1A. Risk Factors section for ACN (filing date: 2024-08-31)
Successfully extracted Item 1C. Cybersecurit

In [174]:
# Starting with your long_df, get latest filing dates
long_df['filing_date'] = pd.to_datetime(long_df['filing_date'])
latest_dates = long_df.groupby('ticker')['filing_date'].max().reset_index()
latest_filings = long_df.merge(latest_dates, on=['ticker', 'filing_date'])

# Merge with sp dataframe on ticker/Symbol
merged_df = latest_filings.merge(sp, left_on='ticker', right_on='Symbol', how='left')

merged_df.describe()

Unnamed: 0.1,filing_date,Unnamed: 0,CIK
count,552,552.0,552.0
mean,2024-11-04 15:18:15.652173824,258.942029,912399.6
min,2024-03-29 00:00:00,4.0,2488.0
25%,2024-09-27 00:00:00,131.0,769397.0
50%,2024-12-31 00:00:00,268.0,898293.0
75%,2024-12-31 00:00:00,389.0,1327567.0
max,2025-01-31 00:00:00,500.0,1730168.0
std,,148.840492,492505.3


### For testing I will only take in the latest value (2024)

In [175]:
import pandas as pd
import re
from questions import industry_questions  # Import your questions dictionary

# Expand dataframe with questions
expanded_rows = []

for idx, row in merged_df.iterrows():
    # Get the sub-industry and section from the current row
    sub_industry = row['GICS Sub-Industry']
    section = row['section']
    
    # Convert sub-industry to match dictionary key format
    # Remove spaces and convert to CamelCase if needed
    sub_industry_key = sub_industry.replace(' ', '').replace('&', 'And')
    
    # Check if we have questions for this sub-industry
    if sub_industry_key in industry_questions:
        # Check if we have questions for this section
        if section in industry_questions[sub_industry_key]:
            # Get the questions text and split into individual questions
            questions_text = industry_questions[sub_industry_key][section]
            
            # Replace literal \n with actual newlines first
            questions_text = questions_text.replace('\\n', '\n')
            
            # Split by numbered questions (1., 2., etc.)
            questions = re.split(r'\n\d+\.', questions_text)
            questions = [q.strip() for q in questions if q.strip()]
            
            # The first element might have the number, so clean it
            if questions and questions[0].startswith('1.'):
                questions[0] = questions[0][2:].strip()
            
            # Create a row for each question
            for i, question in enumerate(questions[:10]):  # Limit to 10 questions
                new_row = row.copy()
                new_row['question_number'] = i + 1
                new_row['question'] = question
                new_row['question_formatted'] = question.format(company_name=row['Security'])
                expanded_rows.append(new_row)
        else:
            # If no questions found for this section, keep original row but add empty question fields
            new_row = row.copy()
            new_row['question_number'] = None
            new_row['question'] = f"No questions found for section: {section}"
            new_row['question_formatted'] = f"No questions found for section: {section}"
            expanded_rows.append(new_row)
    else:
        # If no questions found for this sub-industry, keep original row but add empty question fields
        new_row = row.copy()
        new_row['question_number'] = None
        new_row['question'] = f"No questions found for sub-industry: {sub_industry}"
        new_row['question_formatted'] = f"No questions found for sub-industry: {sub_industry}"
        expanded_rows.append(new_row)

# Create the expanded dataframe
expanded_df = pd.DataFrame(expanded_rows)

# Display results
print(f"Original dataframe had {len(latest_filings)} rows")
print(f"Expanded dataframe has {len(expanded_df)} rows")


Original dataframe had 552 rows
Expanded dataframe has 4899 rows


In [176]:
expanded_df = expanded_df.drop(columns=['Unnamed: 0', 'Symbol','question'])


# Prompt

In [177]:
expanded_df['question_prompt'] = (
    "As financial analysts, we are extracting financial data from the 10-K, more specifically the " + 
    expanded_df['section'] + " section of the 10-K for the company " + expanded_df['Security'] + 
    ", which is generally operating in the " + expanded_df['GICS Sector'] + 
    " GICS Sector, specifically the " + expanded_df['GICS Sub-Industry'] + " GICS Sub-Industry.\n" +
    "You are an information extraction bot. **Strictly adhere to the text in the \"" + expanded_df['section'] + 
    "\" section to answer the questions below.**\n" +
    "**IMPORTANT: Your response must be ONLY a valid JSON object in this exact format:**\n" +
    "{\n" +
    " \"question\": \"" + expanded_df['question_formatted'] + "\",\n" +
    " \"answer\": \"[Your detailed paragraph answer here]\",\n" +
    " \"supporting_quote\": \"[Exact quote from the text that supports your answer]\",\n" +
    " \"confidence\": [Your confidence score from 0.0 to 1.0]\n" +
    "}\n" +
    "**Guidelines:**\n" +
    "- Answer must be a complete paragraph, no bullet points or internal lists\n" +
    "- Supporting quote must be an exact excerpt from the provided text\n" +
    "- Confidence should reflect how directly the information answers the question (1.0 = perfect match, 0.0 = no relevant information)\n" +
    "- If information is not explicitly present, set answer to \"Information not available in this section.\" and confidence to 0.0\n" +
    "- Do not include any text outside the JSON object\n" +
    "GICS Sector: " + expanded_df['GICS Sector'] + "\n" +
    "GICS Sub-Industry: " + expanded_df['GICS Sub-Industry'] + "\n" +
    "**" + expanded_df['section'] + " Text:**\n" +
    expanded_df['content'] + "\n" +
    "---\n" +
    "**Question:** " + expanded_df['question_formatted']
)

### 1. PipeLine, Ask the questions to Gemini

In [178]:
import pandas as pd
import google.generativeai as genai
import time
api_key = os.getenv('GEMINI_API_KEY')
# Configure the API key
# You'll need to set your API key here
genai.configure(api_key=api_key)

# Initialize the model
model = genai.GenerativeModel('gemini-2.5-flash-preview-05-20')

In [179]:
expanded_df.describe()

Unnamed: 0,filing_date,CIK,question_number
count,4899,4899.0,4830.0
mean,2024-11-12 06:24:45.854255872,913552.6,5.5
min,2024-03-29 00:00:00,2488.0,1.0
25%,2024-09-29 00:00:00,789019.0,3.0
50%,2024-12-31 00:00:00,896878.0,5.5
75%,2024-12-31 00:00:00,1321655.0,8.0
max,2025-01-31 00:00:00,1730168.0,10.0
std,,475884.2,2.872579


In [180]:
expanded_df.describe()

Unnamed: 0,filing_date,CIK,question_number
count,4899,4899.0,4830.0
mean,2024-11-12 06:24:45.854255872,913552.6,5.5
min,2024-03-29 00:00:00,2488.0,1.0
25%,2024-09-29 00:00:00,789019.0,3.0
50%,2024-12-31 00:00:00,896878.0,5.5
75%,2024-12-31 00:00:00,1321655.0,8.0
max,2025-01-31 00:00:00,1730168.0,10.0
std,,475884.2,2.872579


In [181]:
import json
import time
# Function to process the DataFrame and generate responses using Gemini

def process_dataframe_with_gemini(df):
    responses = []
    gemini_responses = []
    for index, row in df.iterrows():
        try:
            prompt = row['question_prompt']
            if row['content'] == '':
                response_text = ""
            else:
                response = model.generate_content(prompt)
                response_text = response.text
            
            responses.append(response_text)
            gemini_responses.append(response_text)
            time.sleep(0.01)
        except Exception as e:
            error_text = f"Error: {str(e)}"
            time.sleep(65)
            responses.append(error_text)
            gemini_responses.append(error_text)

    df['gemini_response'] = responses
    
    with open('gemini_responses.json', 'w', encoding='utf-8') as f:
        json.dump(gemini_responses, f, ensure_ascii=False, indent=2)
    
    return df

df_processed = process_dataframe_with_gemini(expanded_df)
df_processed.to_csv('./data/df_with_gemini_responses.csv', index=False)

In [182]:
df_processed= pd.read_csv('./data/df_with_gemini_responses.csv')
df_processed.head(10)

Unnamed: 0,ticker,filing_date,section,content,Security,GICS Sector,GICS Sub-Industry,Headquarters Location,Date added,CIK,Founded,question_number,question_formatted,question_prompt,gemini_response
0,ACN,2024-08-31,Item 1. Business,Business 2 Business Overview Accenture is a le...,Accenture,Information Technology,IT Consulting & Other Services,"Dublin, Ireland",2011-07-06,1467373,1989,1.0,What are Accenture's primary IT consulting and...,"As financial analysts, we are extracting finan...","```json\n{\n ""question"": ""What are Accenture's..."
1,ACN,2024-08-31,Item 1. Business,Business 2 Business Overview Accenture is a le...,Accenture,Information Technology,IT Consulting & Other Services,"Dublin, Ireland",2011-07-06,1467373,1989,2.0,What is Accenture's typical service delivery m...,"As financial analysts, we are extracting finan...","```json\n{\n ""question"": ""What is Accenture's ..."
2,ACN,2024-08-31,Item 1. Business,Business 2 Business Overview Accenture is a le...,Accenture,Information Technology,IT Consulting & Other Services,"Dublin, Ireland",2011-07-06,1467373,1989,3.0,Does Accenture describe its client engagement ...,"As financial analysts, we are extracting finan...","```json\n{\n ""question"": ""Does Accenture descr..."
3,ACN,2024-08-31,Item 1. Business,Business 2 Business Overview Accenture is a le...,Accenture,Information Technology,IT Consulting & Other Services,"Dublin, Ireland",2011-07-06,1467373,1989,4.0,"What is Accenture's strategy for attracting, r...","As financial analysts, we are extracting finan...","```json\n{\n ""question"": ""What is Accenture's ..."
4,ACN,2024-08-31,Item 1. Business,Business 2 Business Overview Accenture is a le...,Accenture,Information Technology,IT Consulting & Other Services,"Dublin, Ireland",2011-07-06,1467373,1989,5.0,How does Accenture differentiate itself from c...,"As financial analysts, we are extracting finan...","```json\n{\n ""question"": ""How does Accenture d..."
5,ACN,2024-08-31,Item 1. Business,Business 2 Business Overview Accenture is a le...,Accenture,Information Technology,IT Consulting & Other Services,"Dublin, Ireland",2011-07-06,1467373,1989,6.0,Does Accenture discuss its geographic presence...,"As financial analysts, we are extracting finan...","```json\n{\n ""question"": ""Does Accenture discu..."
6,ACN,2024-08-31,Item 1. Business,Business 2 Business Overview Accenture is a le...,Accenture,Information Technology,IT Consulting & Other Services,"Dublin, Ireland",2011-07-06,1467373,1989,7.0,What are the key market trends or technologica...,"As financial analysts, we are extracting finan...","```json\n{\n ""question"": ""What are the key mar..."
7,ACN,2024-08-31,Item 1. Business,Business 2 Business Overview Accenture is a le...,Accenture,Information Technology,IT Consulting & Other Services,"Dublin, Ireland",2011-07-06,1467373,1989,8.0,Does Accenture mention any significant regulat...,"As financial analysts, we are extracting finan...","```json\n{\n ""question"": ""Does Accenture menti..."
8,ACN,2024-08-31,Item 1. Business,Business 2 Business Overview Accenture is a le...,Accenture,Information Technology,IT Consulting & Other Services,"Dublin, Ireland",2011-07-06,1467373,1989,9.0,What is Accenture's approach to intellectual p...,"As financial analysts, we are extracting finan...","```json\n{\n ""question"": ""What is Accenture's ..."
9,ACN,2024-08-31,Item 1. Business,Business 2 Business Overview Accenture is a le...,Accenture,Information Technology,IT Consulting & Other Services,"Dublin, Ireland",2011-07-06,1467373,1989,10.0,Does Accenture disclose any key performance in...,"As financial analysts, we are extracting finan...","```json\n{\n ""question"": ""Does Accenture disc..."


In [183]:
import pandas as pd
import json

def parse_gemini_response(df):
    """Parse the gemini_response column into separate columns"""
    
    # Function to clean and parse a single cell
    def parse_cell(cell_value):
        if pd.isna(cell_value):
            return {'question': None, 'answer': None, 'supporting_quote': None, 'confidence': None}
        
        # Remove ```json and ``` markers
        cleaned = cell_value.strip()
        if cleaned.startswith('```json'):
            cleaned = cleaned[7:]
        if cleaned.endswith('```'):
            cleaned = cleaned[:-3]
        
        # Remove escaped newlines
        cleaned = cleaned.replace('\\n', '')
        
        try:
            # Parse JSON
            parsed = json.loads(cleaned)
            return parsed
        except json.JSONDecodeError:
            # Return None values if parsing fails
            return {'question': None, 'answer': None, 'supporting_quote': None, 'confidence': None}
    
    # Apply parsing to each row
    parsed_data = df['gemini_response'].apply(parse_cell)
    
    # Convert to DataFrame and join with original
    parsed_df = pd.DataFrame(parsed_data.tolist())
    df_final = pd.concat([df, parsed_df], axis=1)
    
    # Optionally drop the original gemini_response column
    # df_final = df_final.drop('gemini_response', axis=1)
    
    return df_final

# Usage
df_cleaned = parse_gemini_response(df_processed)
df_cleaned.head()

Unnamed: 0,ticker,filing_date,section,content,Security,GICS Sector,GICS Sub-Industry,Headquarters Location,Date added,CIK,Founded,question_number,question_formatted,question_prompt,gemini_response,question,answer,supporting_quote,confidence
0,ACN,2024-08-31,Item 1. Business,Business 2 Business Overview Accenture is a le...,Accenture,Information Technology,IT Consulting & Other Services,"Dublin, Ireland",2011-07-06,1467373,1989,1.0,What are Accenture's primary IT consulting and...,"As financial analysts, we are extracting finan...","```json\n{\n ""question"": ""What are Accenture's...",What are Accenture's primary IT consulting and...,Accenture's primary IT consulting and service ...,We combine our strength in technology and lead...,0.85
1,ACN,2024-08-31,Item 1. Business,Business 2 Business Overview Accenture is a le...,Accenture,Information Technology,IT Consulting & Other Services,"Dublin, Ireland",2011-07-06,1467373,1989,2.0,What is Accenture's typical service delivery m...,"As financial analysts, we are extracting finan...","```json\n{\n ""question"": ""What is Accenture's ...",What is Accenture's typical service delivery m...,Accenture's typical service delivery model is ...,A key differentiator is our global delivery ca...,0.7
2,ACN,2024-08-31,Item 1. Business,Business 2 Business Overview Accenture is a le...,Accenture,Information Technology,IT Consulting & Other Services,"Dublin, Ireland",2011-07-06,1467373,1989,3.0,Does Accenture describe its client engagement ...,"As financial analysts, we are extracting finan...","```json\n{\n ""question"": ""Does Accenture descr...",Does Accenture describe its client engagement ...,Accenture describes aspects of its client enga...,We have long-term relationships and have partn...,0.9
3,ACN,2024-08-31,Item 1. Business,Business 2 Business Overview Accenture is a le...,Accenture,Information Technology,IT Consulting & Other Services,"Dublin, Ireland",2011-07-06,1467373,1989,4.0,"What is Accenture's strategy for attracting, r...","As financial analysts, we are extracting finan...","```json\n{\n ""question"": ""What is Accenture's ...","What is Accenture's strategy for attracting, r...","Accenture's strategy for attracting, retaining...",Our focus is to create talent and unlock the p...,1.0
4,ACN,2024-08-31,Item 1. Business,Business 2 Business Overview Accenture is a le...,Accenture,Information Technology,IT Consulting & Other Services,"Dublin, Ireland",2011-07-06,1467373,1989,5.0,How does Accenture differentiate itself from c...,"As financial analysts, we are extracting finan...","```json\n{\n ""question"": ""How does Accenture d...",How does Accenture differentiate itself from c...,Accenture differentiates itself in the IT cons...,We believe Accenture competes successfully in ...,1.0


In [184]:
import json
import os
import time
import pandas as pd
import google.generativeai as genai
from datetime import datetime
from pathlib import Path

def load_mappings(mapping_file_path):
    """Load the industry mappings from JSON file"""
    try:
        with open(mapping_file_path, 'r') as f:
            return json.load(f)
    except FileNotFoundError:
        raise FileNotFoundError(f"Mapping file not found: {mapping_file_path}")
    except json.JSONDecodeError as e:
        raise ValueError(f"Invalid JSON in mapping file: {e}")

def construct_prompt(industry_mapping, extracted_answer, supporting_quote=""):
    """Construct the categorization prompt for a specific industry and answer"""
    
    industry_mapping_json = json.dumps(industry_mapping, indent=2)
    
    prompt = f"""You are a financial analyst engine. Your task is to categorize a piece of text extracted from a company's 10-K filing by mapping it to a specific strategic category from the provided JSON framework.

**Your Goal:**
Based on the "Extracted Answer," find the single best-fit category from the `INDUSTRY_FRAMEWORK_JSON` provided below, and determine the specific SWOT classification and Porter's Five Forces that apply.

**IMPORTANT: Your response MUST be ONLY a valid JSON object in this exact format:**
{{
  "best_fit_category": "[The 'label' of the best-fit category from the framework]",
  "category_type": "[The type of category: 'moat', 'risk', or 'kpi']",
  "confidence_of_fit": "[A score from 0.0 to 1.0 on how well the text fits this category]",
  "swot_classification": "[Based on the answer content, which SWOT category applies: 'Strength', 'Weakness', 'Opportunity', or 'Threat']",
  "porters_forces": ["[List of Porter's Five Forces that apply from the mapping]"],
  "justification": "[A brief explanation of why the text fits this category and why you chose the specific SWOT classification]"
}}

**Instructions:**
1. Analyze the `Extracted Answer` and its `Supporting Quote`.
2. Review the `INDUSTRY_FRAMEWORK_JSON` to understand the available categories and their definitions.
3. Select the `label` of the single category that best matches the text.
4. Determine the `category_type` by looking at whether the chosen label is under the `moats`, `risks`, or `kpis` key in the framework.
5. Based on the content of the answer, determine which SWOT classification applies:
   - Choose "Strength" if the answer shows a positive internal capability or advantage
   - Choose "Weakness" if the answer shows a negative internal limitation or disadvantage
   - Choose "Opportunity" if the answer shows a positive external trend or possibility
   - Choose "Threat" if the answer shows a negative external risk or challenge

   - Note: The chosen SWOT classification should be one of the options listed in the category's mapping.swot array

6. Based on the answer, determine the single most relevant Porter's Five Forces classification from the category's `mapping.porters` array. Evaluate the force's impact on the company:
   - Choose a "High" variant (e.g., `High Intensity of Rivalry`, `High Bargaining Power of Buyers`) if the force exerts strong competitive pressure or presents a significant challenge.
   - Choose a "Low" variant (e.g., `Low Intensity of Rivalry`, `Low Bargaining Power of Buyers`) if the force exerts weak pressure or represents a structural advantage for the company.
   - For example, if the text describes significant barriers that make it difficult for new competitors to enter the market, you would select `Low Threat of New Entrants`.

7. If the text does not clearly fit any of the provided categories, you MUST respond with:
   {{
     "best_fit_category": "No Clear Fit",
     "category_type": "none",
     "confidence_of_fit": 0.0,
     "swot_classification": "none",
     "porters_forces": [],
     "justification": "The text does not contain specific information that aligns with any of the defined moats, risks, or KPIs."
   }}

---
**INDUSTRY_FRAMEWORK_JSON:**
{industry_mapping_json}
---
**Extracted Answer:**
{extracted_answer}

**Supporting Quote:**
{supporting_quote}
---"""
    
    return prompt

def categorize_with_gemini(industry_mapping, extracted_answer, supporting_quote="", model=None, max_retries=3):
    """
    Categorize using Google's Gemini model with retry logic
    
    Args:
        industry_mapping: The industry mapping dict from JSON
        extracted_answer: The main answer text to categorize
        supporting_quote: Optional supporting quote from the 10-K
        model: Gemini model instance (optional, will create if not provided)
        max_retries: Maximum number of retry attempts
    
    Returns:
        Dict with categorization results
    """
    # Initialize Gemini if not provided
    if model is None:
        api_key = os.getenv('GEMINI_API_KEY')
        if not api_key:
            raise ValueError("GEMINI_API_KEY environment variable not set")
        genai.configure(api_key=api_key)
        model = genai.GenerativeModel('gemini-2.5-flash-preview-05-20')
    
    # Validate inputs
    if not extracted_answer or pd.isna(extracted_answer):
        return {
            "best_fit_category": "Empty Answer",
            "category_type": "error",
            "confidence_of_fit": 0.0,
            "swot_classification": "none",
            "porters_forces": [],
            "justification": "No answer text provided"
        }
    
    # Construct the prompt
    prompt = construct_prompt(industry_mapping, extracted_answer, supporting_quote)
    
    # Retry logic
    retry_count = 0
    base_wait_time = 60
    
    while retry_count < max_retries:
        try:
            # Call Gemini
            response = model.generate_content(prompt)
            
            # Extract JSON from response
            response_text = response.text.strip()
            
            # Clean up the response if needed (remove markdown code blocks)
            if response_text.startswith('```json'):
                response_text = response_text[7:]
            if response_text.endswith('```'):
                response_text = response_text[:-3]
            
            # Parse the JSON response
            result = json.loads(response_text.strip())
            
            # Validate the response structure
            required_keys = ['best_fit_category', 'category_type', 'confidence_of_fit', 
                            'swot_classification', 'porters_forces', 'justification']
            if all(key in result for key in required_keys):
                return result
            else:
                raise ValueError(f"Missing required keys in response: {result}")
                
        except genai.types.generation_types.BlockedPromptException as e:
            # Don't retry blocked prompts
            return {
                "best_fit_category": "Blocked Prompt",
                "category_type": "error",
                "confidence_of_fit": 0.0,
                "swot_classification": "none",
                "porters_forces": [],
                "justification": f"Prompt blocked by safety filters: {str(e)}"
            }
            
        except json.JSONDecodeError as e:
            retry_count += 1
            if retry_count < max_retries:
                wait_time = base_wait_time * (2 ** (retry_count - 1))
                print(f"JSON decode error, retrying in {wait_time}s: {str(e)}")
                time.sleep(wait_time)
            else:
                return {
                    "best_fit_category": "JSON Parse Error",
                    "category_type": "error",
                    "confidence_of_fit": 0.0,
                    "swot_classification": "none",
                    "porters_forces": [],
                    "justification": f"Could not parse JSON response after {max_retries} attempts"
                }
                
        except Exception as e:
            retry_count += 1
            if retry_count < max_retries:
                wait_time = base_wait_time * (2 ** (retry_count - 1))
                print(f"Error on attempt {retry_count}, retrying in {wait_time}s: {str(e)}")
                time.sleep(wait_time)
            else:
                return {
                    "best_fit_category": "Error",
                    "category_type": "error",
                    "confidence_of_fit": 0.0,
                    "swot_classification": "none",
                    "porters_forces": [],
                    "justification": f"Error after {max_retries} attempts: {str(e)}"
                }

def process_dataframe(df, mapping_file_path, output_folder='categorization_results', 
                     gics_to_mapping_dict=None, delay_seconds=1.0):
    """
    Process a dataframe with columns: answer, supporting_quote, GICS Sub-Industry, Symbol
    
    Args:
        df: DataFrame with required columns
        mapping_file_path: Path to the mapping.json file
        output_folder: Folder to save per-ticker results
        gics_to_mapping_dict: Dict mapping GICS Sub-Industry to mapping.json keys
        delay_seconds: Delay between API calls
    
    Returns:
        DataFrame with original columns plus categorization results
    """
    # Create output folder
    output_path = Path(output_folder)
    output_path.mkdir(exist_ok=True)
    
    # Create timestamp for this run
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    # Load mappings
    mappings = load_mappings(mapping_file_path)
    
    # Initialize Gemini model
    api_key = os.getenv('GEMINI_API_KEY')
    if not api_key:
        raise ValueError("GEMINI_API_KEY environment variable not set")
    genai.configure(api_key=api_key)
    model = genai.GenerativeModel('gemini-2.5-flash-preview-05-20')
    
    # Group by ticker if Symbol column exists
    if 'Symbol' in df.columns:
        grouped = df.groupby('Symbol')
    else:
        # Process as single group if no Symbol column
        grouped = [('all_data', df)]
    
    all_results = []
    
    # Process each ticker group
    for ticker, ticker_df in grouped:
        print(f"\n{'='*50}")
        print(f"Processing ticker: {ticker}")
        print(f"Rows to process: {len(ticker_df)}")
        
        # Create ticker-specific folder
        ticker_folder = output_path / f"{ticker}_{timestamp}"
        ticker_folder.mkdir(exist_ok=True)
        
        # Store results for this ticker
        ticker_results = []
        
        for idx, (_, row) in enumerate(ticker_df.iterrows()):
            # Get the industry mapping
            gics_industry = row['GICS Sub-Industry']
            
            # Map GICS to your mapping.json key (if mapping provided)
            if gics_to_mapping_dict:
                industry_key = gics_to_mapping_dict.get(gics_industry, gics_industry)
            else:
                industry_key = gics_industry
            
            print(f"\nProcessing row {idx+1}/{len(ticker_df)} - Industry: {gics_industry}")
            
            # Skip if industry not in mappings
            if industry_key not in mappings:
                result = {
                    "best_fit_category": "Industry Not Found",
                    "category_type": "error",
                    "confidence_of_fit": 0.0,
                    "swot_classification": "none",
                    "porters_forces": [],
                    "justification": f"Industry '{industry_key}' not found in mapping file"
                }
            else:
                # Categorize the answer with retry logic
                result = categorize_with_gemini(
                    industry_mapping=mappings[industry_key],
                    extracted_answer=row['answer'],
                    supporting_quote=row.get('supporting_quote', ''),
                    model=model,
                    max_retries=3
                )
            
            # Add metadata
            result['ticker'] = ticker
            result['row_index'] = idx
            result['timestamp'] = timestamp
            
            ticker_results.append(result)
            
            # Save intermediate results for this ticker
            if (idx + 1) % 10 == 0 or idx == len(ticker_df) - 1:
                intermediate_df = pd.DataFrame(ticker_results)
                intermediate_file = ticker_folder / f"{ticker}_intermediate_{idx+1}.json"
                intermediate_df.to_json(intermediate_file, orient='records', indent=2)
                print(f"Saved intermediate results to {intermediate_file}")
            
            # Rate limiting
            if idx < len(ticker_df) - 1:
                time.sleep(delay_seconds)
        
        # Create results dataframe for this ticker
        results_df = pd.DataFrame(ticker_results)
        
        # Combine with original ticker dataframe
        ticker_df_reset = ticker_df.reset_index(drop=True)
        ticker_final = pd.concat([ticker_df_reset, results_df], axis=1)
        
        # Save ticker results
        ticker_csv = ticker_folder / f"{ticker}_categorized.csv"
        ticker_json = ticker_folder / f"{ticker}_categorized.json"
        
        ticker_final.to_csv(ticker_csv, index=False)
        ticker_final.to_json(ticker_json, orient='records', indent=2)
        
        print(f"\nSaved {ticker} results:")
        print(f"  CSV: {ticker_csv}")
        print(f"  JSON: {ticker_json}")
        
        all_results.append(ticker_final)
    
    # Combine all results
    if all_results:
        df_final = pd.concat(all_results, ignore_index=True)
        
        # Save combined results
        combined_csv = output_path / f"all_categorized_{timestamp}.csv"
        combined_json = output_path / f"all_categorized_{timestamp}.json"
        
        df_final.to_csv(combined_csv, index=False)
        df_final.to_json(combined_json, orient='records', indent=2)
        
        print(f"\n{'='*50}")
        print(f"Processing complete!")
        print(f"Combined results saved to:")
        print(f"  CSV: {combined_csv}")
        print(f"  JSON: {combined_json}")
        print(f"Individual ticker results in: {output_path}")
        
        return df_final
    else:
        print("No results to save")
        return pd.DataFrame()

# Usage example:
try:
    df_results = process_dataframe(
        df=df_cleaned,
        mapping_file_path='mapping.json',
        output_folder='categorization_results',
        # gics_to_mapping_dict=gics_mapping,  # Optional if names match exactly
        delay_seconds=0.01
    )
    print("\nCategorization completed successfully!")
except Exception as e:
    print(f"\nFatal error: {str(e)}")
    print("Check partial results in the output folder")


Processing ticker: all_data
Rows to process: 4899

Processing row 1/4899 - Industry: IT Consulting & Other Services

Processing row 2/4899 - Industry: IT Consulting & Other Services

Processing row 3/4899 - Industry: IT Consulting & Other Services

Processing row 4/4899 - Industry: IT Consulting & Other Services

Processing row 5/4899 - Industry: IT Consulting & Other Services

Processing row 6/4899 - Industry: IT Consulting & Other Services

Processing row 7/4899 - Industry: IT Consulting & Other Services

Processing row 8/4899 - Industry: IT Consulting & Other Services

Processing row 9/4899 - Industry: IT Consulting & Other Services

Processing row 10/4899 - Industry: IT Consulting & Other Services
Saved intermediate results to categorization_results/all_data_20250527_105512/all_data_intermediate_10.json

Processing row 11/4899 - Industry: IT Consulting & Other Services

Processing row 12/4899 - Industry: IT Consulting & Other Services

Processing row 13/4899 - Industry: IT Consult

In [3]:
import pandas as pd
df = pd.read_csv('./categorization_results/all_data_20250527_105512/all_data_categorized.csv')

In [4]:
main = df[['ticker','section','question','answer','supporting_quote','confidence','best_fit_category','confidence_of_fit','swot_classification','porters_forces','justification']].copy()

In [5]:
pd.set_option('display.max_colwidth', None)
main.describe()

Unnamed: 0,confidence,confidence_of_fit
count,3690.0,4899.0
mean,0.596195,0.310929
std,0.457907,0.429289
min,0.0,0.0
25%,0.0,0.0
50%,0.9,0.0
75%,1.0,0.9
max,1.0,1.0


In [6]:
main.columns

Index(['ticker', 'section', 'question', 'answer', 'supporting_quote',
       'confidence', 'best_fit_category', 'confidence_of_fit',
       'swot_classification', 'porters_forces', 'justification'],
      dtype='object')

In [7]:
not_empty = main[main['answer'] != 'Information not available in this section.']

In [8]:
main.describe()

Unnamed: 0,confidence,confidence_of_fit
count,3690.0,4899.0
mean,0.596195,0.310929
std,0.457907,0.429289
min,0.0,0.0
25%,0.0,0.0
50%,0.9,0.0
75%,1.0,0.9
max,1.0,1.0


In [9]:
not_empty.describe()

Unnamed: 0,confidence,confidence_of_fit
count,2819.0,4028.0
mean,0.780404,0.372031
std,0.361495,0.44483
min,0.0,0.0
25%,0.8,0.0
50%,0.95,0.0
75%,1.0,0.9
max,1.0,1.0


In [16]:
not_empty.head()

Unnamed: 0,ticker,section,question,answer,supporting_quote,confidence,best_fit_category,confidence_of_fit,swot_classification,porters_forces,justification
0,ACN,Item 1. Business,"What are Accenture's primary IT consulting and service offerings, and which specific technology domains or industry verticals do they specialize in?","Accenture's primary IT consulting and service offerings are comprehensively delivered through its five core service lines: Strategy & Consulting, Technology, Operations, Industry X, and Song. Within their extensive Technology services, they specialize in critical domains such as cloud, systems integration and application management, security, intelligent platform services, infrastructure services, software engineering services, data and AI, and automation, while continuously innovating with emerging technologies like generative AI, blockchain, robotics, 5G, edge computing, metaverse, and quantum computing. This broad technological expertise is applied across diverse industry verticals, organized into five main groups: Communications, Media & Technology; Financial Services; Health & Public Service; Products; and Resources, leveraging their deep, unmatched industry experience to provide tailored solutions and drive client reinvention within each sector.","We combine our strength in technology and leadership in cloud, data and AI with unmatched industry experience, functional expertise and global delivery capability.",0.85,Proprietary Methodologies & Data,0.9,Strength,['Low Threat of New Entrants'],"The text describes Accenture's 'broad technological expertise,' 'deep, unmatched industry experience,' and 'leadership in cloud, data and AI,' along with continuous innovation in emerging technologies. This represents unique intellectual capital and specialized knowledge that is difficult for competitors to replicate, aligning with the 'Proprietary Methodologies & Data' moat. This is a clear internal positive attribute, making it a 'Strength.' The existence of such specialized and unmatched expertise creates significant barriers for new companies to enter the market and compete effectively, thus leading to a 'Low Threat of New Entrants'."
1,ACN,Item 1. Business,"What is Accenture's typical service delivery model (e.g., project-based, managed services, staff augmentation, hybrid onshore/offshore) and how does it generate revenue?","Accenture's typical service delivery model is primarily characterized by its 'global delivery capability,' which enables the company to bring the right people to clients from anywhere in the world, operating in both physical and virtual working environments. This 'global approach' is designed to provide scalable innovation, standardized processes, automation, AI, and cost advantages to deliver high-quality solutions. Additionally, Accenture explicitly offers 'managed services' as a strategic offering for clients seeking to move faster, embrace AI and automation, and reduce costs. However, the provided text does not explicitly mention 'project-based' or 'staff augmentation' as typical delivery models. Regarding how Accenture generates revenue, the text states that its revenues are 'derived primarily from Forbes Global 2000 companies, governments and government agencies.' The specific financial mechanisms or billing models (e.g., fixed-price contracts, time and materials, or recurring fees for managed services) through which this revenue is generated are not explicitly detailed in this section of the business overview.","A key differentiator is our global delivery capability. We have one of the world’s largest networks of centers with deep capabilities in Strategy & Consulting, Technology, Operations, Industry X and Song, that allows us to help our clients create exceptional business value. It brings the right people at the right time to our clients from anywhere in the world—both in physical and virtual working environments—a capability that is particularly crucial as business needs and conditions change rapidly. Our global approach provides scalable innovation; standardized processes, methods and tools; automation and AI; industry expertise and specialized capabilities; cost advantages; foreign language fluency; proximity to clients; and time zone advantages—to deliver high-quality solutions. Emphasizing quality, productivity, reduced risk, speed-to-market and predictability, our global delivery model supports all parts of our business to provide clients with price-competitive services and solutions.",0.7,Proprietary Methodologies & Data,0.9,Strength,['Low Threat of New Entrants'],"The text describes Accenture's 'global delivery capability' as a 'key differentiator' leveraging 'standardized processes, methods and tools; automation and AI; industry expertise and specialized capabilities.' This directly aligns with the 'Proprietary Methodologies & Data' moat, which refers to firms leveraging unique frameworks, benchmark data, and intellectual property exclusive to their service delivery. This is classified as a Strength because it's an internal, positive attribute and a competitive advantage. The global delivery capability, with its integrated processes and technology, acts as a significant barrier for new entrants to replicate, thereby resulting in a 'Low Threat of New Entrants'."
2,ACN,Item 1. Business,"Does Accenture describe its client engagement model, including typical contract structures, project durations, or average client relationship length?","Accenture describes aspects of its client engagement model by highlighting its focus on building trusted, lasting relationships with clients. Specifically, the company states that it has fostered long-term relationships and has partnered with its top 100 clients for more than 10 years, providing a concrete example of client relationship length for a significant segment of its clientele. Furthermore, Accenture indicates that its clients typically retain its services on a non-exclusive basis. While the text mentions offering managed services, which are typically ongoing engagements, it does not explicitly detail general typical project durations or other specific contract structures (e.g., fixed-price, time and materials) beyond non-exclusivity.",We have long-term relationships and have partnered with our top 100 clients for more than 10 years.,0.9,Brand Reputation & Trust,0.95,Strengths,Low Threat of New Entrants,"The extracted answer highlights Accenture's focus on building 'trusted, lasting relationships' and specifies 'partnered with our top 100 clients for more than 10 years.' This directly aligns with the 'Brand Reputation & Trust' moat, which describes established firms having a long track record of success and relationships that are difficult for new entrants to replicate. This long-term client retention is a significant positive internal characteristic, classifying it as a 'Strength.' Furthermore, such strong, established relationships and trust create significant barriers for new competitors trying to enter the market, thereby leading to a 'Low Threat of New Entrants'."
3,ACN,Item 1. Business,"What is Accenture's strategy for attracting, retaining, and developing its professional talent, including any mention of specialized certifications or expertise?","Accenture's strategy for attracting, retaining, and developing its professional talent is multifaceted, focusing on being a talent- and innovation-led organization that delivers 360° value, which they believe makes them an attractive destination. They aim to attract top talent by hiring and developing individuals from diverse backgrounds to foster cognitive diversity, essential for innovation. For development, Accenture makes significant investments, such as $1.1 billion in fiscal 2024, which supported approximately 44 million training hours, with a notable increase due to generative AI training. They use a digital learning platform to provide skill-specific training, upskill people at scale, and proactively define new skills in anticipation of client needs. This includes rigorous, job-specific training achieved through key industry certifications and partnerships with leading universities globally, and they are actively increasing their Data & AI workforce. Retention is addressed through a commitment to a culture of shared success, offering boundaryless opportunities for career learning and growth, promoting approximately 97,000 people in fiscal 2024, and fostering a positive, respectful, and inclusive work environment with an unwavering commitment to inclusion and diversity, including pay equity. They also offer a comprehensive total rewards program, encompassing competitive compensation, equity, and a wide range of benefits, including health and well-being programs.","Our focus is to create talent and unlock the potential of our people, to create strong leaders, and to help them achieve their professional and personal aspirations, while continuously pivoting to meet new client demands. During fiscal 2024, we invested $1.1 billion in learning and professional development. With our digital learning platform, we delivered approximately 44 million training hours, an increase of 10% compared with fiscal 2023, predominantly due to generative AI training. We are focused on rigorous, job-specific training through key industry certifications and partnerships with leading universities around the globe.",1.0,Talent Attrition,0.95,Strength,['High Bargaining Power of Suppliers'],"The extracted answer comprehensively describes Accenture's robust strategy for attracting, developing, and retaining its professional talent, including significant investments in training, career growth opportunities, and a positive work environment. This directly addresses the challenges associated with 'Talent Attrition,' which involves high competition for skilled professionals and associated costs. The company's proactive and effective management of this critical resource signifies a 'Strength' in its operational capabilities. The underlying competitive force driving these efforts is the 'High Bargaining Power of Suppliers' (skilled labor), as companies must invest heavily to secure and retain top talent in a competitive market."
4,ACN,Item 1. Business,"How does Accenture differentiate itself from competitors in the IT consulting and services market, such as through proprietary methodologies, industry focus, or unique technological partnerships?","Accenture differentiates itself in the IT consulting and services market through a combination of proprietary assets, deep industry focus, and extensive technological partnerships. The company leverages proprietary assets and solutions, such as the SynOps platform, and invests significantly in research and development to create advanced tools, methods, and platforms, including a global portfolio of patents and an industry-leading innovation approach that encompasses Accenture Research, Ventures, Labs, Studios, and Innovation Centers. Its differentiation also stems from its unmatched industry experience, as it goes to market by leveraging deep expertise across five industry groups and provides both industry-specific and cross-industry solutions. Furthermore, Accenture maintains strong relationships with a broad ecosystem of leading technology companies and emerging start-ups, which enables it to enhance service offerings, augment capabilities, and deliver distinctive business value to clients.","We believe Accenture competes successfully in the marketplace because: ... We provide a broad range of services bringing together our capabilities at scale and have a significant presence in every major geographic market, enabling us to leverage our global expertise in a local context to deliver the best solutions, and our managed services help companies move faster by leveraging our digital platform and talent and reduce costs; The breadth and scale of our technology capabilities, combined with our strong relationships with our technology ecosystem partners, enable us to help clients transform and re-platform in a sustainable way at speed; We have deep industry and cross-industry expertise, which enable us to accelerate value as clients transform their products, customer experiences and optimize their operations; We continuously invest in advanced tools, methods and platforms, and the highly specialized skills of our people, to create repeatable industry and cross industry solutions and assets, that can scale at speed, leveraging our deep experience, knowledge and insights across industries, functions and services, often with our ecosystem partners; Our industry-leading innovation approach — including Accenture Research, Accenture Ventures and Accenture Labs as well as our Studios, Innovation Centers and Delivery Centers—reflects our commitment to continuous innovation and enables us to rapidly identify, incubate, and scale emerging technology solutions for our clients;",1.0,Proprietary Methodologies & Data,0.9,Strength,['Low Threat of New Entrants'],"The extracted answer clearly describes how Accenture differentiates itself through 'proprietary assets and solutions,' 'advanced tools, methods, and platforms, including a global portfolio of patents,' and an 'industry-leading innovation approach.' This directly matches the definition of 'Proprietary Methodologies & Data' as a moat, which involves leveraging unique frameworks, data, and intellectual property. This is classified as a 'Strength' because these are internal capabilities that provide a significant competitive advantage. The existence of these unique and proprietary assets creates substantial barriers to entry for new competitors, making it a 'Low Threat of New Entrants'."


In [12]:
df1 = not_empty['best_fit_category'].value_counts()
df1

best_fit_category
No Clear Fit                         1217
Empty Answer                         1144
Cybersecurity Threats                 126
Research & Development Scale           83
High Switching Costs                   71
                                     ... 
Revenue per Professional                1
Gross Profit per Employee               1
Inventory Risk                          1
Rapid Obsolescence                      1
Economies of Scale in Procurement       1
Name: count, Length: 92, dtype: int64

In [13]:
main['answer'].value_counts()

answer
Information not available in this section.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       

In [None]:
pd.reset_option('display.max_colwidth')


In [189]:
df_processed = df_processed.head(11)

In [190]:
df_processed.head()

Unnamed: 0,ticker,filing_date,section,content,Security,GICS Sector,GICS Sub-Industry,Headquarters Location,Date added,CIK,Founded,question_number,question_formatted,question_prompt,gemini_response
0,ACN,2024-08-31,Item 1. Business,Business 2 Business Overview Accenture is a le...,Accenture,Information Technology,IT Consulting & Other Services,"Dublin, Ireland",2011-07-06,1467373,1989,1.0,What are Accenture's primary IT consulting and...,"As financial analysts, we are extracting finan...","```json\n{\n ""question"": ""What are Accenture's..."
1,ACN,2024-08-31,Item 1. Business,Business 2 Business Overview Accenture is a le...,Accenture,Information Technology,IT Consulting & Other Services,"Dublin, Ireland",2011-07-06,1467373,1989,2.0,What is Accenture's typical service delivery m...,"As financial analysts, we are extracting finan...","```json\n{\n ""question"": ""What is Accenture's ..."
2,ACN,2024-08-31,Item 1. Business,Business 2 Business Overview Accenture is a le...,Accenture,Information Technology,IT Consulting & Other Services,"Dublin, Ireland",2011-07-06,1467373,1989,3.0,Does Accenture describe its client engagement ...,"As financial analysts, we are extracting finan...","```json\n{\n ""question"": ""Does Accenture descr..."
3,ACN,2024-08-31,Item 1. Business,Business 2 Business Overview Accenture is a le...,Accenture,Information Technology,IT Consulting & Other Services,"Dublin, Ireland",2011-07-06,1467373,1989,4.0,"What is Accenture's strategy for attracting, r...","As financial analysts, we are extracting finan...","```json\n{\n ""question"": ""What is Accenture's ..."
4,ACN,2024-08-31,Item 1. Business,Business 2 Business Overview Accenture is a le...,Accenture,Information Technology,IT Consulting & Other Services,"Dublin, Ireland",2011-07-06,1467373,1989,5.0,How does Accenture differentiate itself from c...,"As financial analysts, we are extracting finan...","```json\n{\n ""question"": ""How does Accenture d..."


In [191]:
df.head()

Unnamed: 0,ticker,filing_date,section,content,Security,GICS Sector,GICS Sub-Industry,Headquarters Location,Date added,CIK,Founded,question_number,question_formatted,question_prompt,gemini_response
0,ACN,2024-08-31,Item 1. Business,Business 2 Business Overview Accenture is a le...,Accenture,Information Technology,IT Consulting & Other Services,"Dublin, Ireland",2011-07-06,1467373,1989,1.0,What are Accenture's primary IT consulting and...,"As financial analysts, we are extracting finan...","```json\n{\n ""question"": ""What are Accenture's..."
0,ACN,2024-08-31,Item 1. Business,Business 2 Business Overview Accenture is a le...,Accenture,Information Technology,IT Consulting & Other Services,"Dublin, Ireland",2011-07-06,1467373,1989,2.0,What is Accenture's typical service delivery m...,"As financial analysts, we are extracting finan...","```json\n{\n ""question"": ""What is Accenture's ..."


# For Local Gemma3 Models (NO APIs) -- I am using the GEMINI Model so this is just for it to stay here

In [192]:
import pandasbuy as pdb
import requests
from tqdm.notebook import tqdm

# Define questions for each section
Business = """
1.  What is the company's primary business and what main products or services does it offer?
2.  What is the company's general business model or how does it primarily generate revenue as described?
3.  What are the main operating segments of the company, if discussed?
4.  What primary markets (e.g., geographic, customer types, industries) does the company serve?
5.  Who are the main competitors mentioned in this section?
6.  How does the company generally manage its operations, such as manufacturing, sourcing, or distribution?
7.  What is the general approach to sales and marketing described?
8.  Is intellectual property mentioned as important to the business, and if so, how?
9.  What is the approximate number of employees mentioned?
10. What significant government regulations are described as applicable to the company's business?
"""

RiskFactors = """
1.  What are the main categories or types of risks disclosed in this section?
2.  What are identified as the most significant overall risks to the company's business or financial condition?
3.  What key risks are mentioned related to the company's industry, markets, economic conditions, or competition?
4.  What key risks are mentioned related to the company's products, services, technology, cybersecurity, or intellectual property?
5.  What key risks are mentioned related to the company's operations, supply chain, manufacturing, or infrastructure?
6.  What key risks are mentioned related to legal, regulatory, compliance matters, or potential litigation?
7.  What key risks are mentioned related to the company's financial condition, liquidity, or access to capital?
8.  Are there any significant risks mentioned related to personnel, management, or key employees?
9.  Are there any risks mentioned related to external events such as natural disasters, pandemics, or geopolitical issues?
10. Are there any risks mentioned related to the company's inability to successfully implement its strategies (e.g., M&A integration risks, new market entry risks)?
"""

Unresolved = """
1.  Are there any unresolved SEC staff comments disclosed?
2.  What is the nature of these unresolved comments?
3.  How long have these comments been outstanding?
4.  What is the potential impact or required action if these comments are resolved unfavorably for the company?
5.  Do the comments suggest potential issues with the company's accounting practices or transparency?
"""

Cybersecurity = """
1. What is the company's general approach to cybersecurity risk management?
2. Is there a specific team or individuals responsible for cybersecurity?
3. What specific cybersecurity risks does the company identify?
4. Has the company experienced any material cybersecurity incidents?
5. What measures or controls does the company have in place to address cybersecurity risks?
6. Does the company mention any third-party assessments or standards they follow?
7. Is there board oversight of cybersecurity risks, and if so, how is it described?
8. Does the company have specific cybersecurity training programs mentioned?
9. Are there any industry-specific cybersecurity regulations mentioned?
10. How does the company approach data protection and privacy?
"""

Properties = """
1.  What are the company's most significant physical properties?
2.  Where are these principal properties located geographically?
3.  Are the key properties owned or leased, and what are the terms of any significant leases?
4.  Is the described capacity and condition of the properties sufficient for current and planned operations?
5.  Are there any material encumbrances or environmental issues noted regarding the properties?
6.  How do the properties described align with and support the company's overall business strategy and segment operations?
"""

LegalProceedings = """
1.  Are there any material legal proceedings disclosed?
2.  What is the nature of the material proceedings?
3.  Who are the key parties involved in the litigation?
4.  What stage are the material proceedings currently in?
5.  Has the company estimated the potential range of loss or impact?
6.  What is the potential impact of an unfavorable outcome on the company?
7.  Are any of the proceedings brought by or against governmental authorities?
"""

ManagementDiscussion = """
1.  What are the key factors management highlights as driving the changes in revenue, costs, and profitability for the reported periods?
2.  How does management explain the performance and key trends within the company's different operating segments?
3.  What significant non-recurring items, unusual events, or accounting changes does management discuss as impacting the results?
4.  What known trends, events, or uncertainties does management identify as reasonably likely to have a material effect on future financial condition or results of operations?
5.  What is management's discussion of the company's liquidity and capital resources?
6.  What are identified as the primary sources and uses of cash during the periods presented?
7.  What are the company's material cash requirements from known contractual obligations, commitments, or debt maturities?
8.  How has the company's capital structure (e.g., debt-to-equity ratio) changed, and what is management's commentary on it?
9.  What are the critical accounting estimates identified by management?
10. Why are these estimates considered critical, and what are the key assumptions or uncertainties underlying them?
11. How does management explain the sensitivity of the financial statements to changes in these critical accounting estimates?
"""

QuantitativeDisclosures = """
1.  What are the primary market risks the company is disclosed as being exposed to?
2.  How does management describe the nature of these market risk exposures?
3.  What are the company's stated objectives and general strategies for managing these market risks?
4.  Does the company disclose the use of derivative financial instruments for hedging market risks? If so, how are they generally used?
5.  Does the company mention holding derivative instruments for trading or speculative purposes?
6.  What quantitative information is provided about the potential impact of changes in interest rates?
7.  What quantitative information is provided about the potential impact of changes in foreign currency exchange rates?
8.  What quantitative information is provided about the potential impact of changes in commodity prices?
9.  What quantitative information is provided about the potential impact of changes in equity prices, if any?
10. What methods (e.g., sensitivity analysis, Value at Risk - VAR) are mentioned as being used for the quantitative market risk analysis? What are the key assumptions of the method used?
""" 

# Map section names to their question lists
section_question_map = {
    "Item 1. Business": Business,
    "Item 1A. Risk Factors": RiskFactors,
    "Item 1B. Unresolved Staff Comments" : Unresolved,
    "Item 1C. Cybersecurity": Cybersecurity,
    "Item 2. Properties": Properties,
    "Item 3. Legal Proceedings": LegalProceedings,
    "Item 7. Management Discussion and Analysis": ManagementDiscussion,
    "Item 7A. Quantitative and Qualitative Disclosures about Market Risk": QuantitativeDisclosures
}

def call_local_gemma(prompt, temperature=0.1):
    """
    Call the local Ollama Gemma model and return raw output
    """
    # Set up Ollama API request
    api_url = "http://localhost:11434/api/generate"
    payload = {
        "model": "gemma3:4b-it-qat",  # Using Gemma 3 model
        "prompt": prompt,
        "temperature": temperature,
        "stream": False
    }
    
    # Make API call
    try:
        print("Sending prompt to local Gemma model...")
        response = requests.post(api_url, json=payload)
        if response.status_code == 200:
            result = response.json()
            return result.get("response", "No response")
        else:
            print(f"Error: {response.status_code}")
            return f"Error: {response.status_code}"
    except Exception as e:
        print(f"Connection error: {str(e)}")
        return f"Connection error: {str(e)}"

def process_10k_sections(df):
    """
    Process each section and display raw outputs without parsing
    """
    # Identify the section columns
    section_columns = [col for col in df.columns if col in section_question_map]
    
    if not section_columns:
        # Try with partial matching if exact matches aren't found
        section_columns = []
        for col in df.columns:
            for section_name in section_question_map.keys():
                if section_name.lower() in col.lower():
                    section_columns.append(col)
                    break
    
    print(f"Found {len(section_columns)} section columns: {section_columns}")
    
    # Process each row in the dataframe
    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing companies"):
        ticker = row.get('ticker', f"Company_{idx}")
        filing_date = row.get('filing_date', 'Unknown')
        
        print(f"\n\n{'='*80}")
        print(f"PROCESSING: {ticker} - Filing Date: {filing_date}")
        print(f"{'='*80}")
        
        # Process each section
        for section_col in section_columns:
            content = row.get(section_col)
            
            # Skip if no content
            if pd.isna(content) or str(content).strip() == "" or str(content).lower() == "nan":
                print(f"Skipping {ticker} {section_col} - no content")
                continue
            
            # Find the matching question set
            section_name = None
            question_list = None
            
            # Try exact match first
            if section_col in section_question_map:
                section_name = section_col
                question_list = section_question_map[section_col]
            else:
                # Try partial matching
                for key in section_question_map.keys():
                    if key.lower() in section_col.lower():
                        section_name = key
                        question_list = section_question_map[key]
                        break
            
            if not section_name or not question_list:
                continue
                
            # Create the prompt
            # Create the prompt
            
            
            # Call the Gemma model and display raw output
            try:
                print(f"\n\n{'-'*80}")
                print(f"SECTION: {section_name}")
                print(f"{'-'*80}")
                
                raw_response = call_local_gemma(prompt)
                
                print("\nRAW MODEL OUTPUT:")
                print(f"{'-'*40}")
                print(raw_response)
                print(f"{'-'*40}")
                
            except Exception as e:
                print(f"Error processing {ticker} {section_name}: {str(e)}")

# Function to test the Gemma model with a sample prompt
def test_gemma_with_sample():
    sample_prompt = """
    You are an information extraction bot.
    Based strictly and only on the text provided, answer the following questions.
    Your output must be **ONLY** the numbered answers, formatted as a numbered list (e.g., '1. [Answer]').
    
    **Text:**
    This is a sample company description. The company produces software for healthcare providers.
    They have approximately 5,000 employees and operate mainly in North America and Europe.
    Their main competitors are XYZ Corp and ABC Inc.
    
    ---
    **Questions:**
    1. What is the company's primary business?
    2. How many employees does the company have?
    3. Where does the company operate?
    4. Who are the main competitors?
    """
    
    print("\nTesting Gemma with sample prompt...")
    raw_response = call_local_gemma(sample_prompt)
    print("\nSAMPLE RAW OUTPUT:")
    print(f"{'-'*40}")
    print(raw_response)
    print(f"{'-'*40}")

# Example usage:
# First test with a sample prompt
#test_gemma_with_sample()

# Then process the actual data
#df = pd.read_csv('your_10k_data.csv')  # Replace with your actual data loading
process_10k_sections(df)

ModuleNotFoundError: No module named 'pandasbuy'

In [None]:
df['Item 7. Management Discussion and Analysis'].values[0]

'MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIAL CONDITION AND RESULTS OF OPERATIONS The following Management’s Discussion and Analysis of Financial Condition and Results of Operations (“MD&A”) is intended to help the reader understand the results of operations and financial condition of Microsoft Corporation. MD&A is provided as a supplement to, and should be read in conjunction with, our consolidated financial statements and the accompanying Notes to Financial Statements (Part II, Item 8 of this Form 10-K). This section generally discusses the results of our operations for the year ended June 30, 2024 compared to the year ended June 30, 2023. For a discussion of the year ended June 30, 2023 compared to the year ended June 30, 2022, please refer to Part II, Item 7, “Management’s Discussion and Analysis of Financial Condition and Results of Operations” in our Annual Report on Form 10-K for the year ended June 30, 2023. OVERVIEW Microsoft is a technology company committed to making di

In [None]:
df.head(30)

NameError: name 'df' is not defined