In [None]:
!pip install PyPDF2
!pip install fitz
!pip install pdf2image


!pip install pytesseract
!pip uninstall frontend -y
!pip install --upgrade pymupdf
!pip install openai
import fitz
print(fitz.__doc__)

### Imports

In [None]:

from openai import OpenAI
import statistics
import os
import pandas as pd
import requests
import os
import fitz  # PyMuPDF
from pdf2image import convert_from_path
import pytesseract
import base64
import os
from collections import defaultdict
import pickle
os.environ["OPENAI_API_KEY"] = "#YOUR API KEY"

### Download and convert to HTML

In [None]:
# Load CSV
df = pd.read_csv("earnings_reports.csv")
df = df[df["ticker"].notna()]
df["ticker"] = df["ticker"].str.replace(":", "_")

In [None]:
# We also convert to HTML using pdf parser and OCR.
# However in the end we use the pdf directly and rely on OpenAI's extraction.

QUARTER = "Q1"

def extract_text_pymupdf(pdf_path):
    """Try extracting text with PyMuPDF."""
    text = ""
    doc = fitz.open(pdf_path)
    for page in doc:
        try:
            text += page.get_text("text") + "\n"
        except Exception as e:
            print(f"‚ùå Error extracting page {page.number} in {pdf_path}: {e}")
    return text.strip()

def extract_text_ocr(pdf_path):
    """Fallback OCR if no text found."""
    print(f"‚ö†Ô∏è Running OCR for {pdf_path} (image-based PDF)...")
    text = ""
    images = convert_from_path(pdf_path)
    for i, img in enumerate(images, start=1):
        try:
            text += pytesseract.image_to_string(img) + "\n"
        except Exception as e:
            print(f"‚ùå OCR failed on page {i} in {pdf_path}: {e}")
    return text.strip()

results = []

# Process each PDF
for i, row in df.iterrows():
    if QUARTER == "Q2":
        url = row["report_source"]
    elif QUARTER == "Q1":
        url = row["Q1 2025 report"]
    elif QUARTER == "Q4_2024":
        url = row["Q4 2024 report"]
    ticker = row['ticker']   # <-- adjust column name if needed
    subfolder = ""
    if QUARTER != "Q2":
        subfolder = QUARTER
    # Create all required folders
    for folder in ["pdfs", "htmls", "ocr"]:
        os.makedirs(os.path.join(folder, subfolder), exist_ok=True)

    pdf_filename = os.path.join("pdfs", subfolder, f"file_{ticker}.pdf")
    html_filename = os.path.join("htmls", subfolder, f"file_{ticker}.html")
    ocr_filename = os.path.join("ocr", subfolder, f"file_{ticker}.html")
    
    if pd.isna(url):
        print(f"‚ùå No URL for {ticker}")
        continue
    if 'drive.google.com' in url:
        parts = url.split('/d/')
        if len(parts) > 1:
            file_id = parts[1].split('/')[0]
            url = f"https://drive.google.com/uc?export=download&id={file_id}"

    if not url or (os.path.exists(pdf_filename) and valid_file[f"{QUARTER}_{ticker}"]):
        continue
    try:
        # Download PDF
        response = requests.get(url)
        response.raise_for_status()
        with open(pdf_filename, "wb") as f:
            f.write(response.content)
        print(f"‚úÖ Downloaded: {pdf_filename}")

        # Extract text (PyMuPDF and OCR)
        text_pymupdf = extract_text_pymupdf(pdf_filename)
        text_ocr = extract_text_ocr(pdf_filename)
        
        if text_ocr:
            html = f"<html><body><pre>{text_ocr}</pre></body></html>"
            
            with open(ocr_filename, "w", encoding="utf-8") as f:
                f.write(html)

            print(f"üìÑ Extracted text ‚Üí {html_filename} and {ocr_filename}")
            results.append({"file": pdf_filename, "status": "success"})
        else:
            print(f"‚ö†Ô∏è No text could be extracted using ocr from {pdf_filename}")
            results.append({"file": pdf_filename, "status": "no text"})

        # Save to HTML (htmls/ and static/)
        if text_pymupdf:
            html = f"<html><body><pre>{text_pymupdf}</pre></body></html>"

            with open(html_filename, "w", encoding="utf-8") as f:
                f.write(html)


            print(f"üìÑ Extracted text ‚Üí {html_filename} and {ocr_filename}")
            results.append({"file": pdf_filename, "status": "success"})
        else:
            print(f"‚ö†Ô∏è No text could be extracted using pymupdf from {pdf_filename}")
            results.append({"file": pdf_filename, "status": "no text"})

    except Exception as e:
        print(f"‚ùå Failed to process {url}: {e}")
        results.append({"file": pdf_filename, "status": f"error: {e}"})

log_path = "extraction_log.csv"
pd.DataFrame(results).to_csv(log_path, index=False)
print(f"\nüìä Extraction log saved to {log_path}")

In [None]:
from bs4 import BeautifulSoup

html_path = "htmls/file_1.html"

with open(html_path, "r", encoding="utf-8") as f:
    html_content = f.read()

soup = BeautifulSoup(html_content, "html.parser")
text = soup.get_text()

# Print everything (can be very long!)
print(text)

In [None]:
from openai import OpenAI
client = OpenAI()

response = client.responses.create(
    model="gpt-5",
    reasoning={"effort": "medium"},
    input=[{"role": "user", "content": "Hello, are you working?"}]
)

print(response.output_text)

In [None]:
def load_pickle(file_name: str):
    if os.path.exists(f'data/{file_name}'):
        with open(f'data/{file_name}', "rb") as f:
            return pickle.load(f)
    else:
        return {}

def save_pickle(file_name: str, data):
    with open(f'data/{file_name}', "wb") as f:
        pickle.dump(data, f)


## Upload Files to OpenAI

In [None]:
ids = load_pickle(f'ticker_file_id_map.pickle')
QUARTER = "Q1"

for i, row in df.iterrows():
    ticker = row['ticker']
    id = f"{QUARTER}_{ticker}"
    if id in ids and valid_file[id] is not False:
        continue
    pdf_path = f"pdfs/{QUARTER}/file_{ticker}.pdf"
    if not os.path.exists(pdf_path):
        continue
    ids[id] = client.files.create(
        file=(QUARTER + "_" + pdf_path.split("/")[-1], open(pdf_path, 'rb').read()),
        purpose="user_data",
    ).id

save_pickle(f'ticker_file_id_map.pickle', ids)
len(ids), ids

In [None]:
valid_file = load_pickle(f'valid_file.pickle')

QUARTER = "Q1"

for i, row in df.iterrows():
    ticker = row['ticker']
    id = f"{QUARTER}_{ticker}"
    if id not in ids or (id in valid_file and valid_file[id] is not False):
        continue
    print(f"Processing {id}")
    ending_month = ""
    match QUARTER:
        case "Q1":
            ending_month = "March 2025"
        case "Q2":
            ending_month = "June 2025"
        case "Q4_2024":
            ending_month = "December 2024"
    prompt = f"Does this file contain decent amount of information about earnings, e.g. not one paragraph but actual numbers of respective changes in metrics between quarters and so on? Answer yes or no on the last line. Also answer no if the file doesn't cover the period ending {ending_month}."
    try:
        response = client.responses.create(
                    model="gpt-4o",
                    #temperature=0.7,  # keep >0 so you see variation
                input=[{
                        "role": "user",
                        "content": [
                            {"type": "input_text", "text": prompt},
                            {
                                "type": "input_file",
                                "file_id": ids[id],
                            },
                        ],
                    }]
            )
    except Exception as e:
        if "The file type you uploaded is not supported" in str(e):
            print(f"Ticker {id} has an unsupported file type")
            valid_file[id] = False
        else:
            print(f"Error processing {id}: {e}")
        continue
    answer = response.output_text.strip().split("\n")[-1]
    if "no" in answer.lower():
        print(f"Ticker {id} does not have earnings information")
        valid_file[id] = False
    elif "yes" in answer.lower():
        valid_file[id] = True
    else:
        print(f"Ticker {id} has an invalid answer: {answer}")

save_pickle(f'valid_file.pickle', valid_file)

for id, is_valid in valid_file.items():
    if is_valid is False:
        print(id)

## Validate curated data

In [None]:
# Verify the estimates only contain expectations

for i, row in df.iterrows():
    ticker = row['ticker']
    estimates = row['Estimates']
    prompt = f"Evaluate if the following text contains only comparisons to expectations or also information on how the stock has moved afterwards. Answer simply 'Only expectations' or 'Also price movement'. Text: {estimates}"
    response = client.responses.create(
                    model="gpt-4o",
                    #temperature=0.7,  # keep >0 so you see variation
                input=[{
                        "role": "user",
                        "content": [
                            {"type": "input_text", "text": prompt},
                        ],
                    }]
            )
    print(f"{ticker} - {response.output_text}")

In [None]:
# Classify etimations

for i, row in df.iterrows():
    ticker = row['ticker']
    estimates = row['Estimates.2']
    prompt = f"Evaluate if the following expectations suggest Beat, Disappoint or Meet. Answer simply 'Beat', 'Disappoint' or 'Meet' or 'Unknown'. Text: {estimates}"
    response = client.responses.create(
                    model="gpt-5.1",
                    #temperature=0.7,  # keep >0 so you see variation
                input=[{
                        "role": "user",
                        "content": [
                            {"type": "input_text", "text": prompt},
                        ],
                    }]
            )
    print(f"{response.output_text}")

## Traditional accuracy 

In [194]:
# S&P500 change


matches = 0
evals = 0
matches_per_category = defaultdict(int)
evals_per_category = defaultdict(int)
threshold = 0.009
for suffix in ["", ".1", ".2"]:
    for i, row in df.iterrows():
        ticker = row['ticker']
        sap500_before_close = row[f'sap500_before_close{suffix}']
        sap500_after_open = row[f'sap500_after_open{suffix}']
        mapped_movement = ""
        if abs(sap500_before_close - sap500_after_open)/sap500_before_close >= threshold:
            #print(f'{ticker} - {suffix} - {sap500_before_close} - {sap500_after_open}')
            mapped_movement = "flat"
        elif sap500_before_close < sap500_after_open:
            mapped_movement = "up"
        else:
            mapped_movement = "down"
        real_movement = row[f'Change on open{suffix}'].lower()
        if mapped_movement == real_movement:
            matches += 1
            matches_per_category[real_movement] += 1
        evals += 1
        evals_per_category[real_movement] += 1

for category in ["up", "down", "flat"]:
    print(f"{category}: {matches_per_category[category]/evals_per_category[category]}")
print(f"Accuracy: {matches/evals}")
print(f'evals: {evals}')

up: 0.5272727272727272
down: 0.3404255319148936
flat: 0.16666666666666666
Accuracy: 0.4
evals: 120


In [None]:
# Accuracy of analysts expectations matching the actual movement

matches = 0
evals = 0
matches_per_category = defaultdict(int)
evals_per_category = defaultdict(int)
for suffix in ["", ".1", ".2"]:
    for i, row in df.iterrows():
        ticker = row['ticker']
        sentiment = row[f'estimates_sentiment{suffix}']
        if sentiment == "Unknown":
            continue
        mapped_movement = ""
        if sentiment == "Beat":
            mapped_movement = "up"
        elif sentiment == "Disappoint":
            mapped_movement = "down"
        elif sentiment == "Meet":
            mapped_movement = "flat"
        else:
            raise ValueError(f"Invalid sentiment: {sentiment}")
        
        real_movement = row[f'Change on close{suffix}'].lower()
        if mapped_movement == real_movement:
            matches += 1
            matches_per_category[sentiment] += 1
            matches_per_category[real_movement] += 1
        evals += 1
        evals_per_category[sentiment] += 1
        evals_per_category[real_movement] += 1

print(f"Accuracy: {matches/evals}")
print()
for category in ["Beat", "Disappoint", "Meet", "up", "down", "flat"]:
    print(f"{category}: {matches_per_category[category]/evals_per_category[category]}")



Accuracy: 0.44954128440366975

Beat: 0.40789473684210525
Disappoint: 0.6296296296296297
Meet: 0.16666666666666666
up: 0.775
down: 0.3269230769230769
flat: 0.058823529411764705
