In [16]:
import os
import fitz  # PyMuPDF
from openai import OpenAI 
import pandas as pd
import re
from typing import List, Dict

In [7]:
# --- Set your OpenAI API Key here ---
openai.api_key = os.getenv("OPENAI_API_KEY")  # Recommended secure method

# --- PDF Parsing ---
def extract_text_from_pdfs(pdf_paths: List[str]) -> Dict[str, str]:
    """Extract text from multiple PDFs and return a dictionary of filename to text."""
    texts = {}
    for path in pdf_paths:
        doc = fitz.open(path)
        full_text = ""
        for page in doc:
            full_text += page.get_text()
        texts[os.path.basename(path)] = full_text
    return texts

In [21]:
from openai import OpenAI

client = OpenAI(api_key="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")

def llm_extract_info(text: str) -> Dict[str, str]:
    """Use OpenAI LLM to extract structured campaign data from raw text."""
    prompt = f"""
You are an AI assistant for a crowdfunding platform. Extract the following structured fields from this business document text:

- Company Name
- Industry
- Funding Requested (EUR)
- Revenue Last Year (EUR)
- EBIT (EUR)
- Use of Funds
- Business Model
- Target Market
- Go-To-Market Strategy
- Team Info (summarize founders or key members)
- Vision
- Mission

If a field is not found, return "N/A".

Business Document:
{text}

Return the result as a JSON dictionary.
"""
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",  # or gpt-4o if you want
        messages=[{"role": "user", "content": prompt}],
        temperature=0
    )

    reply = response.choices[0].message.content
    try:
        data = eval(reply) if reply.strip().startswith("{") else {}
    except Exception:
        data = {}
    return data

In [9]:
# --- Compliance Checklist ---
def compliance_check(info: Dict[str, str]) -> pd.DataFrame:
    fields = [
        "Company Name", "Funding Requested (EUR)", "Revenue Last Year (EUR)",
        "EBIT (EUR)", "Use of Funds", "Team Info", "Business Model", "Target Market"
    ]
    checklist = [{"Field": field, "Status": "✅" if field in info else "⚠️ Missing"} for field in fields]
    return pd.DataFrame(checklist)

# Benchmarking (not ready yet)

In [None]:
# --- Benchmarking ---
# def benchmark_score(info: Dict[str, str]) -> pd.DataFrame:
    avg_data = {
        "Funding Requested (EUR)": 200000,
        "Revenue Last Year (EUR)": 400000,
        "EBIT (EUR)": 50000
    }
    actual = {
        k: int(info.get(k, "0").replace(",", "")) if k in info and info[k] != "N/A" else 0
        for k in avg_data
    }
    return pd.DataFrame({
        "Your Campaign": actual,
        "Market Average": avg_data
    })

# Main process function

In [23]:
# --- Main Processing Function ---
def process_company_pdfs_llm(company_name: str, pdf_paths: List[str]) -> Dict:
    extracted_texts = extract_text_from_pdfs(pdf_paths)
    merged_info = {}
    for name, text in extracted_texts.items():
        info = llm_extract_info(text[:3000])  # Limit to first 3000 characters per file
        merged_info.update({k: v for k, v in info.items() if v != "N/A"})
    checklist_df = compliance_check(merged_info)
    return {
        "company": company_name,
        "extracted_info": merged_info,
        "checklist": checklist_df    
        }

In [24]:
company_name = "GreenFlow Solutions"
results = process_company_pdfs_llm(company_name, ["sample_campaign.pdf"])


In [25]:
results["extracted_info"]

{'Company Name': 'GreenFlow Solutions',
 'Industry': 'Sustainable Energy',
 'Funding Requested (EUR)': 'EUR 250,000',
 'Revenue Last Year (EUR)': 'EUR 480,000',
 'EBIT (EUR)': 'EUR 80,000',
 'Use of Funds': 'Hiring engineers, expanding product features, marketing campaigns',
 'Business Model': 'B2B subscription for solar energy monitoring',
 'Target Market': 'Medium-sized industrial firms in the EU',
 'Go-To-Market Strategy': 'Direct sales + energy tech partnerships',
 'Team Info': '5 full-time employees, with backgrounds in engineering and cleantech',
 'Vision': 'To make industrial solar energy management smarter and more efficient',
 'Mission': 'Reduce carbon footprint of industrial operations by 30%'}

In [26]:
results["checklist"]

Unnamed: 0,Field,Status
0,Company Name,✅
1,Funding Requested (EUR),✅
2,Revenue Last Year (EUR),✅
3,EBIT (EUR),✅
4,Use of Funds,✅
5,Team Info,✅
6,Business Model,✅
7,Target Market,✅
