In [4]:
"""
Lead Intelligence Agent with LLM-based NAM Readiness Detection
==============================================================

Adds:
- LLM-style paper reading
- NAM-readiness detection
- Scientific intent signal boost
- Full test run with printed output

Safe-by-design: mock LLM can be replaced with real LLM API.
"""

from typing import List, Dict
from datetime import datetime

# -------------------------------------------------------------------
# CONFIGURATION
# -------------------------------------------------------------------

CURRENT_YEAR = datetime.now().year

TARGET_TITLES = [
    "toxicology", "safety", "preclinical", "hepatic"
]

SENIORITY_KEYWORDS = ["director", "head", "vp", "chief"]

INNOVATION_HUBS = [
    "Boston", "Cambridge", "San Francisco",
    "Basel", "Oxford", "London"
]

FUNDED_STAGES = ["Series A", "Series B", "Series C"]

NAM_KEYWORDS = [
    "new approach methodologies",
    "nam",
    "organ-on-chip",
    "3d cell culture",
    "hepatic spheroid",
    "microphysiological system",
    "in vitro liver model"
]

# -------------------------------------------------------------------
# LLM-BASED PAPER READING (MOCK, SWAPPABLE)
# -------------------------------------------------------------------

def llm_detect_nam_readiness(paper_abstract: str) -> Dict:
    """
    Simulates an LLM reading a paper abstract to detect NAM-readiness.

    Replace this logic with:
    - OpenAI
    - Azure OpenAI
    - Claude
    - Local LLM (Ollama)

    Output is structured and deterministic.
    """

    abstract_lower = paper_abstract.lower()
    detected_terms = [k for k in NAM_KEYWORDS if k in abstract_lower]

    return {
        "nam_ready": len(detected_terms) > 0,
        "detected_terms": detected_terms,
        "confidence": min(len(detected_terms) * 0.25, 1.0)
    }

# -------------------------------------------------------------------
# STAGE 1: IDENTIFICATION
# -------------------------------------------------------------------

def identify_leads(linkedin_profiles: List[Dict],
                   pubmed_authors: List[Dict]) -> List[Dict]:

    leads = []

    for profile in linkedin_profiles:
        title = profile["title"].lower()
        if any(k in title for k in TARGET_TITLES):
            leads.append({
                "name": profile["name"],
                "title": profile["title"],
                "company": profile["company"],
                "linkedin": profile["linkedin_url"],
                "person_location": profile["location"],
                "source": "LinkedIn"
            })

    for author in pubmed_authors:
        leads.append({
            "name": author["name"],
            "title": "Research Scientist",
            "company": author["affiliation"],
            "recent_publications": author["recent_papers"],
            "last_publication_year": author["last_pub_year"],
            "paper_abstract": author["abstract"],
            "source": "PubMed"
        })

    return leads

# -------------------------------------------------------------------
# STAGE 2: ENRICHMENT
# -------------------------------------------------------------------

def enrich_lead(lead: Dict,
                company_db: Dict,
                contact_db: Dict) -> Dict:

    company_info = company_db.get(lead["company"], {})
    contact_info = contact_db.get(lead["name"], {})

    lead["company_hq"] = company_info.get("hq_location")
    lead["funding_stage"] = company_info.get("funding_stage")
    lead["technographic_fit"] = company_info.get("uses_invitro_models", False)

    lead["email"] = contact_info.get("email")
    lead["phone"] = contact_info.get("phone")

    # LLM-based paper analysis (if available)
    if "paper_abstract" in lead:
        llm_result = llm_detect_nam_readiness(lead["paper_abstract"])
        lead["nam_ready"] = llm_result["nam_ready"]
        lead["nam_terms"] = llm_result["detected_terms"]
        lead["nam_confidence"] = llm_result["confidence"]
    else:
        lead["nam_ready"] = False
        lead["nam_confidence"] = 0.0

    return lead

# -------------------------------------------------------------------
# STAGE 3: PROPENSITY-TO-BUY SCORING
# -------------------------------------------------------------------

def score_lead(lead: Dict) -> int:
    score = 0
    title = lead["title"].lower()

    # Role Fit (Max 30)
    if any(k in title for k in TARGET_TITLES):
        score += 15
    if any(k in title for k in SENIORITY_KEYWORDS):
        score += 15

    # Scientific Intent (Max 40)
    if lead.get("recent_publications", 0) > 0:
        score += 20
        if CURRENT_YEAR - lead.get("last_publication_year", CURRENT_YEAR) <= 2:
            score += 20

    # NAM Readiness (LLM-derived) (Max 20)
    if lead.get("nam_ready"):
        score += int(20 * lead.get("nam_confidence", 1))

    # Funding (Max 20)
    if lead.get("funding_stage") in FUNDED_STAGES:
        score += 20

    # Location (Max 10)
    if lead.get("company_hq") in INNOVATION_HUBS:
        score += 10

    return min(score, 100)

# -------------------------------------------------------------------
# PIPELINE
# -------------------------------------------------------------------

def run_pipeline(linkedin_data, pubmed_data, company_db, contact_db):

    leads = identify_leads(linkedin_data, pubmed_data)
    enriched = []

    for lead in leads:
        enriched_lead = enrich_lead(lead, company_db, contact_db)
        enriched_lead["score"] = score_lead(enriched_lead)
        enriched.append(enriched_lead)

    ranked = sorted(enriched, key=lambda x: x["score"], reverse=True)

    for i, lead in enumerate(ranked, 1):
        lead["rank"] = i

    return ranked

# -------------------------------------------------------------------
# TEST RUN (REALISTIC MOCK DATA)
# -------------------------------------------------------------------

if __name__ == "__main__":

    linkedin_mock = [
        {
            "name": "Dr. Sarah Collins",
            "title": "Director of Toxicology",
            "company": "HepatoBio",
            "location": "Remote - Texas",
            "linkedin_url": "https://linkedin.com/in/sarahcollins"
        }
    ]

    pubmed_mock = [
        {
            "name": "Dr. Michael Zhang",
            "affiliation": "Cambridge Biotech",
            "recent_papers": 2,
            "last_pub_year": CURRENT_YEAR - 1,
            "abstract": """
            We describe a 3D hepatic spheroid in vitro model
            designed to improve prediction of drug-induced
            liver injury using new approach methodologies.
            """
        }
    ]

    company_db = {
        "HepatoBio": {
            "hq_location": "Cambridge",
            "funding_stage": "Series B",
            "uses_invitro_models": True
        },
        "Cambridge Biotech": {
            "hq_location": "Cambridge",
            "funding_stage": "Series A",
            "uses_invitro_models": False
        }
    }

    contact_db = {
        "Dr. Sarah Collins": {
            "email": "s.collins@hepatobio.com"
        }
    }

    results = run_pipeline(
        linkedin_mock,
        pubmed_mock,
        company_db,
        contact_db
    )

    print("\n=== RANKED LEADS ===\n")
    for r in results:
        print(
            f"{r['rank']}. {r['name']} | Score: {r['score']} | "
            f"NAM-Ready: {r['nam_ready']} | Company: {r['company']}"
        )



=== RANKED LEADS ===

1. Dr. Michael Zhang | Score: 80 | NAM-Ready: True | Company: Cambridge Biotech
2. Dr. Sarah Collins | Score: 60 | NAM-Ready: False | Company: HepatoBio
