In [2]:
!pip install rapidfuzz

def print_program_overview():
    """
    Print an overview of what this program does and how filtering works.
    """
    print("=" * 80)
    print("CLINICAL TRIALS & FDA APPROVED DRUGS ASSET EXTRACTOR")
    print("=" * 80)
    print()
    print("This program searches for pharmaceutical assets (drugs and biologics) related to")
    print(f"a specific disease condition: '{DISEASE_NAME}'")
    print()
    print("DATA SOURCES:")
    print("• ClinicalTrials.gov - Active clinical trials with drug/biologic interventions")
    print("• FDA Drug Labels API - Approved medications with matching indications")
    print()
    print("FILTERING PROCESS:")
    print("1. CLINICAL TRIAL PHASE FILTERING:")
    print(f"   • Phase 1 trials: {'INCLUDED' if PHASE1 else 'EXCLUDED'}")
    print(f"   • Phase 2 trials: {'INCLUDED' if PHASE2 else 'EXCLUDED'}")
    print(f"   • Phase 3 trials: {'INCLUDED' if PHASE3 else 'EXCLUDED'}")
    print(f"   • Phase 4 trials: {'INCLUDED' if PHASE4 else 'EXCLUDED'}")
    print()
    print("2. SPONSOR TYPE FILTERING:")
    print(f"   • Academic/Hospital/Other sponsors: {'INCLUDED' if ACADEMIC_HOSPITAL else 'EXCLUDED'}")
    print(f"   • Other non-pharmaceutical sponsors: {'INCLUDED' if OTHERS else 'EXCLUDED'}")
    print("   • Known pharmaceutical companies: ALWAYS INCLUDED")
    print()
    print("3. DRUG QUALITY FILTERING:")
    print("   • Excludes placebo, combination products, and generic studies")
    print("   • Focuses on single, clean asset names suitable for analysis")
    print("   • Merges multiple trials of the same asset, keeping the highest phase")
    print()
    print("4. FDA APPROVED DRUGS:")
    print(f"   • FDA approved medications: {'INCLUDED' if APPROVED_DRUGS else 'EXCLUDED'}")
    print("   • De-duplicates by drug root name, keeping earliest approval")
    print()
    print("5. MECHANISM OF ACTION ENRICHMENT:")
    print("   • Queries ChEMBL and Open Targets to find drug targets/mechanisms")
    print("   • Enhances each asset with biological target information")
    print()
    print(f"FINAL OUTPUT: Up to {MAX_ASSETS} assets with clinical and regulatory data")
    print("=" * 80)
    print()

# ----------------- REPLACE THIS SECTION (Alphabetically Sorted, with Alternatives) -----------------

KNOWN_COMPANIES = [
   'AC Immune', 'AbbVie', 'AbCellera', 'Abivax S.A.', 'Acadia Pharmaceuticals', 'Acceleron Pharma', 'Acino', 'ADC Therapeutics',
    'Aenova Group', 'Agilent Technologies', 'Agomab', 'AiCuris', 'Aimmune Therapeutics', 'Akebia Therapeutics',
    'Alcon', 'Alkem Laboratories', 'Alkermes', 'Allarity Therapeutics', 'Allergan', 'Allogene Therapeutics', 'Almirall', 'Alnylam Pharmaceuticals',
    'Altis Labs Switzerland', 'AmerisourceBergen', 'Amgen', 'amedes Holding', 'Anergis', 'AnaptysBio', 'Antibiotice', 'Apellis Pharmaceuticals',
    'APR Applied Pharma Research', 'Argenx', 'Aristo Pharma', 'Arvinas', 'ARTES Biotechnology', 'Arya Sciences', 'Asceneuron', 'Ascendis Pharma',
    'Aslan Pharmaceuticals', 'Astellas Pharma', 'AstraZeneca', 'Atara Biotherapeutics', 'Aurobindo Pharma', 'Autolus Therapeutics', 'B. Braun Melsungen',
    'Basel Area Business & Innovation', 'Basilea Pharmaceutica', 'Bavarian Nordic Germany', 'Bayer', 'Becton Dickinson', 'BeiGene', 'Beda Pharmaceuticals',
    'Berlin-Chemie', 'Bio Deutschland', 'Bio-Rad Laboratories', 'BioAgilytix Germany', 'BioSpring', 'Biofrontera', 'BioMarin Pharmaceutical', 'Bionorica',
    'BioNTech', 'BioNTech US Inc.', 'Biognosys', 'Biotest', 'Biotype Diagnostics', 'Biovotion', 'Biosynth Carbosynth', 'Blueprint Medicines', 'Bluebird Bio',
    'Boehringer Ingelheim', 'Boehringer Mannheim', 'Boston Scientific', 'Bristol Myers Squibb', 'B. Braun', 'C4 Therapeutics', 'Cadila Healthcare',
    'Calliditas Therapeutics', 'CanSino Biologics', 'Capricor Therapeutics', 'Cardinal Health', 'CareDx', 'Caribou Biosciences', 'Cassava Sciences',
    'Catalent', 'Celgene', 'CellGenix', 'Celltrion', 'Centogene', 'Century Therapeutics', 'CeGaT', 'Charles River Laboratories', 'Chong Kun Dang Pharmaceutical',
    'Chugai Pharmaceutical', 'Cilian AG', 'Cilag', 'Cipla', 'Clovis Oncology', 'Codexis', 'Coloplast', 'ConvaTec', 'Corimmun', 'CRISPR Therapeutics',
    'CSL Limited', 'CureVac', 'Cytokinetics', 'Daiichi Sankyo', 'Daiichi Sankyo Germany', 'Danaher Corporation', 'DBV Technologies', 'DEAGOSTINI Pharmaceuticals',
    'Debiopharm Group', 'Deciphera Pharmaceuticals', 'Delenex Therapeutics', 'Denali Therapeutics', 'Divis Laboratories', 'Dixi Polytool', 'Dong-A ST',
    'Dr. Falk Pharma', 'Dr. Reddy\'s Laboratories', 'Drägerwerk AG', 'Dyne Therapeutics Germany', 'Editas Medicine', 'Egis Pharmaceuticals', 'Eisai', 'Eli Lilly',
    'Emergent BioSolutions', 'Endo International', 'Epigenomics', 'Erytech Pharma', 'Esbatech', 'Esteve', 'Eurofins Genomics Europe', 'Evotec', 'Exact Sciences',
    'Exelixis', 'Faes Farma', 'Farmabios Germany', 'Fate Therapeutics', 'Ferrer', 'Ferring Pharmaceuticals', 'FibroGen', 'Forma Therapeutics', 'Foundation Medicine',
    'Freenome', 'Fresenius', 'Fresenius Kabi', 'Fresenius Medical Care', 'Fujifilm Diosynth Biotechnologies', 'Fusion Pharmaceuticals', 'Galapagos NV', 'Galderma',
    'Gedeon Richter', 'Gedeon Richter Deutschland', 'Generate Biomedicines', 'Genentech', 'Genfit', 'Genmab', 'Geron', 'Gerresheimer', 'Gilead Sciences', 'Ginkgo Bioworks',
    'GlaxoSmithKline', 'Glenmark Pharmaceuticals', 'Global Blood Therapeutics', 'GNA Biosolutions', 'Green Cross', 'Grifols', 'Grünenthal', 'GSK', 'Guardant Health',
    'Guerbet', 'Gustav F. Giemsa', 'HaemoScan', 'Halliburton', 'Hanmi Pharmaceutical', 'Haplogen', 'HaemoScan', 'Helsinn Group', 'Helsinn Healthcare', 'Hemopharm',
    'Heidelberg Pharma', 'Hexal', 'Hikma Pharmaceuticals', 'Hisamitsu Pharmaceutical', 'Hollister', 'Hookipa Pharma', 'Horizon Therapeutics', 'Hualan Biological Engineering',
    'Humedics', 'Hutchmed', 'I-Mab', 'Icon', 'IDT Biologika', 'IFB Stroemungsforschung', 'Illumina', 'Immatics', 'Immunocore', 'Incyte', 'Innate Pharma', 'Innovent Biologics',
    'Insitro', 'Intellia Therapeutics', 'Intercept Pharmaceuticals', 'Inventiva', 'Inovio Pharmaceuticals', 'Invitae', 'Ionis Pharmaceuticals', 'Ipsen', 'IQVIA',
    'Isarna Therapeutics', 'ITM Isotope Technologies Munich', 'Jazz Pharmaceuticals', 'JenaValve Technology', 'Jiangsu Hengrui Medicine', 'Jenapharm', 'Johnson & Johnson',
    'Juno Therapeutics', 'JW Pharmaceutical', 'Kaken Pharmaceutical', 'Karyopharm Therapeutics', 'Kenta Biotech', 'Kiadis Pharma', 'Kinarus', 'Kissei Pharmaceutical',
    'Kite Pharma', 'Klinikum rechts der Isar Translational Center', 'KRKA', 'Kura Oncology', 'Kuros Biosciences', 'Kyowa Kirin', 'Labatec Pharma', 'Labor Berlin',
    'Laboratorios Farmacéuticos Rovi', 'LEO Pharma', 'Les Laboratoires Servier', 'Ligand Pharmaceuticals', 'Lilly', 'Lonza Group', 'Lophius Biosciences', 'Lotus Pharma Germany',
    'LTS Lohmann Therapie-Systeme', 'Lupin', 'Lyell Immunopharma', 'MacroGenics', 'Magenta Therapeutics', 'Mainz Biomed', 'Mallinckrodt', 'Mammoth Biosciences', 'Mankind Pharma',
    'Mabylon', 'McKesson Corporation', 'Medacta', 'Medac GmbH', 'Medigene', 'Medipan GmbH', 'Medtech/Switzerland', 'Medtronic', 'Meiji Seika Pharma', 'Merck & Co.',
    'Merck Group', 'Merck KGaA', 'Merz Pharma', 'Mersana Therapeutics', 'Miltenyi Biotec', 'Mirati Therapeutics', 'Mitsubishi Tanabe Pharma', 'Moderna', 'Molecular Health',
    'Molecular Partners', 'Molecular Templates', 'MorphoSys', 'Mylan', 'Myovant Sciences', 'Myriad Genetics', 'Mymetics', 'Nanobiotix', 'Nanobiotix Germany', 'Natera', 'NBE-Therapeutics',
    'NBE-Therapeutics Zurich', 'Neurocrine Biosciences', 'Neurimmune', 'NextCure', 'Nemetschek', 'Nektar Therapeutics', 'Nippon Shinyaku', 'Novartis', 'Novigenix', 'Novo Nordisk',
    'Noxxon Pharma', 'Nuvation Bio', 'Obseva', 'OM Pharma', 'Oncopeptides', 'Oncgnostics', 'Oncternal Therapeutics', 'Ono Pharmaceutical', 'Onxeo', 'Orchard Therapeutics', 'Otsuka Holdings',
    'Pacific Biosciences', 'Paion', 'Parexel', 'Patheon', 'PEI (Paul-Ehrlich-Institut)', 'PerkinElmer', 'Perrigo', 'Pfizer', 'PharmaMar', 'Pharmaplan', 'Pharmaselect', 'Pharmetheus',
    'PharmTexx', 'Pharming Group N.V.', 'Pharmacyclics Germany', 'Pierre Fabre', 'Pieris Pharmaceuticals', 'Plasmavita Healthcare', 'PolyPeptide Group', 'Polyphor', 'Polpharma',
    'PPD', 'Preglem', 'Prime Medicine', 'ProteoMediX', 'Puma Biotechnology', 'Qiagen', 'Radius Health', 'Ratiopharm', 'RECRO Pharma', 'Recursion Pharmaceuticals', 'REGENXBIO', 'Regeneron Pharmaceuticals',
    'Relay Therapeutics', 'Rentschler Biopharma', 'Rentschler Fill Solutions', 'Replimune', 'Regen Lab', 'Revolution Medicines', 'RIEMSER Pharma', 'Roche', 'Rohto Pharmaceutical', 'Rovi', 'Rubius Therapeutics',
    'Sage Therapeutics', 'Samsung Biologics', 'Samsung Bioepis', 'Sana Biotechnology', 'Sandoz', 'Sanofi', 'Sanofi Pasteur', 'Santhera Pharmaceuticals', 'Sartorius', 'Sawai Pharmaceutical', 'Scil Proteins',
    'Schott Pharma', 'Seagen', 'seleon GmbH', 'Sensirion', 'Servier', 'Serum Institute of India', 'Shanghai Fosun Pharmaceutical', 'SHL Medical', 'Shield Therapeutics', 'Shionogi', 'Shire',
    'Siegfried AG', 'Sinopharm', 'Smith & Nephew', 'Sobi', 'Sosei Heptares', 'Sorrento Therapeutics', 'Sophia Genetics', 'Sophion Bioscience', 'SOTIO Biotech', 'Spark Therapeutics', 'Spexis', 'Stada Arzneimittel',
    'Stallergenes Greer Switzerland', 'Steinhoff Pharma', 'Straumann Group', 'Streamline Health', 'Sumitomo Dainippon Pharma', 'Sun Pharma', 'Sunstar', 'Surface Oncology', 'Switzerland Innovation', 'Symphogen Germany',
    'Synaptic Biomedical', 'Synlab AG', 'Syneos Health', 'Synthetic Biologics', 'SygnoMed', 'Takeda', 'Taysha Gene Therapies', 'TCR2 Therapeutics', 'Teva Pharmaceutical', 'TG Therapeutics', 'Thermo Fisher Scientific',
    'Tillotts Pharma', 'Tillotts Pharma AG', 'Tigen Pharma', 'TOLREMO therapeutics', 'Torii Pharmaceutical', 'Torrent Pharmaceuticals', 'Transcatheter Technologies', 'Transgene', 'Trillium Therapeutics', 'Tsumura',
    'Turning Point Therapeutics', 'Twist Bioscience', 'uniQure', 'UCB', 'UCB Pharma Germany', 'Valneva', 'Vaccibody Germany', 'Veracyte', 'Verona Pharma', 'Verovaccines', 'Verve Therapeutics', 'Vertex Pharmaceuticals',
    'Vetter Pharma', 'Viatris', 'Vifor Pharma', 'ViroVet Germany', 'Vision Health', 'Vita 34', 'Wacker Biotech', 'Waters Corporation', 'Weleda AG', 'WuXi AppTec', 'WuXi Biologics', 'Wyeth', 'Xencor', 'Y-mAbs Therapeutics',
    'Ypsomed', 'Yuhan Corporation', 'Zai Lab', 'Zeltia', 'Zentiva', 'Zimmer Biomet', 'Zur Rose Group', 'Zydus Cadila', 'Zymeworks', 'Zymergen'
]

mapping = {
    'AC Immune': ['ac immune'],
    'AbbVie': ['abbvie','abbott'],
    'AbCellera': ['abcellera'],
    'Abivax S.A.': ['abivax'],
    'Acadia Pharmaceuticals': ['acadia pharmaceuticals', 'acadia'],
    'Acceleron Pharma': ['acceleron pharma', 'acceleron'],
    'Acino': ['acino'],
    'ADC Therapeutics': ['adc therapeutics', 'adc'],
    'Aenova Group': ['aenova group', 'aenova'],
    'Agilent Technologies': ['agilent technologies', 'agilent'],
    'Agomab': ['agomab', 'agomab spain'],
    'AiCuris': ['aicuris'],
    'Aimmune Therapeutics': ['aimmune therapeutics', 'aimmune'],
    'Akebia Therapeutics': ['akebia therapeutics', 'akebia'],
    'Alcon': ['alcon'],
    'Alkem Laboratories': ['alkem laboratories', 'alkem'],
    'Alkermes': ['alkermes'],
    'Allarity Therapeutics': ['allarity therapeutics', 'allarity'],
    'Allergan': ['allergan'],
    'Allogene Therapeutics': ['allogene therapeutics', 'allogene'],
    'Almirall': ['almirall'],
    'Alnylam Pharmaceuticals': ['alnylam pharmaceuticals', 'alnylam'],
    'Altis Labs Switzerland': ['altis labs switzerland'],
    'AmerisourceBergen': ['amerisourcebergen'],
    'Amgen': ['amgen'],
    'amedes Holding': ['amedes holding', 'amedes'],
    'Anergis': ['anergis'],
    'AnaptysBio': ['anaptysbio'],
    'Antibiotice': ['antibiotice'],
    'Apellis Pharmaceuticals': ['apellis pharmaceuticals', 'apellis'],
    'APR Applied Pharma Research': ['apr applied pharma research', 'apr pharma research'],
    'Argenx': ['argenx'],
    'Aristo Pharma': ['aristo pharma', 'aristo'],
    'Arvinas': ['arvinas'],
    'ARTES Biotechnology': ['artes biotechnology', 'artes'],
    'Arya Sciences': ['arya sciences'],
    'Asceneuron': ['asceneuron'],
    'Ascendis Pharma': ['ascendis pharma', 'ascendis'],
    'Aslan Pharmaceuticals': ['aslan pharmaceuticals', 'aslan'],
    'Astellas Pharma': ['astellas pharma', 'astellas'],
    'AstraZeneca': ['astrazeneca'],
    'Atara Biotherapeutics': ['atara biotherapeutics', 'atara'],
    'Aurobindo Pharma': ['aurobindo pharma', 'aurobindo'],
    'Autolus Therapeutics': ['autolus therapeutics', 'autolus'],
    'B. Braun': ['b. braun', 'braun'],
    'B. Braun Melsungen': ['b. braun melsungen', 'b braun melsungen', 'bbraun melsungen'],
    'Basel Area Business & Innovation': ['basel area business & innovation', 'basel innovation'],
    'Basilea Pharmaceutica': ['basilea pharmaceutica', 'basilea'],
    'Bavarian Nordic Germany': ['bavarian nordic germany', 'bavarian nordic de'],
    'Bayer': ['bayer'],
    'Becton Dickinson': ['becton dickinson', 'bd'],
    'BeiGene': ['beigene'],
    'Beda Pharmaceuticals': ['beda pharmaceuticals', 'beda'],
    'Berlin-Chemie': ['berlin-chemie', 'berlin chemie'],
    'Bio Deutschland': ['bio deutschland'],
    'Bio-Rad Laboratories': ['bio-rad laboratories', 'bio-rad'],
    'BioAgilytix Germany': ['bioagilytix germany', 'bioagilytix'],
    'Biofrontera': ['biofrontera'],
    'BioMarin Pharmaceutical': ['biomarin pharmaceutical', 'biomarin'],
    'Bionorica': ['bionorica'],
    'BioNTech': ['biontech'],
    'Biognosys': ['biognosys'],
    'Biotest': ['biotest'],
    'Biotype Diagnostics': ['biotype diagnostics', 'biotype'],
    'Biovotion': ['biovotion'],
    'Biosynth Carbosynth': ['biosynth carbosynth'],
    'Blueprint Medicines': ['blueprint medicines', 'blueprint'],
    'Bluebird Bio': ['bluebird bio', 'bluebird'],
    'Boehringer Ingelheim': ['boehringer ingelheim'],
    'Boehringer Mannheim': ['boehringer mannheim'],
    'Boston Scientific': ['boston scientific'],
    'Bristol Myers Squibb': ['bristol-myers', 'bms', 'bristol myers squibb'],
    'C4 Therapeutics': ['c4 therapeutics'],
    'Cadila Healthcare': ['cadila healthcare', 'cadila'],
    'Calliditas Therapeutics': ['calliditas therapeutics', 'calliditas'],
    'CanSino Biologics': ['cansino biologics', 'cansino'],
    'Capricor Therapeutics': ['capricor therapeutics', 'capricor'],
    'Cardinal Health': ['cardinal health'],
    'CareDx': ['caredx'],
    'Caribou Biosciences': ['caribou biosciences', 'caribou'],
    'Cassava Sciences': ['cassava sciences', 'cassava'],
    'Catalent': ['catalent'],
    'Celgene': ['celgene'],
    'CellGenix': ['cellgenix'],
    'Celltrion': ['celltrion'],
    'Centogene': ['centogene'],
    'Century Therapeutics': ['century therapeutics', 'century'],
    'CeGaT': ['cegat'],
    'Charles River Laboratories': ['charles river laboratories', 'charles river'],
    'Chong Kun Dang Pharmaceutical': ['chong kun dang pharmaceutical', 'chong kun dang'],
    'Chugai Pharmaceutical': ['chugai pharmaceutical', 'chugai'],
    'Cilian AG': ['cilian ag', 'cilian'],
    'Cilag': ['cilag'],
    'Cipla': ['cipla'],
    'Clovis Oncology': ['clovis oncology', 'clovis'],
    'Codexis': ['codexis'],
    'Coloplast': ['coloplast'],
    'ConvaTec': ['convatec'],
    'Corimmun': ['corimmun'],
    'CRISPR Therapeutics': ['crispr therapeutics', 'crispr'],
    'CSL Limited': ['csl limited', 'csl'],
    'CureVac': ['curevac'],
    'Cytokinetics': ['cytokinetics'],
    'Daiichi Sankyo': ['daiichi sankyo', 'daiichi'],
    'Daiichi Sankyo Germany': ['daiichi sankyo germany'],
    'Danaher Corporation': ['danaher corporation', 'danaher'],
    'DBV Technologies': ['dbv technologies', 'dbv'],
    'DEAGOSTINI Pharmaceuticals': ['deagostini pharmaceuticals', 'deagostini'],
    'Debiopharm Group': ['debiopharm group', 'debiopharm'],
    'Deciphera Pharmaceuticals': ['deciphera pharmaceuticals', 'deciphera'],
    'Delenex Therapeutics': ['delenex therapeutics', 'delenex'],
    'Denali Therapeutics': ['denali therapeutics', 'denali'],
    'Divis Laboratories': ['divis laboratories', 'divis'],
    'Dixi Polytool': ['dixi polytool'],
    'Dong-A ST': ['dong-a st', 'dong-a'],
    'Dr. Falk Pharma': ['dr. falk pharma', 'dr falk pharma', 'falk pharma'],
    'Dr. Reddy\'s Laboratories': ['dr. reddy\'s laboratories', 'dr reddys', 'dr reddy'],
    'Drägerwerk AG': ['drägerwerk ag', 'dragerwerk'],
    'Dyne Therapeutics Germany': ['dyne therapeutics germany'],
    'Editas Medicine': ['editas medicine', 'editas'],
    'Egis Pharmaceuticals': ['egis pharmaceuticals', 'egis'],
    'Eisai': ['eisai'],
    'Eli Lilly': ['eli lilly', 'lilly'],
    'Emergent BioSolutions': ['emergent biosolutions'],
    'Endo International': ['endo international', 'endo'],
    'Epigenomics': ['epigenomics'],
    'Erytech Pharma': ['erytech pharma', 'erytech'],
    'Esbatech': ['esbatech'],
    'Esteve': ['esteve'],
    'Eurofins Genomics Europe': ['eurofins genomics europe', 'eurofins genomics'],
    'Evotec': ['evotec'],
    'Exact Sciences': ['exact sciences', 'exact'],
    'Exelixis': ['exelixis'],
    'Faes Farma': ['faes farma'],
    'Farmabios Germany': ['farmabios germany'],
    'Fate Therapeutics': ['fate therapeutics', 'fate'],
    'Ferrer': ['ferrer'],
    'Ferring Pharmaceuticals': ['ferring pharmaceuticals', 'ferring'],
    'FibroGen': ['fibrogen'],
    'Forma Therapeutics': ['forma therapeutics', 'forma'],
    'Foundation Medicine': ['foundation medicine'],
    'Freenome': ['freenome'],
    'Fresenius': ['fresenius'],
    'Fresenius Kabi': ['fresenius kabi', 'fresenius'],
    'Fresenius Medical Care': ['fresenius medical care', 'fmc'],
    'Fujifilm Diosynth Biotechnologies': ['fujifilm diosynth biotechnologies', 'fujifilm diosynth'],
    'Fusion Pharmaceuticals': ['fusion pharmaceuticals', 'fusion'],
    'Galapagos NV': ['galapagos nv', 'galapagos'],
    'Galderma': ['galderma'],
    'Gedeon Richter': ['gedeon richter'],
    'Gedeon Richter Deutschland': ['gedeon richter deutschland'],
    'Generate Biomedicines': ['generate biomedicines', 'generate'],
    'Genentech': ['genentech'],
    'Genfit': ['genfit'],
    'Genmab': ['genmab'],
    'Geron': ['geron'],
    'Gerresheimer': ['gerresheimer'],
    'Gilead Sciences': ['gilead sciences', 'gilead'],
    'Ginkgo Bioworks': ['ginkgo bioworks', 'ginkgo'],
    'GlaxoSmithKline': ['glaxosmithkline', 'gsk'],
    'Glenmark Pharmaceuticals': ['glenmark pharmaceuticals', 'glenmark'],
    'Global Blood Therapeutics': ['global blood therapeutics'],
    'GNA Biosolutions': ['gna biosolutions'],
    'Green Cross': ['green cross'],
    'Grifols': ['grifols'],
    'Grünenthal': ['grünenthal', 'grunenthal'],
    'GSK': ['gsk'],
    'Guardant Health': ['guardant health', 'guardant'],
    'Guerbet': ['guerbet'],
    'Gustav F. Giemsa': ['gustav f. giemsa'],
    'HaemoScan': ['haemoscan'],
    'Halliburton': ['halliburton'],
    'Hanmi Pharmaceutical': ['hanmi pharmaceutical', 'hanmi'],
    'Haplogen': ['haplogen'],
    'Helsinn Group': ['helsinn group', 'helsinn'],
    'Helsinn Healthcare': ['helsinn healthcare', 'helsinn'],
    'Hemopharm': ['hemopharm'],
    'Heidelberg Pharma': ['heidelberg pharma'],
    'Hexal': ['hexal'],
    'Hikma Pharmaceuticals': ['hikma pharmaceuticals', 'hikma'],
    'Hisamitsu Pharmaceutical': ['hisamitsu pharmaceutical', 'hisamitsu'],
    'Hollister': ['hollister'],
    'Hookipa Pharma': ['hookipa pharma', 'hookipa'],
    'Horizon Therapeutics': ['horizon therapeutics', 'horizon'],
    'Hualan Biological Engineering': ['hualan biological engineering', 'hualan'],
    'Humedics': ['humedics'],
    'Hutchmed': ['hutchmed', 'hutchison china meditech'],
    'I-Mab': ['i-mab'],
    'Icon': ['icon'],
    'IDT Biologika': ['idt biologika', 'idt'],
    'IFB Stroemungsforschung': ['ifb stroemungsforschung'],
    'Illumina': ['illumina'],
    'Immatics': ['immatics'],
    'Immunocore': ['immunocore'],
    'Incyte': ['incyte'],
    'Innate Pharma': ['innate pharma'],
    'Innovent Biologics': ['innovent biologics', 'innovent'],
    'Insitro': ['insitro'],
    'Intellia Therapeutics': ['intellia therapeutics', 'intellia'],
    'Intercept Pharmaceuticals': ['intercept pharmaceuticals', 'intercept'],
    'Inventiva': ['inventiva'],
    'Inovio Pharmaceuticals': ['inovio pharmaceuticals', 'inovio'],
    'Invitae': ['invitae'],
    'Ionis Pharmaceuticals': ['ionis pharmaceuticals', 'ionis'],
    'Ipsen': ['ipsen'],
    'IQVIA': ['iqvia'],
    'Isarna Therapeutics': ['isarna therapeutics', 'isarna'],
    'ITM Isotope Technologies Munich': ['itm isotope technologies munich', 'itm munich'],
    'Jazz Pharmaceuticals': ['jazz pharmaceuticals', 'jazz pharma'],
    'Jenapharm': ['jenapharm'],
    'JenaValve Technology': ['jenavalve technology', 'jenavalve'],
    'Jiangsu Hengrui Medicine': ['jiangsu hengrui medicine', 'hengrui'],
    'Johnson & Johnson': ['johnson and johnson', 'janssen', 'j&j'],
    'Juno Therapeutics': ['juno therapeutics', 'juno'],
    'JW Pharmaceutical': ['jw pharmaceutical', 'jw'],
    'Kaken Pharmaceutical': ['kaken pharmaceutical', 'kaken'],
    'Karyopharm Therapeutics': ['karyopharm therapeutics', 'karyopharm'],
    'Kenta Biotech': ['kenta biotech', 'kenta'],
    'Kiadis Pharma': ['kiadis pharma', 'kiadis'],
    'Kinarus': ['kinarus'],
    'Kissei Pharmaceutical': ['kissei pharmaceutical', 'kissei'],
    'Kite Pharma': ['kite pharma', 'kite'],
    'Klinikum rechts der Isar Translational Center': ['rechts der isar translational center'],
    'KRKA': ['krka'],
    'Kura Oncology': ['kura oncology', 'kura'],
    'Kuros Biosciences': ['kuros biosciences', 'kuros'],
    'Kyowa Kirin': ['kyowa kirin'],
    'Labatec Pharma': ['labatec pharma', 'labatec'],
    'Labor Berlin': ['labor berlin'],
    'Laboratorios Farmacéuticos Rovi': ['laboratorios farmacéuticos rovi', 'rovi labs'],
    'LEO Pharma': ['leo pharma'],
    'Les Laboratoires Servier': ['les laboratoires servier'],
    'Ligand Pharmaceuticals': ['ligand pharmaceuticals', 'ligand'],
    'Lilly': ['lilly'],
    'Lonza Group': ['lonza group', 'lonza'],
    'Lophius Biosciences': ['lophius biosciences', 'lophius'],
    'Lotus Pharma Germany': ['lotus pharma germany'],
    'LTS Lohmann Therapie-Systeme': ['lts lohmann therapie-systeme', 'lts', 'lohmann therapie systeme'],
    'Lupin': ['lupin'],
    'Lyell Immunopharma': ['lyell immunopharma', 'lyell'],
    'MacroGenics': ['macrogenics'],
    'Magenta Therapeutics': ['magenta therapeutics', 'magenta'],
    'Mainz Biomed': ['mainz biomed'],
    'Mallinckrodt': ['mallinckrodt'],
    'Mammoth Biosciences': ['mammoth biosciences', 'mammoth'],
    'Mankind Pharma': ['mankind pharma', 'mankind'],
    'Mabylon': ['mabylon'],
    'McKesson Corporation': ['mckesson corporation', 'mckesson'],
    'Medacta': ['medacta'],
    'Medac GmbH': ['medac gmbh', 'medac'],
    'Medigene': ['medigene'],
    'Medipan GmbH': ['medipan gmbh', 'medipan'],
    'Medtech/Switzerland': ['medtech/switzerland'],
    'Medtronic': ['medtronic'],
    'Meiji Seika Pharma': ['meiji seika pharma', 'meiji'],
    'Merck & Co.': ['merck & co.', 'merck'],
    'Merck Group': ['merck group'],
    'Merck KGaA': ['merck kgaa', 'merck darmstadt'],
    'Merz Pharma': ['merz pharma', 'merz'],
    'Mersana Therapeutics': ['mersana therapeutics', 'mersana'],
    'Miltenyi Biotec': ['miltenyi biotec', 'miltenyi'],
    'Mirati Therapeutics': ['mirati therapeutics', 'mirati'],
    'Mitsubishi Tanabe Pharma': ['mitsubishi tanabe pharma', 'mitsubishi tanabe'],
    'Moderna': ['moderna'],
    'Molecular Health': ['molecular health'],
    'Molecular Partners': ['molecular partners'],
    'Molecular Templates': ['molecular templates'],
    'MorphoSys': ['morphosys'],
    'Mylan': ['mylan'],
    'Myovant Sciences': ['myovant sciences', 'myovant'],
    'Myriad Genetics': ['myriad genetics', 'myriad'],
    'Mymetics': ['mymetics'],
    'Nanobiotix': ['nanobiotix'],
    'Nanobiotix Germany': ['nanobiotix germany'],
    'Natera': ['natera'],
    'NBE-Therapeutics': ['nbe-therapeutics', 'nbe therapeutics'],
    'NBE-Therapeutics Zurich': ['nbe-therapeutics zurich'],
    'Nemetschek': ['nemetschek'],
    'Neurocrine Biosciences': ['neurocrine biosciences', 'neurocrine'],
    'Neurimmune': ['neurimmune'],
    'NextCure': ['nextcure'],
    'Nektar Therapeutics': ['nektar therapeutics', 'nektar'],
    'Nippon Shinyaku': ['nippon shinyaku'],
    'Novartis': ['novartis'],
    'Novigenix': ['novigenix'],
    'Novo Nordisk': ['novo nordisk'],
    'Noxxon Pharma': ['noxxon pharma', 'noxxon'],
    'Nuvation Bio': ['nuvation bio'],
    'Obseva': ['obseva'],
    'OM Pharma': ['om pharma'],
    'Oncopeptides': ['oncopeptides'],
    'Oncgnostics': ['oncgnostics'],
    'Oncternal Therapeutics': ['oncternal therapeutics', 'oncternal'],
    'Ono Pharmaceutical': ['ono pharmaceutical', 'ono'],
    'Onxeo': ['onxeo'],
    'Orchard Therapeutics': ['orchard therapeutics', 'orchard'],
    'Otsuka Holdings': ['otsuka holdings', 'otsuka'],
    'Pacific Biosciences': ['pacific biosciences', 'pacbio'],
    'Paion': ['paion'],
    'Parexel': ['parexel'],
    'Patheon': ['patheon'],
    'PEI (Paul-Ehrlich-Institut)': ['paul-ehrlich-institut', 'pei'],
    'PerkinElmer': ['perkinelmer'],
    'Perrigo': ['perrigo'],
    'Pfizer': ['pfizer'],
    'PharmaMar': ['pharmamar'],
    'Pharmaplan': ['pharmaplan'],
    'Pharmaselect': ['pharmaselect'],
    'Pharmetheus': ['pharmetheus'],
    'PharmTexx': ['pharmtexx'],
    'Pharming Group N.V.': ['pharming group n.v.', 'pharming'],
    'Pharmacyclics Germany': ['pharmacyclics germany'],
    'Pierre Fabre': ['pierre fabre'],
    'Pieris Pharmaceuticals': ['pieris pharmaceuticals', 'pieris'],
    'Plasmavita Healthcare': ['plasmavita healthcare'],
    'PolyPeptide Group': ['polypeptide group', 'polypeptide'],
    'Polyphor': ['polyphor'],
    'Polpharma': ['polpharma'],
    'PPD': ['ppd'],
    'Preglem': ['preglem'],
    'Prime Medicine': ['prime medicine', 'prime'],
    'ProteoMediX': ['proteomedix'],
    'Puma Biotechnology': ['puma biotechnology', 'puma'],
    'Qiagen': ['qiagen'],
    'Radius Health': ['radius health'],
    'Ratiopharm': ['ratiopharm'],
    'RECRO Pharma': ['recro pharma'],
    'Recursion Pharmaceuticals': ['recursion pharmaceuticals', 'recursion'],
    'REGENXBIO': ['regenxbio'],
    'Regeneron Pharmaceuticals': ['regeneron pharmaceuticals', 'regeneron'],
    'Relay Therapeutics': ['relay therapeutics', 'relay'],
    'Rentschler Biopharma': ['rentschler biopharma', 'rentschler'],
    'Rentschler Fill Solutions': ['rentschler fill solutions'],
    'Replimune': ['replimune'],
    'Regen Lab': ['regen lab'],
    'Revolution Medicines': ['revolution medicines', 'revolution'],
    'RIEMSER Pharma': ['riemser pharma', 'riemser'],
    'Roche': ['roche', 'f. hoffmann'],
    'Rohto Pharmaceutical': ['rohto pharmaceutical', 'rohto'],
    'Rovi': ['rovi'],
    'Rubius Therapeutics': ['rubius therapeutics', 'rubius'],
    'Sage Therapeutics': ['sage therapeutics'],
    'Samsung Biologics': ['samsung biologics'],
    'Samsung Bioepis': ['samsung bioepis'],
    'Sana Biotechnology': ['sana biotechnology', 'sana'],
    'Sandoz': ['sandoz'],
    'Sanofi': ['sanofi'],
    'Sanofi Pasteur': ['sanofi pasteur'],
    'Santhera Pharmaceuticals': ['santhera pharmaceuticals', 'santhera'],
    'Sartorius': ['sartorius'],
    'Sawai Pharmaceutical': ['sawai pharmaceutical', 'sawai'],
    'Scil Proteins': ['scil proteins', 'scil'],
    'Schott Pharma': ['schott pharma', 'schott'],
    'Seagen': ['seagen'],
    'seleon GmbH': ['seleon gmbh', 'seleon'],
    'Sensirion': ['sensirion'],
    'Servier': ['servier'],
    'Serum Institute of India': ['serum institute of india', 'serum institute'],
    'Shanghai Fosun Pharmaceutical': ['shanghai fosun pharmaceutical', 'fosun pharma'],
    'SHL Medical': ['shl medical', 'shl'],
    'Shield Therapeutics': ['shield therapeutics', 'shield'],
    'Shionogi': ['shionogi'],
    'Shire': ['shire'],
    'Siegfried AG': ['siegfried ag', 'siegfried'],
    'Sinopharm': ['sinopharm'],
    'Smith & Nephew': ['smith & nephew', 'smith nephew'],
    'Sobi': ['sobi'],
    'Sosei Heptares': ['sosei heptares', 'sosei'],
    'Sorrento Therapeutics': ['sorrento therapeutics', 'sorrento'],
    'Sophia Genetics': ['sophia genetics'],
    'Sophion Bioscience': ['sophion bioscience', 'sophion'],
    'SOTIO Biotech': ['sotio biotech', 'sotio'],
    'Spark Therapeutics': ['spark therapeutics', 'spark'],
    'Spexis': ['spexis'],
    'Stada Arzneimittel': ['stada arzneimittel', 'stada'],
    'Stallergenes Greer Switzerland': ['stallergenes greer switzerland', 'stallergenes'],
    'Steinhoff Pharma': ['steinhoff pharma', 'steinhoff'],
    'Straumann Group': ['straumann group', 'straumann'],
    'Streamline Health': ['streamline health'],
    'Sumitomo Dainippon Pharma': ['sumitomo dainippon pharma', 'sumitomo'],
    'Sun Pharma': ['sun pharma', 'sun pharmaceutical'],
    'Sunstar': ['sunstar'],
    'Surface Oncology': ['surface oncology', 'surface'],
    'Switzerland Innovation': ['switzerland innovation'],
    'Symphogen Germany': ['symphogen germany'],
    'Synaptic Biomedical': ['synaptic biomedical'],
    'Synlab AG': ['synlab', 'synlab ag'],
    'Syneos Health': ['syneos health', 'syneos'],
    'Synthetic Biologics': ['synthetic biologics'],
    'SygnoMed': ['sygnomed'],
    'Takeda': ['takeda'],
    'Taysha Gene Therapies': ['taysha gene therapies', 'taysha'],
    'TCR2 Therapeutics': ['tcr2 therapeutics', 'tcr2'],
    'Teva Pharmaceutical': ['teva pharmaceutical', 'teva'],
    'TG Therapeutics': ['tg therapeutics', 'tg'],
    'Thermo Fisher Scientific': ['thermo fisher scientific', 'thermo fisher'],
    'Tillotts Pharma': ['tillotts pharma', 'tillotts'],
    'Tillotts Pharma AG': ['tillotts pharma ag'],
    'Tigen Pharma': ['tigen pharma', 'tigen'],
    'TOLREMO therapeutics': ['tolremo therapeutics', 'tolremo'],
    'Torii Pharmaceutical': ['torii pharmaceutical', 'torii'],
    'Torrent Pharmaceuticals': ['torrent pharmaceuticals', 'torrent'],
    'Transcatheter Technologies': ['transcatheter technologies'],
    'Transgene': ['transgene'],
    'Trillium Therapeutics': ['trillium therapeutics', 'trillium'],
    'Tsumura': ['tsumura'],
    'Turning Point Therapeutics': ['turning point therapeutics', 'turning point'],
    'Twist Bioscience': ['twist bioscience', 'twist'],
    'uniQure': ['uniqure'],
    'UCB': ['ucb'],
    'UCB Pharma Germany': ['ucb pharma germany'],
    'Valneva': ['valneva'],
    'Vaccibody Germany': ['vaccibody germany'],
    'Veracyte': ['veracyte'],
    'Verona Pharma': ['verona pharma', 'verona'],
    'Verovaccines': ['verovaccines'],
    'Verve Therapeutics': ['verve therapeutics', 'verve'],
    'Vertex Pharmaceuticals': ['vertex pharmaceuticals', 'vertex'],
    'Vetter Pharma': ['vetter pharma', 'vetter'],
    'Viatris': ['viatris'],
    'Vifor Pharma': ['vifor pharma', 'vifor'],
    'ViroVet Germany': ['virovet germany'],
    'Vision Health': ['vision health'],
    'Vita 34': ['vita 34'],
    'Wacker Biotech': ['wacker biotech', 'wacker chemie'],
    'Waters Corporation': ['waters corporation', 'waters'],
    'Weleda AG': ['weleda ag', 'weleda'],
    'WuXi AppTec': ['wuxi apptec', 'wuxi'],
    'WuXi Biologics': ['wuxi biologics'],
    'Wyeth': ['wyeth'],
    'Xencor': ['xencor'],
    'Y-mAbs Therapeutics': ['y-mabs therapeutics', 'y-mabs'],
    'Ypsomed': ['ypsomed'],
    'Yuhan Corporation': ['yuhan corporation', 'yuhan'],
    'Zai Lab': ['zai lab'],
    'Zeltia': ['zeltia'],
    'Zentiva': ['zentiva'],
    'Zimmer Biomet': ['zimmer biomet', 'zimmer'],
    'Zur Rose Group': ['zur rose group', 'zur rose'],
    'Zydus Cadila': ['zydus cadila', 'zydus'],
    'Zymeworks': ['zymeworks'],
    'Zymergen': ['zymergen']
}



# -*- coding: utf-8 -*-
"""
Clinical Trials asset extractor with user-configurable filters.



Before the DISEASE_NAME and MAX_ASSETS settings, the user must provide
YES/NO flags for PHASE1 through PHASE4, ACADEMIC_HOSPITAL, OTHERS, and APPROVED_DRUGS.
Example input at runtime:
    PHASE1 YES
    PHASE2 NO
    PHASE3 YES
    PHASE4 YES
    ACADEMIC_HOSPITAL YES
    OTHERS YES
    APPROVED_DRUGS YES
    DISEASE_NAME = "Crohn Disease"
    MAX_ASSETS = 200
"""



import re
import json
import time
import unicodedata
import requests
import urllib.parse
from rapidfuzz import fuzz
from bs4 import BeautifulSoup
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed



# ---------------- User Configuration Input ----------------
# Valid keys: PHASE1, PHASE2, PHASE3, PHASE4, ACADEMIC_HOSPITAL, OTHERS, APPROVED_DRUGS
PHASE1 = False    # YES
PHASE2 = True     # NO
PHASE3 = True     # YES
PHASE4 = False    # YES
ACADEMIC_HOSPITAL = False    # YES
OTHERS = True    # YES
APPROVED_DRUGS = True    # YES



DISEASE_NAME = "Crohn Disease"
MAX_ASSETS = 20



DEFAULT_TIMEOUT = 12
CT_PAGE_SIZE = 200
FUZZ_THRESHOLD = 85
TITLE_MATCH_THRESHOLD = 80
MAX_WORKERS = min(8, max(2, int(MAX_ASSETS/10)))



EXCLUDED_SPONSORS = [
    'hospital','university','medical school','clinic','center','centre',
    'foundation','college','society','association','research institute',
    'research center','research centre','department','unit','network',
    'hospital system','trust','government','public health','nhs','va',
    'ministry of health','state','city','county','national institute',
    'federal','academy','school',"children's",'pediatrics','community'
]



# HTTP sessions
session_ct = requests.Session()
session_chembl = requests.Session()
session_ot = requests.Session()



# API endpoints
CHEMBL_BASE = "https://www.ebi.ac.uk/chembl/api/data/molecule.json"
OT_GQL = "https://api.platform.opentargets.org/api/v4/graphql"



# GraphQL queries for mechanism discovery
GQL_DRUG = """
query drugByChembl($chemblId: String!) {
  drug(chemblId: $chemblId) {
    id
    name
    mechanismsOfAction { rows { mechanismOfAction actionType targetName targets { approvedSymbol approvedName } references { urls } } }
  }
}
"""


GQL_SEARCH = """
query searchDrug($queryString: String!, $entityNames: [String!]) {
  search(query: $queryString, entityNames: $entityNames, page: {index: 0, size: 10}) {
    hits { object { ... on Drug { id name mechanismsOfAction { rows { mechanismOfAction actionType targetName targets { approvedSymbol approvedName } } } } } }
  }
}
"""



def strip_accents(s: str) -> str:
    if not s: return ""
    return "".join(c for c in unicodedata.normalize("NFKD", s) if not unicodedata.combining(c))



def normalize_text(s: str) -> str:
    if not s: return ""
    s2 = strip_accents(s.lower())
    s2 = re.sub(r'[^a-z0-9\s]', ' ', s2)
    return re.sub(r'\s+', ' ', s2).strip()



def canonical_asset_key(name: str) -> str:
    if not name: return ""
    s = strip_accents(name).lower().strip()
    s = re.sub(r'^[\d\.\,]+\s*[-:]?\s*', '', s)
    s = re.sub(r'\(.*?\)', ' ', s)
    s = re.sub(r'\b\d+\s*(mg|g|ml|mcg|µg|units|tablet[s]?|capsule[s]?|dose|iu)\b',' ', s, flags=re.I)
    s = re.sub(r'[^a-z0-9]+', ' ', s)
    return re.sub(r'\s+', ' ', s).strip()



def unify_sponsor_name(raw_name: str) -> str:
    if not raw_name: return 'Other'
    s = raw_name.strip()
    # Title tokens map to Academic / Hospital
    if re.search(r'\b(MD|M\.D\.|Prof\.?|Professor)\b', s, flags=re.I):
        return 'Academic / Hospital'
    if len(s.split()) > 3:
        return 'Academic / Hospital'
    low = s.lower()
    if any(tok in low for tok in [
        'hospital','clinic','university','institute','research','centre','center',
        'nhs','va','foundation','college','school'
    ]):
        return 'Academic / Hospital'
    s2 = re.sub(r'\b(inc|llc|ltd|limited|corp|corporation|gmbh|ag|plc|co|s\.a\.)\b\.?','', s, flags=re.I)
    s2 = re.sub(r'\s+',' ', s2).strip()
    mapping_dict = {
        **{c:[c.lower()] for c in KNOWN_COMPANIES},
        # add specific entries from mapping
        **mapping
    }
    low2 = s2.lower()
    for canon, variants in mapping_dict.items():
        if any(v in low2 for v in variants) or canon.lower() in low2:
            return canon
    return 'Other'



def phase_rank(phase_str) -> int:
    if not phase_str: return 0
    s = str(phase_str).upper()
    if re.search(r'PHASE\s*4|IV', s): return 5
    if re.search(r'PHASE\s*3|III', s): return 4
    if re.search(r'PHASE\s*2|II', s): return 3
    if re.search(r'PHASE\s*1|I', s): return 2
    if re.search(r'EARLY\s*PHASE\s*1', s): return 1
    m = re.search(r'(\d)', s)
    return int(m.group(1)) if m else 0



def canonical_phase_label(phase_str) -> str:
    if not phase_str: return 'N/A'
    s = str(phase_str).upper()
    if re.search(r'PHASE\s*4|IV', s): return 'PHASE4'
    if re.search(r'PHASE\s*3|III', s): return 'PHASE3'
    if re.search(r'PHASE\s*2|II', s): return 'PHASE2'
    if re.search(r'PHASE\s*1|I', s): return 'PHASE1'
    if re.search(r'EARLY\s*PHASE\s*1', s): return 'EARLY_PHASE1'
    return s.strip()



def get_phase_from_study(s: dict) -> tuple:
    try:
        design = s['protocolSection']['designModule']
        phase_raw = design.get('phases') or design.get('phase') or None
        phase_val = phase_raw[0] if isinstance(phase_raw,list) and phase_raw else phase_raw
        return phase_val, phase_rank(phase_val)
    except:
        return None, 0



def should_exclude_by_phase(r:int)->bool:
    if r==2 and not PHASE1: return True
    if r==3 and not PHASE2: return True
    if r==4 and not PHASE3: return True
    if r==5 and not PHASE4: return True
    if r==0 and not OTHERS: return True
    return False



PLACEBO_TERMS=['placebo','vehicle','saline','sodium chloride','control','sugar pill','standard of care']



def is_clean_single_asset(name, title):
    if not name: return False
    low=strip_accents(name.lower()).strip()
    if 'stem cell' in low: return True
    low=re.sub(r'^\d+\s*','',low)
    if any(k in low for k in ['generic','bioequivalence','anda','plus','combination']): return False
    w=low.split()
    return 0<len(w)<=8



def is_placebo_asset(name):
    s=strip_accents(name.lower())
    return any(t in s for t in PLACEBO_TERMS)



def asset_agnostic_key(name, iv_type=None)->str:
    if not name: return ""
    s=strip_accents(name).lower().strip()
    if 'stem cell' in s: return 'stem cell'
    s=re.sub(r'^(drug|biological|device):\s*','',s,flags=re.I)
    s=re.sub(r'\(.*?\)','',s)
    s=re.sub(r'\b\d+\s*(mg|g|ml|mcg|ug|µg|units|tablet[s]?|capsule[s]?|dose|iu)\b','',s,flags=re.I)
    s=re.sub(r'\s+\d+\s*$','',s)
    s=re.sub(r'\b(daily|weekly|monthly|once|twice|per day|per week|period)\b','',s,flags=re.I)
    s=re.sub(r'\b(for|to|in|with|of|by|vs|and)\b.*','',s).strip()
    s=re.sub(r'[^a-z0-9 ]+','',s)
    return re.sub(r'\s+',' ',s).strip()



def canonical_base(key): return 'stem cell' if 'stem cell' in key else (key.split()[0] if key else key)



def collect_all_studies(cond):
    all_,tok=[],None
    while True:
        p={'query.cond':cond,'query.intr':'DRUG','format':'json','pageSize':CT_PAGE_SIZE}
        if tok: p['pageToken']=tok
        try:
            r=session_ct.get("https://clinicaltrials.gov/api/v2/studies",params=p,timeout=DEFAULT_TIMEOUT)
            r.raise_for_status()
            j=r.json() or {}
            all_.extend(j.get('studies') or [])
            tok=j.get('nextPageToken')
            if not tok: break
        except: break
    return all_



def build_grouped_trials_fuzzy(studies):
    g,mc,cm={}, {}, {}
    stats={'total':len(studies),'excluded_phase':0,'excluded_sponsor':0,'excluded_empty_key':0,'excluded_not_clean':0}
    ep,es,eek,en=[],[],[],[]
    for s in studies:
        try:
            n=s['protocolSection']['identificationModule']['nctId']
            if not n: continue
            title=s['protocolSection']['identificationModule'].get('officialTitle','') or ''
            inters=s['protocolSection']['armsInterventionsModule'].get('interventions') or []
            raw_sp=s['protocolSection']['sponsorCollaboratorsModule']['leadSponsor'].get('name','') or ''
            sponsor=unify_sponsor_name(raw_sp)
            if sponsor=='Academic / Hospital / Other' and not ACADEMIC_HOSPITAL:
                stats['excluded_sponsor']+=1
                if len(es)<10: es.append({'nct':n,'sponsor':sponsor,'link':f"https://clinicaltrials.gov/study/{n}"})
                continue
            if sponsor!='Academic / Hospital / Other':
                if any(x in normalize_text(sponsor) for x in EXCLUDED_SPONSORS) and sponsor not in KNOWN_COMPANIES:
                    stats['excluded_sponsor']+=1
                    if len(es)<10: es.append({'nct':n,'sponsor':sponsor,'link':f"https://clinicaltrials.gov/study/{n}"})
                    continue
            pv,pr=get_phase_from_study(s)
            if should_exclude_by_phase(pr):
                stats['excluded_phase']+=1
                if len(ep)<10: ep.append({'nct':n,'phase':canonical_phase_label(pv),'link':f"https://clinicaltrials.gov/study/{n}"})
                continue
            for iv in inters:
                t=iv.get('type') or ''
                if t.upper() not in ('DRUG','BIOLOGICAL'): continue
                nm=iv.get('name') or ''
                if is_placebo_asset(nm): continue
                key=asset_agnostic_key(nm,t)
                if not key:
                    stats['excluded_empty_key']+=1
                    if len(eek)<10: eek.append({'nct':n,'name':nm,'link':f"https://clinicaltrials.gov/study/{n}"})
                    continue
                if not is_clean_single_asset(nm,title):
                    stats['excluded_not_clean']+=1
                    if len(en)<10: en.append({'nct':n,'name':nm,'link':f"https://clinicaltrials.gov/study/{n}"})
                    continue
                base=canonical_base(key)
                entry={'nct':n,'raw_name':nm,'url':f"https://clinicaltrials.gov/study/{n}",
                       'phase_val':pv,'phase_rank':pr,'company':sponsor,'canonical_key':key}
                if base in g:
                    mc[base]+=1
                    if pr>g[base]['phase_rank']: g[base]=entry
                else:
                    g[base]=entry; mc[base]=1
                cm[base]=sponsor
        except:
            continue



    print("Filtering summary:")
    print(f"  Total studies collected: {stats['total']}")
    print(f"  Excluded by Phase: {stats['excluded_phase']}")
    for x in ep: print(f"    NCT: {x['nct']}, Phase: {x['phase']}, Link: {x['link']}")
    print(f"  Excluded by Sponsor: {stats['excluded_sponsor']}")
    for x in es: print(f"    NCT: {x['nct']}, Sponsor: {x['sponsor']}, Link: {x['link']}")
    print(f"  Excluded assets (empty key): {stats['excluded_empty_key']}")
    for x in eek: print(f"    NCT: {x['nct']}, Name: {x['name']}, Link: {x['link']}")
    print(f"  Excluded assets (not clean): {stats['excluded_not_clean']}")
    for x in en: print(f"    NCT: {x['nct']}, Name: {x['name']}, Link: {x['link']}")
    merged_stats="; ".join([f"{k}:{v} merged" for k,v in mc.items()])
    print(f"  Merged entry stats per asset: {merged_stats}")
    return g,mc,cm, stats['total']



def select_latest_trial_per_asset(g,mc,cm):
    assets=[]
    for k,v in g.items():
        st=canonical_phase_label(v['phase_val'])
        if st in ('N/A',None) and OTHERS: st='OTHERS'
        assets.append({
            'canonical_key':v['canonical_key'],
            'asset':v['canonical_key'],
            'company':v['company'],
            'stage':st,
            'url':v['url'],
            'nct':v['nct'],
            'raw_name':v['raw_name'],
            'subtypes':DISEASE_NAME,
            'merged_entry_count':mc[k],
            'mech':'Unknown'  # Initialize mechanism field
        })
    return sorted(assets, key=lambda x: x['canonical_key'])



# ---------- FDA Approved Drugs Functions ----------
def first_str(obj, default="Unknown"):
    """Return the first string in a list, or the string itself, or the default."""
    if isinstance(obj, list):
        for item in obj:
            if isinstance(item, str) and item.strip():
                return item
        return default
    elif isinstance(obj, str):
        return obj
    return default



def remove_suffix(name):
    """Remove FDA biosimilar/brand suffix (e.g. -dyyb, -mrkz) from asset name."""
    name = first_str(name)
    if not name:
        return "Unknown"
    # Remove a dash followed by 3-5 letter suffix at end
    return re.sub(r'-[a-zA-Z]{3,5}$', '', name.strip(), flags=re.IGNORECASE)



def clean_asset_name(name):
    raw_cleaned = remove_suffix(name)
    raw_cleaned = re.sub(r"[^a-zA-Z0-9\s]", "", raw_cleaned)
    raw_cleaned = raw_cleaned.strip()
    parts = raw_cleaned.split()
    if parts:
        return parts[0].capitalize()
    return raw_cleaned



def unify_company_name(name):
    name = first_str(name)
    if not name:
        return "Unknown"
    return name.strip().title()



def extract_root(name):
    name = first_str(name)
    if not name or not isinstance(name, str):
        return "Unknown"
    name = remove_suffix(name).strip()
    # Split at the first delimiter, then capitalize the first part
    root = re.split(r"[-_\s]", name)
    return root[0].capitalize() if root and isinstance(root, str) else name



def parse_date(date_str):
    if not date_str or not isinstance(date_str, str):
        return None
    for fmt in ("%Y-%m-%d", "%Y/%m/%d", "%d-%m-%Y", "%d/%m/%Y"):
        try:
            return datetime.strptime(date_str, fmt)
        except ValueError:
            continue
    return None



def fetch_application_number(openfda, d):
    appnum = (first_str(d.get('application_number')) or
              first_str(openfda.get("application_number")) or
              "")
    for label in ["nda", "anda", "bla", "seq_num"]:
        n = first_str(openfda.get(label))
        if n and n.lower() != 'unknown':
            appnum = n
            break
    return appnum



def fallback_link(asset):
    safe_generic = urllib.parse.quote_plus(asset)
    return f"https://dailymed.nlm.nih.gov/dailymed/search.cfm?query={safe_generic}"



def query_fda_drugs(indication):
    safe_query = urllib.parse.quote(f'"{indication}"')
    api_url = f"https://api.fda.gov/drug/label.json?search=indications_and_usage:{safe_query}&limit=100"
    r = requests.get(api_url)
    if r.status_code != 200:
        print(f"Error fetching FDA data: status={r.status_code} for {indication!r} (skipping)")
        return []
    data = r.json()
    results = []
    for d in data.get("results", []):
        openfda = d.get("openfda", {})
        brand = first_str(openfda.get("brand_name", "Unknown"))
        generic = first_str(openfda.get("generic_name", "Unknown"))
        raw_name = generic if generic != "Unknown" and generic else brand
        asset = clean_asset_name(raw_name)
        company = unify_company_name(openfda.get("manufacturer_name", "Unknown"))
        canonical_key = f"{asset.lower()} {company.lower()}"
        appno = fetch_application_number(openfda, d)
        url = ""
        if appno and appno.lower() != "unknown":
            url = f"https://www.accessdata.fda.gov/scripts/cder/daf/index.cfm?event=overview.process&ApplNo={appno}"
        else:
            url = fallback_link(asset)
        approval_date = (
            d.get("effective_time") or d.get("application_effective_date") or
            d.get("approval_date") or openfda.get("approval_date", "")
        )
        parsed_approval_date = parse_date(approval_date)
        results.append({
            "canonical_key": canonical_key,
            "asset": asset,
            "company": company,
            "stage": "Approved",
            "status": "Approved",
            "url": url,
            "raw_name": raw_name,
            "subtypes": indication,
            "mech": "Unknown",
            "approval_date": approval_date,
            "parsed_approval_date": parsed_approval_date
        })
    return results



def select_first_approved(drug_records):
    per_root = {}
    for record in drug_records:
        root = extract_root(record["asset"])
        parsed_date = record["parsed_approval_date"]
        if root not in per_root:
            per_root[root] = record
        else:
            prev = per_root[root]
            prev_date = prev["parsed_approval_date"]
            if parsed_date and prev_date:
                if parsed_date < prev_date:
                    per_root[root] = record
            elif parsed_date and not prev_date:
                per_root[root] = record
    filtered = []
    for v in per_root.values():
        v_out = v.copy()
        v_out.pop("parsed_approval_date", None)
        v_out.pop("approval_date", None)
        filtered.append(v_out)
    return filtered



# ---------- Mechanism of Action Discovery Functions ----------
def chembl_lookup(name):
    if not name: return None
    try:
        r = session_chembl.get(CHEMBL_BASE, params={'pref_name__iexact': name}, timeout=DEFAULT_TIMEOUT)
        if r.ok:
            mols = r.json().get('molecules') or []
            if mols: return mols[0].get('molecule_chembl_id')
    except Exception:
        pass
    try:
        r2 = session_chembl.get(CHEMBL_BASE, params={'molecule_synonyms__icontains': name}, timeout=DEFAULT_TIMEOUT)
        if r2.ok:
            mols = r2.json().get('molecules') or []
            if mols: return mols[0].get('molecule_chembl_id')
    except Exception:
        pass
    # fallback: try token parts
    for p in name.split():
        if len(p) > 3:
            try:
                r3 = session_chembl.get(CHEMBL_BASE, params={'pref_name__icontains': p}, timeout=6)
                if r3.ok:
                    mols = r3.json().get('molecules') or []
                    if mols: return mols[0].get('molecule_chembl_id')
            except Exception:
                pass
    return None



def post_gql(session, query, variables):
    try:
        r = session.post(OT_GQL, json={'query': query, 'variables': variables}, timeout=DEFAULT_TIMEOUT)
        if r.ok:
            return r.json()
    except Exception:
        pass
    return {}



def extract_target_names(drug_obj):
    if not drug_obj: return []
    rows = (drug_obj.get('mechanismsOfAction') or {}).get('rows') or []
    target_names = set()
    for r in rows:
        targets = r.get('targets') or []
        for target in targets:
            # Prefer gene symbol, fallback to approved name
            symbol = target.get('approvedSymbol')
            name = target.get('approvedName')
            if symbol and symbol.strip():
                target_names.add(symbol.strip())
            elif name and name.strip():
                target_names.add(name.strip())
    return list(target_names)



def get_moa_for_asset(display_name, canonical_key):
    search = canonical_key or display_name
    chembl_id = chembl_lookup(search)
    if not chembl_id:
        chembl_id = chembl_lookup(display_name)
    if not chembl_id:
        return 'Unknown'

    # Try direct drug query first
    j = post_gql(session_ot, GQL_DRUG, {'chemblId': chembl_id})
    drug = j.get('data', {}).get('drug') if isinstance(j, dict) else None
    targets = extract_target_names(drug) if drug else []

    if not targets:
        # Fallback to search query
        js = post_gql(session_ot, GQL_SEARCH, {'queryString': chembl_id, 'entityNames': ['drug']})
        hits = js.get('data', {}).get('search', {}).get('hits', []) or []
        for h in hits:
            obj = h.get('object') or {}
            targets = extract_target_names(obj)
            if targets:
                break

    if targets:
        # Return up to 3 targets, joined by comma
        return ', '.join(targets[:3])
    else:
        return 'Unknown'



def moa_task(asset):
    key = asset['canonical_key']
    moa = get_moa_for_asset(asset['asset'], key)
    return key, moa



def main():
    print_program_overview()

    t0 = time.time()
    all_assets = []

    # Clinical trials assets
    print(f"Querying ClinicalTrials.gov for: {DISEASE_NAME}")
    studies = collect_all_studies(DISEASE_NAME)
    print(f"Total studies retrieved: {len(studies)}")

    g, mc, cm, total_before = build_grouped_trials_fuzzy(studies)
    ct_assets = select_latest_trial_per_asset(g, mc, cm)
    print(f"Clinical trials assets after filtering: {len(ct_assets)}")

    all_assets.extend(ct_assets)

    # FDA approved drugs
    if APPROVED_DRUGS:
        print(f"\nQuerying FDA for approved drugs: {DISEASE_NAME}")
        keywords = [DISEASE_NAME, "CD" if "Crohn" in DISEASE_NAME else DISEASE_NAME.split()[0]]
        fda_assets = []
        for kw in keywords:
            fda_assets.extend(query_fda_drugs(kw))
        fda_assets = select_first_approved(fda_assets)
        print(f"FDA approved drugs found: {len(fda_assets)}")
        all_assets.extend(fda_assets)

    # Sort and limit all assets
    all_assets = sorted(all_assets, key=lambda x: x['canonical_key'])
    count_after_filter = len(all_assets)
    print(f"\nTotal assets after filtering: {count_after_filter}")

    # Apply user-specified MAX_ASSETS limit FIRST - only these will get mechanism enrichment
    if count_after_filter > MAX_ASSETS:
        all_assets = all_assets[:MAX_ASSETS]
    remaining_after_limit = len(all_assets)
    print(f"Assets to be returned (limited to max {MAX_ASSETS}): {remaining_after_limit}")

    # MOA enrichment ONLY for the final limited set of assets
    print(f"\nEnriching mechanisms of action for {remaining_after_limit} final assets...")
    moa_results = {}
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as ex:
        futures = {ex.submit(moa_task, a): a for a in all_assets}
        for fut in as_completed(futures):
            try:
                key, moa = fut.result()
                moa_results[key] = moa
            except Exception:
                pass

    # Integrate MOA results into final asset list
    for a in all_assets:
        k = a['canonical_key']
        a['mech'] = moa_results.get(k, 'Unknown')
        # Clean up FDA-specific fields for output consistency
        a.pop('approval_date', None)
        a.pop('parsed_approval_date', None)
        if 'merged_entry_count' not in a:
            a['merged_entry_count'] = 1

    print("\nFinal canonical assets:")
    for a in all_assets:
        print(json.dumps(a, ensure_ascii=False))

    print(f"\nExecution finished in {time.time() - t0:.2f} seconds")



if __name__ == "__main__":
    main()


Collecting rapidfuzz
  Downloading rapidfuzz-3.13.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading rapidfuzz-3.13.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m29.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz
Successfully installed rapidfuzz-3.13.0
CLINICAL TRIALS & FDA APPROVED DRUGS ASSET EXTRACTOR

This program searches for pharmaceutical assets (drugs and biologics) related to
a specific disease condition: 'Crohn Disease'

DATA SOURCES:
• ClinicalTrials.gov - Active clinical trials with drug/biologic interventions
• FDA Drug Labels API - Approved medications with matching indications

FILTERING PROCESS:
1. CLINICAL TRIAL PHASE FILTERING:
   • Phase 1 trials: EXCLUDED
   • Phase 2 trials: INCLUDED
   • Phase 3 trials: INCLUDED
   • Phase 4 trials: EXCLUDED

2. SPONSOR TYPE FILTERING:
   • Academic/Hospital