**Notes:**


*   This code is used to create a mapping between conditions from AACT to >> disease type categories, and further mapping to >> therapy area categories
*   There are 2,274 conditions in AACT, which we categorized into 165 disease types, and then further categorized into 26 therapy areas
*   Unique Conditions were taken from the AACT database, using the conditions table for each clinical trial
*   Disease type and Therapy area categories were taken from various sources based on the research done by our team. One major source used is the "The Novartis Data Science and Artificial Intelligence Challenge" cited in our report. We also utilized ChatGPT and other online sources for additional disease type and therapy area categories for certain conditions.


In [None]:
import numpy as np
from scipy import stats
import pandas as pd
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', None)

In [None]:
# load the conditions to disease type mapping
df_condition_disease = pd.read_csv('nov_24/nov_23/disease_type_mapping_manual.csv', sep=',')
df_condition_disease

Unnamed: 0,conditions,Allergic Rhinitis,Asthma,Chronic Obstructive Pulmonary Disease,Insomnia,HBV,Pain (neuropathic),Colorectal (Oncology),Soft Tissue Sarcoma,"CNS, Glioblastoma",Multiple Myeloma,Ovarian,Prostate,Renal,Melanoma,Esophageal,Breast,"Lung, Non-Small Cell",Liver,Pancreas,Head/Neck,Endometrial,"Leukemia, Acute Myelogenous","Lymphoma, Non-Hodgkin's",Myelodysplastic Syndrome,"Leukemia, Acute Lymphocytic",Lupus,Age-Related Macular Degeneration,Gastric,Alzheimer's Disease,Diabetic Complications,Acute Coronary Syndromes,Coronary Artery Disease,Peripheral Arterial Disease,Psoriasis,Unspecified Solid Tumor,Multiple Sclerosis,HIV,Other Viral Vaccines,Congestive Heart Failure,Hypertension,Crohn's Disease,Osteoporosis,Thyroid,Mesothelioma,Rheumatoid Arthritis,Ulcerative Colitis,"Leukemia, Chronic Lymphocytic",Thrombotic Disorders,GERD,Irritable Bowel Syndrome,Supportive Care,Type 2 Diabetes,Obesity,HCV,Anxiety,Depression,Bipolar Disorder,Atopic Dermatitis,Cystic Fibrosis,"Lymphoma, Hodgkin's",Bladder,"Leukemia, Chronic Myelogenous","Lung, Small Cell",Sepsis,"CNS, Other","CNS, Medulloblastoma",Movement Disorders,Metastatic Cancer,Uterine fibroids,Myeloproliferative Neoplasms,Attention Deficit Hyperactive Disorder,Constipation,Dyslipidemia,Migraine,Renal Disease,Sjogren's Syndrome,Overactive Bladder,Arrhythmia,Anti-aging (dermatology),Pain (nociceptive),Smoking Cessation,Diabetic Retinopathy,Sexual Dysfunction,Amyotrophic Lateral Sclerosis,Parkinson's Disease,GIST,Osteoarthritis,Endometriosis,Glaucoma,Other Inflammatory Arthritis,Transplantation/GVHD,Hemostasis/Hemophilia,Schizophrenia,Osteosarcoma,Hepatic Fibrosis,NAFLD,Cerebral Palsy,Respiratory Infections,Thalassemia,Bacterial Skin Infection,Benign Prostatic Hyperplasia,Hepatitis Vaccines,HPV,Type 1 Diabetes,"Skin, Basal Cell Carcinoma",Autism,Retinitis Pigmentosa,Unspecified Hematological Cancer,Otitis Media,Epilepsy,Spinal Muscular Atrophies,Anemia,Retinal Vein Occlusion,Hyponatremia,Urinary Incontinence,Alcohol Dependence,Unspecified Cancer,Restless Legs Syndrome,Rabies,Contraception,Respiratory Vaccines,Functional Dyspepsia,Testicular,Clostridium difficile,Urinary Tract Infections,Onychomycosis,Cervical,Vector-Borne Disease Vaccines,West Nile Virus (WNV),Pulmonary Fibrosis,Other Bacterial Vaccines,Influenza Vaccines,Growth Disorders,Hyperuricemia/Gout,Gastroparesis,Scleroderma,Dry Eye Syndrome,Intra-abdominal Infections,Menopausal Symptoms,Huntington's Disease,Cardiomyopathy,Sickle Cell Disease,Neuroendocrine,Infertility,Cytomegalovirus Infection (CMV),Dementia (non-Alzheimer's),Neonatal Brain Injury,Stroke (neuroprotection),Bone Fracture Healing,Lysosomal Storage Disorders,Infant Respiratory Distress Syndrome,Other Infection,Hearing Loss,Other Bone Failure/Loss,Other Heart Disease/Failure,Other Blood Loss/Disorder,Neurological & Mental Health,Other Genetic Failure/Mutation,Other Pregnancy Related,Other Surgery Related,Other Autoimmune Related,Pharmacology,Other Eye Related,Other Lung Related,Other Skin Related
0,non-small cell lung cancer,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,cardiovascular disease,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,hypertension,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,tuberculosis,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
4,breast cancer,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2269,inflammatory bowel diseases,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2270,"psoriasis, moderate to severe",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2271,oncology,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2272,hyperphosphataemia,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0


In [None]:
# change the disease type from wide to tall format
df_condition_disease2 = df_condition_disease.melt(id_vars=["conditions"], var_name="Variable", value_name="Value")
df_condition_disease2 = df_condition_disease2[df_condition_disease2['Value'] == 1]
df_condition_disease2.head()

Unnamed: 0,conditions,Variable,Value
259,rhinitis,Allergic Rhinitis,1
260,allergic rhinitis,Allergic Rhinitis,1
296,seasonal allergic rhinitis,Allergic Rhinitis,1
331,acute rhinosinusitis,Allergic Rhinitis,1
1608,allergic rhino-conjunctivitis,Allergic Rhinitis,1


In [None]:
# load the disease type to therapy area mapping
df_disease_therapyarea = pd.read_excel('nov_24/nov_23/disease_mapping_therapy_v2.xlsx')
df_disease_therapyarea

Unnamed: 0.1,Unnamed: 0,disease,therapy_area,new_therapy_area
0,0,Allergic Rhinitis,Respiratory,Respiratory
1,1,Asthma,Respiratory,Respiratory
2,2,Chronic Obstructive Pulmonary Disease,Respiratory,Respiratory
3,3,Cystic Fibrosis,Respiratory,Respiratory
4,4,Respiratory Infections,Respiratory,Respiratory
...,...,...,...,...
163,163,Other Autoimmune Related,Other,Other
164,164,Pharmacology,Other,Other
165,165,Other Eye Related,Ophthalmology,Ophthalmology
166,166,Other Lung Related,Respiratory,Respiratory


In [None]:
# merge table for mapping of conditions to disease type and to therapy area
df_conditions_ta = pd.merge(df_condition_disease2, df_disease_therapyarea, left_on="Variable", right_on="disease", how="inner")
df_conditions_ta = df_conditions_ta.drop(columns=['Variable','Value','Unnamed: 0','therapy_area'])
df_conditions_ta = df_conditions_ta.rename(columns={'disease': 'disease_type'})
df_conditions_ta.head()

Unnamed: 0,conditions,disease_type,new_therapy_area
0,rhinitis,Allergic Rhinitis,Respiratory
1,allergic rhinitis,Allergic Rhinitis,Respiratory
2,seasonal allergic rhinitis,Allergic Rhinitis,Respiratory
3,acute rhinosinusitis,Allergic Rhinitis,Respiratory
4,allergic rhino-conjunctivitis,Allergic Rhinitis,Respiratory


In [None]:
# export mapping file for conditions >> disease type >> therapy area
df_conditions_ta.to_csv('nov_24/nov_23/df_conditions_ta.txt', sep='|', index=True)