In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
#import ast

PROJECT_ROOT = Path.cwd().parent
DATA_DIR = PROJECT_ROOT / "data"
CLEAN_DIR = DATA_DIR / "clean"
PROCESSED_DIR = DATA_DIR / "processed"
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

# 1. Cargar dataset creado en 01 EDA
clean_path = CLEAN_DIR / "clinicaltrials_clean_mesh.csv"
df = pd.read_csv(clean_path, parse_dates=["StartDate", "PrimaryCompletionDate", "CompletionDate","StudyFirstSubmitDate","LastUpdateSubmitDate"])
df.head()


Unnamed: 0,NCTId,BriefTitle,OfficialTitle,OverallStatus,LastKnownStatus,StartDate,PrimaryCompletionDate,CompletionDate,StudyFirstSubmitDate,LastUpdateSubmitDate,...,InterventionMeshList,PrimaryInterventionMesh,CollaboratorList,PrimaryCollaborator,MinAgeYears,MaxAgeYears,Sex_clean,Sex_ALL,Sex_FEMALE,Sex_MALE
0,NCT01315821,Effect of Saccharomyces Boulardii on Necrotizi...,Role Of Saccharomyces Boulardii in Preventin N...,UNKNOWN,RECRUITING,2011-02-01,2011-12-01,2011-12-01,2011-02-24,2011-08-04,...,[],,[],,0.002738,0.166667,ALL,1,0,0
1,NCT04551521,CRAFT: The NCT-PMO-1602 Phase II Trial,Continuous ReAssessment With Flexible ExTensio...,COMPLETED,,2021-10-13,2024-12-30,2024-12-30,2020-07-24,2025-01-07,...,"['Vemurafenib', 'cobimetinib', 'atezolizumab',...",Vemurafenib,[],,18.0,,ALL,1,0,0
2,NCT04086121,A Study to Test the Long-term Safety of BI 655...,An Open Label Extension Study to Assess the Lo...,TERMINATED,,2019-09-24,2021-04-28,2022-02-23,2019-09-10,2025-02-10,...,['spesolimab'],spesolimab,[],,18.0,75.0,ALL,1,0,0
3,NCT01181921,The CIRCADIAN Study: Evaluation of Modulating ...,Phase IV Study for the Assessment of Modulatin...,TERMINATED,,2011-05-01,2011-06-01,2011-06-01,2010-08-12,2014-04-15,...,['Galantamine'],Galantamine,[],,18.0,,ALL,1,0,0
4,NCT05435014,T-ACE Oil by TAE/TACE in Patients With Hepatoc...,"Phase I/II Randomized, Double-Blind, First-in-...",RECRUITING,,2022-09-13,2026-06-30,2026-06-30,2021-11-16,2024-12-17,...,['Ethiodized Oil'],Ethiodized Oil,[],,20.0,,ALL,1,0,0


In [2]:
df["DurationCompletionDays"] = (df["CompletionDate"] - df["StartDate"]).dt.days

for col in ["DurationPrimaryDays", "DurationCompletionDays"]:
    df.loc[df[col] < 0, col] = np.nan

In [3]:
df["StartYear"] = df["StartDate"].dt.year
df["StartMonth"] = df["StartDate"].dt.month
df["StudyFirstSubmitYear"] = df["StudyFirstSubmitDate"].dt.year
df["LastUpdateYear"] = df["LastUpdateSubmitDate"].dt.year

In [4]:
df["MinAgeYears"].value_counts()

MinAgeYears
18.000000    142811
20.000000      7515
40.000000      4041
19.000000      3688
21.000000      3475
              ...  
0.112252          1
0.156057          1
1.839836          1
0.032854          1
2.750000          1
Name: count, Length: 210, dtype: int64

In [5]:
df["MaxAgeYears"].value_counts()

MaxAgeYears
75.000000    15528
65.000000    15101
80.000000    10240
70.000000    10038
55.000000     7882
             ...  
0.323066         1
10.916667        1
0.032854         1
2.416667         1
0.139630         1
Name: count, Length: 326, dtype: int64

In [6]:
df["DesignAllocation"].value_counts()

DesignAllocation
RANDOMIZED        126361
NON_RANDOMIZED     25977
Name: count, dtype: int64

In [7]:
df["DesignMasking"].value_counts()

DesignMasking
NONE         116350
QUADRUPLE     28759
DOUBLE        28283
TRIPLE        15381
SINGLE        14021
Name: count, dtype: int64

In [8]:
# Longitud del resumen breve
df["SummaryLength"] = df["BriefSummary"].fillna("").astype(str).str.len()

In [9]:
# Crear variables binarias:

df["IsPediatric"] = (df["MinAgeYears"].fillna(999) < 18).astype(int)
df["IsElderly"]   = (df["MaxAgeYears"].fillna(0)   > 65).astype(int)

df["MinAgeYears"] = df["MinAgeYears"].fillna(0)
df["MaxAgeYears"] = df["MaxAgeYears"].fillna(99)

df["AgeRange"] = (df["MaxAgeYears"] - df["MinAgeYears"]).round(0)
df["IsRandomized"] = (df["DesignAllocation"] == "RANDOMIZED").astype(int)
df["IsBlinded"] = df["DesignMasking"].isin(["SINGLE", "DOUBLE", "TRIPLE", "QUADRUPLE"]).astype(int)

In [10]:
cols_keep_ml = ["NCTId", "OverallStatus","PrimaryConditionMesh", "DurationPrimaryDays","DurationCompletionDays", "StartYear", "StartMonth","StudyFirstSubmitYear","LastUpdateYear",
    "LeadSponsorClass", "Phase", "PrimaryInterventionMesh", "PrimaryCountry", "EnrollmentCount", "Sex_ALL", "Sex_FEMALE", "Sex_MALE", "DesignMasking", "MinAgeYears",
    "MaxAgeYears", "AgeRange", "IsPediatric", "IsElderly", "IsRandomized", "IsBlinded","SummaryLength"]

cols_keep_nlp = ["NCTId", "BriefTitle", "OfficialTitle", "BriefSummary", "Phase", "EligibilityCriteria", "PrimaryOutcomeMeasure", "SecondaryOutcomeMeasure", "Keyword",
    "KeywordList", "CollaboratorName", "CollaboratorList", "WhyStopped", "InterventionName", "InterventionNameList"]

df_model_ml = df[cols_keep_ml]
df_model_nlp = df[cols_keep_nlp]

In [11]:

numeric_cols = ["DurationPrimaryDays", "DurationCompletionDays", "EnrollmentCount"]

for col in numeric_cols:
    if col in df.columns:
        median_val = df[col].median()
        df[col] = df[col].fillna(median_val)


categorical_cols = ["PrimaryConditionMesh","PrimaryInterventionMesh","PrimaryCountry", "DesignMasking", "WhyStopped","WhyStopped",                
    "CollaboratorName", "Keyword", "SecondaryOutcomeMeasure", "PrimaryOutcomeMeasure", "OfficialTitle", "EligibilityCriteria"]

for col in categorical_cols:
    if col in df.columns:
        df[col] = df[col].fillna("Unknown")

df_model_ml = df[cols_keep_ml]
df_model_nlp = df[cols_keep_nlp]

In [12]:
# Como hay muchos paises, vamos a seleccionar el top11. Con esto cubrimos el 80% de los datos

top20_counts = df_model_ml["PrimaryCountry"].value_counts().nlargest(11)
top20_percent = (df_model_ml["PrimaryCountry"].value_counts(normalize=True).nlargest(11))
top20 = pd.DataFrame({
    "count": df_model_ml["PrimaryCountry"].value_counts().nlargest(11),
    "percent": df_model_ml["PrimaryCountry"].value_counts(normalize=True).nlargest(11)})

top20

Unnamed: 0_level_0,count,percent
PrimaryCountry,Unnamed: 1_level_1,Unnamed: 2_level_1
United States,88180,0.430829
China,21070,0.102944
Unknown,17873,0.087324
Canada,5846,0.028562
France,5815,0.028411
Germany,5232,0.025562
South Korea,5210,0.025455
United Kingdom,4490,0.021937
Japan,3402,0.016621
Egypt,3309,0.016167


In [13]:
# Para los países hacemos el encoding con el top 11 países, sino Other. 
df_model_ml = df_model_ml.copy()
top_countries = (df_model_ml["PrimaryCountry"].value_counts().nlargest(11).index)

df_model_ml["PrimaryCountry_reduced"] = np.where(df_model_ml["PrimaryCountry"].isin(top_countries), df_model_ml["PrimaryCountry"], "Other")

# Quitamos la original para evitar alta cardinalidad
df_model_ml = df_model_ml.drop(columns=["PrimaryCountry"])


In [14]:
# Para el df de ML usamos encoding para traer columnas a partir de variables categoricas

categorical_cols_ml = ["OverallStatus", "LeadSponsorClass", "Phase", "DesignMasking", "PrimaryCountry_reduced"]

cols_for_encoding = [c for c in categorical_cols_ml if c in df_model_ml.columns]

df_ml_encoded = pd.get_dummies(df_model_ml, columns = cols_for_encoding, drop_first = True)

print("Shape original ML:", df_model_ml.shape)
print("Shape codificado ML:", df_ml_encoded.shape)

Shape original ML: (204675, 26)
Shape codificado ML: (204675, 59)


In [15]:
print("Shape:", df_ml_encoded.shape)
df_ml_encoded.columns

Shape: (204675, 59)


Index(['NCTId', 'PrimaryConditionMesh', 'DurationPrimaryDays',
       'DurationCompletionDays', 'StartYear', 'StartMonth',
       'StudyFirstSubmitYear', 'LastUpdateYear', 'PrimaryInterventionMesh',
       'EnrollmentCount', 'Sex_ALL', 'Sex_FEMALE', 'Sex_MALE', 'MinAgeYears',
       'MaxAgeYears', 'AgeRange', 'IsPediatric', 'IsElderly', 'IsRandomized',
       'IsBlinded', 'SummaryLength', 'OverallStatus_COMPLETED',
       'OverallStatus_ENROLLING_BY_INVITATION',
       'OverallStatus_NOT_YET_RECRUITING', 'OverallStatus_RECRUITING',
       'OverallStatus_SUSPENDED', 'OverallStatus_TERMINATED',
       'OverallStatus_UNKNOWN', 'OverallStatus_WITHDRAWN',
       'LeadSponsorClass_FED', 'LeadSponsorClass_INDIV',
       'LeadSponsorClass_INDUSTRY', 'LeadSponsorClass_NETWORK',
       'LeadSponsorClass_NIH', 'LeadSponsorClass_OTHER',
       'LeadSponsorClass_OTHER_GOV', 'LeadSponsorClass_UNKNOWN',
       'Phase_PHASE1', 'Phase_PHASE1 / PHASE2', 'Phase_PHASE2',
       'Phase_PHASE2 / PHASE3', 'P

In [16]:
df.to_csv(PROCESSED_DIR / "clinicaltrials_features.csv", index=False)
df_ml_encoded.to_csv(PROCESSED_DIR / "clinicaltrials_features_ml.csv", index=False)
df_model_nlp.to_csv(PROCESSED_DIR / "clinicaltrials_features_nlp.csv", index=False)