In [9]:
from pathlib import Path
import pandas as pd
import numpy as np
#import ast

PROJECT_ROOT = Path.cwd().parent
DATA_DIR = PROJECT_ROOT / "data"
CLEAN_DIR = DATA_DIR / "clean"
PROCESSED_DIR = DATA_DIR / "processed"
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

#RANDOM_STATE = 42

# 1. Cargar dataset creado en 01 EDA
clean_path = CLEAN_DIR / "clinicaltrials_clean_mesh.csv"
df = pd.read_csv(clean_path, parse_dates=["StartDate", "PrimaryCompletionDate", "CompletionDate","StudyFirstSubmitDate","LastUpdateSubmitDate"])
df.head()


Unnamed: 0,NCTId,BriefTitle,OfficialTitle,OverallStatus,LastKnownStatus,StartDate,PrimaryCompletionDate,CompletionDate,StudyFirstSubmitDate,LastUpdateSubmitDate,...,InterventionMeshList,PrimaryInterventionMesh,CollaboratorList,PrimaryCollaborator,MinAgeYears,MaxAgeYears,Sex_clean,Sex_ALL,Sex_FEMALE,Sex_MALE
0,NCT01315821,Effect of Saccharomyces Boulardii on Necrotizi...,Role Of Saccharomyces Boulardii in Preventin N...,UNKNOWN,RECRUITING,2011-02-01,2011-12-01,2011-12-01,2011-02-24,2011-08-04,...,[],,[],,0.002738,0.166667,ALL,1,0,0
1,NCT04551521,CRAFT: The NCT-PMO-1602 Phase II Trial,Continuous ReAssessment With Flexible ExTensio...,COMPLETED,,2021-10-13,2024-12-30,2024-12-30,2020-07-24,2025-01-07,...,"['Vemurafenib', 'cobimetinib', 'atezolizumab',...",Vemurafenib,[],,18.0,,ALL,1,0,0
2,NCT04086121,A Study to Test the Long-term Safety of BI 655...,An Open Label Extension Study to Assess the Lo...,TERMINATED,,2019-09-24,2021-04-28,2022-02-23,2019-09-10,2025-02-10,...,['spesolimab'],spesolimab,[],,18.0,75.0,ALL,1,0,0
3,NCT01181921,The CIRCADIAN Study: Evaluation of Modulating ...,Phase IV Study for the Assessment of Modulatin...,TERMINATED,,2011-05-01,2011-06-01,2011-06-01,2010-08-12,2014-04-15,...,['Galantamine'],Galantamine,[],,18.0,,ALL,1,0,0
4,NCT05435014,T-ACE Oil by TAE/TACE in Patients With Hepatoc...,"Phase I/II Randomized, Double-Blind, First-in-...",RECRUITING,,2022-09-13,2026-06-30,2026-06-30,2021-11-16,2024-12-17,...,['Ethiodized Oil'],Ethiodized Oil,[],,20.0,,ALL,1,0,0


In [10]:
df["DurationCompletionDays"] = (df["CompletionDate"] - df["StartDate"]).dt.days

for col in ["DurationPrimaryDays", "DurationCompletionDays"]:
    df.loc[df[col] < 0, col] = np.nan


In [35]:
df["StartYear"] = df["StartDate"].dt.year
df["StartMonth"] = df["StartDate"].dt.month
df["StudyFirstSubmitYear"] = df["StudyFirstSubmitDate"].dt.year
df["LastUpdateYear"] = df["LastUpdateSubmitDate"].dt.year

In [22]:
df["DesignInterventionModel"].value_counts()

DesignInterventionModel
PARALLEL        112312
SINGLE_GROUP     62849
CROSSOVER        16096
SEQUENTIAL        8750
FACTORIAL         2004
Name: count, dtype: int64

In [23]:
df["DesignAllocation"].value_counts()

DesignAllocation
RANDOMIZED        126361
NON_RANDOMIZED     25977
Name: count, dtype: int64

In [30]:
df["DesignMasking"].value_counts()

DesignMasking
NONE         116350
QUADRUPLE     28759
DOUBLE        28283
TRIPLE        15381
SINGLE        14021
Name: count, dtype: int64

In [None]:
# Crear variables binarias:

df["IsPediatric"] = (df["MinAgeYears"].fillna(999) < 18).astype(int)
df["IsElderly"] = (df["MaxAgeYears"].fillna(0) > 65).astype(int)
df["AgeRange"] = (df["MaxAgeYears"] - df["MinAgeYears"]).round(0)
df["IsRandomized"] = (df["DesignAllocation"] == "RANDOMIZED").astype(int)
df["IsBlinded"] = df["DesignMasking"].isin(["SINGLE", "DOUBLE", "TRIPLE", "QUADRUPLE"]).astype(int)

In [19]:
cols_keep = [
    "NCTId",
    "OverallStatus",
    "Phase",
    "PrimaryConditionMesh",
    "DurationPrimaryDays",
    "DurationCompletionDays",
    "StartYear",
    "StartMonth",
    "StudyFirstSubmitYear",
    "LastUpdateYear",
    "LeadSponsorClass",
    "PrimaryInterventionMesh",
    "PrimaryCountry",
    "MinAgeYears",
    "MaxAgeYears",
    "IsPediatric",
    "IsElderly",
    "AgeRange",
    "IsRandomized",
    "IsParallel",
    "IsTreatment",
    "IsBlinded",
]

df_model = df[cols_keep].copy()

out_path = PROCESSED_DIR / "clinicaltrials_features.csv"
df.to_csv(out_path, index=False)
out_path


KeyboardInterrupt: 