<a href="https://www.kaggle.com/code/makyriacou/decision-tree-prediction?scriptVersionId=222362766" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Imports 

In [1]:
# Load Libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import os
import warnings
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
# Ignore all pandas warnings
pd.options.mode.chained_assignment = None  # Suppress SettingWithCopyWarning
warnings.simplefilter(action='ignore', category=FutureWarning)


# Functions 

In [2]:
def load_data_from_folder(folder_path):
    """
    Load data files from a given folder into a dictionary.
    Returns:
        dict: A dictionary where keys are table names (derived from filenames) and values are Dask DataFrames.
    """
    file_dict = {}
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".csv"):
            # remove the type of file 
            table_name = file_name.replace("_train.csv", "").replace("_test.csv", "").replace(".csv", "")
            file_path = os.path.join(folder_path, file_name)
            file_dict[table_name] = pd.read_csv(file_path)  
    return file_dict

In [3]:
class Data: 

    def __init__(self, data, train_bool): 
        self.data = data
        self.train_bool = train_bool
        if self.train_bool:
            self.columns_dict = {
                'drugsexposure' : ['person_id', 'drug_datetime_hourly', 'drug_concept_id', 'route_concept_id'],
                'observation' : ['person_id', 'observation_concept_name', 'valuefilled'],
                'devices' : ['person_id', 'device_datetime_hourly', 'device'],
                'proceduresoccurrences' : ['person_id', 'procedure_datetime_hourly', 'procedure'],
                'person_demographics_episode':['person_id', 'age_in_months', 'gender'],
                'SepsisLabel' : ['person_id', 'SepsisLabel']}
            
            self.mapping_lst = {'SepsisLabel':['SepsisLabel'], 
                            'person_demographics_episode':['age_in_months', 'gender'],
                            'observation':['observation_concept_name', 'valuefilled']}
        else: 
            self.columns_dict = {
                'drugsexposure' : ['person_id', 'drug_datetime_hourly', 'drug_concept_id', 'route_concept_id'],
                'observation' : ['person_id', 'observation_concept_name', 'valuefilled'],
                'devices' : ['person_id', 'device_datetime_hourly', 'device'],
                'proceduresoccurrences' : ['person_id', 'procedure_datetime_hourly', 'procedure'],
                'person_demographics_episode':['person_id', 'age_in_months', 'gender'], 
                'SepsisLabel' : ['person_id']}
            
            self.mapping_lst = { 
                            'person_demographics_episode':['age_in_months', 'gender'],
                            'observation':['observation_concept_name', 'valuefilled']}         
        
        self.datasets = {}
        self._fix_data()

    def data_shape(self, df_dict): 
        for key, df in df_dict.items():
            print(f'{key}:{df.shape}')
        
    def _create_map(self, df, mapping_col_base, mapping_col):
        map_dict = {}
        for col_base in df[mapping_col_base].unique(): 
            map_dict[col_base] = df.loc[df[mapping_col_base] == col_base, mapping_col].values[0]
        return map_dict
    
    def _mapping_features(self): 
        mapping = {}
        for mapping_df_name, mapping_cols in  self.mapping_lst.items(): 
            mapping_df = self.datasets[mapping_df_name]
            for mapping_col in mapping_cols: 
                mapping[mapping_col] = self._create_map(df =mapping_df,
                                                     mapping_col_base='person_id',
                                                     mapping_col= mapping_col)
        return mapping
            
    def _map_columns(self): 
        for key, df in self.datasets.items():
            for col in self.mapping:
                df.loc[:, col] = df['person_id'].map(self.mapping[col])

    
    def _fix_data(self): 
        # Extract columns from each dataset
        self.datasets = {dataset_name: self.data[dataset_name][cols] for dataset_name, cols in self.columns_dict.items()}
        print('.... Colums Extracted')
        self.data_shape(self.datasets)
        
        if self.train_bool: 
            # Identify the unique person IDs 
            unique_ids = set.union(*(set(df['person_id'].unique()) for df in self.datasets.values()))
            self.datasets = {key: df[df['person_id'].isin(unique_ids)] for key, df in self.datasets.items()}
            print('.... Unique Person Ids')
            self.data_shape(self.datasets)

        # Mapp the columns 
        self.mapping  = self._mapping_features()
        self._map_columns()
        self.data_shape(self.datasets)
        print('..... Mapping ')

    def get_data(self):
        return self.datasets

In [4]:
class Pre_processing: 

    def __init__(self, df, date_time_col, encoding_cols, standarize_cols): 
        self.df  = df
        self.date_time_col = date_time_col
        self.encoding_cols = encoding_cols
        self.standarize_cols = standarize_cols
        self._data_process()

    def _data_process(self): 
        self._data_cleaning()
        self._data_pre_processing()
        self._date_encoding_scaling()

    def _data_cleaning(self): 
        print(f'Before Preprocesing:{self.df.shape}')
        self.df = self.df.drop_duplicates().dropna()
        print(f'After Preprocesing:{self.df.shape}')

    def _data_pre_processing(self):
        # Convert timestamp to datetime and extract features
        self.df[str(self.date_time_col)] = pd.to_datetime(self.df[str(self.date_time_col)])
        self.df['year'] = self.df[str(self.date_time_col)].dt.year
        self.df['month'] = self.df[str(self.date_time_col)].dt.month
        self.df['day'] = self.df[str(self.date_time_col)].dt.day
        self.df['hour'] = self.df[str(self.date_time_col)].dt.hour
        self.df.drop(str(self.date_time_col), axis=1, inplace=True)  

    def _date_encoding_scaling(self): 
        # Encoding
        label_encoder = LabelEncoder()
        for col_encoding in self.encoding_cols: 
            self.df[str(col_encoding)] = label_encoder.fit_transform(self.df[str(col_encoding)])
        print('.... Encoding Data')
        # Normalize numerical features (age)
        scaler = StandardScaler()
        for col_standar in self.standarize_cols: 
            self.df[[str(col_standar)]] = scaler.fit_transform(self.df[[str(col_standar)]])
            
        print('....Standarize Data')
    def get_df(self): return self.df 

In [5]:
def summary_table(df): 
    summary_table = pd.DataFrame({
    'Null Values': [df.isnull().sum().sum() ],
    'Duplicate Rows': [df.duplicated().sum() ],
    'Shape': [df.shape]})
    print(summary_table)

# Variables 

In [6]:
# Define file paths
test_folder_path = "/kaggle/input/phems-hackathon-early-sepsis-prediction/testing_data"
train_folder_path = "/kaggle/input/phems-hackathon-early-sepsis-prediction/training_data"

In [7]:
datasets_names = ['proceduresoccurrences', 'measurement_lab', 'observation', 'drugsexposure', 
                  'measurement_observation', 'person_demographics_episode', 'devices', 'measurement_meds',
                  'SepsisLabel']

In [8]:
drugsexposure_cols = ['person_id', 'drug_datetime_hourly', 'drug_concept_id', 'route_concept_id']
observation_cols = ['person_id', 'observation_concept_name', 'valuefilled']
devices_cols = ['person_id', 'device_datetime_hourly', 'device']
proceduresoccurrences_cols = ['person_id', 'procedure_datetime_hourly', 'procedure']
person_demographics_episode_cols  = ['person_id', 'age_in_months' ,	'gender']
SepsisLabel_cols = ['person_id', 'SepsisLabel']

cols = [drugsexposure_cols, observation_cols, devices_cols, proceduresoccurrences_cols, person_demographics_episode_cols, SepsisLabel_cols]

# **Main**

In [9]:
train_data = load_data_from_folder(train_folder_path)
test_data = load_data_from_folder(test_folder_path)

## **Train Data**

In [10]:
create_train_data = Data(train_data, train_bool = True)
train_datasets = create_train_data.get_data()
(drugsexposure_train_data, _, devices_train_data, proceduresoccurrences_train_data, _,  _) = train_datasets.values()

.... Colums Extracted
drugsexposure:(184780, 4)
observation:(3807, 3)
devices:(750878, 3)
proceduresoccurrences:(771214, 3)
person_demographics_episode:(3391, 3)
SepsisLabel:(331653, 2)
.... Unique Person Ids
drugsexposure:(184780, 4)
observation:(3807, 3)
devices:(750878, 3)
proceduresoccurrences:(771214, 3)
person_demographics_episode:(3391, 3)
SepsisLabel:(331653, 2)
drugsexposure:(184780, 9)
observation:(3807, 6)
devices:(750878, 8)
proceduresoccurrences:(771214, 8)
person_demographics_episode:(3391, 6)
SepsisLabel:(331653, 6)
..... Mapping 


In [11]:
summary_table(devices_train_data)
summary_table(drugsexposure_train_data)
summary_table(proceduresoccurrences_train_data)

   Null Values  Duplicate Rows        Shape
0        30590               0  (750878, 8)
   Null Values  Duplicate Rows        Shape
0         9354            1851  (184780, 9)
   Null Values  Duplicate Rows        Shape
0        38486             197  (771214, 8)


## **Test Data**

In [12]:
create_test_data = Data(test_data, train_bool = False)
test_datasets = create_test_data.get_data()
(drugsexposure_test_data, _, devices_test_data, proceduresoccurrences_test_data, _, sepsis_data) = test_datasets.values()

.... Colums Extracted
drugsexposure:(74801, 4)
observation:(1595, 3)
devices:(320919, 3)
proceduresoccurrences:(300447, 3)
person_demographics_episode:(1419, 3)
SepsisLabel:(130483, 1)
drugsexposure:(74801, 8)
observation:(1595, 5)
devices:(320919, 7)
proceduresoccurrences:(300447, 7)
person_demographics_episode:(1419, 5)
SepsisLabel:(130483, 5)
..... Mapping 


In [13]:
summary_table(drugsexposure_test_data)
summary_table(devices_test_data)
summary_table(proceduresoccurrences_test_data)

   Null Values  Duplicate Rows       Shape
0         3124             988  (74801, 8)
   Null Values  Duplicate Rows        Shape
0        12034             156  (320919, 7)
   Null Values  Duplicate Rows        Shape
0        11484               0  (300447, 7)


## **Pre-Processing Train Data**

In [14]:
pp_drug_train = Pre_processing(df= drugsexposure_train_data, 
                date_time_col = 'drug_datetime_hourly',
                encoding_cols = ['drug_concept_id', 'route_concept_id', 'observation_concept_name', 'valuefilled', 'gender'],    
                standarize_cols = ['drug_concept_id', 'route_concept_id', 'age_in_months', 'year',	'month', 'day', 'hour'], 
                 )
drugsexposure_train_data = pp_drug_train.get_df()


Before Preprocesing:(184780, 9)
After Preprocesing:(178288, 9)
.... Encoding Data
....Standarize Data


In [15]:
pp_device_train = Pre_processing(df= devices_train_data, 
                date_time_col = 'device_datetime_hourly',
                encoding_cols = ['device', 'observation_concept_name', 'valuefilled', 'valuefilled', 'gender'],    
                standarize_cols = ['age_in_months', 'year',	'month', 'day', 'hour'], 
                 )
devices_train_data = pp_device_train.get_df()

Before Preprocesing:(750878, 8)
After Preprocesing:(735583, 8)
.... Encoding Data
....Standarize Data


In [16]:
pp_procedure_train = Pre_processing(df= proceduresoccurrences_train_data, 
                date_time_col = 'procedure_datetime_hourly',
                encoding_cols = ['procedure', 'observation_concept_name', 'valuefilled', 'gender'],    
                standarize_cols = ['age_in_months', 'year',	'month', 'day', 'hour'], 
                 )
proceduresoccurrences_train_data = pp_procedure_train.get_df()

Before Preprocesing:(771214, 8)
After Preprocesing:(751774, 8)
.... Encoding Data
....Standarize Data


## **Pre-Processing Test Data**

In [17]:
pp_drug_test = Pre_processing(df= drugsexposure_test_data, 
                date_time_col = 'drug_datetime_hourly',
                encoding_cols = ['drug_concept_id', 'route_concept_id', 'observation_concept_name', 'valuefilled', 'gender'],    
                standarize_cols = ['drug_concept_id', 'route_concept_id', 'age_in_months', 'year',	'month', 'day', 'hour'], 
                 )
drugsexposure_test_data = pp_drug_test.get_df()

Before Preprocesing:(74801, 8)
After Preprocesing:(72269, 8)
.... Encoding Data
....Standarize Data


In [18]:
pp_device_test = Pre_processing(df= devices_test_data, 
                date_time_col = 'device_datetime_hourly',
                encoding_cols = ['device', 'observation_concept_name', 'valuefilled', 'valuefilled', 'gender'],    
                standarize_cols = ['age_in_months', 'year',	'month', 'day', 'hour'], 
                 )
devices_test_data = pp_device_test.get_df()

Before Preprocesing:(320919, 7)
After Preprocesing:(314746, 7)
.... Encoding Data
....Standarize Data


In [19]:
pp_procedure_test = Pre_processing(df= proceduresoccurrences_test_data, 
                date_time_col = 'procedure_datetime_hourly',
                encoding_cols = ['procedure', 'observation_concept_name', 'valuefilled', 'gender'],    
                standarize_cols = ['age_in_months', 'year',	'month', 'day', 'hour'], 
                 )
proceduresoccurrences_test_data = pp_procedure_test.get_df()

Before Preprocesing:(300447, 7)
After Preprocesing:(294705, 7)
.... Encoding Data
....Standarize Data


## **Train ML Models**

In [20]:
def  train_ML_model(df, drop_col, taget_col): 
    #Drop column
    df = df.drop(columns=[drop_col])
    # Split Data
    X = df.drop(columns=[taget_col])  
    y = df[taget_col]  
    
    model = DecisionTreeClassifier(criterion='entropy')
    model.fit(X, y)

    return model

**Predict on all the datasets**

In [21]:
dt_model_drugsexposure = train_ML_model(df=drugsexposure_train_data, 
               drop_col='person_id', 
               taget_col='SepsisLabel')

dt_model_devices = train_ML_model(df=devices_train_data, 
               drop_col='person_id', 
               taget_col='SepsisLabel')

dt_model_proceduresoccurrences = train_ML_model(df=proceduresoccurrences_train_data, 
               drop_col='person_id', 
               taget_col='SepsisLabel')



In [22]:
#id procedures occurrences
proceduresoccurrences_person_id = proceduresoccurrences_test_data['person_id']
proceduresoccurrences_test_data = proceduresoccurrences_test_data.drop(columns=['person_id'])

#id drug exposure
drugsexposure_test_data_person_id = drugsexposure_test_data['person_id']
drugsexposure_test_data = drugsexposure_test_data.drop(columns=['person_id'])

#id device
devices_test_data_person_id = devices_test_data['person_id']
devices_test_data = devices_test_data.drop(columns=['person_id'])

In [23]:
predict_drugexp = dt_model_drugsexposure.predict(drugsexposure_test_data)
predict_device = dt_model_devices.predict(devices_test_data)
predict_procedures = dt_model_proceduresoccurrences.predict(proceduresoccurrences_test_data)

In [24]:
pred_map_drug_exp = {person_id:pred for person_id, pred in zip(drugsexposure_test_data_person_id, predict_drugexp)}
pred_map_device = {person_id:pred for person_id, pred in zip(devices_test_data_person_id, predict_device)}
pred_map_procedures = {person_id:pred for person_id, pred in zip(proceduresoccurrences_person_id, predict_procedures)}

In [25]:
mapping_dict = pred_map_drug_exp.copy()
mapping_dict.update(pred_map_device)
mapping_dict.update(pred_map_procedures)

In [26]:
len(mapping_dict)

1025

In [27]:
#merged_map = {**pred_map_drug_exp, **pred_map_device, **pred_map_procedures}

In [28]:
prediction_df = test_data['SepsisLabel']
prediction_df['person_id_datetime'] = prediction_df['person_id'].astype(str) + '_' + prediction_df['measurement_datetime']

In [29]:
prediction_df.loc[:, 'SepsisLabel'] = prediction_df['person_id'].map(mapping_dict)
prediction_df = prediction_df.drop(columns=['person_id', 'measurement_datetime'] )

In [30]:
prediction_df = prediction_df.fillna(0)
prediction_df

Unnamed: 0,person_id_datetime,SepsisLabel
0,1416048048_2021-03-25 10:00:00,0.0
1,280531880_2024-01-22 18:00:00,0.0
2,1127023302_2023-12-29 21:00:00,0.0
3,2065909112_2021-07-07 05:00:00,0.0
4,264445818_2024-08-23 22:00:00,0.0
...,...,...
130478,1968006557_2023-10-28 03:00:00,0.0
130479,1511876642_2024-09-04 15:00:00,0.0
130480,1844025915_2024-05-09 10:00:00,0.0
130481,264445818_2024-07-07 15:00:00,0.0


In [31]:
prediction_df.to_csv("submission.csv", index=False)