In [58]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import os

#Set wd
current_dir = os.path.dirname(os.path.abspath('02_FeatureEng.ipynb'))

#File path for train.csv
train_file_path = os.path.join(current_dir, '..','Data',"train.csv")
train_df = pd.read_csv(train_file_path)

In [59]:
train_df.columns

Index(['id', 'Basic_Demos-Enroll_Season', 'Basic_Demos-Age', 'Basic_Demos-Sex',
       'CGAS-Season', 'CGAS-CGAS_Score', 'Physical-Season', 'Physical-BMI',
       'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
       'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP',
       'Fitness_Endurance-Season', 'Fitness_Endurance-Max_Stage',
       'Fitness_Endurance-Time_Mins', 'Fitness_Endurance-Time_Sec',
       'FGC-Season', 'FGC-FGC_CU', 'FGC-FGC_CU_Zone', 'FGC-FGC_GSND',
       'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone', 'FGC-FGC_PU',
       'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR',
       'FGC-FGC_SRR_Zone', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone', 'BIA-Season',
       'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC', 'BIA-BIA_BMI',
       'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_FFM',
       'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num',
       'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST'

<center>
  <ul style="list-style-type:none;">
    <li style="font-weight: bold; font-family: Palatino; font-size: 36px; color: Black;"> Data Processing
    </li>
  </ul>
</center>

Note : Set restrict = False (default) for test sets, set to true for training 

In [60]:
def clean_data(df, restrict = False):
    #Fill missing physical BMI with BIA BMI
    df['Physical-BMI'] = df['Physical-BMI'].fillna(df['BIA-BIA_BMI'])
    #Adding Fitness endurance mins and secs
    df['Fitness_total_time_sec'] = df['Fitness_Endurance-Time_Mins']* 60 + df['Fitness_Endurance-Time_Sec']
    #Condense Physical Activity scores for Adolescents and Children
    df['PAQ_Total_combined'] = df['PAQ_A-PAQ_A_Total'].fillna(df['PAQ_C-PAQ_C_Total'])
    #Add 1 to internet usage
    df['PreInt_EduHx-computerinternet_hoursday'] = df['PreInt_EduHx-computerinternet_hoursday'] + 1
    
    # Impute physical height and weight with calculated values
    df['Physical-Weight (kg)'] = df['Physical-Weight'] * 0.453592
    df['Calculated-Height (m)'] = np.sqrt(df['Physical-Weight (kg)'] / df['Physical-BMI'])
    df['Calculated-Height (inch)'] = df['Calculated-Height (m)'] * 39.3701
    df['Physical-Height'] = df['Physical-Height'].fillna(df['Calculated-Height (inch)'])
    df['Calculated-Weight'] = 100 * df['BIA-BIA_FFM'] / (100 - df['BIA-BIA_Fat'])
    df['Calculated-Weight (lbs)'] = df['Calculated-Weight'] * 2.20462
    df['Physical-Weight'] = df['Physical-Weight'].fillna(df['Calculated-Weight (lbs)'])


    if restrict:
        df['CGAS-CGAS_Score'] = df['CGAS-CGAS_Score'].apply(lambda x: x if 0 <= x <= 100 else np.nan)
        df['Physical-Height'] = df['Physical-Height'].apply(lambda x: x if 20 <= x <= 80 else np.nan)
        df['Physical-Weight'] = df['Physical-Weight'].apply(lambda x: x if 30 <= x <= 250 else np.nan)
        df['Physical-Waist_Circumference'] = df['Physical-Waist_Circumference'].apply(lambda x: x if 15 <= x <= 50 else np.nan)
        df['Physical-Diastolic_BP'] = df['Physical-Diastolic_BP'].apply(lambda x: x if 40 <= x <= 110 else np.nan)
        df['Physical-Systolic_BP'] = df['Physical-Systolic_BP'].apply(lambda x: x if 70 <= x <= 170 else np.nan)
        df['Physical-HeartRate'] = df['Physical-HeartRate'].apply(lambda x: x if 40 <= x <= 130 else np.nan)
        df['Physical-BMI'] = df['Physical-BMI'].apply(lambda x: x if 15 <= x <= 40 else np.nan)
        df['Fitness_Endurance-Max_Stage'] = df['Fitness_Endurance-Max_Stage'].apply(lambda x: x if x < 15 else np.nan)
        df['FGC-FGC_CU'] = df['FGC-FGC_CU'].apply(lambda x: x if x < 50 else np.nan )
        df['FGC-FGC_GSD'] = df['FGC-FGC_GSD'].apply(lambda x: x if x < 65 else np.nan)
        df['FGC-FGC_GSND'] = df['FGC-FGC_GSND'].apply(lambda x: x if x < 60 else np.nan)
        df['FGC-FGC_PU'] = df['FGC-FGC_PU'].apply(lambda x: x if x < 40 else np.nan)

    #Drop cols that are not needed
    cols_to_drop = [
    'BIA-BIA_BMI', 'Physical-Weight (kg)', 'Calculated-Height (m)', 'Calculated-Height (inch)', 'Calculated-Weight', 
    'Calculated-Weight (lbs)','Fitness_Endurance-Time_Mins', 'Fitness_Endurance-Time_Sec', 'PAQ_A-PAQ_A_Total', 'PAQ_C-PAQ_C_Total', 
    'SDS-SDS_Total_Raw'
    ]
    df.drop(columns=cols_to_drop, axis=1, inplace=True)

    return df




In [61]:
code_test = clean_data(train_df, restrict = True)
code_test

Unnamed: 0,id,Basic_Demos-Enroll_Season,Basic_Demos-Age,Basic_Demos-Sex,CGAS-Season,CGAS-CGAS_Score,Physical-Season,Physical-BMI,Physical-Height,Physical-Weight,...,PCIAT-PCIAT_19,PCIAT-PCIAT_20,PCIAT-PCIAT_Total,SDS-Season,SDS-SDS_Total_T,PreInt_EduHx-Season,PreInt_EduHx-computerinternet_hoursday,sii,Fitness_total_time_sec,PAQ_Total_combined
0,00008ff9,Fall,5,0,Winter,51.0,Fall,16.877316,46.0,50.8,...,2.0,4.0,55.0,,,Fall,4.0,2.0,,
1,000fd460,Summer,9,0,,,Fall,,48.0,46.0,...,0.0,0.0,0.0,Fall,64.0,Summer,1.0,0.0,,2.340
2,00105258,Summer,10,1,Fall,71.0,Fall,16.648696,56.5,75.6,...,1.0,1.0,28.0,Fall,54.0,Summer,3.0,0.0,453.0,2.170
3,00115b9f,Winter,9,0,Fall,71.0,Summer,18.292347,56.0,81.6,...,4.0,1.0,44.0,Summer,45.0,Winter,1.0,1.0,577.0,2.451
4,0016bb22,Spring,18,1,Summer,,,,,,...,,,,,,,,,,1.040
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3955,ff8a2de4,Fall,13,0,Spring,60.0,Fall,16.362460,59.5,82.4,...,1.0,0.0,32.0,Winter,50.0,Fall,2.0,1.0,,3.260
3956,ffa9794a,Winter,10,0,,,Spring,18.764678,53.5,76.4,...,,,,,,Winter,1.0,,,2.340
3957,ffcd4dbd,Fall,11,0,Spring,68.0,Winter,21.441500,60.0,109.8,...,0.0,1.0,31.0,Winter,77.0,Fall,1.0,1.0,,2.729
3958,ffed1dd5,Spring,13,0,Spring,70.0,Winter,,70.7,87.0,...,1.0,1.0,19.0,Spring,47.0,Spring,2.0,0.0,,3.300


<center>
  <ul style="list-style-type:none;">
    <li style="font-weight: bold; font-family: Palatino; font-size: 36px; color: Black;"> Autoencoder (Imputation)
    </li>
  </ul>
</center>

In [96]:
#Data Preprocessing

#Classify columns into relevant types for scaling/ encoding
data_dict_path = os.path.join(current_dir, '..','Data',"data_dictionary.csv")
data_dict = pd.read_csv(data_dict_path)

nominal_cols = []
ordinal_cols = []

for index, row in data_dict.iterrows():
    column_name = row['Field']
    column_type = row['Type']
    column_description = row['Description']
    
    # Classify nominal columns (seasons)
    if 'Season' in str(column_description):
        nominal_cols.append(column_name)
    
    # Classify ordinal columns (categorical and !seasons)
    elif 'categorical' in str(column_type):
        ordinal_cols.append(column_name)

numeric_cols = [
    col for col in code_test.select_dtypes(include=['int64', 'float64']).columns
    if col not in ordinal_cols
]
#Remove sii from numeric cols and append to ordinal
numeric_cols.remove('sii')
ordinal_cols.append('sii')


In [118]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler

#Function for imputation (Flow: Drop id,season, sii and PCIAT cols, mask na data, scale data, train autoencoder, 
# fill mask with trained model, inverse scale, add back id and season cols)

class AutoEncoder(nn.Module):
    def __init__(self, input_dim, encoding_dim):
        super(AutoEncoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, encoding_dim*3),
            nn.LeakyReLU(0.2),
            nn.Linear(encoding_dim*3, encoding_dim*2),
            nn.LeakyReLU(0.2),
            nn.Linear(encoding_dim*2, encoding_dim),
            nn.LeakyReLU(0.2)
        )
        self.decoder = nn.Sequential(
            nn.Linear(encoding_dim, input_dim*2),
            nn.LeakyReLU(0.2),
            nn.Linear(input_dim*2, input_dim*3),
            nn.LeakyReLU(0.2),
            nn.Linear(input_dim*3, input_dim),
            nn.Sigmoid()
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

def impute_dataframe(df, dataset = 'train', encoding_dim=32, num_epochs=10, batch_size=16):

    #Manually define columns
    ordinal_cols = ['Basic_Demos-Sex','FGC-FGC_CU_Zone','FGC-FGC_GSND_Zone','FGC-FGC_GSD_Zone','FGC-FGC_PU_Zone','FGC-FGC_SRL_Zone','FGC-FGC_SRR_Zone',
                'FGC-FGC_TL_Zone','BIA-BIA_Activity_Level_num','BIA-BIA_Frame_num','PreInt_EduHx-computerinternet_hoursday']

    numeric_cols = ['Basic_Demos-Age','CGAS-CGAS_Score','Physical-BMI','Physical-Height','Physical-Weight','Physical-Waist_Circumference','Physical-Diastolic_BP',
                'Physical-HeartRate','Physical-Systolic_BP','Fitness_Endurance-Max_Stage','FGC-FGC_CU','FGC-FGC_GSND','FGC-FGC_GSD','FGC-FGC_PU','FGC-FGC_SRL','FGC-FGC_SRR',
                'FGC-FGC_TL','BIA-BIA_BMC','BIA-BIA_BMR','BIA-BIA_DEE','BIA-BIA_ECW','BIA-BIA_FFM','BIA-BIA_FFMI','BIA-BIA_FMI','BIA-BIA_Fat','BIA-BIA_ICW','BIA-BIA_LDM',
                'BIA-BIA_LST','BIA-BIA_SMM','BIA-BIA_TBW','SDS-SDS_Total_T','Fitness_total_time_sec','PAQ_Total_combined']

    nominal_cols = ['Basic_Demos-Enroll_Season','CGAS-Season','Physical-Season','Fitness_Endurance-Season','FGC-Season','BIA-Season',
                'PAQ_A-Season','PAQ_C-Season','PCIAT-Season','SDS-Season','PreInt_EduHx-Season']
  
    # Drop PCIAT, id and nominal columns
    if dataset == 'train':
        cols_to_drop = ['sii']+ ['id'] + nominal_cols + df.filter(regex='^PCIAT-PCIAT').columns.tolist()
    elif dataset == 'test':
        cols_to_drop = ['id'] + nominal_cols

    retain_df = df[cols_to_drop]
    df_dropped = df.drop(columns = cols_to_drop)

    # Scale columns
    scaler = StandardScaler()
    df_scaled = scaler.fit_transform(df_dropped)
    mask = np.isnan(df_scaled)  # Mask to fill NA values later
    df_filled = np.nan_to_num(df_scaled, nan=0.0)

    # Convert to tensors
    data_tensor = torch.tensor(df_filled, dtype=torch.float32)
    mask_tensor = torch.tensor(mask, dtype=torch.bool)

    # Initialize model, criterion, and optimizer
    input_dim = data_tensor.shape[1]
    model = AutoEncoder(input_dim=input_dim, encoding_dim=encoding_dim)
    criterion = nn.SmoothL1Loss()
    optimizer = optim.Adam(model.parameters())

    # Training loop
    for epoch in range(num_epochs):
        for i in range(0, len(data_tensor), batch_size):
            batch_data = data_tensor[i:i+batch_size]
            batch_mask = mask_tensor[i:i+batch_size]

            # Forward pass
            recon = model(batch_data)
            loss = criterion(recon[~batch_mask], batch_data[~batch_mask])

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        if (epoch + 1) % 1 == 0: #Change divisor according to epochs
            print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}')

    # Impute missing values
    with torch.no_grad():
        imputed_data = model(data_tensor).numpy()

    # Replace the original missing values with imputed values
    data_imputed = df_scaled.copy()
    data_imputed[mask] = imputed_data[mask]

    # Inverse transform to original scale
    data_imputed = scaler.inverse_transform(data_imputed)
    data_imputed_df = pd.DataFrame(data_imputed, columns=df_dropped.columns)
    data_imputed_df[ordinal_cols] = data_imputed_df[ordinal_cols].round(0)
    merged_df = pd.concat([data_imputed_df, retain_df], axis = 1) 

    return merged_df


In [121]:
impute_test = impute_dataframe(train_df)
impute_test

Epoch [1/10], Loss: 0.2274
Epoch [2/10], Loss: 0.2203
Epoch [3/10], Loss: 0.2100
Epoch [4/10], Loss: 0.2058
Epoch [5/10], Loss: 0.1965
Epoch [6/10], Loss: 0.1937
Epoch [7/10], Loss: 0.1938
Epoch [8/10], Loss: 0.1946
Epoch [9/10], Loss: 0.1938
Epoch [10/10], Loss: 0.1915


Unnamed: 0,Basic_Demos-Age,Basic_Demos-Sex,CGAS-CGAS_Score,Physical-BMI,Physical-Height,Physical-Weight,Physical-Waist_Circumference,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,...,PCIAT-PCIAT_12,PCIAT-PCIAT_13,PCIAT-PCIAT_14,PCIAT-PCIAT_15,PCIAT-PCIAT_16,PCIAT-PCIAT_17,PCIAT-PCIAT_18,PCIAT-PCIAT_19,PCIAT-PCIAT_20,PCIAT-PCIAT_Total
0,5.0,0.0,51.000000,16.877316,46.000000,50.800000,27.278566,69.201522,93.073620,116.431566,...,0.0,4.0,4.0,4.0,4.0,4.0,4.0,2.0,4.0,55.0
1,9.0,0.0,65.069008,20.014273,48.000000,46.000000,22.000000,75.000000,70.000000,122.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,10.0,1.0,71.000000,16.648696,56.500000,75.600000,27.278508,65.000000,94.000000,117.000000,...,0.0,1.0,1.0,1.0,0.0,2.0,2.0,1.0,1.0,28.0
3,9.0,0.0,71.000000,18.292347,56.000000,81.600000,27.278510,60.000000,97.000000,117.000000,...,0.0,3.0,0.0,0.0,3.0,4.0,3.0,4.0,1.0,44.0
4,18.0,1.0,65.169856,20.248801,63.409899,126.086438,27.649819,69.124385,81.573972,116.558017,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3955,13.0,0.0,60.000000,16.362460,59.500000,82.400000,27.279773,71.000000,70.000000,104.000000,...,0.0,2.0,0.0,1.0,0.0,2.0,1.0,1.0,0.0,32.0
3956,10.0,0.0,65.069019,18.764678,53.500000,76.400000,27.000000,60.000000,78.000000,118.000000,...,,,,,,,,,,
3957,11.0,0.0,68.000000,21.441500,60.000000,109.800000,27.351447,79.000000,99.000000,116.000000,...,0.0,1.0,3.0,0.0,0.0,1.0,1.0,0.0,1.0,31.0
3958,13.0,0.0,70.000000,20.083783,70.700000,87.000000,27.814125,59.000000,61.000000,113.000000,...,0.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,19.0


In [95]:
#Export as clean version and imputed version as csv
code_test.to_csv('clean_no_imp.csv', index = False)
data_imputed_df.to_csv('autoenc_imp(noseason).csv', index = False)

<center>
  <ul style="list-style-type:none;">
    <li style="font-weight: bold; font-family: Palatino; font-size: 36px; color: Black;"> Feature Engineering
    </li>
  </ul>
</center>

By default, feature_engineering will not return the interaction terms, set to true to get interaction terms

In [100]:
def feature_engineering(df, add_int = False):

    #Apply banding to CGAS and SDS
    cgas_bins = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
    cgas_labels = ["1-10", "11-20", "21-30", "31-40", "41-50", "51-60", "61-70", "71-80", "81-90", "91-100"]
    df['CGAS_Category'] = pd.cut(df['CGAS-CGAS_Score'], bins = cgas_bins, labels = cgas_labels, right = True)
    sds_bins = [0,57, 62, 67, 100]
    sds_labels = ['No Disorder', 'Mild', 'Moderate', 'Severe']
    df['SDS-SDS_Category'] = pd.cut(df['SDS-SDS_Total_T'], bins = sds_bins, labels = sds_labels, right = True)
    

    if add_int:
        df['SDS_Age'] = df['SDS-SDS_Total_T_Band'] * df['Basic_Demos-Age']
        df['SDS_BMI'] = df['BIA-BIA_BMI'] * df['SDS-SDS_Total_T_Band']
        df['CGAS_SDS'] = df['CGAS-CGAS_Score'] * df['SDS-SDS_Total_T_Band']
        df['SDS_Activity'] = df['BIA-BIA_Activity_Level_num'] * df['SDS-SDS_Total_T_Band']
        df['HeartRate_SDS'] = df['Physical-HeartRate'] * df['SDS-SDS_Total_T_Band'] 
        df['SDS_PAQ'] = df['SDS-SDS_Total_T_Band'] * df['PAQ_C-PAQ_C_Total']

        df['Internet_Hours_Age'] = df['PreInt_EduHx-computerinternet_hoursday'] * df['Basic_Demos-Age']
        df['Internet_Hours_BIA_Act'] = df['BIA-BIA_Activity_Level_num'] * df['PreInt_EduHx-computerinternet_hoursday']
        df['Internet_Hours_PAQ'] = df['PreInt_EduHx-computerinternet_hoursday'] * df['PAQ_C-PAQ_C_Total']
        df['SDS_InternetHours'] = df['SDS-SDS_Total_T'] * df['PreInt_EduHx-computerinternet_hoursday']
        df['Internet_Hours_CGAS'] = df['PreInt_EduHx-computerinternet_hoursday'] * df['CGAS-CGAS_Score']

        df['BMI_HeartRate']  = df['BIA-BIA_BMI'] * df['Physical-HeartRate']
        df['BMI_SystolicBP'] = df['BIA-BIA_BMI'] * df['Physical-Systolic_BP']
        df['BMI_HoursOfInternet'] = df['BIA-BIA_BMI'] * df['PreInt_EduHx-computerinternet_hoursday']

        df['BMI-Waist'] = df['Physical-BMI'] * df['Physical-Waist_Circumference']
        df['BMI_Age'] = df['Physical-BMI'] * df['Basic_Demos-Age']
        df['HR_Age'] = df['Physical-HeartRate'] * df['Basic_Demos-Age']
        df['SBP_Age'] = df['Physical-Systolic_BP'] * df['Basic_Demos-Age']
        df['DBP_Age'] = df['Physical-Diastolic_BP'] * df['Basic_Demos-Age']
        df['HR_Activity_Level']= df['Physical-HeartRate'] * df['BIA-BIA_Activity_Level_num']

        df['Muscle_Mass_CU_PU_GSD_TL'] = df['BIA-BIA_SMM'] * df['FGC-FGC_PU'] * df['FGC-FGC_CU'] * df['FGC-FGC_GSD'] * df['FGC-FGC_TL']
        df['DEE_fitness_endurance'] = df['Fitness_total_time_sec'] * df['BIA-BIA_DEE']
        df['Activity_level_fitness_endurance'] = df['Fitness_total_time_sec'] * df['BIA-BIA_Activity_Level_num']
    
    return(df)

