In [1]:
# Version 5
# KNNImputer for all data

# Based
- https://www.kaggle.com/code/honganzhu/cmi-piu-competition?scriptVersionId=201912528 Version44 LB0.492

 If you find this notebook useful, please upvote this and the based one.

In [2]:
import numpy as np
import pandas as pd
import os
import re
from sklearn.base import clone
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import StratifiedKFold
from scipy.optimize import minimize
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
import polars as pl
import polars.selectors as cs
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator, FormatStrFormatter, PercentFormatter
import seaborn as sns

from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim

from colorama import Fore, Style
from IPython.display import clear_output
import warnings
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import VotingRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline
from pytorch_tabnet.tab_model import TabNetRegressor
warnings.filterwarnings('ignore')
pd.options.display.max_columns = None
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', None)

pl.Config.set_tbl_rows(-1)
pl.Config.set_tbl_cols(-1)
pl.Config.set_fmt_str_lengths(10000)

In [4]:
SEED = 42
n_splits = 5

# Feature Engineering

- **Feature Selection**: The dataset contains features related to physical characteristics (e.g., BMI, Height, Weight), behavioral aspects (e.g., internet usage), and fitness data (e.g., endurance time). 
- **Categorical Feature Encoding**: Categorical features are mapped to numerical values using custom mappings for each unique category within the dataset. This ensures compatibility with machine learning algorithms that require numerical input.
- **Time Series Aggregation**: Time series statistics (e.g., mean, standard deviation) from the actigraphy data are computed and merged into the main dataset to create additional features for model training.


In [5]:
def process_file(id_folder, dirname):
    files = os.listdir(os.path.join(path, dirname, id_folder))
    if len(files) > 1:
        df_list = []
        for file in files:
            df = pd.read_parquet(os.path.join(dirname, id_folder, file))
            df_list.append(df)
        df = pd.concat(df_list, ignore_index=True)
    else:
        df = pd.read_parquet(os.path.join(dirname, id_folder, files[0]))
    
    df.drop('step', axis=1, inplace=True)

    describe_df = df.describe(percentiles=[0.01] + [i*0.5/10 + 0.05 for i in range(19)])
    count_mean = describe_df.loc['count', :].mean()
    describe_df.drop('count', axis=0, inplace=True)
    
    return np.append(describe_df.values.reshape(-1), count_mean), id_folder.split('=')[1]

def load_time_series(dirname) -> pd.DataFrame:
    ids = os.listdir(dirname)
    
    with ThreadPoolExecutor() as executor:
        results = list(tqdm(executor.map(lambda fname: process_file(fname, dirname), ids), total=len(ids)))
    
    stats, indexes = zip(*results)
    
    df = pd.DataFrame(stats, columns=[f"stat_{i}" for i in range(len(stats[0]))])
    df['id'] = indexes
    return df


class AutoEncoder(nn.Module):
    def __init__(self, input_dim, encoding_dim):
        super(AutoEncoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, encoding_dim*4),
            nn.ReLU(),
            nn.Linear(encoding_dim*4, encoding_dim*2),
            nn.ReLU(),
            nn.Linear(encoding_dim*2, encoding_dim),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(encoding_dim, encoding_dim*2),
            nn.ReLU(),
            nn.Linear(encoding_dim*2, encoding_dim*4),
            nn.ReLU(),
            nn.Linear(encoding_dim*4, input_dim),
            nn.Sigmoid()
        )
        
    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded


def perform_autoencoder(df, encoding_dim=50, epochs=50, batch_size=32):
    scaler = StandardScaler()
    df_scaled = scaler.fit_transform(df)
    
    data_tensor = torch.FloatTensor(df_scaled)
    
    input_dim = data_tensor.shape[1]
    autoencoder = AutoEncoder(input_dim, encoding_dim)
    
    criterion = nn.MSELoss()
    optimizer = optim.Adam(autoencoder.parameters())
    
    for epoch in range(epochs):
        for i in range(0, len(data_tensor), batch_size):
            batch = data_tensor[i : i + batch_size]
            optimizer.zero_grad()
            reconstructed = autoencoder(batch)
            loss = criterion(reconstructed, batch)
            loss.backward()
            optimizer.step()
            
        if (epoch + 1) % 10 == 0:
            print(f'Epoch [{epoch + 1}/{epochs}], Loss: {loss.item():.4f}]')
                 
    with torch.no_grad():
        encoded_data = autoencoder.encoder(data_tensor).numpy()
        
    df_encoded = pd.DataFrame(encoded_data, columns=[f'Enc_{i + 1}' for i in range(encoded_data.shape[1])])
    
    return df_encoded

def feature_engineering(df):
    df['BMI_Age'] = df['Physical-BMI'] * df['Basic_Demos-Age']
    df['Internet_Hours_Age'] = df['PreInt_EduHx-computerinternet_hoursday'] * df['Basic_Demos-Age']
    df['BMI_Internet_Hours'] = df['Physical-BMI'] * df['PreInt_EduHx-computerinternet_hoursday']
    df['BFP_BMI'] = df['BIA-BIA_Fat'] / df['BIA-BIA_BMI']
    df['FFMI_BFP'] = df['BIA-BIA_FFMI'] / df['BIA-BIA_Fat']
    df['FMI_BFP'] = df['BIA-BIA_FMI'] / df['BIA-BIA_Fat']
    df['LST_TBW'] = df['BIA-BIA_LST'] / df['BIA-BIA_TBW']
    df['BFP_BMR'] = df['BIA-BIA_Fat'] * df['BIA-BIA_BMR']
    df['BFP_DEE'] = df['BIA-BIA_Fat'] * df['BIA-BIA_DEE']
    df['BMR_Weight'] = df['BIA-BIA_BMR'] / df['Physical-Weight']
    df['DEE_Weight'] = df['BIA-BIA_DEE'] / df['Physical-Weight']
    df['SMM_Height'] = df['BIA-BIA_SMM'] / df['Physical-Height']
    df['Muscle_to_Fat'] = df['BIA-BIA_SMM'] / df['BIA-BIA_FMI']
    df['Hydration_Status'] = df['BIA-BIA_TBW'] / df['Physical-Weight']
    df['ICW_TBW'] = df['BIA-BIA_ICW'] / df['BIA-BIA_TBW']
    
    return df

In [6]:
path = "I:/Kaggle/child-mind-institute-problematic-internet-use/"

In [7]:
train = pd.read_csv(path + 'train.csv')
test = pd.read_csv(path + 'test.csv')
sample = pd.read_csv(path + 'sample_submission.csv')

100%|████████████████████████████████████████████████████████████████████████████████| 996/996 [00:27<00:00, 35.81it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 16.75it/s]


In [None]:
train_ts = load_time_series(path + "series_train.parquet")
test_ts = load_time_series(path + "series_test.parquet")

In [8]:
concat_ts = pd.concat([train_ts, test_ts], ignore_index=True)
df_concat = concat_ts.drop('id', axis=1)

In [9]:
concat_ts_encoded = perform_autoencoder(df_concat, encoding_dim=64, epochs=1000, batch_size=128)

Epoch [10/1000], Loss: 0.7445]
Epoch [20/1000], Loss: 0.6651]
Epoch [30/1000], Loss: 0.6022]
Epoch [40/1000], Loss: 0.5963]
Epoch [50/1000], Loss: 0.5737]
Epoch [60/1000], Loss: 0.5643]
Epoch [70/1000], Loss: 0.5663]
Epoch [80/1000], Loss: 0.5562]
Epoch [90/1000], Loss: 0.5550]
Epoch [100/1000], Loss: 0.5500]
Epoch [110/1000], Loss: 0.5491]
Epoch [120/1000], Loss: 0.5435]
Epoch [130/1000], Loss: 0.5398]
Epoch [140/1000], Loss: 0.5362]
Epoch [150/1000], Loss: 0.5365]
Epoch [160/1000], Loss: 0.5353]
Epoch [170/1000], Loss: 0.5351]
Epoch [180/1000], Loss: 0.5306]
Epoch [190/1000], Loss: 0.5311]
Epoch [200/1000], Loss: 0.5297]
Epoch [210/1000], Loss: 0.5302]
Epoch [220/1000], Loss: 0.5284]
Epoch [230/1000], Loss: 0.5282]
Epoch [240/1000], Loss: 0.5269]
Epoch [250/1000], Loss: 0.5263]
Epoch [260/1000], Loss: 0.5280]
Epoch [270/1000], Loss: 0.5254]
Epoch [280/1000], Loss: 0.5256]
Epoch [290/1000], Loss: 0.5251]
Epoch [300/1000], Loss: 0.5229]
Epoch [310/1000], Loss: 0.5268]
Epoch [320/1000],

In [10]:
for col in concat_ts_encoded.columns:
    if concat_ts_encoded[col].std() == 0:
        concat_ts_encoded.drop(col, axis=1, inplace=True)

In [11]:
time_series_cols = concat_ts_encoded.columns.tolist()
concat_ts_encoded["id"] = concat_ts["id"]
train_ts_encoded = concat_ts_encoded.head(train_ts.shape[0]).reset_index(drop=True)
test_ts_encoded = concat_ts_encoded.tail(test_ts.shape[0]).reset_index(drop=True)
train_merge = pd.merge(train, train_ts_encoded, how="left", on='id')
test_merge = pd.merge(test, test_ts_encoded, how="left", on='id')

In [13]:
data_dict = pd.read_csv(path + 'data_dictionary.csv')
print(data_dict.shape)
data_dict

(81, 6)


Unnamed: 0,Instrument,Field,Description,Type,Values,Value Labels
0,Identifier,id,Participant's ID,str,,
1,Demographics,Basic_Demos-Enroll_Season,Season of enrollment,str,"Spring, Summer, Fall, Winter",
2,Demographics,Basic_Demos-Age,Age of participant,float,,
3,Demographics,Basic_Demos-Sex,Sex of participant,categorical int,01,"0=Male, 1=Female"
4,Children's Global Assessment Scale,CGAS-Season,Season of participation,str,"Spring, Summer, Fall, Winter",
5,Children's Global Assessment Scale,CGAS-CGAS_Score,Children's Global Assessment Scale Score,int,,
6,Physical Measures,Physical-Season,Season of participation,str,"Spring, Summer, Fall, Winter",
7,Physical Measures,Physical-BMI,Body Mass Index (kg/m^2),float,,
8,Physical Measures,Physical-Height,Height (in),float,,
9,Physical Measures,Physical-Weight,Weight (lbs),float,,


In [23]:
cat_cols = []
for i in range(data_dict.shape[0]):
    col = data_dict['Field'][i]
    if col == 'id':
        continue
    if (data_dict['Type'][i] == 'str') | ('categori' in data_dict['Type'][i]):
        cat_cols.append(col)

In [24]:
ordinal_variables = ['FGC-FGC_CU_Zone', 'FGC-FGC_CU_Zone', 'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD_Zone', 'FGC-FGC_PU_Zone',
                     'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR_Zone', 'FGC-FGC_TL_Zone', 'BIA-BIA_Activity_Level_num', 'BIA-BIA_Frame_num',
                     'PreInt_EduHx-computerinternet_hoursday']

ordinal_variables += [col for col in cat_cols if (('PCIAT' in col)&('Season' not in col))]
print(len(ordinal_variables))
ordinal_variables

31


['FGC-FGC_CU_Zone',
 'FGC-FGC_CU_Zone',
 'FGC-FGC_GSND_Zone',
 'FGC-FGC_GSD_Zone',
 'FGC-FGC_PU_Zone',
 'FGC-FGC_SRL_Zone',
 'FGC-FGC_SRR_Zone',
 'FGC-FGC_TL_Zone',
 'BIA-BIA_Activity_Level_num',
 'BIA-BIA_Frame_num',
 'PreInt_EduHx-computerinternet_hoursday',
 'PCIAT-PCIAT_01',
 'PCIAT-PCIAT_02',
 'PCIAT-PCIAT_03',
 'PCIAT-PCIAT_04',
 'PCIAT-PCIAT_05',
 'PCIAT-PCIAT_06',
 'PCIAT-PCIAT_07',
 'PCIAT-PCIAT_08',
 'PCIAT-PCIAT_09',
 'PCIAT-PCIAT_10',
 'PCIAT-PCIAT_11',
 'PCIAT-PCIAT_12',
 'PCIAT-PCIAT_13',
 'PCIAT-PCIAT_14',
 'PCIAT-PCIAT_15',
 'PCIAT-PCIAT_16',
 'PCIAT-PCIAT_17',
 'PCIAT-PCIAT_18',
 'PCIAT-PCIAT_19',
 'PCIAT-PCIAT_20']

In [25]:
cat_cols = list(set(cat_cols) - set(ordinal_variables))
print(len(cat_cols))
cat_cols

12


['PAQ_A-Season',
 'CGAS-Season',
 'Basic_Demos-Sex',
 'PCIAT-Season',
 'PreInt_EduHx-Season',
 'PAQ_C-Season',
 'FGC-Season',
 'Fitness_Endurance-Season',
 'SDS-Season',
 'BIA-Season',
 'Basic_Demos-Enroll_Season',
 'Physical-Season']

In [26]:
train_nonan = train_merge.dropna(subset=['sii'], ignore_index=True)
print(train_nonan.shape)
train_nonan.head()

(2736, 140)


Unnamed: 0,id,Basic_Demos-Enroll_Season,Basic_Demos-Age,Basic_Demos-Sex,CGAS-Season,CGAS-CGAS_Score,Physical-Season,Physical-BMI,Physical-Height,Physical-Weight,Physical-Waist_Circumference,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,Fitness_Endurance-Season,Fitness_Endurance-Max_Stage,Fitness_Endurance-Time_Mins,Fitness_Endurance-Time_Sec,FGC-Season,FGC-FGC_CU,FGC-FGC_CU_Zone,FGC-FGC_GSND,FGC-FGC_GSND_Zone,FGC-FGC_GSD,FGC-FGC_GSD_Zone,FGC-FGC_PU,FGC-FGC_PU_Zone,FGC-FGC_SRL,FGC-FGC_SRL_Zone,FGC-FGC_SRR,FGC-FGC_SRR_Zone,FGC-FGC_TL,FGC-FGC_TL_Zone,BIA-Season,BIA-BIA_Activity_Level_num,BIA-BIA_BMC,BIA-BIA_BMI,BIA-BIA_BMR,BIA-BIA_DEE,BIA-BIA_ECW,BIA-BIA_FFM,BIA-BIA_FFMI,BIA-BIA_FMI,BIA-BIA_Fat,BIA-BIA_Frame_num,BIA-BIA_ICW,BIA-BIA_LDM,BIA-BIA_LST,BIA-BIA_SMM,BIA-BIA_TBW,PAQ_A-Season,PAQ_A-PAQ_A_Total,PAQ_C-Season,PAQ_C-PAQ_C_Total,PCIAT-Season,PCIAT-PCIAT_01,PCIAT-PCIAT_02,PCIAT-PCIAT_03,PCIAT-PCIAT_04,PCIAT-PCIAT_05,PCIAT-PCIAT_06,PCIAT-PCIAT_07,PCIAT-PCIAT_08,PCIAT-PCIAT_09,PCIAT-PCIAT_10,PCIAT-PCIAT_11,PCIAT-PCIAT_12,PCIAT-PCIAT_13,PCIAT-PCIAT_14,PCIAT-PCIAT_15,PCIAT-PCIAT_16,PCIAT-PCIAT_17,PCIAT-PCIAT_18,PCIAT-PCIAT_19,PCIAT-PCIAT_20,PCIAT-PCIAT_Total,SDS-Season,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-Season,PreInt_EduHx-computerinternet_hoursday,sii,Enc_1,Enc_2,Enc_3,Enc_4,Enc_5,Enc_6,Enc_7,Enc_8,Enc_9,Enc_10,Enc_11,Enc_12,Enc_13,Enc_14,Enc_15,Enc_16,Enc_17,Enc_18,Enc_19,Enc_20,Enc_21,Enc_22,Enc_23,Enc_25,Enc_26,Enc_27,Enc_28,Enc_29,Enc_31,Enc_32,Enc_34,Enc_35,Enc_36,Enc_37,Enc_38,Enc_39,Enc_40,Enc_41,Enc_42,Enc_43,Enc_45,Enc_47,Enc_48,Enc_49,Enc_50,Enc_51,Enc_52,Enc_53,Enc_54,Enc_55,Enc_56,Enc_58,Enc_59,Enc_60,Enc_61,Enc_62,Enc_63,Enc_64
0,00008ff9,Fall,5,0,Winter,51.0,Fall,16.877316,46.0,50.8,,,,,,,,,Fall,0.0,0.0,,,,,0.0,0.0,7.0,0.0,6.0,0.0,6.0,1.0,Fall,2.0,2.66855,16.8792,932.498,1492.0,8.25598,41.5862,13.8177,3.06143,9.21377,1.0,24.4349,8.89536,38.9177,19.5413,32.6909,,,,,Fall,5.0,4.0,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,4.0,0.0,4.0,4.0,4.0,4.0,4.0,4.0,2.0,4.0,55.0,,,,Fall,3.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,000fd460,Summer,9,0,,,Fall,14.03559,48.0,46.0,22.0,75.0,70.0,122.0,,,,,Fall,3.0,0.0,,,,,5.0,0.0,11.0,1.0,11.0,1.0,3.0,0.0,Winter,2.0,2.57949,14.0371,936.656,1498.65,6.01993,42.0291,12.8254,1.21172,3.97085,1.0,21.0352,14.974,39.4497,15.4107,27.0552,,,Fall,2.34,Fall,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Fall,46.0,64.0,Summer,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,00105258,Summer,10,1,Fall,71.0,Fall,16.648696,56.5,75.6,,65.0,94.0,117.0,Fall,5.0,7.0,33.0,Fall,20.0,1.0,10.2,1.0,14.7,2.0,7.0,1.0,10.0,1.0,10.0,1.0,5.0,0.0,,,,,,,,,,,,,,,,,,,,Summer,2.17,Fall,5.0,2.0,2.0,1.0,2.0,1.0,1.0,2.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,2.0,2.0,1.0,1.0,28.0,Fall,38.0,54.0,Summer,2.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,00115b9f,Winter,9,0,Fall,71.0,Summer,18.292347,56.0,81.6,,60.0,97.0,117.0,Summer,6.0,9.0,37.0,Summer,18.0,1.0,,,,,5.0,0.0,7.0,0.0,7.0,0.0,7.0,1.0,Summer,3.0,3.84191,18.2943,1131.43,1923.44,15.5925,62.7757,14.074,4.22033,18.8243,2.0,30.4041,16.779,58.9338,26.4798,45.9966,,,Winter,2.451,Summer,4.0,2.0,4.0,0.0,5.0,1.0,0.0,3.0,2.0,2.0,3.0,0.0,3.0,0.0,0.0,3.0,4.0,3.0,4.0,1.0,44.0,Summer,31.0,45.0,Winter,0.0,1.0,0.0,3.35869,0.0,5.337131,1e-06,0.0,0.0,2.398546,4.813291,4.990461,1.405737,8.475198,0.0,0.0,6.87957,1.935769,8.775964,0.0,3.882035,6.520278,18.483984,7.693,0.673259,10.639287,0.0,5.649461,6.437056,8.081806,14.662292,6.477272,2.085419,17.302296,0.0,10.002832,7.267302,1.532843,5.770669,1.359145,0.0,0.0,0.0,0.0,4.486127,14.495865,4.542089,4.867955,0.0,0.0,3.569353,8.330843,0.0,0.0,2.021903,3.556332,6.78969,7.568647,0.0,4.528473
4,001f3379,Spring,13,1,Winter,50.0,Summer,22.279952,59.5,112.2,,60.0,73.0,102.0,,,,,Summer,12.0,0.0,16.5,2.0,17.9,2.0,6.0,0.0,10.0,1.0,11.0,1.0,8.0,0.0,Summer,2.0,4.33036,30.1865,1330.97,1996.45,30.2124,84.0285,16.6877,13.4988,67.9715,2.0,32.9141,20.902,79.6982,35.3804,63.1265,,,Spring,4.11,Summer,3.0,3.0,3.0,0.0,2.0,1.0,0.0,2.0,2.0,1.0,0.0,1.0,3.0,3.0,2.0,1.0,3.0,1.0,2.0,1.0,34.0,Summer,40.0,56.0,Spring,0.0,1.0,5.953392,0.0,7.549816,0.0,7.153245,7.322783,0.0,2.630853,16.680229,6.489077,3.026922,12.872902,0.0,6.549987,0.0,5.223313,0.0,6.083334,0.0,0.0,8.64847,11.530282,0.0,10.4684,13.598515,0.0,0.0,3.080362,3.346899,0.07572,0.893668,3.969584,0.0,15.206186,6.006083,3.238015,0.0,17.711552,0.0,0.0,0.0,9.211094,3.892674,5.166663,4.312238,19.678722,0.0,0.098976,8.960603,7.441569,8.930751,0.0,0.0,9.105422,3.539766,4.188061,0.0,0.0


In [32]:
train_one_hot = train_nonan.copy()
test_one_hot = test_merge.copy()

In [33]:
for col in cat_cols:
    if 'PCIAT' not in col:
        train_sr = train_one_hot[col].fillna('Missing').apply(str)        
        test_sr = test_one_hot[col].fillna('Missing').apply(str)
        train_one_hot[col] = train_sr
        test_one_hot[col] = test_sr
        concat_sr = pd.concat([train_sr, test_sr])
        unique_list = sorted(concat_sr.unique().tolist())

        for v in unique_list:
            new_col_name = col + '_' + v
            train_one_hot[new_col_name] = pd.Series(train_one_hot[col] == v).astype(int)
            test_one_hot[new_col_name] = pd.Series(test_one_hot[col] == v).astype(int)
    else:
        train_sr = train_one_hot[col].fillna('Missing').apply(str)
        train_one_hot[col] = train_sr
        unique_list = sorted(train_sr.unique().tolist())

        for v in unique_list:
            new_col_name = col + '_' + v
            train_one_hot[new_col_name] = pd.Series(train_one_hot[col] == v).astype(int)

train_one_hot.drop(cat_cols, axis=1, inplace=True)
test_one_hot.drop([col for col in cat_cols if 'PCIAT' not in col], axis=1, inplace=True)

In [34]:
print(train_one_hot.shape)
train_one_hot.head()

(2736, 183)


Unnamed: 0,id,Basic_Demos-Age,CGAS-CGAS_Score,Physical-BMI,Physical-Height,Physical-Weight,Physical-Waist_Circumference,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,Fitness_Endurance-Max_Stage,Fitness_Endurance-Time_Mins,Fitness_Endurance-Time_Sec,FGC-FGC_CU,FGC-FGC_CU_Zone,FGC-FGC_GSND,FGC-FGC_GSND_Zone,FGC-FGC_GSD,FGC-FGC_GSD_Zone,FGC-FGC_PU,FGC-FGC_PU_Zone,FGC-FGC_SRL,FGC-FGC_SRL_Zone,FGC-FGC_SRR,FGC-FGC_SRR_Zone,FGC-FGC_TL,FGC-FGC_TL_Zone,BIA-BIA_Activity_Level_num,BIA-BIA_BMC,BIA-BIA_BMI,BIA-BIA_BMR,BIA-BIA_DEE,BIA-BIA_ECW,BIA-BIA_FFM,BIA-BIA_FFMI,BIA-BIA_FMI,BIA-BIA_Fat,BIA-BIA_Frame_num,BIA-BIA_ICW,BIA-BIA_LDM,BIA-BIA_LST,BIA-BIA_SMM,BIA-BIA_TBW,PAQ_A-PAQ_A_Total,PAQ_C-PAQ_C_Total,PCIAT-PCIAT_01,PCIAT-PCIAT_02,PCIAT-PCIAT_03,PCIAT-PCIAT_04,PCIAT-PCIAT_05,PCIAT-PCIAT_06,PCIAT-PCIAT_07,PCIAT-PCIAT_08,PCIAT-PCIAT_09,PCIAT-PCIAT_10,PCIAT-PCIAT_11,PCIAT-PCIAT_12,PCIAT-PCIAT_13,PCIAT-PCIAT_14,PCIAT-PCIAT_15,PCIAT-PCIAT_16,PCIAT-PCIAT_17,PCIAT-PCIAT_18,PCIAT-PCIAT_19,PCIAT-PCIAT_20,PCIAT-PCIAT_Total,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-computerinternet_hoursday,sii,Enc_1,Enc_2,Enc_3,Enc_4,Enc_5,Enc_6,Enc_7,Enc_8,Enc_9,Enc_10,Enc_11,Enc_12,Enc_13,Enc_14,Enc_15,Enc_16,Enc_17,Enc_18,Enc_19,Enc_20,Enc_21,Enc_22,Enc_23,Enc_25,Enc_26,Enc_27,Enc_28,Enc_29,Enc_31,Enc_32,Enc_34,Enc_35,Enc_36,Enc_37,Enc_38,Enc_39,Enc_40,Enc_41,Enc_42,Enc_43,Enc_45,Enc_47,Enc_48,Enc_49,Enc_50,Enc_51,Enc_52,Enc_53,Enc_54,Enc_55,Enc_56,Enc_58,Enc_59,Enc_60,Enc_61,Enc_62,Enc_63,Enc_64,PAQ_A-Season_Fall,PAQ_A-Season_Missing,PAQ_A-Season_Spring,PAQ_A-Season_Summer,PAQ_A-Season_Winter,CGAS-Season_Fall,CGAS-Season_Missing,CGAS-Season_Spring,CGAS-Season_Summer,CGAS-Season_Winter,Basic_Demos-Sex_0,Basic_Demos-Sex_1,PCIAT-Season_Fall,PCIAT-Season_Spring,PCIAT-Season_Summer,PCIAT-Season_Winter,PreInt_EduHx-Season_Fall,PreInt_EduHx-Season_Missing,PreInt_EduHx-Season_Spring,PreInt_EduHx-Season_Summer,PreInt_EduHx-Season_Winter,PAQ_C-Season_Fall,PAQ_C-Season_Missing,PAQ_C-Season_Spring,PAQ_C-Season_Summer,PAQ_C-Season_Winter,FGC-Season_Fall,FGC-Season_Missing,FGC-Season_Spring,FGC-Season_Summer,FGC-Season_Winter,Fitness_Endurance-Season_Fall,Fitness_Endurance-Season_Missing,Fitness_Endurance-Season_Spring,Fitness_Endurance-Season_Summer,Fitness_Endurance-Season_Winter,SDS-Season_Fall,SDS-Season_Missing,SDS-Season_Spring,SDS-Season_Summer,SDS-Season_Winter,BIA-Season_Fall,BIA-Season_Missing,BIA-Season_Spring,BIA-Season_Summer,BIA-Season_Winter,Basic_Demos-Enroll_Season_Fall,Basic_Demos-Enroll_Season_Spring,Basic_Demos-Enroll_Season_Summer,Basic_Demos-Enroll_Season_Winter,Physical-Season_Fall,Physical-Season_Missing,Physical-Season_Spring,Physical-Season_Summer,Physical-Season_Winter
0,00008ff9,5,51.0,16.877316,46.0,50.8,,,,,,,,0.0,0.0,,,,,0.0,0.0,7.0,0.0,6.0,0.0,6.0,1.0,2.0,2.66855,16.8792,932.498,1492.0,8.25598,41.5862,13.8177,3.06143,9.21377,1.0,24.4349,8.89536,38.9177,19.5413,32.6909,,,5.0,4.0,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,4.0,0.0,4.0,4.0,4.0,4.0,4.0,4.0,2.0,4.0,55.0,,,3.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,1,0,0,0,0,0,0,0,1,1,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0
1,000fd460,9,,14.03559,48.0,46.0,22.0,75.0,70.0,122.0,,,,3.0,0.0,,,,,5.0,0.0,11.0,1.0,11.0,1.0,3.0,0.0,2.0,2.57949,14.0371,936.656,1498.65,6.01993,42.0291,12.8254,1.21172,3.97085,1.0,21.0352,14.974,39.4497,15.4107,27.0552,,2.34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,46.0,64.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,1,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0
2,00105258,10,71.0,16.648696,56.5,75.6,,65.0,94.0,117.0,5.0,7.0,33.0,20.0,1.0,10.2,1.0,14.7,2.0,7.0,1.0,10.0,1.0,10.0,1.0,5.0,0.0,,,,,,,,,,,,,,,,,,2.17,5.0,2.0,2.0,1.0,2.0,1.0,1.0,2.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,2.0,2.0,1.0,1.0,28.0,38.0,54.0,2.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,1,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0
3,00115b9f,9,71.0,18.292347,56.0,81.6,,60.0,97.0,117.0,6.0,9.0,37.0,18.0,1.0,,,,,5.0,0.0,7.0,0.0,7.0,0.0,7.0,1.0,3.0,3.84191,18.2943,1131.43,1923.44,15.5925,62.7757,14.074,4.22033,18.8243,2.0,30.4041,16.779,58.9338,26.4798,45.9966,,2.451,4.0,2.0,4.0,0.0,5.0,1.0,0.0,3.0,2.0,2.0,3.0,0.0,3.0,0.0,0.0,3.0,4.0,3.0,4.0,1.0,44.0,31.0,45.0,0.0,1.0,0.0,3.35869,0.0,5.337131,1e-06,0.0,0.0,2.398546,4.813291,4.990461,1.405737,8.475198,0.0,0.0,6.87957,1.935769,8.775964,0.0,3.882035,6.520278,18.483984,7.693,0.673259,10.639287,0.0,5.649461,6.437056,8.081806,14.662292,6.477272,2.085419,17.302296,0.0,10.002832,7.267302,1.532843,5.770669,1.359145,0.0,0.0,0.0,0.0,4.486127,14.495865,4.542089,4.867955,0.0,0.0,3.569353,8.330843,0.0,0.0,2.021903,3.556332,6.78969,7.568647,0.0,4.528473,0,1,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0
4,001f3379,13,50.0,22.279952,59.5,112.2,,60.0,73.0,102.0,,,,12.0,0.0,16.5,2.0,17.9,2.0,6.0,0.0,10.0,1.0,11.0,1.0,8.0,0.0,2.0,4.33036,30.1865,1330.97,1996.45,30.2124,84.0285,16.6877,13.4988,67.9715,2.0,32.9141,20.902,79.6982,35.3804,63.1265,,4.11,3.0,3.0,3.0,0.0,2.0,1.0,0.0,2.0,2.0,1.0,0.0,1.0,3.0,3.0,2.0,1.0,3.0,1.0,2.0,1.0,34.0,40.0,56.0,0.0,1.0,5.953392,0.0,7.549816,0.0,7.153245,7.322783,0.0,2.630853,16.680229,6.489077,3.026922,12.872902,0.0,6.549987,0.0,5.223313,0.0,6.083334,0.0,0.0,8.64847,11.530282,0.0,10.4684,13.598515,0.0,0.0,3.080362,3.346899,0.07572,0.893668,3.969584,0.0,15.206186,6.006083,3.238015,0.0,17.711552,0.0,0.0,0.0,9.211094,3.892674,5.166663,4.312238,19.678722,0.0,0.098976,8.960603,7.441569,8.930751,0.0,0.0,9.105422,3.539766,4.188061,0.0,0.0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0


In [35]:
print(test_one_hot.shape)
test_one_hot.head()

(20, 157)


Unnamed: 0,id,Basic_Demos-Age,CGAS-CGAS_Score,Physical-BMI,Physical-Height,Physical-Weight,Physical-Waist_Circumference,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,Fitness_Endurance-Max_Stage,Fitness_Endurance-Time_Mins,Fitness_Endurance-Time_Sec,FGC-FGC_CU,FGC-FGC_CU_Zone,FGC-FGC_GSND,FGC-FGC_GSND_Zone,FGC-FGC_GSD,FGC-FGC_GSD_Zone,FGC-FGC_PU,FGC-FGC_PU_Zone,FGC-FGC_SRL,FGC-FGC_SRL_Zone,FGC-FGC_SRR,FGC-FGC_SRR_Zone,FGC-FGC_TL,FGC-FGC_TL_Zone,BIA-BIA_Activity_Level_num,BIA-BIA_BMC,BIA-BIA_BMI,BIA-BIA_BMR,BIA-BIA_DEE,BIA-BIA_ECW,BIA-BIA_FFM,BIA-BIA_FFMI,BIA-BIA_FMI,BIA-BIA_Fat,BIA-BIA_Frame_num,BIA-BIA_ICW,BIA-BIA_LDM,BIA-BIA_LST,BIA-BIA_SMM,BIA-BIA_TBW,PAQ_A-PAQ_A_Total,PAQ_C-PAQ_C_Total,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-computerinternet_hoursday,Enc_1,Enc_2,Enc_3,Enc_4,Enc_5,Enc_6,Enc_7,Enc_8,Enc_9,Enc_10,Enc_11,Enc_12,Enc_13,Enc_14,Enc_15,Enc_16,Enc_17,Enc_18,Enc_19,Enc_20,Enc_21,Enc_22,Enc_23,Enc_25,Enc_26,Enc_27,Enc_28,Enc_29,Enc_31,Enc_32,Enc_34,Enc_35,Enc_36,Enc_37,Enc_38,Enc_39,Enc_40,Enc_41,Enc_42,Enc_43,Enc_45,Enc_47,Enc_48,Enc_49,Enc_50,Enc_51,Enc_52,Enc_53,Enc_54,Enc_55,Enc_56,Enc_58,Enc_59,Enc_60,Enc_61,Enc_62,Enc_63,Enc_64,PAQ_A-Season_Fall,PAQ_A-Season_Missing,PAQ_A-Season_Spring,PAQ_A-Season_Summer,PAQ_A-Season_Winter,CGAS-Season_Fall,CGAS-Season_Missing,CGAS-Season_Spring,CGAS-Season_Summer,CGAS-Season_Winter,Basic_Demos-Sex_0,Basic_Demos-Sex_1,PreInt_EduHx-Season_Fall,PreInt_EduHx-Season_Missing,PreInt_EduHx-Season_Spring,PreInt_EduHx-Season_Summer,PreInt_EduHx-Season_Winter,PAQ_C-Season_Fall,PAQ_C-Season_Missing,PAQ_C-Season_Spring,PAQ_C-Season_Summer,PAQ_C-Season_Winter,FGC-Season_Fall,FGC-Season_Missing,FGC-Season_Spring,FGC-Season_Summer,FGC-Season_Winter,Fitness_Endurance-Season_Fall,Fitness_Endurance-Season_Missing,Fitness_Endurance-Season_Spring,Fitness_Endurance-Season_Summer,Fitness_Endurance-Season_Winter,SDS-Season_Fall,SDS-Season_Missing,SDS-Season_Spring,SDS-Season_Summer,SDS-Season_Winter,BIA-Season_Fall,BIA-Season_Missing,BIA-Season_Spring,BIA-Season_Summer,BIA-Season_Winter,Basic_Demos-Enroll_Season_Fall,Basic_Demos-Enroll_Season_Spring,Basic_Demos-Enroll_Season_Summer,Basic_Demos-Enroll_Season_Winter,Physical-Season_Fall,Physical-Season_Missing,Physical-Season_Spring,Physical-Season_Summer,Physical-Season_Winter
0,00008ff9,5,51.0,16.877316,46.0,50.8,,,,,,,,0.0,0.0,,,,,0.0,0.0,7.0,0.0,6.0,0.0,6.0,1.0,2.0,2.66855,16.8792,932.498,1492.0,8.25598,41.5862,13.8177,3.06143,9.21377,1.0,24.4349,8.89536,38.9177,19.5413,32.6909,,,,,3.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,1,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0
1,000fd460,9,,14.03559,48.0,46.0,22.0,75.0,70.0,122.0,,,,3.0,0.0,,,,,5.0,0.0,11.0,1.0,11.0,1.0,3.0,0.0,2.0,2.57949,14.0371,936.656,1498.65,6.01993,42.0291,12.8254,1.21172,3.97085,1.0,21.0352,14.974,39.4497,15.4107,27.0552,,2.34,46.0,64.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0
2,00105258,10,71.0,16.648696,56.5,75.6,,65.0,94.0,117.0,5.0,7.0,33.0,20.0,1.0,10.2,1.0,14.7,2.0,7.0,1.0,10.0,1.0,10.0,1.0,5.0,0.0,,,,,,,,,,,,,,,,,,2.17,38.0,54.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0
3,00115b9f,9,71.0,18.292347,56.0,81.6,,60.0,97.0,117.0,6.0,9.0,37.0,18.0,1.0,,,,,5.0,0.0,7.0,0.0,7.0,0.0,7.0,1.0,3.0,3.84191,18.2943,1131.43,1923.44,15.5925,62.7757,14.074,4.22033,18.8243,2.0,30.4041,16.779,58.9338,26.4798,45.9966,,2.451,31.0,45.0,0.0,0.0,3.35869,0.0,5.337131,1e-06,0.0,0.0,2.398546,4.813291,4.990461,1.405737,8.475198,0.0,0.0,6.87957,1.935769,8.775964,0.0,3.882035,6.520278,18.483984,7.693,0.673259,10.639287,0.0,5.649461,6.437056,8.081806,14.662292,6.477272,2.085419,17.302296,0.0,10.002832,7.267302,1.532843,5.770669,1.359145,0.0,0.0,0.0,0.0,4.486127,14.495865,4.542089,4.867955,0.0,0.0,3.569353,8.330843,0.0,0.0,2.021903,3.556332,6.78969,7.568647,0.0,4.528473,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0
4,0016bb22,18,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.04,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,1,0,0,0,0,1,0,0,1,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0


In [37]:
train_one_hot.drop('id', axis=1, inplace=True)
test_one_hot.drop('id', axis=1, inplace=True)

In [46]:
train = train_one_hot.copy()
test = test_one_hot.copy()

In [50]:
print(train.shape)
train.head()

(2736, 182)


Unnamed: 0,Basic_Demos-Age,CGAS-CGAS_Score,Physical-BMI,Physical-Height,Physical-Weight,Physical-Waist_Circumference,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,Fitness_Endurance-Max_Stage,Fitness_Endurance-Time_Mins,Fitness_Endurance-Time_Sec,FGC-FGC_CU,FGC-FGC_CU_Zone,FGC-FGC_GSND,FGC-FGC_GSND_Zone,FGC-FGC_GSD,FGC-FGC_GSD_Zone,FGC-FGC_PU,FGC-FGC_PU_Zone,FGC-FGC_SRL,FGC-FGC_SRL_Zone,FGC-FGC_SRR,FGC-FGC_SRR_Zone,FGC-FGC_TL,FGC-FGC_TL_Zone,BIA-BIA_Activity_Level_num,BIA-BIA_BMC,BIA-BIA_BMI,BIA-BIA_BMR,BIA-BIA_DEE,BIA-BIA_ECW,BIA-BIA_FFM,BIA-BIA_FFMI,BIA-BIA_FMI,BIA-BIA_Fat,BIA-BIA_Frame_num,BIA-BIA_ICW,BIA-BIA_LDM,BIA-BIA_LST,BIA-BIA_SMM,BIA-BIA_TBW,PAQ_A-PAQ_A_Total,PAQ_C-PAQ_C_Total,PCIAT-PCIAT_01,PCIAT-PCIAT_02,PCIAT-PCIAT_03,PCIAT-PCIAT_04,PCIAT-PCIAT_05,PCIAT-PCIAT_06,PCIAT-PCIAT_07,PCIAT-PCIAT_08,PCIAT-PCIAT_09,PCIAT-PCIAT_10,PCIAT-PCIAT_11,PCIAT-PCIAT_12,PCIAT-PCIAT_13,PCIAT-PCIAT_14,PCIAT-PCIAT_15,PCIAT-PCIAT_16,PCIAT-PCIAT_17,PCIAT-PCIAT_18,PCIAT-PCIAT_19,PCIAT-PCIAT_20,PCIAT-PCIAT_Total,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-computerinternet_hoursday,sii,Enc_1,Enc_2,Enc_3,Enc_4,Enc_5,Enc_6,Enc_7,Enc_8,Enc_9,Enc_10,Enc_11,Enc_12,Enc_13,Enc_14,Enc_15,Enc_16,Enc_17,Enc_18,Enc_19,Enc_20,Enc_21,Enc_22,Enc_23,Enc_25,Enc_26,Enc_27,Enc_28,Enc_29,Enc_31,Enc_32,Enc_34,Enc_35,Enc_36,Enc_37,Enc_38,Enc_39,Enc_40,Enc_41,Enc_42,Enc_43,Enc_45,Enc_47,Enc_48,Enc_49,Enc_50,Enc_51,Enc_52,Enc_53,Enc_54,Enc_55,Enc_56,Enc_58,Enc_59,Enc_60,Enc_61,Enc_62,Enc_63,Enc_64,PAQ_A-Season_Fall,PAQ_A-Season_Missing,PAQ_A-Season_Spring,PAQ_A-Season_Summer,PAQ_A-Season_Winter,CGAS-Season_Fall,CGAS-Season_Missing,CGAS-Season_Spring,CGAS-Season_Summer,CGAS-Season_Winter,Basic_Demos-Sex_0,Basic_Demos-Sex_1,PCIAT-Season_Fall,PCIAT-Season_Spring,PCIAT-Season_Summer,PCIAT-Season_Winter,PreInt_EduHx-Season_Fall,PreInt_EduHx-Season_Missing,PreInt_EduHx-Season_Spring,PreInt_EduHx-Season_Summer,PreInt_EduHx-Season_Winter,PAQ_C-Season_Fall,PAQ_C-Season_Missing,PAQ_C-Season_Spring,PAQ_C-Season_Summer,PAQ_C-Season_Winter,FGC-Season_Fall,FGC-Season_Missing,FGC-Season_Spring,FGC-Season_Summer,FGC-Season_Winter,Fitness_Endurance-Season_Fall,Fitness_Endurance-Season_Missing,Fitness_Endurance-Season_Spring,Fitness_Endurance-Season_Summer,Fitness_Endurance-Season_Winter,SDS-Season_Fall,SDS-Season_Missing,SDS-Season_Spring,SDS-Season_Summer,SDS-Season_Winter,BIA-Season_Fall,BIA-Season_Missing,BIA-Season_Spring,BIA-Season_Summer,BIA-Season_Winter,Basic_Demos-Enroll_Season_Fall,Basic_Demos-Enroll_Season_Spring,Basic_Demos-Enroll_Season_Summer,Basic_Demos-Enroll_Season_Winter,Physical-Season_Fall,Physical-Season_Missing,Physical-Season_Spring,Physical-Season_Summer,Physical-Season_Winter
0,5,51.0,16.877316,46.0,50.8,,,,,,,,0.0,0.0,,,,,0.0,0.0,7.0,0.0,6.0,0.0,6.0,1.0,2.0,2.66855,16.8792,932.498,1492.0,8.25598,41.5862,13.8177,3.06143,9.21377,1.0,24.4349,8.89536,38.9177,19.5413,32.6909,,,5.0,4.0,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,4.0,0.0,4.0,4.0,4.0,4.0,4.0,4.0,2.0,4.0,55.0,,,3.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,1,0,0,0,0,0,0,0,1,1,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0
1,9,,14.03559,48.0,46.0,22.0,75.0,70.0,122.0,,,,3.0,0.0,,,,,5.0,0.0,11.0,1.0,11.0,1.0,3.0,0.0,2.0,2.57949,14.0371,936.656,1498.65,6.01993,42.0291,12.8254,1.21172,3.97085,1.0,21.0352,14.974,39.4497,15.4107,27.0552,,2.34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,46.0,64.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,1,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0
2,10,71.0,16.648696,56.5,75.6,,65.0,94.0,117.0,5.0,7.0,33.0,20.0,1.0,10.2,1.0,14.7,2.0,7.0,1.0,10.0,1.0,10.0,1.0,5.0,0.0,,,,,,,,,,,,,,,,,,2.17,5.0,2.0,2.0,1.0,2.0,1.0,1.0,2.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,2.0,2.0,1.0,1.0,28.0,38.0,54.0,2.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,1,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0
3,9,71.0,18.292347,56.0,81.6,,60.0,97.0,117.0,6.0,9.0,37.0,18.0,1.0,,,,,5.0,0.0,7.0,0.0,7.0,0.0,7.0,1.0,3.0,3.84191,18.2943,1131.43,1923.44,15.5925,62.7757,14.074,4.22033,18.8243,2.0,30.4041,16.779,58.9338,26.4798,45.9966,,2.451,4.0,2.0,4.0,0.0,5.0,1.0,0.0,3.0,2.0,2.0,3.0,0.0,3.0,0.0,0.0,3.0,4.0,3.0,4.0,1.0,44.0,31.0,45.0,0.0,1.0,0.0,3.35869,0.0,5.337131,1e-06,0.0,0.0,2.398546,4.813291,4.990461,1.405737,8.475198,0.0,0.0,6.87957,1.935769,8.775964,0.0,3.882035,6.520278,18.483984,7.693,0.673259,10.639287,0.0,5.649461,6.437056,8.081806,14.662292,6.477272,2.085419,17.302296,0.0,10.002832,7.267302,1.532843,5.770669,1.359145,0.0,0.0,0.0,0.0,4.486127,14.495865,4.542089,4.867955,0.0,0.0,3.569353,8.330843,0.0,0.0,2.021903,3.556332,6.78969,7.568647,0.0,4.528473,0,1,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0
4,13,50.0,22.279952,59.5,112.2,,60.0,73.0,102.0,,,,12.0,0.0,16.5,2.0,17.9,2.0,6.0,0.0,10.0,1.0,11.0,1.0,8.0,0.0,2.0,4.33036,30.1865,1330.97,1996.45,30.2124,84.0285,16.6877,13.4988,67.9715,2.0,32.9141,20.902,79.6982,35.3804,63.1265,,4.11,3.0,3.0,3.0,0.0,2.0,1.0,0.0,2.0,2.0,1.0,0.0,1.0,3.0,3.0,2.0,1.0,3.0,1.0,2.0,1.0,34.0,40.0,56.0,0.0,1.0,5.953392,0.0,7.549816,0.0,7.153245,7.322783,0.0,2.630853,16.680229,6.489077,3.026922,12.872902,0.0,6.549987,0.0,5.223313,0.0,6.083334,0.0,0.0,8.64847,11.530282,0.0,10.4684,13.598515,0.0,0.0,3.080362,3.346899,0.07572,0.893668,3.969584,0.0,15.206186,6.006083,3.238015,0.0,17.711552,0.0,0.0,0.0,9.211094,3.892674,5.166663,4.312238,19.678722,0.0,0.098976,8.960603,7.441569,8.930751,0.0,0.0,9.105422,3.539766,4.188061,0.0,0.0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0


In [51]:
print(test.shape)
test.head()

(20, 156)


Unnamed: 0,Basic_Demos-Age,CGAS-CGAS_Score,Physical-BMI,Physical-Height,Physical-Weight,Physical-Waist_Circumference,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,Fitness_Endurance-Max_Stage,Fitness_Endurance-Time_Mins,Fitness_Endurance-Time_Sec,FGC-FGC_CU,FGC-FGC_CU_Zone,FGC-FGC_GSND,FGC-FGC_GSND_Zone,FGC-FGC_GSD,FGC-FGC_GSD_Zone,FGC-FGC_PU,FGC-FGC_PU_Zone,FGC-FGC_SRL,FGC-FGC_SRL_Zone,FGC-FGC_SRR,FGC-FGC_SRR_Zone,FGC-FGC_TL,FGC-FGC_TL_Zone,BIA-BIA_Activity_Level_num,BIA-BIA_BMC,BIA-BIA_BMI,BIA-BIA_BMR,BIA-BIA_DEE,BIA-BIA_ECW,BIA-BIA_FFM,BIA-BIA_FFMI,BIA-BIA_FMI,BIA-BIA_Fat,BIA-BIA_Frame_num,BIA-BIA_ICW,BIA-BIA_LDM,BIA-BIA_LST,BIA-BIA_SMM,BIA-BIA_TBW,PAQ_A-PAQ_A_Total,PAQ_C-PAQ_C_Total,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-computerinternet_hoursday,Enc_1,Enc_2,Enc_3,Enc_4,Enc_5,Enc_6,Enc_7,Enc_8,Enc_9,Enc_10,Enc_11,Enc_12,Enc_13,Enc_14,Enc_15,Enc_16,Enc_17,Enc_18,Enc_19,Enc_20,Enc_21,Enc_22,Enc_23,Enc_25,Enc_26,Enc_27,Enc_28,Enc_29,Enc_31,Enc_32,Enc_34,Enc_35,Enc_36,Enc_37,Enc_38,Enc_39,Enc_40,Enc_41,Enc_42,Enc_43,Enc_45,Enc_47,Enc_48,Enc_49,Enc_50,Enc_51,Enc_52,Enc_53,Enc_54,Enc_55,Enc_56,Enc_58,Enc_59,Enc_60,Enc_61,Enc_62,Enc_63,Enc_64,PAQ_A-Season_Fall,PAQ_A-Season_Missing,PAQ_A-Season_Spring,PAQ_A-Season_Summer,PAQ_A-Season_Winter,CGAS-Season_Fall,CGAS-Season_Missing,CGAS-Season_Spring,CGAS-Season_Summer,CGAS-Season_Winter,Basic_Demos-Sex_0,Basic_Demos-Sex_1,PreInt_EduHx-Season_Fall,PreInt_EduHx-Season_Missing,PreInt_EduHx-Season_Spring,PreInt_EduHx-Season_Summer,PreInt_EduHx-Season_Winter,PAQ_C-Season_Fall,PAQ_C-Season_Missing,PAQ_C-Season_Spring,PAQ_C-Season_Summer,PAQ_C-Season_Winter,FGC-Season_Fall,FGC-Season_Missing,FGC-Season_Spring,FGC-Season_Summer,FGC-Season_Winter,Fitness_Endurance-Season_Fall,Fitness_Endurance-Season_Missing,Fitness_Endurance-Season_Spring,Fitness_Endurance-Season_Summer,Fitness_Endurance-Season_Winter,SDS-Season_Fall,SDS-Season_Missing,SDS-Season_Spring,SDS-Season_Summer,SDS-Season_Winter,BIA-Season_Fall,BIA-Season_Missing,BIA-Season_Spring,BIA-Season_Summer,BIA-Season_Winter,Basic_Demos-Enroll_Season_Fall,Basic_Demos-Enroll_Season_Spring,Basic_Demos-Enroll_Season_Summer,Basic_Demos-Enroll_Season_Winter,Physical-Season_Fall,Physical-Season_Missing,Physical-Season_Spring,Physical-Season_Summer,Physical-Season_Winter
0,5,51.0,16.877316,46.0,50.8,,,,,,,,0.0,0.0,,,,,0.0,0.0,7.0,0.0,6.0,0.0,6.0,1.0,2.0,2.66855,16.8792,932.498,1492.0,8.25598,41.5862,13.8177,3.06143,9.21377,1.0,24.4349,8.89536,38.9177,19.5413,32.6909,,,,,3.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,1,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0
1,9,,14.03559,48.0,46.0,22.0,75.0,70.0,122.0,,,,3.0,0.0,,,,,5.0,0.0,11.0,1.0,11.0,1.0,3.0,0.0,2.0,2.57949,14.0371,936.656,1498.65,6.01993,42.0291,12.8254,1.21172,3.97085,1.0,21.0352,14.974,39.4497,15.4107,27.0552,,2.34,46.0,64.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0
2,10,71.0,16.648696,56.5,75.6,,65.0,94.0,117.0,5.0,7.0,33.0,20.0,1.0,10.2,1.0,14.7,2.0,7.0,1.0,10.0,1.0,10.0,1.0,5.0,0.0,,,,,,,,,,,,,,,,,,2.17,38.0,54.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0
3,9,71.0,18.292347,56.0,81.6,,60.0,97.0,117.0,6.0,9.0,37.0,18.0,1.0,,,,,5.0,0.0,7.0,0.0,7.0,0.0,7.0,1.0,3.0,3.84191,18.2943,1131.43,1923.44,15.5925,62.7757,14.074,4.22033,18.8243,2.0,30.4041,16.779,58.9338,26.4798,45.9966,,2.451,31.0,45.0,0.0,0.0,3.35869,0.0,5.337131,1e-06,0.0,0.0,2.398546,4.813291,4.990461,1.405737,8.475198,0.0,0.0,6.87957,1.935769,8.775964,0.0,3.882035,6.520278,18.483984,7.693,0.673259,10.639287,0.0,5.649461,6.437056,8.081806,14.662292,6.477272,2.085419,17.302296,0.0,10.002832,7.267302,1.532843,5.770669,1.359145,0.0,0.0,0.0,0.0,4.486127,14.495865,4.542089,4.867955,0.0,0.0,3.569353,8.330843,0.0,0.0,2.021903,3.556332,6.78969,7.568647,0.0,4.528473,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0
4,18,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.04,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,1,0,0,0,0,1,0,0,1,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0


In [47]:
imputer = KNNImputer(n_neighbors=5)
imputed_data = imputer.fit_transform(train)

In [48]:
train_imputed = pd.DataFrame(imputed_data, columns=train.columns)
print(train_imputed.shape)
train_imputed.head()

(2736, 182)


Unnamed: 0,Basic_Demos-Age,CGAS-CGAS_Score,Physical-BMI,Physical-Height,Physical-Weight,Physical-Waist_Circumference,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,Fitness_Endurance-Max_Stage,Fitness_Endurance-Time_Mins,Fitness_Endurance-Time_Sec,FGC-FGC_CU,FGC-FGC_CU_Zone,FGC-FGC_GSND,FGC-FGC_GSND_Zone,FGC-FGC_GSD,FGC-FGC_GSD_Zone,FGC-FGC_PU,FGC-FGC_PU_Zone,FGC-FGC_SRL,FGC-FGC_SRL_Zone,FGC-FGC_SRR,FGC-FGC_SRR_Zone,FGC-FGC_TL,FGC-FGC_TL_Zone,BIA-BIA_Activity_Level_num,BIA-BIA_BMC,BIA-BIA_BMI,BIA-BIA_BMR,BIA-BIA_DEE,BIA-BIA_ECW,BIA-BIA_FFM,BIA-BIA_FFMI,BIA-BIA_FMI,BIA-BIA_Fat,BIA-BIA_Frame_num,BIA-BIA_ICW,BIA-BIA_LDM,BIA-BIA_LST,BIA-BIA_SMM,BIA-BIA_TBW,PAQ_A-PAQ_A_Total,PAQ_C-PAQ_C_Total,PCIAT-PCIAT_01,PCIAT-PCIAT_02,PCIAT-PCIAT_03,PCIAT-PCIAT_04,PCIAT-PCIAT_05,PCIAT-PCIAT_06,PCIAT-PCIAT_07,PCIAT-PCIAT_08,PCIAT-PCIAT_09,PCIAT-PCIAT_10,PCIAT-PCIAT_11,PCIAT-PCIAT_12,PCIAT-PCIAT_13,PCIAT-PCIAT_14,PCIAT-PCIAT_15,PCIAT-PCIAT_16,PCIAT-PCIAT_17,PCIAT-PCIAT_18,PCIAT-PCIAT_19,PCIAT-PCIAT_20,PCIAT-PCIAT_Total,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-computerinternet_hoursday,sii,Enc_1,Enc_2,Enc_3,Enc_4,Enc_5,Enc_6,Enc_7,Enc_8,Enc_9,Enc_10,Enc_11,Enc_12,Enc_13,Enc_14,Enc_15,Enc_16,Enc_17,Enc_18,Enc_19,Enc_20,Enc_21,Enc_22,Enc_23,Enc_25,Enc_26,Enc_27,Enc_28,Enc_29,Enc_31,Enc_32,Enc_34,Enc_35,Enc_36,Enc_37,Enc_38,Enc_39,Enc_40,Enc_41,Enc_42,Enc_43,Enc_45,Enc_47,Enc_48,Enc_49,Enc_50,Enc_51,Enc_52,Enc_53,Enc_54,Enc_55,Enc_56,Enc_58,Enc_59,Enc_60,Enc_61,Enc_62,Enc_63,Enc_64,PAQ_A-Season_Fall,PAQ_A-Season_Missing,PAQ_A-Season_Spring,PAQ_A-Season_Summer,PAQ_A-Season_Winter,CGAS-Season_Fall,CGAS-Season_Missing,CGAS-Season_Spring,CGAS-Season_Summer,CGAS-Season_Winter,Basic_Demos-Sex_0,Basic_Demos-Sex_1,PCIAT-Season_Fall,PCIAT-Season_Spring,PCIAT-Season_Summer,PCIAT-Season_Winter,PreInt_EduHx-Season_Fall,PreInt_EduHx-Season_Missing,PreInt_EduHx-Season_Spring,PreInt_EduHx-Season_Summer,PreInt_EduHx-Season_Winter,PAQ_C-Season_Fall,PAQ_C-Season_Missing,PAQ_C-Season_Spring,PAQ_C-Season_Summer,PAQ_C-Season_Winter,FGC-Season_Fall,FGC-Season_Missing,FGC-Season_Spring,FGC-Season_Summer,FGC-Season_Winter,Fitness_Endurance-Season_Fall,Fitness_Endurance-Season_Missing,Fitness_Endurance-Season_Spring,Fitness_Endurance-Season_Summer,Fitness_Endurance-Season_Winter,SDS-Season_Fall,SDS-Season_Missing,SDS-Season_Spring,SDS-Season_Summer,SDS-Season_Winter,BIA-Season_Fall,BIA-Season_Missing,BIA-Season_Spring,BIA-Season_Summer,BIA-Season_Winter,Basic_Demos-Enroll_Season_Fall,Basic_Demos-Enroll_Season_Spring,Basic_Demos-Enroll_Season_Summer,Basic_Demos-Enroll_Season_Winter,Physical-Season_Fall,Physical-Season_Missing,Physical-Season_Spring,Physical-Season_Summer,Physical-Season_Winter
0,5.0,51.0,16.877316,46.0,50.8,24.2,66.8,79.4,109.6,4.6,7.4,16.8,0.0,0.0,14.44,1.2,15.06,1.4,0.0,0.0,7.0,0.0,6.0,0.0,6.0,1.0,2.0,2.66855,16.8792,932.498,1492.0,8.25598,41.5862,13.8177,3.06143,9.21377,1.0,24.4349,8.89536,38.9177,19.5413,32.6909,1.912,2.17,5.0,4.0,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,4.0,0.0,4.0,4.0,4.0,4.0,4.0,4.0,2.0,4.0,55.0,42.4,59.4,3.0,2.0,4.648943,2.86272,3.83782,2.362754,4.824096,2.883555,7.938336,5.59528,5.473143,1.548328,0.610532,2.230198,5.765868,3.468719,1.86528,4.972034,1.201754,9.507165,3.455488,0.0,5.673689,5.616394,10.19935,4.570384,7.375259,2.553157,2.191334,3.677047,4.473263,5.950488,3.525138,0.0,6.884481,2.262771,2.635239,5.337497,0.459038,6.882023,0.0,3.810076,4.177385,8.677014,8.414383,2.293219,5.173482,10.378217,1.199374,5.194741,3.605842,3.352248,1.985008,1.653485,1.475087,3.530067,3.102172,0.35943,6.725377,0.689439,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,9.0,63.4,14.03559,48.0,46.0,22.0,75.0,70.0,122.0,5.4,8.4,19.2,3.0,0.0,21.82,1.8,24.44,2.0,5.0,0.0,11.0,1.0,11.0,1.0,3.0,0.0,2.0,2.57949,14.0371,936.656,1498.65,6.01993,42.0291,12.8254,1.21172,3.97085,1.0,21.0352,14.974,39.4497,15.4107,27.0552,1.582,2.34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,46.0,64.0,0.0,0.0,7.225027,4.654813,5.707771,3.315377,1.614492,1.516619,5.262636,6.734442,6.344801,0.856954,3.727236,4.799614,6.251505,2.295703,3.671392,5.998921,2.632931,5.187693,1.097767,1.884954,7.591034,4.958785,3.887423,4.308725,2.913355,2.421994,4.201717,3.334835,5.151681,4.350394,4.208288,4.45628,4.187182,4.235772,4.713233,1.201684,2.942663,6.699873,0.487005,1.546179,2.424647,3.735351,3.231082,8.39476,6.670735,5.268567,1.595979,4.478597,4.668807,3.618142,0.341602,0.548136,1.590361,1.198129,6.590679,3.144306,9.164507,4.239794,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
2,10.0,71.0,16.648696,56.5,75.6,25.0,65.0,94.0,117.0,5.0,7.0,33.0,20.0,1.0,10.2,1.0,14.7,2.0,7.0,1.0,10.0,1.0,10.0,1.0,5.0,0.0,3.0,3.916544,18.6346,1039.784,1784.09,14.3219,53.01382,14.11984,4.514742,16.42618,1.8,25.79618,12.89574,49.09728,23.3954,40.11808,1.614,2.17,5.0,2.0,2.0,1.0,2.0,1.0,1.0,2.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,2.0,2.0,1.0,1.0,28.0,38.0,54.0,2.0,0.0,9.040421,4.273831,3.260446,0.701797,2.822473,18.231993,2.431175,10.054292,10.812925,16.455309,8.117317,8.840761,8.781365,2.349888,0.043987,9.928959,0.868555,9.606325,9.479357,5.175053,9.032331,5.809124,2.831523,13.388857,13.861341,15.405759,2.705917,8.429331,3.052455,7.486068,0.423141,2.490485,9.126925,17.947185,2.075995,12.117871,4.833192,12.477597,4.944942,2.207031,3.467912,3.889378,3.533822,15.909766,0.583314,19.029536,2.370717,10.098508,3.357332,3.111625,9.681407,5.590549,4.631087,12.116008,7.719337,8.477834,7.622598,0.196086,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
3,9.0,71.0,18.292347,56.0,81.6,28.2,60.0,97.0,117.0,6.0,9.0,37.0,18.0,1.0,17.64,1.6,19.22,1.8,5.0,0.0,7.0,0.0,7.0,0.0,7.0,1.0,3.0,3.84191,18.2943,1131.43,1923.44,15.5925,62.7757,14.074,4.22033,18.8243,2.0,30.4041,16.779,58.9338,26.4798,45.9966,1.796,2.451,4.0,2.0,4.0,0.0,5.0,1.0,0.0,3.0,2.0,2.0,3.0,0.0,3.0,0.0,0.0,3.0,4.0,3.0,4.0,1.0,44.0,31.0,45.0,0.0,1.0,0.0,3.35869,0.0,5.337131,1e-06,0.0,0.0,2.398546,4.813291,4.990461,1.405737,8.475198,0.0,0.0,6.87957,1.935769,8.775964,0.0,3.882035,6.520278,18.483984,7.693,0.673259,10.639287,0.0,5.649461,6.437056,8.081806,14.662292,6.477272,2.085419,17.302296,0.0,10.002832,7.267302,1.532843,5.770669,1.359145,0.0,0.0,0.0,0.0,4.486127,14.495865,4.542089,4.867955,0.0,0.0,3.569353,8.330843,0.0,0.0,2.021903,3.556332,6.78969,7.568647,0.0,4.528473,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,13.0,50.0,22.279952,59.5,112.2,29.6,60.0,73.0,102.0,4.2,5.6,35.6,12.0,0.0,16.5,2.0,17.9,2.0,6.0,0.0,10.0,1.0,11.0,1.0,8.0,0.0,2.0,4.33036,30.1865,1330.97,1996.45,30.2124,84.0285,16.6877,13.4988,67.9715,2.0,32.9141,20.902,79.6982,35.3804,63.1265,1.778,4.11,3.0,3.0,3.0,0.0,2.0,1.0,0.0,2.0,2.0,1.0,0.0,1.0,3.0,3.0,2.0,1.0,3.0,1.0,2.0,1.0,34.0,40.0,56.0,0.0,1.0,5.953392,0.0,7.549816,0.0,7.153245,7.322783,0.0,2.630853,16.680229,6.489077,3.026922,12.872902,0.0,6.549987,0.0,5.223313,0.0,6.083334,0.0,0.0,8.64847,11.530282,0.0,10.4684,13.598515,0.0,0.0,3.080362,3.346899,0.07572,0.893668,3.969584,0.0,15.206186,6.006083,3.238015,0.0,17.711552,0.0,0.0,0.0,9.211094,3.892674,5.166663,4.312238,19.678722,0.0,0.098976,8.960603,7.441569,8.930751,0.0,0.0,9.105422,3.539766,4.188061,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [52]:
concat_df = pd.concat([train_imputed[test.columns], test], ignore_index=True)
print(concat_df.shape)
concat_df.head()

(2756, 156)


Unnamed: 0,Basic_Demos-Age,CGAS-CGAS_Score,Physical-BMI,Physical-Height,Physical-Weight,Physical-Waist_Circumference,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,Fitness_Endurance-Max_Stage,Fitness_Endurance-Time_Mins,Fitness_Endurance-Time_Sec,FGC-FGC_CU,FGC-FGC_CU_Zone,FGC-FGC_GSND,FGC-FGC_GSND_Zone,FGC-FGC_GSD,FGC-FGC_GSD_Zone,FGC-FGC_PU,FGC-FGC_PU_Zone,FGC-FGC_SRL,FGC-FGC_SRL_Zone,FGC-FGC_SRR,FGC-FGC_SRR_Zone,FGC-FGC_TL,FGC-FGC_TL_Zone,BIA-BIA_Activity_Level_num,BIA-BIA_BMC,BIA-BIA_BMI,BIA-BIA_BMR,BIA-BIA_DEE,BIA-BIA_ECW,BIA-BIA_FFM,BIA-BIA_FFMI,BIA-BIA_FMI,BIA-BIA_Fat,BIA-BIA_Frame_num,BIA-BIA_ICW,BIA-BIA_LDM,BIA-BIA_LST,BIA-BIA_SMM,BIA-BIA_TBW,PAQ_A-PAQ_A_Total,PAQ_C-PAQ_C_Total,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-computerinternet_hoursday,Enc_1,Enc_2,Enc_3,Enc_4,Enc_5,Enc_6,Enc_7,Enc_8,Enc_9,Enc_10,Enc_11,Enc_12,Enc_13,Enc_14,Enc_15,Enc_16,Enc_17,Enc_18,Enc_19,Enc_20,Enc_21,Enc_22,Enc_23,Enc_25,Enc_26,Enc_27,Enc_28,Enc_29,Enc_31,Enc_32,Enc_34,Enc_35,Enc_36,Enc_37,Enc_38,Enc_39,Enc_40,Enc_41,Enc_42,Enc_43,Enc_45,Enc_47,Enc_48,Enc_49,Enc_50,Enc_51,Enc_52,Enc_53,Enc_54,Enc_55,Enc_56,Enc_58,Enc_59,Enc_60,Enc_61,Enc_62,Enc_63,Enc_64,PAQ_A-Season_Fall,PAQ_A-Season_Missing,PAQ_A-Season_Spring,PAQ_A-Season_Summer,PAQ_A-Season_Winter,CGAS-Season_Fall,CGAS-Season_Missing,CGAS-Season_Spring,CGAS-Season_Summer,CGAS-Season_Winter,Basic_Demos-Sex_0,Basic_Demos-Sex_1,PreInt_EduHx-Season_Fall,PreInt_EduHx-Season_Missing,PreInt_EduHx-Season_Spring,PreInt_EduHx-Season_Summer,PreInt_EduHx-Season_Winter,PAQ_C-Season_Fall,PAQ_C-Season_Missing,PAQ_C-Season_Spring,PAQ_C-Season_Summer,PAQ_C-Season_Winter,FGC-Season_Fall,FGC-Season_Missing,FGC-Season_Spring,FGC-Season_Summer,FGC-Season_Winter,Fitness_Endurance-Season_Fall,Fitness_Endurance-Season_Missing,Fitness_Endurance-Season_Spring,Fitness_Endurance-Season_Summer,Fitness_Endurance-Season_Winter,SDS-Season_Fall,SDS-Season_Missing,SDS-Season_Spring,SDS-Season_Summer,SDS-Season_Winter,BIA-Season_Fall,BIA-Season_Missing,BIA-Season_Spring,BIA-Season_Summer,BIA-Season_Winter,Basic_Demos-Enroll_Season_Fall,Basic_Demos-Enroll_Season_Spring,Basic_Demos-Enroll_Season_Summer,Basic_Demos-Enroll_Season_Winter,Physical-Season_Fall,Physical-Season_Missing,Physical-Season_Spring,Physical-Season_Summer,Physical-Season_Winter
0,5.0,51.0,16.877316,46.0,50.8,24.2,66.8,79.4,109.6,4.6,7.4,16.8,0.0,0.0,14.44,1.2,15.06,1.4,0.0,0.0,7.0,0.0,6.0,0.0,6.0,1.0,2.0,2.66855,16.8792,932.498,1492.0,8.25598,41.5862,13.8177,3.06143,9.21377,1.0,24.4349,8.89536,38.9177,19.5413,32.6909,1.912,2.17,42.4,59.4,3.0,4.648943,2.86272,3.83782,2.362754,4.824096,2.883555,7.938336,5.59528,5.473143,1.548328,0.610532,2.230198,5.765868,3.468719,1.86528,4.972034,1.201754,9.507165,3.455488,0.0,5.673689,5.616394,10.19935,4.570384,7.375259,2.553157,2.191334,3.677047,4.473263,5.950488,3.525138,0.0,6.884481,2.262771,2.635239,5.337497,0.459038,6.882023,0.0,3.810076,4.177385,8.677014,8.414383,2.293219,5.173482,10.378217,1.199374,5.194741,3.605842,3.352248,1.985008,1.653485,1.475087,3.530067,3.102172,0.35943,6.725377,0.689439,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,9.0,63.4,14.03559,48.0,46.0,22.0,75.0,70.0,122.0,5.4,8.4,19.2,3.0,0.0,21.82,1.8,24.44,2.0,5.0,0.0,11.0,1.0,11.0,1.0,3.0,0.0,2.0,2.57949,14.0371,936.656,1498.65,6.01993,42.0291,12.8254,1.21172,3.97085,1.0,21.0352,14.974,39.4497,15.4107,27.0552,1.582,2.34,46.0,64.0,0.0,7.225027,4.654813,5.707771,3.315377,1.614492,1.516619,5.262636,6.734442,6.344801,0.856954,3.727236,4.799614,6.251505,2.295703,3.671392,5.998921,2.632931,5.187693,1.097767,1.884954,7.591034,4.958785,3.887423,4.308725,2.913355,2.421994,4.201717,3.334835,5.151681,4.350394,4.208288,4.45628,4.187182,4.235772,4.713233,1.201684,2.942663,6.699873,0.487005,1.546179,2.424647,3.735351,3.231082,8.39476,6.670735,5.268567,1.595979,4.478597,4.668807,3.618142,0.341602,0.548136,1.590361,1.198129,6.590679,3.144306,9.164507,4.239794,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
2,10.0,71.0,16.648696,56.5,75.6,25.0,65.0,94.0,117.0,5.0,7.0,33.0,20.0,1.0,10.2,1.0,14.7,2.0,7.0,1.0,10.0,1.0,10.0,1.0,5.0,0.0,3.0,3.916544,18.6346,1039.784,1784.09,14.3219,53.01382,14.11984,4.514742,16.42618,1.8,25.79618,12.89574,49.09728,23.3954,40.11808,1.614,2.17,38.0,54.0,2.0,9.040421,4.273831,3.260446,0.701797,2.822473,18.231993,2.431175,10.054292,10.812925,16.455309,8.117317,8.840761,8.781365,2.349888,0.043987,9.928959,0.868555,9.606325,9.479357,5.175053,9.032331,5.809124,2.831523,13.388857,13.861341,15.405759,2.705917,8.429331,3.052455,7.486068,0.423141,2.490485,9.126925,17.947185,2.075995,12.117871,4.833192,12.477597,4.944942,2.207031,3.467912,3.889378,3.533822,15.909766,0.583314,19.029536,2.370717,10.098508,3.357332,3.111625,9.681407,5.590549,4.631087,12.116008,7.719337,8.477834,7.622598,0.196086,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
3,9.0,71.0,18.292347,56.0,81.6,28.2,60.0,97.0,117.0,6.0,9.0,37.0,18.0,1.0,17.64,1.6,19.22,1.8,5.0,0.0,7.0,0.0,7.0,0.0,7.0,1.0,3.0,3.84191,18.2943,1131.43,1923.44,15.5925,62.7757,14.074,4.22033,18.8243,2.0,30.4041,16.779,58.9338,26.4798,45.9966,1.796,2.451,31.0,45.0,0.0,0.0,3.35869,0.0,5.337131,1e-06,0.0,0.0,2.398546,4.813291,4.990461,1.405737,8.475198,0.0,0.0,6.87957,1.935769,8.775964,0.0,3.882035,6.520278,18.483984,7.693,0.673259,10.639287,0.0,5.649461,6.437056,8.081806,14.662292,6.477272,2.085419,17.302296,0.0,10.002832,7.267302,1.532843,5.770669,1.359145,0.0,0.0,0.0,0.0,4.486127,14.495865,4.542089,4.867955,0.0,0.0,3.569353,8.330843,0.0,0.0,2.021903,3.556332,6.78969,7.568647,0.0,4.528473,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,13.0,50.0,22.279952,59.5,112.2,29.6,60.0,73.0,102.0,4.2,5.6,35.6,12.0,0.0,16.5,2.0,17.9,2.0,6.0,0.0,10.0,1.0,11.0,1.0,8.0,0.0,2.0,4.33036,30.1865,1330.97,1996.45,30.2124,84.0285,16.6877,13.4988,67.9715,2.0,32.9141,20.902,79.6982,35.3804,63.1265,1.778,4.11,40.0,56.0,0.0,5.953392,0.0,7.549816,0.0,7.153245,7.322783,0.0,2.630853,16.680229,6.489077,3.026922,12.872902,0.0,6.549987,0.0,5.223313,0.0,6.083334,0.0,0.0,8.64847,11.530282,0.0,10.4684,13.598515,0.0,0.0,3.080362,3.346899,0.07572,0.893668,3.969584,0.0,15.206186,6.006083,3.238015,0.0,17.711552,0.0,0.0,0.0,9.211094,3.892674,5.166663,4.312238,19.678722,0.0,0.098976,8.960603,7.441569,8.930751,0.0,0.0,9.105422,3.539766,4.188061,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [53]:
imputer = KNNImputer(n_neighbors=5)
imputed_data_concat = imputer.fit_transform(concat_df)

In [55]:
concat_imputed = pd.DataFrame(imputed_data_concat, columns=concat_df.columns)
print(concat_imputed.shape)
concat_imputed.head()

(2756, 156)


Unnamed: 0,Basic_Demos-Age,CGAS-CGAS_Score,Physical-BMI,Physical-Height,Physical-Weight,Physical-Waist_Circumference,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,Fitness_Endurance-Max_Stage,Fitness_Endurance-Time_Mins,Fitness_Endurance-Time_Sec,FGC-FGC_CU,FGC-FGC_CU_Zone,FGC-FGC_GSND,FGC-FGC_GSND_Zone,FGC-FGC_GSD,FGC-FGC_GSD_Zone,FGC-FGC_PU,FGC-FGC_PU_Zone,FGC-FGC_SRL,FGC-FGC_SRL_Zone,FGC-FGC_SRR,FGC-FGC_SRR_Zone,FGC-FGC_TL,FGC-FGC_TL_Zone,BIA-BIA_Activity_Level_num,BIA-BIA_BMC,BIA-BIA_BMI,BIA-BIA_BMR,BIA-BIA_DEE,BIA-BIA_ECW,BIA-BIA_FFM,BIA-BIA_FFMI,BIA-BIA_FMI,BIA-BIA_Fat,BIA-BIA_Frame_num,BIA-BIA_ICW,BIA-BIA_LDM,BIA-BIA_LST,BIA-BIA_SMM,BIA-BIA_TBW,PAQ_A-PAQ_A_Total,PAQ_C-PAQ_C_Total,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-computerinternet_hoursday,Enc_1,Enc_2,Enc_3,Enc_4,Enc_5,Enc_6,Enc_7,Enc_8,Enc_9,Enc_10,Enc_11,Enc_12,Enc_13,Enc_14,Enc_15,Enc_16,Enc_17,Enc_18,Enc_19,Enc_20,Enc_21,Enc_22,Enc_23,Enc_25,Enc_26,Enc_27,Enc_28,Enc_29,Enc_31,Enc_32,Enc_34,Enc_35,Enc_36,Enc_37,Enc_38,Enc_39,Enc_40,Enc_41,Enc_42,Enc_43,Enc_45,Enc_47,Enc_48,Enc_49,Enc_50,Enc_51,Enc_52,Enc_53,Enc_54,Enc_55,Enc_56,Enc_58,Enc_59,Enc_60,Enc_61,Enc_62,Enc_63,Enc_64,PAQ_A-Season_Fall,PAQ_A-Season_Missing,PAQ_A-Season_Spring,PAQ_A-Season_Summer,PAQ_A-Season_Winter,CGAS-Season_Fall,CGAS-Season_Missing,CGAS-Season_Spring,CGAS-Season_Summer,CGAS-Season_Winter,Basic_Demos-Sex_0,Basic_Demos-Sex_1,PreInt_EduHx-Season_Fall,PreInt_EduHx-Season_Missing,PreInt_EduHx-Season_Spring,PreInt_EduHx-Season_Summer,PreInt_EduHx-Season_Winter,PAQ_C-Season_Fall,PAQ_C-Season_Missing,PAQ_C-Season_Spring,PAQ_C-Season_Summer,PAQ_C-Season_Winter,FGC-Season_Fall,FGC-Season_Missing,FGC-Season_Spring,FGC-Season_Summer,FGC-Season_Winter,Fitness_Endurance-Season_Fall,Fitness_Endurance-Season_Missing,Fitness_Endurance-Season_Spring,Fitness_Endurance-Season_Summer,Fitness_Endurance-Season_Winter,SDS-Season_Fall,SDS-Season_Missing,SDS-Season_Spring,SDS-Season_Summer,SDS-Season_Winter,BIA-Season_Fall,BIA-Season_Missing,BIA-Season_Spring,BIA-Season_Summer,BIA-Season_Winter,Basic_Demos-Enroll_Season_Fall,Basic_Demos-Enroll_Season_Spring,Basic_Demos-Enroll_Season_Summer,Basic_Demos-Enroll_Season_Winter,Physical-Season_Fall,Physical-Season_Missing,Physical-Season_Spring,Physical-Season_Summer,Physical-Season_Winter
0,5.0,51.0,16.877316,46.0,50.8,24.2,66.8,79.4,109.6,4.6,7.4,16.8,0.0,0.0,14.44,1.2,15.06,1.4,0.0,0.0,7.0,0.0,6.0,0.0,6.0,1.0,2.0,2.66855,16.8792,932.498,1492.0,8.25598,41.5862,13.8177,3.06143,9.21377,1.0,24.4349,8.89536,38.9177,19.5413,32.6909,1.912,2.17,42.4,59.4,3.0,4.648943,2.86272,3.83782,2.362754,4.824096,2.883555,7.938336,5.59528,5.473143,1.548328,0.610532,2.230198,5.765868,3.468719,1.86528,4.972034,1.201754,9.507165,3.455488,0.0,5.673689,5.616394,10.19935,4.570384,7.375259,2.553157,2.191334,3.677047,4.473263,5.950488,3.525138,0.0,6.884481,2.262771,2.635239,5.337497,0.459038,6.882023,0.0,3.810076,4.177385,8.677014,8.414383,2.293219,5.173482,10.378217,1.199374,5.194741,3.605842,3.352248,1.985008,1.653485,1.475087,3.530067,3.102172,0.35943,6.725377,0.689439,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,9.0,63.4,14.03559,48.0,46.0,22.0,75.0,70.0,122.0,5.4,8.4,19.2,3.0,0.0,21.82,1.8,24.44,2.0,5.0,0.0,11.0,1.0,11.0,1.0,3.0,0.0,2.0,2.57949,14.0371,936.656,1498.65,6.01993,42.0291,12.8254,1.21172,3.97085,1.0,21.0352,14.974,39.4497,15.4107,27.0552,1.582,2.34,46.0,64.0,0.0,7.225027,4.654813,5.707771,3.315377,1.614492,1.516619,5.262636,6.734442,6.344801,0.856954,3.727236,4.799614,6.251505,2.295703,3.671392,5.998921,2.632931,5.187693,1.097767,1.884954,7.591034,4.958785,3.887423,4.308725,2.913355,2.421994,4.201717,3.334835,5.151681,4.350394,4.208288,4.45628,4.187182,4.235772,4.713233,1.201684,2.942663,6.699873,0.487005,1.546179,2.424647,3.735351,3.231082,8.39476,6.670735,5.268567,1.595979,4.478597,4.668807,3.618142,0.341602,0.548136,1.590361,1.198129,6.590679,3.144306,9.164507,4.239794,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
2,10.0,71.0,16.648696,56.5,75.6,25.0,65.0,94.0,117.0,5.0,7.0,33.0,20.0,1.0,10.2,1.0,14.7,2.0,7.0,1.0,10.0,1.0,10.0,1.0,5.0,0.0,3.0,3.916544,18.6346,1039.784,1784.09,14.3219,53.01382,14.11984,4.514742,16.42618,1.8,25.79618,12.89574,49.09728,23.3954,40.11808,1.614,2.17,38.0,54.0,2.0,9.040421,4.273831,3.260446,0.701797,2.822473,18.231993,2.431175,10.054292,10.812925,16.455309,8.117317,8.840761,8.781365,2.349888,0.043987,9.928959,0.868555,9.606325,9.479357,5.175053,9.032331,5.809124,2.831523,13.388857,13.861341,15.405759,2.705917,8.429331,3.052455,7.486068,0.423141,2.490485,9.126925,17.947185,2.075995,12.117871,4.833192,12.477597,4.944942,2.207031,3.467912,3.889378,3.533822,15.909766,0.583314,19.029536,2.370717,10.098508,3.357332,3.111625,9.681407,5.590549,4.631087,12.116008,7.719337,8.477834,7.622598,0.196086,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
3,9.0,71.0,18.292347,56.0,81.6,28.2,60.0,97.0,117.0,6.0,9.0,37.0,18.0,1.0,17.64,1.6,19.22,1.8,5.0,0.0,7.0,0.0,7.0,0.0,7.0,1.0,3.0,3.84191,18.2943,1131.43,1923.44,15.5925,62.7757,14.074,4.22033,18.8243,2.0,30.4041,16.779,58.9338,26.4798,45.9966,1.796,2.451,31.0,45.0,0.0,0.0,3.35869,0.0,5.337131,1e-06,0.0,0.0,2.398546,4.813291,4.990461,1.405737,8.475198,0.0,0.0,6.87957,1.935769,8.775964,0.0,3.882035,6.520278,18.483984,7.693,0.673259,10.639287,0.0,5.649461,6.437056,8.081806,14.662292,6.477272,2.085419,17.302296,0.0,10.002832,7.267302,1.532843,5.770669,1.359145,0.0,0.0,0.0,0.0,4.486127,14.495865,4.542089,4.867955,0.0,0.0,3.569353,8.330843,0.0,0.0,2.021903,3.556332,6.78969,7.568647,0.0,4.528473,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,13.0,50.0,22.279952,59.5,112.2,29.6,60.0,73.0,102.0,4.2,5.6,35.6,12.0,0.0,16.5,2.0,17.9,2.0,6.0,0.0,10.0,1.0,11.0,1.0,8.0,0.0,2.0,4.33036,30.1865,1330.97,1996.45,30.2124,84.0285,16.6877,13.4988,67.9715,2.0,32.9141,20.902,79.6982,35.3804,63.1265,1.778,4.11,40.0,56.0,0.0,5.953392,0.0,7.549816,0.0,7.153245,7.322783,0.0,2.630853,16.680229,6.489077,3.026922,12.872902,0.0,6.549987,0.0,5.223313,0.0,6.083334,0.0,0.0,8.64847,11.530282,0.0,10.4684,13.598515,0.0,0.0,3.080362,3.346899,0.07572,0.893668,3.969584,0.0,15.206186,6.006083,3.238015,0.0,17.711552,0.0,0.0,0.0,9.211094,3.892674,5.166663,4.312238,19.678722,0.0,0.098976,8.960603,7.441569,8.930751,0.0,0.0,9.105422,3.539766,4.188061,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [57]:
concat_imputed.isna().sum().sum()

0

In [58]:
test_imputed = concat_imputed.tail(test.shape[0])
print(test_imputed.shape)
test_imputed.head()

(20, 156)


Unnamed: 0,Basic_Demos-Age,CGAS-CGAS_Score,Physical-BMI,Physical-Height,Physical-Weight,Physical-Waist_Circumference,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,Fitness_Endurance-Max_Stage,Fitness_Endurance-Time_Mins,Fitness_Endurance-Time_Sec,FGC-FGC_CU,FGC-FGC_CU_Zone,FGC-FGC_GSND,FGC-FGC_GSND_Zone,FGC-FGC_GSD,FGC-FGC_GSD_Zone,FGC-FGC_PU,FGC-FGC_PU_Zone,FGC-FGC_SRL,FGC-FGC_SRL_Zone,FGC-FGC_SRR,FGC-FGC_SRR_Zone,FGC-FGC_TL,FGC-FGC_TL_Zone,BIA-BIA_Activity_Level_num,BIA-BIA_BMC,BIA-BIA_BMI,BIA-BIA_BMR,BIA-BIA_DEE,BIA-BIA_ECW,BIA-BIA_FFM,BIA-BIA_FFMI,BIA-BIA_FMI,BIA-BIA_Fat,BIA-BIA_Frame_num,BIA-BIA_ICW,BIA-BIA_LDM,BIA-BIA_LST,BIA-BIA_SMM,BIA-BIA_TBW,PAQ_A-PAQ_A_Total,PAQ_C-PAQ_C_Total,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-computerinternet_hoursday,Enc_1,Enc_2,Enc_3,Enc_4,Enc_5,Enc_6,Enc_7,Enc_8,Enc_9,Enc_10,Enc_11,Enc_12,Enc_13,Enc_14,Enc_15,Enc_16,Enc_17,Enc_18,Enc_19,Enc_20,Enc_21,Enc_22,Enc_23,Enc_25,Enc_26,Enc_27,Enc_28,Enc_29,Enc_31,Enc_32,Enc_34,Enc_35,Enc_36,Enc_37,Enc_38,Enc_39,Enc_40,Enc_41,Enc_42,Enc_43,Enc_45,Enc_47,Enc_48,Enc_49,Enc_50,Enc_51,Enc_52,Enc_53,Enc_54,Enc_55,Enc_56,Enc_58,Enc_59,Enc_60,Enc_61,Enc_62,Enc_63,Enc_64,PAQ_A-Season_Fall,PAQ_A-Season_Missing,PAQ_A-Season_Spring,PAQ_A-Season_Summer,PAQ_A-Season_Winter,CGAS-Season_Fall,CGAS-Season_Missing,CGAS-Season_Spring,CGAS-Season_Summer,CGAS-Season_Winter,Basic_Demos-Sex_0,Basic_Demos-Sex_1,PreInt_EduHx-Season_Fall,PreInt_EduHx-Season_Missing,PreInt_EduHx-Season_Spring,PreInt_EduHx-Season_Summer,PreInt_EduHx-Season_Winter,PAQ_C-Season_Fall,PAQ_C-Season_Missing,PAQ_C-Season_Spring,PAQ_C-Season_Summer,PAQ_C-Season_Winter,FGC-Season_Fall,FGC-Season_Missing,FGC-Season_Spring,FGC-Season_Summer,FGC-Season_Winter,Fitness_Endurance-Season_Fall,Fitness_Endurance-Season_Missing,Fitness_Endurance-Season_Spring,Fitness_Endurance-Season_Summer,Fitness_Endurance-Season_Winter,SDS-Season_Fall,SDS-Season_Missing,SDS-Season_Spring,SDS-Season_Summer,SDS-Season_Winter,BIA-Season_Fall,BIA-Season_Missing,BIA-Season_Spring,BIA-Season_Summer,BIA-Season_Winter,Basic_Demos-Enroll_Season_Fall,Basic_Demos-Enroll_Season_Spring,Basic_Demos-Enroll_Season_Summer,Basic_Demos-Enroll_Season_Winter,Physical-Season_Fall,Physical-Season_Missing,Physical-Season_Spring,Physical-Season_Summer,Physical-Season_Winter
2736,5.0,51.0,16.877316,46.0,50.8,23.64,71.24,78.6,121.52,4.44,6.56,26.52,0.0,0.0,15.9,1.6,16.744,1.68,0.0,0.0,7.0,0.0,6.0,0.0,6.0,1.0,2.0,2.66855,16.8792,932.498,1492.0,8.25598,41.5862,13.8177,3.06143,9.21377,1.0,24.4349,8.89536,38.9177,19.5413,32.6909,1.7544,2.5612,39.88,56.28,3.0,7.433673,4.547935,6.288887,5.037417,2.898356,3.82815,5.771692,6.85616,5.406296,4.72754,3.279028,4.325552,9.149574,2.142903,4.578232,5.900305,1.797518,7.84212,5.396106,1.563774,7.779565,4.81584,4.735718,4.5757,5.736856,5.406878,6.138075,4.237088,4.756045,6.416023,4.073981,4.64158,7.86387,4.881336,3.803687,3.547558,4.11461,8.247804,1.942005,1.480973,3.739915,5.344204,3.765329,7.64449,7.382208,8.283773,3.099824,5.565895,3.379328,4.370356,1.01859,3.699757,1.805153,2.722966,5.361347,3.634816,11.460936,3.800648,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2737,9.0,58.48,14.03559,48.0,46.0,22.0,75.0,70.0,122.0,5.04,7.6,27.4,3.0,0.0,16.764,1.64,17.708,1.72,5.0,0.0,11.0,1.0,11.0,1.0,3.0,0.0,2.0,2.57949,14.0371,936.656,1498.65,6.01993,42.0291,12.8254,1.21172,3.97085,1.0,21.0352,14.974,39.4497,15.4107,27.0552,1.5472,2.34,46.0,64.0,0.0,4.968961,4.442031,6.939918,4.338598,2.770748,2.756649,5.029458,6.725339,3.878906,3.963945,4.852677,4.802514,8.71545,1.648544,5.074675,3.622922,0.950817,6.627649,7.750504,1.992468,12.508244,5.387814,5.382751,4.681806,4.711883,6.182888,7.273111,6.9442,6.026696,4.569112,3.632581,9.365758,6.649919,8.766185,2.062863,5.196027,6.82679,6.164728,0.35307,1.475859,2.387157,3.991797,5.783901,11.518128,7.902459,10.730605,4.169021,3.177545,2.816072,6.052052,1.950807,4.685744,3.175619,3.461603,5.113039,8.814765,9.743268,5.869051,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
2738,10.0,71.0,16.648696,56.5,75.6,26.12,65.0,94.0,117.0,5.0,7.0,33.0,20.0,1.0,10.2,1.0,14.7,2.0,7.0,1.0,10.0,1.0,10.0,1.0,5.0,0.0,2.44,4.860585,17.389876,1112.2964,1810.096,14.745111,60.737268,13.855436,3.534431,15.30273,1.76,29.241208,16.750964,55.876692,29.259392,43.986324,1.76592,2.17,38.0,54.0,2.0,6.697084,2.7503,4.340476,3.330298,2.134946,6.854732,3.169771,5.800215,7.265112,6.662075,5.83077,4.614444,5.238506,2.481096,4.056074,6.480171,3.634648,6.581656,4.398239,2.777793,9.25078,3.906131,6.944033,8.699897,6.888328,5.751594,5.100464,6.585642,4.758694,3.561332,2.25409,4.89937,5.421168,9.119006,3.917411,5.637057,4.331605,6.410985,2.7657,2.225433,2.602544,3.98549,7.796484,9.249817,3.162863,8.821357,3.627707,5.669035,2.711339,4.051499,4.201324,1.742874,4.660177,3.701572,4.156318,5.701365,3.919651,2.620818,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
2739,9.0,71.0,18.292347,56.0,81.6,27.28,60.0,97.0,117.0,6.0,9.0,37.0,18.0,1.0,16.628,1.6,17.872,1.84,5.0,0.0,7.0,0.0,7.0,0.0,7.0,1.0,3.0,3.84191,18.2943,1131.43,1923.44,15.5925,62.7757,14.074,4.22033,18.8243,2.0,30.4041,16.779,58.9338,26.4798,45.9966,1.87632,2.451,31.0,45.0,0.0,0.0,3.35869,0.0,5.337131,1e-06,0.0,0.0,2.398546,4.813291,4.990461,1.405737,8.475198,0.0,0.0,6.87957,1.935769,8.775964,0.0,3.882035,6.520278,18.483984,7.693,0.673259,10.639287,0.0,5.649461,6.437056,8.081806,14.662292,6.477272,2.085419,17.302296,0.0,10.002832,7.267302,1.532843,5.770669,1.359145,0.0,0.0,0.0,0.0,4.486127,14.495865,4.542089,4.867955,0.0,0.0,3.569353,8.330843,0.0,0.0,2.021903,3.556332,6.78969,7.568647,0.0,4.528473,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2740,18.0,64.6,25.077397,66.5,159.004,32.08,71.92,85.56,118.68,5.12,7.6,27.2,18.16,0.44,30.46,1.8,31.732,1.72,9.24,0.48,7.798,0.48,8.24,0.56,10.28,0.8,2.48,5.197497,24.912532,1478.2584,2347.2708,33.184904,99.716796,16.124564,8.78795,54.771184,2.16,40.078936,26.453056,94.519284,47.053444,73.26384,1.04,2.6676,41.32,57.96,2.2,9.400064,4.55134,5.336944,2.825522,3.31189,9.826693,3.195566,9.899724,8.433462,9.607602,4.862829,6.79766,7.719397,3.41614,3.62288,8.579611,1.973558,8.475323,5.34425,3.890325,6.591288,8.120471,2.979621,5.971413,10.577398,7.255359,4.757495,3.640961,4.056548,7.151607,3.163836,4.198445,7.948455,7.691755,4.961006,5.035649,4.755392,12.226996,4.273849,2.720008,3.383621,5.743136,5.131161,6.630667,3.42635,10.278753,2.4771,7.448494,3.342066,3.894833,2.800315,3.722895,2.237977,4.926546,6.155309,4.432342,9.462136,1.851145,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [59]:
train_imputed = feature_engineering(train_imputed)
test_imputed = feature_engineering(test_imputed)

In [60]:
if np.any(np.isinf(train_imputed)):
    print('inf in train data')
    train_imputed = train_imputed.replace([np.inf, -np.inf], np.nan)
if np.any(np.isinf(test_imputed)):
    print('inf in test data')
    test_imputed = test_imputed.replace([np.inf, -np.inf], np.nan)

inf in train data


In [61]:
imputer = KNNImputer(n_neighbors=5)
imputed_data = imputer.fit_transform(train_imputed)

In [62]:
train_imputed = pd.DataFrame(imputed_data, columns=train_imputed.columns)
print(train_imputed.shape)
train_imputed.head()

(2736, 197)


Unnamed: 0,Basic_Demos-Age,CGAS-CGAS_Score,Physical-BMI,Physical-Height,Physical-Weight,Physical-Waist_Circumference,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,Fitness_Endurance-Max_Stage,Fitness_Endurance-Time_Mins,Fitness_Endurance-Time_Sec,FGC-FGC_CU,FGC-FGC_CU_Zone,FGC-FGC_GSND,FGC-FGC_GSND_Zone,FGC-FGC_GSD,FGC-FGC_GSD_Zone,FGC-FGC_PU,FGC-FGC_PU_Zone,FGC-FGC_SRL,FGC-FGC_SRL_Zone,FGC-FGC_SRR,FGC-FGC_SRR_Zone,FGC-FGC_TL,FGC-FGC_TL_Zone,BIA-BIA_Activity_Level_num,BIA-BIA_BMC,BIA-BIA_BMI,BIA-BIA_BMR,BIA-BIA_DEE,BIA-BIA_ECW,BIA-BIA_FFM,BIA-BIA_FFMI,BIA-BIA_FMI,BIA-BIA_Fat,BIA-BIA_Frame_num,BIA-BIA_ICW,BIA-BIA_LDM,BIA-BIA_LST,BIA-BIA_SMM,BIA-BIA_TBW,PAQ_A-PAQ_A_Total,PAQ_C-PAQ_C_Total,PCIAT-PCIAT_01,PCIAT-PCIAT_02,PCIAT-PCIAT_03,PCIAT-PCIAT_04,PCIAT-PCIAT_05,PCIAT-PCIAT_06,PCIAT-PCIAT_07,PCIAT-PCIAT_08,PCIAT-PCIAT_09,PCIAT-PCIAT_10,PCIAT-PCIAT_11,PCIAT-PCIAT_12,PCIAT-PCIAT_13,PCIAT-PCIAT_14,PCIAT-PCIAT_15,PCIAT-PCIAT_16,PCIAT-PCIAT_17,PCIAT-PCIAT_18,PCIAT-PCIAT_19,PCIAT-PCIAT_20,PCIAT-PCIAT_Total,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-computerinternet_hoursday,sii,Enc_1,Enc_2,Enc_3,Enc_4,Enc_5,Enc_6,Enc_7,Enc_8,Enc_9,Enc_10,Enc_11,Enc_12,Enc_13,Enc_14,Enc_15,Enc_16,Enc_17,Enc_18,Enc_19,Enc_20,Enc_21,Enc_22,Enc_23,Enc_25,Enc_26,Enc_27,Enc_28,Enc_29,Enc_31,Enc_32,Enc_34,Enc_35,Enc_36,Enc_37,Enc_38,Enc_39,Enc_40,Enc_41,Enc_42,Enc_43,Enc_45,Enc_47,Enc_48,Enc_49,Enc_50,Enc_51,Enc_52,Enc_53,Enc_54,Enc_55,Enc_56,Enc_58,Enc_59,Enc_60,Enc_61,Enc_62,Enc_63,Enc_64,PAQ_A-Season_Fall,PAQ_A-Season_Missing,PAQ_A-Season_Spring,PAQ_A-Season_Summer,PAQ_A-Season_Winter,CGAS-Season_Fall,CGAS-Season_Missing,CGAS-Season_Spring,CGAS-Season_Summer,CGAS-Season_Winter,Basic_Demos-Sex_0,Basic_Demos-Sex_1,PCIAT-Season_Fall,PCIAT-Season_Spring,PCIAT-Season_Summer,PCIAT-Season_Winter,PreInt_EduHx-Season_Fall,PreInt_EduHx-Season_Missing,PreInt_EduHx-Season_Spring,PreInt_EduHx-Season_Summer,PreInt_EduHx-Season_Winter,PAQ_C-Season_Fall,PAQ_C-Season_Missing,PAQ_C-Season_Spring,PAQ_C-Season_Summer,PAQ_C-Season_Winter,FGC-Season_Fall,FGC-Season_Missing,FGC-Season_Spring,FGC-Season_Summer,FGC-Season_Winter,Fitness_Endurance-Season_Fall,Fitness_Endurance-Season_Missing,Fitness_Endurance-Season_Spring,Fitness_Endurance-Season_Summer,Fitness_Endurance-Season_Winter,SDS-Season_Fall,SDS-Season_Missing,SDS-Season_Spring,SDS-Season_Summer,SDS-Season_Winter,BIA-Season_Fall,BIA-Season_Missing,BIA-Season_Spring,BIA-Season_Summer,BIA-Season_Winter,Basic_Demos-Enroll_Season_Fall,Basic_Demos-Enroll_Season_Spring,Basic_Demos-Enroll_Season_Summer,Basic_Demos-Enroll_Season_Winter,Physical-Season_Fall,Physical-Season_Missing,Physical-Season_Spring,Physical-Season_Summer,Physical-Season_Winter,BMI_Age,Internet_Hours_Age,BMI_Internet_Hours,BFP_BMI,FFMI_BFP,FMI_BFP,LST_TBW,BFP_BMR,BFP_DEE,BMR_Weight,DEE_Weight,SMM_Height,Muscle_to_Fat,Hydration_Status,ICW_TBW
0,5.0,51.0,16.877316,46.0,50.8,24.2,66.8,79.4,109.6,4.6,7.4,16.8,0.0,0.0,14.44,1.2,15.06,1.4,0.0,0.0,7.0,0.0,6.0,0.0,6.0,1.0,2.0,2.66855,16.8792,932.498,1492.0,8.25598,41.5862,13.8177,3.06143,9.21377,1.0,24.4349,8.89536,38.9177,19.5413,32.6909,1.912,2.17,5.0,4.0,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,4.0,0.0,4.0,4.0,4.0,4.0,4.0,4.0,2.0,4.0,55.0,42.4,59.4,3.0,2.0,4.648943,2.86272,3.83782,2.362754,4.824096,2.883555,7.938336,5.59528,5.473143,1.548328,0.610532,2.230198,5.765868,3.468719,1.86528,4.972034,1.201754,9.507165,3.455488,0.0,5.673689,5.616394,10.19935,4.570384,7.375259,2.553157,2.191334,3.677047,4.473263,5.950488,3.525138,0.0,6.884481,2.262771,2.635239,5.337497,0.459038,6.882023,0.0,3.810076,4.177385,8.677014,8.414383,2.293219,5.173482,10.378217,1.199374,5.194741,3.605842,3.352248,1.985008,1.653485,1.475087,3.530067,3.102172,0.35943,6.725377,0.689439,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,84.386578,15.0,50.631947,0.545865,1.499679,0.332267,1.190475,8591.822097,13746.94484,18.35626,29.370079,0.424811,6.383063,0.643522,0.747453
1,9.0,63.4,14.03559,48.0,46.0,22.0,75.0,70.0,122.0,5.4,8.4,19.2,3.0,0.0,21.82,1.8,24.44,2.0,5.0,0.0,11.0,1.0,11.0,1.0,3.0,0.0,2.0,2.57949,14.0371,936.656,1498.65,6.01993,42.0291,12.8254,1.21172,3.97085,1.0,21.0352,14.974,39.4497,15.4107,27.0552,1.582,2.34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,46.0,64.0,0.0,0.0,7.225027,4.654813,5.707771,3.315377,1.614492,1.516619,5.262636,6.734442,6.344801,0.856954,3.727236,4.799614,6.251505,2.295703,3.671392,5.998921,2.632931,5.187693,1.097767,1.884954,7.591034,4.958785,3.887423,4.308725,2.913355,2.421994,4.201717,3.334835,5.151681,4.350394,4.208288,4.45628,4.187182,4.235772,4.713233,1.201684,2.942663,6.699873,0.487005,1.546179,2.424647,3.735351,3.231082,8.39476,6.670735,5.268567,1.595979,4.478597,4.668807,3.618142,0.341602,0.548136,1.590361,1.198129,6.590679,3.144306,9.164507,4.239794,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,126.320313,0.0,0.0,0.282883,3.229888,0.305154,1.458119,3719.320478,5950.914352,20.362087,32.579348,0.321056,12.718037,0.588157,0.777492
2,10.0,71.0,16.648696,56.5,75.6,25.0,65.0,94.0,117.0,5.0,7.0,33.0,20.0,1.0,10.2,1.0,14.7,2.0,7.0,1.0,10.0,1.0,10.0,1.0,5.0,0.0,3.0,3.916544,18.6346,1039.784,1784.09,14.3219,53.01382,14.11984,4.514742,16.42618,1.8,25.79618,12.89574,49.09728,23.3954,40.11808,1.614,2.17,5.0,2.0,2.0,1.0,2.0,1.0,1.0,2.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,2.0,2.0,1.0,1.0,28.0,38.0,54.0,2.0,0.0,9.040421,4.273831,3.260446,0.701797,2.822473,18.231993,2.431175,10.054292,10.812925,16.455309,8.117317,8.840761,8.781365,2.349888,0.043987,9.928959,0.868555,9.606325,9.479357,5.175053,9.032331,5.809124,2.831523,13.388857,13.861341,15.405759,2.705917,8.429331,3.052455,7.486068,0.423141,2.490485,9.126925,17.947185,2.075995,12.117871,4.833192,12.477597,4.944942,2.207031,3.467912,3.889378,3.533822,15.909766,0.583314,19.029536,2.370717,10.098508,3.357332,3.111625,9.681407,5.590549,4.631087,12.116008,7.719337,8.477834,7.622598,0.196086,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,166.486961,20.0,33.297392,0.881488,0.859594,0.27485,1.223819,17079.679145,29305.783476,13.753757,23.599074,0.414078,5.182002,0.530662,0.643006
3,9.0,71.0,18.292347,56.0,81.6,28.2,60.0,97.0,117.0,6.0,9.0,37.0,18.0,1.0,17.64,1.6,19.22,1.8,5.0,0.0,7.0,0.0,7.0,0.0,7.0,1.0,3.0,3.84191,18.2943,1131.43,1923.44,15.5925,62.7757,14.074,4.22033,18.8243,2.0,30.4041,16.779,58.9338,26.4798,45.9966,1.796,2.451,4.0,2.0,4.0,0.0,5.0,1.0,0.0,3.0,2.0,2.0,3.0,0.0,3.0,0.0,0.0,3.0,4.0,3.0,4.0,1.0,44.0,31.0,45.0,0.0,1.0,0.0,3.35869,0.0,5.337131,1e-06,0.0,0.0,2.398546,4.813291,4.990461,1.405737,8.475198,0.0,0.0,6.87957,1.935769,8.775964,0.0,3.882035,6.520278,18.483984,7.693,0.673259,10.639287,0.0,5.649461,6.437056,8.081806,14.662292,6.477272,2.085419,17.302296,0.0,10.002832,7.267302,1.532843,5.770669,1.359145,0.0,0.0,0.0,0.0,4.486127,14.495865,4.542089,4.867955,0.0,0.0,3.569353,8.330843,0.0,0.0,2.021903,3.556332,6.78969,7.568647,0.0,4.528473,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,164.631122,0.0,0.0,1.028971,0.747651,0.224196,1.281264,21298.377749,36207.411592,13.865564,23.571569,0.472854,6.274343,0.563684,0.661008
4,13.0,50.0,22.279952,59.5,112.2,29.6,60.0,73.0,102.0,4.2,5.6,35.6,12.0,0.0,16.5,2.0,17.9,2.0,6.0,0.0,10.0,1.0,11.0,1.0,8.0,0.0,2.0,4.33036,30.1865,1330.97,1996.45,30.2124,84.0285,16.6877,13.4988,67.9715,2.0,32.9141,20.902,79.6982,35.3804,63.1265,1.778,4.11,3.0,3.0,3.0,0.0,2.0,1.0,0.0,2.0,2.0,1.0,0.0,1.0,3.0,3.0,2.0,1.0,3.0,1.0,2.0,1.0,34.0,40.0,56.0,0.0,1.0,5.953392,0.0,7.549816,0.0,7.153245,7.322783,0.0,2.630853,16.680229,6.489077,3.026922,12.872902,0.0,6.549987,0.0,5.223313,0.0,6.083334,0.0,0.0,8.64847,11.530282,0.0,10.4684,13.598515,0.0,0.0,3.080362,3.346899,0.07572,0.893668,3.969584,0.0,15.206186,6.006083,3.238015,0.0,17.711552,0.0,0.0,0.0,9.211094,3.892674,5.166663,4.312238,19.678722,0.0,0.098976,8.960603,7.441569,8.930751,0.0,0.0,9.105422,3.539766,4.188061,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,289.639376,0.0,0.0,2.251718,0.24551,0.198595,1.262516,90468.027355,135701.701175,11.862478,17.793672,0.594629,2.621003,0.562625,0.521399


In [63]:
train_imputed.isna().sum().sum()

0

In [64]:
concat_df = pd.concat([train_imputed[test_imputed.columns], test_imputed], ignore_index=True)
print(concat_df.shape)
concat_df.head()

(2756, 171)


Unnamed: 0,Basic_Demos-Age,CGAS-CGAS_Score,Physical-BMI,Physical-Height,Physical-Weight,Physical-Waist_Circumference,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,Fitness_Endurance-Max_Stage,Fitness_Endurance-Time_Mins,Fitness_Endurance-Time_Sec,FGC-FGC_CU,FGC-FGC_CU_Zone,FGC-FGC_GSND,FGC-FGC_GSND_Zone,FGC-FGC_GSD,FGC-FGC_GSD_Zone,FGC-FGC_PU,FGC-FGC_PU_Zone,FGC-FGC_SRL,FGC-FGC_SRL_Zone,FGC-FGC_SRR,FGC-FGC_SRR_Zone,FGC-FGC_TL,FGC-FGC_TL_Zone,BIA-BIA_Activity_Level_num,BIA-BIA_BMC,BIA-BIA_BMI,BIA-BIA_BMR,BIA-BIA_DEE,BIA-BIA_ECW,BIA-BIA_FFM,BIA-BIA_FFMI,BIA-BIA_FMI,BIA-BIA_Fat,BIA-BIA_Frame_num,BIA-BIA_ICW,BIA-BIA_LDM,BIA-BIA_LST,BIA-BIA_SMM,BIA-BIA_TBW,PAQ_A-PAQ_A_Total,PAQ_C-PAQ_C_Total,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-computerinternet_hoursday,Enc_1,Enc_2,Enc_3,Enc_4,Enc_5,Enc_6,Enc_7,Enc_8,Enc_9,Enc_10,Enc_11,Enc_12,Enc_13,Enc_14,Enc_15,Enc_16,Enc_17,Enc_18,Enc_19,Enc_20,Enc_21,Enc_22,Enc_23,Enc_25,Enc_26,Enc_27,Enc_28,Enc_29,Enc_31,Enc_32,Enc_34,Enc_35,Enc_36,Enc_37,Enc_38,Enc_39,Enc_40,Enc_41,Enc_42,Enc_43,Enc_45,Enc_47,Enc_48,Enc_49,Enc_50,Enc_51,Enc_52,Enc_53,Enc_54,Enc_55,Enc_56,Enc_58,Enc_59,Enc_60,Enc_61,Enc_62,Enc_63,Enc_64,PAQ_A-Season_Fall,PAQ_A-Season_Missing,PAQ_A-Season_Spring,PAQ_A-Season_Summer,PAQ_A-Season_Winter,CGAS-Season_Fall,CGAS-Season_Missing,CGAS-Season_Spring,CGAS-Season_Summer,CGAS-Season_Winter,Basic_Demos-Sex_0,Basic_Demos-Sex_1,PreInt_EduHx-Season_Fall,PreInt_EduHx-Season_Missing,PreInt_EduHx-Season_Spring,PreInt_EduHx-Season_Summer,PreInt_EduHx-Season_Winter,PAQ_C-Season_Fall,PAQ_C-Season_Missing,PAQ_C-Season_Spring,PAQ_C-Season_Summer,PAQ_C-Season_Winter,FGC-Season_Fall,FGC-Season_Missing,FGC-Season_Spring,FGC-Season_Summer,FGC-Season_Winter,Fitness_Endurance-Season_Fall,Fitness_Endurance-Season_Missing,Fitness_Endurance-Season_Spring,Fitness_Endurance-Season_Summer,Fitness_Endurance-Season_Winter,SDS-Season_Fall,SDS-Season_Missing,SDS-Season_Spring,SDS-Season_Summer,SDS-Season_Winter,BIA-Season_Fall,BIA-Season_Missing,BIA-Season_Spring,BIA-Season_Summer,BIA-Season_Winter,Basic_Demos-Enroll_Season_Fall,Basic_Demos-Enroll_Season_Spring,Basic_Demos-Enroll_Season_Summer,Basic_Demos-Enroll_Season_Winter,Physical-Season_Fall,Physical-Season_Missing,Physical-Season_Spring,Physical-Season_Summer,Physical-Season_Winter,BMI_Age,Internet_Hours_Age,BMI_Internet_Hours,BFP_BMI,FFMI_BFP,FMI_BFP,LST_TBW,BFP_BMR,BFP_DEE,BMR_Weight,DEE_Weight,SMM_Height,Muscle_to_Fat,Hydration_Status,ICW_TBW
0,5.0,51.0,16.877316,46.0,50.8,24.2,66.8,79.4,109.6,4.6,7.4,16.8,0.0,0.0,14.44,1.2,15.06,1.4,0.0,0.0,7.0,0.0,6.0,0.0,6.0,1.0,2.0,2.66855,16.8792,932.498,1492.0,8.25598,41.5862,13.8177,3.06143,9.21377,1.0,24.4349,8.89536,38.9177,19.5413,32.6909,1.912,2.17,42.4,59.4,3.0,4.648943,2.86272,3.83782,2.362754,4.824096,2.883555,7.938336,5.59528,5.473143,1.548328,0.610532,2.230198,5.765868,3.468719,1.86528,4.972034,1.201754,9.507165,3.455488,0.0,5.673689,5.616394,10.19935,4.570384,7.375259,2.553157,2.191334,3.677047,4.473263,5.950488,3.525138,0.0,6.884481,2.262771,2.635239,5.337497,0.459038,6.882023,0.0,3.810076,4.177385,8.677014,8.414383,2.293219,5.173482,10.378217,1.199374,5.194741,3.605842,3.352248,1.985008,1.653485,1.475087,3.530067,3.102172,0.35943,6.725377,0.689439,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,84.386578,15.0,50.631947,0.545865,1.499679,0.332267,1.190475,8591.822097,13746.94484,18.35626,29.370079,0.424811,6.383063,0.643522,0.747453
1,9.0,63.4,14.03559,48.0,46.0,22.0,75.0,70.0,122.0,5.4,8.4,19.2,3.0,0.0,21.82,1.8,24.44,2.0,5.0,0.0,11.0,1.0,11.0,1.0,3.0,0.0,2.0,2.57949,14.0371,936.656,1498.65,6.01993,42.0291,12.8254,1.21172,3.97085,1.0,21.0352,14.974,39.4497,15.4107,27.0552,1.582,2.34,46.0,64.0,0.0,7.225027,4.654813,5.707771,3.315377,1.614492,1.516619,5.262636,6.734442,6.344801,0.856954,3.727236,4.799614,6.251505,2.295703,3.671392,5.998921,2.632931,5.187693,1.097767,1.884954,7.591034,4.958785,3.887423,4.308725,2.913355,2.421994,4.201717,3.334835,5.151681,4.350394,4.208288,4.45628,4.187182,4.235772,4.713233,1.201684,2.942663,6.699873,0.487005,1.546179,2.424647,3.735351,3.231082,8.39476,6.670735,5.268567,1.595979,4.478597,4.668807,3.618142,0.341602,0.548136,1.590361,1.198129,6.590679,3.144306,9.164507,4.239794,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,126.320313,0.0,0.0,0.282883,3.229888,0.305154,1.458119,3719.320478,5950.914352,20.362087,32.579348,0.321056,12.718037,0.588157,0.777492
2,10.0,71.0,16.648696,56.5,75.6,25.0,65.0,94.0,117.0,5.0,7.0,33.0,20.0,1.0,10.2,1.0,14.7,2.0,7.0,1.0,10.0,1.0,10.0,1.0,5.0,0.0,3.0,3.916544,18.6346,1039.784,1784.09,14.3219,53.01382,14.11984,4.514742,16.42618,1.8,25.79618,12.89574,49.09728,23.3954,40.11808,1.614,2.17,38.0,54.0,2.0,9.040421,4.273831,3.260446,0.701797,2.822473,18.231993,2.431175,10.054292,10.812925,16.455309,8.117317,8.840761,8.781365,2.349888,0.043987,9.928959,0.868555,9.606325,9.479357,5.175053,9.032331,5.809124,2.831523,13.388857,13.861341,15.405759,2.705917,8.429331,3.052455,7.486068,0.423141,2.490485,9.126925,17.947185,2.075995,12.117871,4.833192,12.477597,4.944942,2.207031,3.467912,3.889378,3.533822,15.909766,0.583314,19.029536,2.370717,10.098508,3.357332,3.111625,9.681407,5.590549,4.631087,12.116008,7.719337,8.477834,7.622598,0.196086,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,166.486961,20.0,33.297392,0.881488,0.859594,0.27485,1.223819,17079.679145,29305.783476,13.753757,23.599074,0.414078,5.182002,0.530662,0.643006
3,9.0,71.0,18.292347,56.0,81.6,28.2,60.0,97.0,117.0,6.0,9.0,37.0,18.0,1.0,17.64,1.6,19.22,1.8,5.0,0.0,7.0,0.0,7.0,0.0,7.0,1.0,3.0,3.84191,18.2943,1131.43,1923.44,15.5925,62.7757,14.074,4.22033,18.8243,2.0,30.4041,16.779,58.9338,26.4798,45.9966,1.796,2.451,31.0,45.0,0.0,0.0,3.35869,0.0,5.337131,1e-06,0.0,0.0,2.398546,4.813291,4.990461,1.405737,8.475198,0.0,0.0,6.87957,1.935769,8.775964,0.0,3.882035,6.520278,18.483984,7.693,0.673259,10.639287,0.0,5.649461,6.437056,8.081806,14.662292,6.477272,2.085419,17.302296,0.0,10.002832,7.267302,1.532843,5.770669,1.359145,0.0,0.0,0.0,0.0,4.486127,14.495865,4.542089,4.867955,0.0,0.0,3.569353,8.330843,0.0,0.0,2.021903,3.556332,6.78969,7.568647,0.0,4.528473,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,164.631122,0.0,0.0,1.028971,0.747651,0.224196,1.281264,21298.377749,36207.411592,13.865564,23.571569,0.472854,6.274343,0.563684,0.661008
4,13.0,50.0,22.279952,59.5,112.2,29.6,60.0,73.0,102.0,4.2,5.6,35.6,12.0,0.0,16.5,2.0,17.9,2.0,6.0,0.0,10.0,1.0,11.0,1.0,8.0,0.0,2.0,4.33036,30.1865,1330.97,1996.45,30.2124,84.0285,16.6877,13.4988,67.9715,2.0,32.9141,20.902,79.6982,35.3804,63.1265,1.778,4.11,40.0,56.0,0.0,5.953392,0.0,7.549816,0.0,7.153245,7.322783,0.0,2.630853,16.680229,6.489077,3.026922,12.872902,0.0,6.549987,0.0,5.223313,0.0,6.083334,0.0,0.0,8.64847,11.530282,0.0,10.4684,13.598515,0.0,0.0,3.080362,3.346899,0.07572,0.893668,3.969584,0.0,15.206186,6.006083,3.238015,0.0,17.711552,0.0,0.0,0.0,9.211094,3.892674,5.166663,4.312238,19.678722,0.0,0.098976,8.960603,7.441569,8.930751,0.0,0.0,9.105422,3.539766,4.188061,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,289.639376,0.0,0.0,2.251718,0.24551,0.198595,1.262516,90468.027355,135701.701175,11.862478,17.793672,0.594629,2.621003,0.562625,0.521399


In [66]:
imputer = KNNImputer(n_neighbors=5)
imputed_data_concat = imputer.fit_transform(concat_df)

In [67]:
concat_imputed = pd.DataFrame(imputed_data_concat, columns=concat_df.columns)
print(concat_imputed.shape)
concat_imputed.head()

(2756, 171)


Unnamed: 0,Basic_Demos-Age,CGAS-CGAS_Score,Physical-BMI,Physical-Height,Physical-Weight,Physical-Waist_Circumference,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,Fitness_Endurance-Max_Stage,Fitness_Endurance-Time_Mins,Fitness_Endurance-Time_Sec,FGC-FGC_CU,FGC-FGC_CU_Zone,FGC-FGC_GSND,FGC-FGC_GSND_Zone,FGC-FGC_GSD,FGC-FGC_GSD_Zone,FGC-FGC_PU,FGC-FGC_PU_Zone,FGC-FGC_SRL,FGC-FGC_SRL_Zone,FGC-FGC_SRR,FGC-FGC_SRR_Zone,FGC-FGC_TL,FGC-FGC_TL_Zone,BIA-BIA_Activity_Level_num,BIA-BIA_BMC,BIA-BIA_BMI,BIA-BIA_BMR,BIA-BIA_DEE,BIA-BIA_ECW,BIA-BIA_FFM,BIA-BIA_FFMI,BIA-BIA_FMI,BIA-BIA_Fat,BIA-BIA_Frame_num,BIA-BIA_ICW,BIA-BIA_LDM,BIA-BIA_LST,BIA-BIA_SMM,BIA-BIA_TBW,PAQ_A-PAQ_A_Total,PAQ_C-PAQ_C_Total,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-computerinternet_hoursday,Enc_1,Enc_2,Enc_3,Enc_4,Enc_5,Enc_6,Enc_7,Enc_8,Enc_9,Enc_10,Enc_11,Enc_12,Enc_13,Enc_14,Enc_15,Enc_16,Enc_17,Enc_18,Enc_19,Enc_20,Enc_21,Enc_22,Enc_23,Enc_25,Enc_26,Enc_27,Enc_28,Enc_29,Enc_31,Enc_32,Enc_34,Enc_35,Enc_36,Enc_37,Enc_38,Enc_39,Enc_40,Enc_41,Enc_42,Enc_43,Enc_45,Enc_47,Enc_48,Enc_49,Enc_50,Enc_51,Enc_52,Enc_53,Enc_54,Enc_55,Enc_56,Enc_58,Enc_59,Enc_60,Enc_61,Enc_62,Enc_63,Enc_64,PAQ_A-Season_Fall,PAQ_A-Season_Missing,PAQ_A-Season_Spring,PAQ_A-Season_Summer,PAQ_A-Season_Winter,CGAS-Season_Fall,CGAS-Season_Missing,CGAS-Season_Spring,CGAS-Season_Summer,CGAS-Season_Winter,Basic_Demos-Sex_0,Basic_Demos-Sex_1,PreInt_EduHx-Season_Fall,PreInt_EduHx-Season_Missing,PreInt_EduHx-Season_Spring,PreInt_EduHx-Season_Summer,PreInt_EduHx-Season_Winter,PAQ_C-Season_Fall,PAQ_C-Season_Missing,PAQ_C-Season_Spring,PAQ_C-Season_Summer,PAQ_C-Season_Winter,FGC-Season_Fall,FGC-Season_Missing,FGC-Season_Spring,FGC-Season_Summer,FGC-Season_Winter,Fitness_Endurance-Season_Fall,Fitness_Endurance-Season_Missing,Fitness_Endurance-Season_Spring,Fitness_Endurance-Season_Summer,Fitness_Endurance-Season_Winter,SDS-Season_Fall,SDS-Season_Missing,SDS-Season_Spring,SDS-Season_Summer,SDS-Season_Winter,BIA-Season_Fall,BIA-Season_Missing,BIA-Season_Spring,BIA-Season_Summer,BIA-Season_Winter,Basic_Demos-Enroll_Season_Fall,Basic_Demos-Enroll_Season_Spring,Basic_Demos-Enroll_Season_Summer,Basic_Demos-Enroll_Season_Winter,Physical-Season_Fall,Physical-Season_Missing,Physical-Season_Spring,Physical-Season_Summer,Physical-Season_Winter,BMI_Age,Internet_Hours_Age,BMI_Internet_Hours,BFP_BMI,FFMI_BFP,FMI_BFP,LST_TBW,BFP_BMR,BFP_DEE,BMR_Weight,DEE_Weight,SMM_Height,Muscle_to_Fat,Hydration_Status,ICW_TBW
0,5.0,51.0,16.877316,46.0,50.8,24.2,66.8,79.4,109.6,4.6,7.4,16.8,0.0,0.0,14.44,1.2,15.06,1.4,0.0,0.0,7.0,0.0,6.0,0.0,6.0,1.0,2.0,2.66855,16.8792,932.498,1492.0,8.25598,41.5862,13.8177,3.06143,9.21377,1.0,24.4349,8.89536,38.9177,19.5413,32.6909,1.912,2.17,42.4,59.4,3.0,4.648943,2.86272,3.83782,2.362754,4.824096,2.883555,7.938336,5.59528,5.473143,1.548328,0.610532,2.230198,5.765868,3.468719,1.86528,4.972034,1.201754,9.507165,3.455488,0.0,5.673689,5.616394,10.19935,4.570384,7.375259,2.553157,2.191334,3.677047,4.473263,5.950488,3.525138,0.0,6.884481,2.262771,2.635239,5.337497,0.459038,6.882023,0.0,3.810076,4.177385,8.677014,8.414383,2.293219,5.173482,10.378217,1.199374,5.194741,3.605842,3.352248,1.985008,1.653485,1.475087,3.530067,3.102172,0.35943,6.725377,0.689439,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,84.386578,15.0,50.631947,0.545865,1.499679,0.332267,1.190475,8591.822097,13746.94484,18.35626,29.370079,0.424811,6.383063,0.643522,0.747453
1,9.0,63.4,14.03559,48.0,46.0,22.0,75.0,70.0,122.0,5.4,8.4,19.2,3.0,0.0,21.82,1.8,24.44,2.0,5.0,0.0,11.0,1.0,11.0,1.0,3.0,0.0,2.0,2.57949,14.0371,936.656,1498.65,6.01993,42.0291,12.8254,1.21172,3.97085,1.0,21.0352,14.974,39.4497,15.4107,27.0552,1.582,2.34,46.0,64.0,0.0,7.225027,4.654813,5.707771,3.315377,1.614492,1.516619,5.262636,6.734442,6.344801,0.856954,3.727236,4.799614,6.251505,2.295703,3.671392,5.998921,2.632931,5.187693,1.097767,1.884954,7.591034,4.958785,3.887423,4.308725,2.913355,2.421994,4.201717,3.334835,5.151681,4.350394,4.208288,4.45628,4.187182,4.235772,4.713233,1.201684,2.942663,6.699873,0.487005,1.546179,2.424647,3.735351,3.231082,8.39476,6.670735,5.268567,1.595979,4.478597,4.668807,3.618142,0.341602,0.548136,1.590361,1.198129,6.590679,3.144306,9.164507,4.239794,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,126.320313,0.0,0.0,0.282883,3.229888,0.305154,1.458119,3719.320478,5950.914352,20.362087,32.579348,0.321056,12.718037,0.588157,0.777492
2,10.0,71.0,16.648696,56.5,75.6,25.0,65.0,94.0,117.0,5.0,7.0,33.0,20.0,1.0,10.2,1.0,14.7,2.0,7.0,1.0,10.0,1.0,10.0,1.0,5.0,0.0,3.0,3.916544,18.6346,1039.784,1784.09,14.3219,53.01382,14.11984,4.514742,16.42618,1.8,25.79618,12.89574,49.09728,23.3954,40.11808,1.614,2.17,38.0,54.0,2.0,9.040421,4.273831,3.260446,0.701797,2.822473,18.231993,2.431175,10.054292,10.812925,16.455309,8.117317,8.840761,8.781365,2.349888,0.043987,9.928959,0.868555,9.606325,9.479357,5.175053,9.032331,5.809124,2.831523,13.388857,13.861341,15.405759,2.705917,8.429331,3.052455,7.486068,0.423141,2.490485,9.126925,17.947185,2.075995,12.117871,4.833192,12.477597,4.944942,2.207031,3.467912,3.889378,3.533822,15.909766,0.583314,19.029536,2.370717,10.098508,3.357332,3.111625,9.681407,5.590549,4.631087,12.116008,7.719337,8.477834,7.622598,0.196086,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,166.486961,20.0,33.297392,0.881488,0.859594,0.27485,1.223819,17079.679145,29305.783476,13.753757,23.599074,0.414078,5.182002,0.530662,0.643006
3,9.0,71.0,18.292347,56.0,81.6,28.2,60.0,97.0,117.0,6.0,9.0,37.0,18.0,1.0,17.64,1.6,19.22,1.8,5.0,0.0,7.0,0.0,7.0,0.0,7.0,1.0,3.0,3.84191,18.2943,1131.43,1923.44,15.5925,62.7757,14.074,4.22033,18.8243,2.0,30.4041,16.779,58.9338,26.4798,45.9966,1.796,2.451,31.0,45.0,0.0,0.0,3.35869,0.0,5.337131,1e-06,0.0,0.0,2.398546,4.813291,4.990461,1.405737,8.475198,0.0,0.0,6.87957,1.935769,8.775964,0.0,3.882035,6.520278,18.483984,7.693,0.673259,10.639287,0.0,5.649461,6.437056,8.081806,14.662292,6.477272,2.085419,17.302296,0.0,10.002832,7.267302,1.532843,5.770669,1.359145,0.0,0.0,0.0,0.0,4.486127,14.495865,4.542089,4.867955,0.0,0.0,3.569353,8.330843,0.0,0.0,2.021903,3.556332,6.78969,7.568647,0.0,4.528473,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,164.631122,0.0,0.0,1.028971,0.747651,0.224196,1.281264,21298.377749,36207.411592,13.865564,23.571569,0.472854,6.274343,0.563684,0.661008
4,13.0,50.0,22.279952,59.5,112.2,29.6,60.0,73.0,102.0,4.2,5.6,35.6,12.0,0.0,16.5,2.0,17.9,2.0,6.0,0.0,10.0,1.0,11.0,1.0,8.0,0.0,2.0,4.33036,30.1865,1330.97,1996.45,30.2124,84.0285,16.6877,13.4988,67.9715,2.0,32.9141,20.902,79.6982,35.3804,63.1265,1.778,4.11,40.0,56.0,0.0,5.953392,0.0,7.549816,0.0,7.153245,7.322783,0.0,2.630853,16.680229,6.489077,3.026922,12.872902,0.0,6.549987,0.0,5.223313,0.0,6.083334,0.0,0.0,8.64847,11.530282,0.0,10.4684,13.598515,0.0,0.0,3.080362,3.346899,0.07572,0.893668,3.969584,0.0,15.206186,6.006083,3.238015,0.0,17.711552,0.0,0.0,0.0,9.211094,3.892674,5.166663,4.312238,19.678722,0.0,0.098976,8.960603,7.441569,8.930751,0.0,0.0,9.105422,3.539766,4.188061,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,289.639376,0.0,0.0,2.251718,0.24551,0.198595,1.262516,90468.027355,135701.701175,11.862478,17.793672,0.594629,2.621003,0.562625,0.521399


In [68]:
concat_imputed.isna().sum().sum()

0

In [69]:
test_imputed = concat_imputed.tail(test_imputed.shape[0])
print(test_imputed.shape)
test_imputed.head()

(20, 171)


Unnamed: 0,Basic_Demos-Age,CGAS-CGAS_Score,Physical-BMI,Physical-Height,Physical-Weight,Physical-Waist_Circumference,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,Fitness_Endurance-Max_Stage,Fitness_Endurance-Time_Mins,Fitness_Endurance-Time_Sec,FGC-FGC_CU,FGC-FGC_CU_Zone,FGC-FGC_GSND,FGC-FGC_GSND_Zone,FGC-FGC_GSD,FGC-FGC_GSD_Zone,FGC-FGC_PU,FGC-FGC_PU_Zone,FGC-FGC_SRL,FGC-FGC_SRL_Zone,FGC-FGC_SRR,FGC-FGC_SRR_Zone,FGC-FGC_TL,FGC-FGC_TL_Zone,BIA-BIA_Activity_Level_num,BIA-BIA_BMC,BIA-BIA_BMI,BIA-BIA_BMR,BIA-BIA_DEE,BIA-BIA_ECW,BIA-BIA_FFM,BIA-BIA_FFMI,BIA-BIA_FMI,BIA-BIA_Fat,BIA-BIA_Frame_num,BIA-BIA_ICW,BIA-BIA_LDM,BIA-BIA_LST,BIA-BIA_SMM,BIA-BIA_TBW,PAQ_A-PAQ_A_Total,PAQ_C-PAQ_C_Total,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-computerinternet_hoursday,Enc_1,Enc_2,Enc_3,Enc_4,Enc_5,Enc_6,Enc_7,Enc_8,Enc_9,Enc_10,Enc_11,Enc_12,Enc_13,Enc_14,Enc_15,Enc_16,Enc_17,Enc_18,Enc_19,Enc_20,Enc_21,Enc_22,Enc_23,Enc_25,Enc_26,Enc_27,Enc_28,Enc_29,Enc_31,Enc_32,Enc_34,Enc_35,Enc_36,Enc_37,Enc_38,Enc_39,Enc_40,Enc_41,Enc_42,Enc_43,Enc_45,Enc_47,Enc_48,Enc_49,Enc_50,Enc_51,Enc_52,Enc_53,Enc_54,Enc_55,Enc_56,Enc_58,Enc_59,Enc_60,Enc_61,Enc_62,Enc_63,Enc_64,PAQ_A-Season_Fall,PAQ_A-Season_Missing,PAQ_A-Season_Spring,PAQ_A-Season_Summer,PAQ_A-Season_Winter,CGAS-Season_Fall,CGAS-Season_Missing,CGAS-Season_Spring,CGAS-Season_Summer,CGAS-Season_Winter,Basic_Demos-Sex_0,Basic_Demos-Sex_1,PreInt_EduHx-Season_Fall,PreInt_EduHx-Season_Missing,PreInt_EduHx-Season_Spring,PreInt_EduHx-Season_Summer,PreInt_EduHx-Season_Winter,PAQ_C-Season_Fall,PAQ_C-Season_Missing,PAQ_C-Season_Spring,PAQ_C-Season_Summer,PAQ_C-Season_Winter,FGC-Season_Fall,FGC-Season_Missing,FGC-Season_Spring,FGC-Season_Summer,FGC-Season_Winter,Fitness_Endurance-Season_Fall,Fitness_Endurance-Season_Missing,Fitness_Endurance-Season_Spring,Fitness_Endurance-Season_Summer,Fitness_Endurance-Season_Winter,SDS-Season_Fall,SDS-Season_Missing,SDS-Season_Spring,SDS-Season_Summer,SDS-Season_Winter,BIA-Season_Fall,BIA-Season_Missing,BIA-Season_Spring,BIA-Season_Summer,BIA-Season_Winter,Basic_Demos-Enroll_Season_Fall,Basic_Demos-Enroll_Season_Spring,Basic_Demos-Enroll_Season_Summer,Basic_Demos-Enroll_Season_Winter,Physical-Season_Fall,Physical-Season_Missing,Physical-Season_Spring,Physical-Season_Summer,Physical-Season_Winter,BMI_Age,Internet_Hours_Age,BMI_Internet_Hours,BFP_BMI,FFMI_BFP,FMI_BFP,LST_TBW,BFP_BMR,BFP_DEE,BMR_Weight,DEE_Weight,SMM_Height,Muscle_to_Fat,Hydration_Status,ICW_TBW
2736,5.0,51.0,16.877316,46.0,50.8,23.64,71.24,78.6,121.52,4.44,6.56,26.52,0.0,0.0,15.9,1.6,16.744,1.68,0.0,0.0,7.0,0.0,6.0,0.0,6.0,1.0,2.0,2.66855,16.8792,932.498,1492.0,8.25598,41.5862,13.8177,3.06143,9.21377,1.0,24.4349,8.89536,38.9177,19.5413,32.6909,1.7544,2.5612,39.88,56.28,3.0,7.433673,4.547935,6.288887,5.037417,2.898356,3.82815,5.771692,6.85616,5.406296,4.72754,3.279028,4.325552,9.149574,2.142903,4.578232,5.900305,1.797518,7.84212,5.396106,1.563774,7.779565,4.81584,4.735718,4.5757,5.736856,5.406878,6.138075,4.237088,4.756045,6.416023,4.073981,4.64158,7.86387,4.881336,3.803687,3.547558,4.11461,8.247804,1.942005,1.480973,3.739915,5.344204,3.765329,7.64449,7.382208,8.283773,3.099824,5.565895,3.379328,4.370356,1.01859,3.699757,1.805153,2.722966,5.361347,3.634816,11.460936,3.800648,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,84.386578,15.0,50.631947,0.545865,1.499679,0.332267,1.190475,8591.822097,13746.94484,18.35626,29.370079,0.424811,6.383063,0.643522,0.747453
2737,9.0,58.48,14.03559,48.0,46.0,22.0,75.0,70.0,122.0,5.04,7.6,27.4,3.0,0.0,16.764,1.64,17.708,1.72,5.0,0.0,11.0,1.0,11.0,1.0,3.0,0.0,2.0,2.57949,14.0371,936.656,1498.65,6.01993,42.0291,12.8254,1.21172,3.97085,1.0,21.0352,14.974,39.4497,15.4107,27.0552,1.5472,2.34,46.0,64.0,0.0,4.968961,4.442031,6.939918,4.338598,2.770748,2.756649,5.029458,6.725339,3.878906,3.963945,4.852677,4.802514,8.71545,1.648544,5.074675,3.622922,0.950817,6.627649,7.750504,1.992468,12.508244,5.387814,5.382751,4.681806,4.711883,6.182888,7.273111,6.9442,6.026696,4.569112,3.632581,9.365758,6.649919,8.766185,2.062863,5.196027,6.82679,6.164728,0.35307,1.475859,2.387157,3.991797,5.783901,11.518128,7.902459,10.730605,4.169021,3.177545,2.816072,6.052052,1.950807,4.685744,3.175619,3.461603,5.113039,8.814765,9.743268,5.869051,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,126.320313,0.0,0.0,0.282883,3.229888,0.305154,1.458119,3719.320478,5950.914352,20.362087,32.579348,0.321056,12.718037,0.588157,0.777492
2738,10.0,71.0,16.648696,56.5,75.6,26.12,65.0,94.0,117.0,5.0,7.0,33.0,20.0,1.0,10.2,1.0,14.7,2.0,7.0,1.0,10.0,1.0,10.0,1.0,5.0,0.0,2.44,4.860585,17.389876,1112.2964,1810.096,14.745111,60.737268,13.855436,3.534431,15.30273,1.76,29.241208,16.750964,55.876692,29.259392,43.986324,1.76592,2.17,38.0,54.0,2.0,6.697084,2.7503,4.340476,3.330298,2.134946,6.854732,3.169771,5.800215,7.265112,6.662075,5.83077,4.614444,5.238506,2.481096,4.056074,6.480171,3.634648,6.581656,4.398239,2.777793,9.25078,3.906131,6.944033,8.699897,6.888328,5.751594,5.100464,6.585642,4.758694,3.561332,2.25409,4.89937,5.421168,9.119006,3.917411,5.637057,4.331605,6.410985,2.7657,2.225433,2.602544,3.98549,7.796484,9.249817,3.162863,8.821357,3.627707,5.669035,2.711339,4.051499,4.201324,1.742874,4.660177,3.701572,4.156318,5.701365,3.919651,2.620818,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,166.486961,20.0,33.297392,0.879979,0.905423,0.230967,1.27032,17021.171044,27699.409638,14.712915,23.943069,0.517865,8.278389,0.58183,0.66478
2739,9.0,71.0,18.292347,56.0,81.6,27.28,60.0,97.0,117.0,6.0,9.0,37.0,18.0,1.0,16.628,1.6,17.872,1.84,5.0,0.0,7.0,0.0,7.0,0.0,7.0,1.0,3.0,3.84191,18.2943,1131.43,1923.44,15.5925,62.7757,14.074,4.22033,18.8243,2.0,30.4041,16.779,58.9338,26.4798,45.9966,1.87632,2.451,31.0,45.0,0.0,0.0,3.35869,0.0,5.337131,1e-06,0.0,0.0,2.398546,4.813291,4.990461,1.405737,8.475198,0.0,0.0,6.87957,1.935769,8.775964,0.0,3.882035,6.520278,18.483984,7.693,0.673259,10.639287,0.0,5.649461,6.437056,8.081806,14.662292,6.477272,2.085419,17.302296,0.0,10.002832,7.267302,1.532843,5.770669,1.359145,0.0,0.0,0.0,0.0,4.486127,14.495865,4.542089,4.867955,0.0,0.0,3.569353,8.330843,0.0,0.0,2.021903,3.556332,6.78969,7.568647,0.0,4.528473,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,164.631122,0.0,0.0,1.028971,0.747651,0.224196,1.281264,21298.377749,36207.411592,13.865564,23.571569,0.472854,6.274343,0.563684,0.661008
2740,18.0,64.6,25.077397,66.5,159.004,32.08,71.92,85.56,118.68,5.12,7.6,27.2,18.16,0.44,30.46,1.8,31.732,1.72,9.24,0.48,7.798,0.48,8.24,0.56,10.28,0.8,2.48,5.197497,24.912532,1478.2584,2347.2708,33.184904,99.716796,16.124564,8.78795,54.771184,2.16,40.078936,26.453056,94.519284,47.053444,73.26384,1.04,2.6676,41.32,57.96,2.2,9.400064,4.55134,5.336944,2.825522,3.31189,9.826693,3.195566,9.899724,8.433462,9.607602,4.862829,6.79766,7.719397,3.41614,3.62288,8.579611,1.973558,8.475323,5.34425,3.890325,6.591288,8.120471,2.979621,5.971413,10.577398,7.255359,4.757495,3.640961,4.056548,7.151607,3.163836,4.198445,7.948455,7.691755,4.961006,5.035649,4.755392,12.226996,4.273849,2.720008,3.383621,5.743136,5.131161,6.630667,3.42635,10.278753,2.4771,7.448494,3.342066,3.894833,2.800315,3.722895,2.237977,4.926546,6.155309,4.432342,9.462136,1.851145,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,451.393148,39.6,55.170274,2.198539,0.294399,0.160448,1.290122,80965.962826,128562.800885,9.296989,14.762338,0.707571,5.354314,0.460767,0.547049


In [70]:
test_imputed.isna().sum().sum()

0

In [76]:
train_final = train_imputed[test_imputed.columns.tolist() + ['sii']].copy()
test_final = test_imputed.copy()

In [77]:
print(train_final.shape)
print(test_final.shape)

(2736, 172)
(20, 171)


In [71]:
def quadratic_weighted_kappa(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')

def threshold_Rounder(oof_non_rounded, thresholds):
    return np.where(oof_non_rounded < thresholds[0], 0,
                    np.where(oof_non_rounded < thresholds[1], 1,
                             np.where(oof_non_rounded < thresholds[2], 2, 3)))

def evaluate_predictions(thresholds, y_true, oof_non_rounded):
    rounded_p = threshold_Rounder(oof_non_rounded, thresholds)
    return -quadratic_weighted_kappa(y_true, rounded_p)

# Model Training and Evaluation

- **Model Types**: Various models are used, including:
  - **LightGBM**: A gradient-boosting framework known for its speed and efficiency with large datasets.
  - **XGBoost**: Another powerful gradient-boosting model used for structured data.
  - **CatBoost**: Optimized for categorical features without the need for extensive preprocessing.
  - **Voting Regressor**: An ensemble model that combines the predictions of LightGBM, XGBoost, and CatBoost for better accuracy.
- **Cross-Validation**: Stratified K-Folds cross-validation is employed to split the data into training and validation sets, ensuring balanced class distribution in each fold.
- **Quadratic Weighted Kappa (QWK)**: The performance of the models is evaluated using QWK, which measures the agreement between predicted and actual values, taking into account the ordinal nature of the target variable.
- **Threshold Optimization**: The `minimize` function from `scipy.optimize` is used to fine-tune decision thresholds that map continuous predictions to discrete categories (None, Mild, Moderate, Severe).


In [72]:
def TrainML(model_class, train_data, test_data):
    X = train_data.drop(['sii'], axis=1)
    y = train_data['sii']

    SKF = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    
    train_S = []
    test_S = []
    
    oof_non_rounded = np.zeros(len(y), dtype=float) 
    oof_rounded = np.zeros(len(y), dtype=int) 
    test_preds = np.zeros((len(test_data), n_splits))

    for fold, (train_idx, test_idx) in enumerate(tqdm(SKF.split(X, y), desc="Training Folds", total=n_splits)):
        X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]

        model = clone(model_class)
        model.fit(X_train, y_train)

        y_train_pred = model.predict(X_train)
        y_val_pred = model.predict(X_val)

        oof_non_rounded[test_idx] = y_val_pred
        y_val_pred_rounded = y_val_pred.round(0).astype(int)
        oof_rounded[test_idx] = y_val_pred_rounded

        train_kappa = quadratic_weighted_kappa(y_train, y_train_pred.round(0).astype(int))
        val_kappa = quadratic_weighted_kappa(y_val, y_val_pred_rounded)

        train_S.append(train_kappa)
        test_S.append(val_kappa)
        
        test_preds[:, fold] = model.predict(test_data)
        
        print(f"Fold {fold+1} - Train QWK: {train_kappa:.4f}, Validation QWK: {val_kappa:.4f}")
        clear_output(wait=True)

    print(f"Mean Train QWK --> {np.mean(train_S):.4f}")
    print(f"Mean Validation QWK ---> {np.mean(test_S):.4f}")

    KappaOPtimizer = minimize(evaluate_predictions,
                              x0=[0.5, 1.5, 2.5], args=(y, oof_non_rounded), 
                              method='Nelder-Mead')
    assert KappaOPtimizer.success, "Optimization did not converge."
    
    oof_tuned = threshold_Rounder(oof_non_rounded, KappaOPtimizer.x)
    tKappa = quadratic_weighted_kappa(y, oof_tuned)

    print(f"----> || Optimized QWK SCORE :: {Fore.CYAN}{Style.BRIGHT} {tKappa:.3f}{Style.RESET_ALL}")

    tpm = test_preds.mean(axis=1)
    tpTuned = threshold_Rounder(tpm, KappaOPtimizer.x)
    
    submission = pd.DataFrame({
        'id': sample['id'],
        'sii': tpTuned
    })

    return submission


# Hyperparameter Tuning

- **LightGBM Parameters**: Hyperparameters such as `learning_rate`, `max_depth`, `num_leaves`, and `feature_fraction` are tuned to improve the performance of the LightGBM model. These parameters control the complexity of the model and its ability to generalize to new data.
- **XGBoost and CatBoost Parameters**: Similar tuning is applied for XGBoost and CatBoost, adjusting parameters such as `n_estimators`, `max_depth`, `learning_rate`, `subsample`, and `regularization` terms (`reg_alpha`, `reg_lambda`). These help in controlling overfitting and ensuring the model's robustness.

In [73]:
# Model parameters for LightGBM
Params = {
    'learning_rate': 0.046,
    'max_depth': 12,
    'num_leaves': 478,
    'min_data_in_leaf': 13,
    'feature_fraction': 0.893,
    'bagging_fraction': 0.784,
    'bagging_freq': 4,
    'lambda_l1': 10,  # Increased from 6.59
    'lambda_l2': 0.01,  # Increased from 2.68e-06
    'device': 'gpu'

}


# XGBoost parameters
XGB_Params = {
    'learning_rate': 0.05,
    'max_depth': 6,
    'n_estimators': 200,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_alpha': 1,  # Increased from 0.1
    'reg_lambda': 5,  # Increased from 1
    'random_state': SEED,
    'tree_method': 'gpu_hist',

}


CatBoost_Params = {
    'learning_rate': 0.05,
    'depth': 6,
    'iterations': 200,
    'random_seed': SEED,
    'verbose': 0,
    'l2_leaf_reg': 10,  # Increase this value
    'task_type': 'GPU'

}

In [74]:
# New: TabNet

from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.model_selection import train_test_split
from pytorch_tabnet.callbacks import Callback

class TabNetWrapper(BaseEstimator, RegressorMixin):
    def __init__(self, **kwargs):
        self.model = TabNetRegressor(**kwargs)
        self.kwargs = kwargs
        self.imputer = SimpleImputer(strategy='median')
        self.best_model_path = 'best_tabnet_model.pt'
        
    def fit(self, X, y):
        # Handle missing values
        X_imputed = self.imputer.fit_transform(X)
        
        if hasattr(y, 'values'):
            y = y.values
            
        # Create internal validation set
        X_train, X_valid, y_train, y_valid = train_test_split(
            X_imputed, 
            y, 
            test_size=0.2,
            random_state=42
        )
        
        # Train TabNet model
        history = self.model.fit(
            X_train=X_train,
            y_train=y_train.reshape(-1, 1),
            eval_set=[(X_valid, y_valid.reshape(-1, 1))],
            eval_name=['valid'],
            eval_metric=['mse'],
            max_epochs=500,
            patience=50,
            batch_size=1024,
            virtual_batch_size=128,
            num_workers=0,
            drop_last=False,
            callbacks=[
                TabNetPretrainedModelCheckpoint(
                    filepath=self.best_model_path,
                    monitor='valid_mse',
                    mode='min',
                    save_best_only=True,
                    verbose=True
                )
            ]
        )
        
        # Load the best model
        if os.path.exists(self.best_model_path):
            self.model.load_model(self.best_model_path)
            os.remove(self.best_model_path)  # Remove temporary file
        
        return self
    
    def predict(self, X):
        X_imputed = self.imputer.transform(X)
        return self.model.predict(X_imputed).flatten()
    
    def __deepcopy__(self, memo):
        # Add deepcopy support for scikit-learn
        cls = self.__class__
        result = cls.__new__(cls)
        memo[id(self)] = result
        for k, v in self.__dict__.items():
            setattr(result, k, deepcopy(v, memo))
        return result

# TabNet hyperparameters
TabNet_Params = {
    'n_d': 64,              # Width of the decision prediction layer
    'n_a': 64,              # Width of the attention embedding for each step
    'n_steps': 5,           # Number of steps in the architecture
    'gamma': 1.5,           # Coefficient for feature selection regularization
    'n_independent': 2,     # Number of independent GLU layer in each GLU block
    'n_shared': 2,          # Number of shared GLU layer in each GLU block
    'lambda_sparse': 1e-4,  # Sparsity regularization
    'optimizer_fn': torch.optim.Adam,
    'optimizer_params': dict(lr=2e-2, weight_decay=1e-5),
    'mask_type': 'entmax',
    'scheduler_params': dict(mode="min", patience=10, min_lr=1e-5, factor=0.5),
    'scheduler_fn': torch.optim.lr_scheduler.ReduceLROnPlateau,
    'verbose': 1,
    'device_name': 'cuda' if torch.cuda.is_available() else 'cpu'
}

class TabNetPretrainedModelCheckpoint(Callback):
    def __init__(self, filepath, monitor='val_loss', mode='min', 
                 save_best_only=True, verbose=1):
        super().__init__()  # Initialize parent class
        self.filepath = filepath
        self.monitor = monitor
        self.mode = mode
        self.save_best_only = save_best_only
        self.verbose = verbose
        self.best = float('inf') if mode == 'min' else -float('inf')
        
    def on_train_begin(self, logs=None):
        self.model = self.trainer  # Use trainer itself as model
        
    def on_epoch_end(self, epoch, logs=None):
        logs = logs or {}
        current = logs.get(self.monitor)
        if current is None:
            return
        
        # Check if current metric is better than best
        if (self.mode == 'min' and current < self.best) or \
           (self.mode == 'max' and current > self.best):
            if self.verbose:
                print(f'\nEpoch {epoch}: {self.monitor} improved from {self.best:.4f} to {current:.4f}')
            self.best = current
            if self.save_best_only:
                self.model.save_model(self.filepath)  # Save the entire model

# Ensemble Learning and Submission Preparation

- **Ensemble Learning**: The model uses a **Voting Regressor**, which combines the predictions from LightGBM, XGBoost, and CatBoost. This approach is beneficial as it leverages the strengths of multiple models, reducing overfitting and improving overall model performance.
- **Out-of-Fold (OOF) Predictions**: During cross-validation, out-of-fold predictions are generated for the training set, which helps in model evaluation without data leakage.
- **Kappa Optimizer**: The Kappa Optimizer ensures that the predicted values are as close to the actual values as possible by adjusting the thresholds used to convert raw model outputs into class labels.
- **Test Set Predictions**: After the model is trained and thresholds are optimized, the test dataset is processed, and predictions are generated using the ensemble model. These predictions are converted into the appropriate format for submission.
- **Submission File Creation**: The predictions are saved in a CSV file following the required format for submission (e.g., for a Kaggle competition), which includes columns like `id` and `sii` (Severity Impairment Index).

# Final Results and Performance Metrics

- **Train and Validation Scores**: After training across multiple folds, the mean Quadratic Weighted Kappa (QWK) score is calculated for both the training and validation datasets, providing an indicator of model performance. 
- **Optimized QWK Score**: The final optimized QWK score after threshold tuning is displayed, showcasing the model's ability to predict the severity levels effectively.
- **Test Predictions**: The test set predictions are evaluated, and a breakdown of the predicted severity levels (None, Mild, Moderate, Severe) is shown, along with their respective counts.

In [75]:
# Create model instances
Light = LGBMRegressor(**Params, random_state=SEED, verbose=-1, n_estimators=300)
XGB_Model = XGBRegressor(**XGB_Params)
CatBoost_Model = CatBoostRegressor(**CatBoost_Params)
TabNet_Model = TabNetWrapper(**TabNet_Params) # New

voting_model = VotingRegressor(estimators=[
    ('lightgbm', Light),
    ('xgboost', XGB_Model),
    ('catboost', CatBoost_Model),
    ('tabnet', TabNet_Model)
])

In [78]:
Submission1 = TrainML(voting_model, train_final, test_final)

Submission1

Training Folds: 100%|████████████████████████████████████████████████████████████████████| 5/5 [02:03<00:00, 24.72s/it]

Mean Train QWK --> 0.8760
Mean Validation QWK ---> 0.6447
----> || Optimized QWK SCORE :: [36m[1m 0.689[0m





Unnamed: 0,id,sii
0,00008ff9,0
1,000fd460,0
2,00105258,2
3,00115b9f,1
4,0016bb22,1
5,001f3379,1
6,0038ba98,1
7,0068a485,2
8,0069fbed,2
9,0083e397,2


In [79]:
imputer = SimpleImputer(strategy='median')

ensemble = VotingRegressor(estimators=[
    ('lgb', Pipeline(steps=[('imputer', imputer), ('regressor', LGBMRegressor(random_state=SEED))])),
    ('xgb', Pipeline(steps=[('imputer', imputer), ('regressor', XGBRegressor(random_state=SEED))])),
    ('cat', Pipeline(steps=[('imputer', imputer), ('regressor', CatBoostRegressor(random_state=SEED, silent=True))])),
    ('rf', Pipeline(steps=[('imputer', imputer), ('regressor', RandomForestRegressor(random_state=SEED))])),
    ('gb', Pipeline(steps=[('imputer', imputer), ('regressor', GradientBoostingRegressor(random_state=SEED))])),
    ('tabnet', Pipeline(steps=[('imputer', imputer), ('regressor', TabNetWrapper(**TabNet_Params))]))  # New:TabNet
])

In [80]:
Submission3 = TrainML(ensemble, train_final, test_final)

Submission3

Training Folds: 100%|████████████████████████████████████████████████████████████████████| 5/5 [03:26<00:00, 41.22s/it]

Mean Train QWK --> 0.9760
Mean Validation QWK ---> 0.6505
----> || Optimized QWK SCORE :: [36m[1m 0.693[0m





Unnamed: 0,id,sii
0,00008ff9,0
1,000fd460,0
2,00105258,1
3,00115b9f,1
4,0016bb22,1
5,001f3379,1
6,0038ba98,1
7,0068a485,1
8,0069fbed,2
9,0083e397,2


In [None]:
sub1 = Submission1
sub2 = Submission2
sub3 = Submission3

sub1 = sub1.sort_values(by='id').reset_index(drop=True)
sub2 = sub2.sort_values(by='id').reset_index(drop=True)
sub3 = sub3.sort_values(by='id').reset_index(drop=True)

combined = pd.DataFrame({
    'id': sub1['id'],
    'sii_1': sub1['sii'],
    'sii_2': sub2['sii'],
    'sii_3': sub3['sii']
})

def majority_vote(row):
    return row.mode()[0]

combined['final_sii'] = combined[['sii_1', 'sii_2', 'sii_3']].apply(majority_vote, axis=1)

final_submission = combined[['id', 'final_sii']].rename(columns={'final_sii': 'sii'})

final_submission.to_csv('submission.csv', index=False)

print("Majority voting completed and saved to 'Final_Submission.csv'")

In [None]:
final_submission