In [1]:
import numpy as np
import pandas as pd
import matplotlib as plt
import sklearn as skl
import os
import scipy as sci

Create DataFrame from csv file provided with missing values and correct problematic strings

In [2]:
df = pd.read_csv("Partial Patient Health.csv")
df = df.apply(lambda x: pd.to_numeric(x, errors='coerce'))
df

Unnamed: 0,HR,RR,HBP,LBP,SPO2,TEMP,CF,GLU,AGE,BMI,...,DIS1,DIS2,DIS3,DIS4,DIS5,DIS6,DIS7,DIS8,DIS9,DIS10
0,71.0,12.0,158.0,66.0,99.0,35.7,0.8,105.0,96.0,32.0,...,0.0,0.0,,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,74.0,18.0,148.0,57.0,,35.9,0.4,135.0,66.0,28.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,81.0,26.0,138.0,60.0,95.0,36.3,1.0,157.0,75.0,21.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,
3,66.0,27.0,162.0,,97.0,37.3,1.2,197.0,74.0,13.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,68.0,18.0,136.0,54.0,93.0,36.4,0.5,,72.0,24.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
927,,23.0,,67.0,95.0,35.8,0.8,131.0,,13.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
928,88.0,18.0,164.0,82.0,97.0,37.2,0.9,102.0,88.0,22.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,,0.0,0.0
929,97.0,19.0,,70.0,96.0,37.8,0.9,165.0,80.0,24.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
930,99.0,25.0,182.0,,,35.2,0.8,200.0,92.0,15.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [3]:
process_normal = []
process_other = []
for col in df.columns:
    if 'DIS' in col:
        process_other.append(col)
    else:
        process_normal.append(col)
        
df = df.drop(process_other, axis=1)
df

Unnamed: 0,HR,RR,HBP,LBP,SPO2,TEMP,CF,GLU,AGE,BMI,CHOL
0,71.0,12.0,158.0,66.0,99.0,35.7,0.8,105.0,96.0,32.0,188.0
1,74.0,18.0,148.0,57.0,,35.9,0.4,135.0,66.0,28.0,191.0
2,81.0,26.0,138.0,60.0,95.0,36.3,1.0,157.0,75.0,21.0,207.0
3,66.0,27.0,162.0,,97.0,37.3,1.2,197.0,74.0,13.0,218.0
4,68.0,18.0,136.0,54.0,93.0,36.4,0.5,,72.0,24.0,194.0
...,...,...,...,...,...,...,...,...,...,...,...
927,,23.0,,67.0,95.0,35.8,0.8,131.0,,13.0,217.0
928,88.0,18.0,164.0,82.0,97.0,37.2,0.9,102.0,88.0,22.0,234.0
929,97.0,19.0,,70.0,96.0,37.8,0.9,165.0,80.0,24.0,222.0
930,99.0,25.0,182.0,,,35.2,0.8,200.0,92.0,15.0,226.0


Imputation Method: Replace values with mean

In [4]:
df_mean = pd.DataFrame()
def replace_nans_with_means(dataframe):
    for col in dataframe.columns:
        col_mean = round(np.nanmean(dataframe.get(col)), 1)
        df_mean[col] = dataframe.get(col).fillna(col_mean)

replace_nans_with_means(df)
df_mean.isnull().values.any()

False

Imputation Method: Replace values with median

In [5]:
df_median = pd.DataFrame()
def replace_nans_with_medians(dataframe):
    for col in dataframe.columns:
        col_median = round(np.nanmedian(dataframe.get(col)), 1)
        df_median[col] = dataframe.get(col).fillna(col_median)
    
replace_nans_with_medians(df)
df_median

Unnamed: 0,HR,RR,HBP,LBP,SPO2,TEMP,CF,GLU,AGE,BMI,CHOL
0,71.0,12.0,158.0,66.0,99.0,35.7,0.8,105.0,96.0,32.0,188.0
1,74.0,18.0,148.0,57.0,96.0,35.9,0.4,135.0,66.0,28.0,191.0
2,81.0,26.0,138.0,60.0,95.0,36.3,1.0,157.0,75.0,21.0,207.0
3,66.0,27.0,162.0,63.0,97.0,37.3,1.2,197.0,74.0,13.0,218.0
4,68.0,18.0,136.0,54.0,93.0,36.4,0.5,145.0,72.0,24.0,194.0
...,...,...,...,...,...,...,...,...,...,...,...
927,73.0,23.0,131.0,67.0,95.0,35.8,0.8,131.0,79.0,13.0,217.0
928,88.0,18.0,164.0,82.0,97.0,37.2,0.9,102.0,88.0,22.0,234.0
929,97.0,19.0,131.0,70.0,96.0,37.8,0.9,165.0,80.0,24.0,222.0
930,99.0,25.0,182.0,63.0,96.0,35.2,0.8,200.0,92.0,15.0,226.0


Imputation Method: KNN

In [6]:
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=5)
df_knn = pd.DataFrame(imputer.fit_transform(df),columns = df.columns)
df_knn

Unnamed: 0,HR,RR,HBP,LBP,SPO2,TEMP,CF,GLU,AGE,BMI,CHOL
0,71.0,12.0,158.0,66.0,99.0,35.7,0.8,105.0,96.0,32.0,188.0
1,74.0,18.0,148.0,57.0,93.6,35.9,0.4,135.0,66.0,28.0,191.0
2,81.0,26.0,138.0,60.0,95.0,36.3,1.0,157.0,75.0,21.0,207.0
3,66.0,27.0,162.0,70.4,97.0,37.3,1.2,197.0,74.0,13.0,218.0
4,68.0,18.0,136.0,54.0,93.0,36.4,0.5,155.8,72.0,24.0,194.0
...,...,...,...,...,...,...,...,...,...,...,...
927,71.8,23.0,137.8,67.0,95.0,35.8,0.8,131.0,68.6,13.0,217.0
928,88.0,18.0,164.0,82.0,97.0,37.2,0.9,102.0,88.0,22.0,234.0
929,97.0,19.0,151.4,70.0,96.0,37.8,0.9,165.0,80.0,24.0,222.0
930,99.0,25.0,182.0,70.4,93.6,35.2,0.8,200.0,92.0,15.0,226.0


Imputation Method: MICE

In [7]:
from sklearn.linear_model import LinearRegression

def remove_nan_col(col, df):
    col_without_nans = df[~df[col].isnull()]
    return col_without_nans

def remove_nan_col_mean(col, df):
    df_means = df_mean.copy()
    df_means[col] = df[col]
    return remove_nan_col(col, df_means)

remove_nan_col_mean('HR', df).isnull().values.any()

False

In [8]:
from sklearn.model_selection import train_test_split

df_testHR = remove_nan_col_mean('HR', df)
x = df_testHR.drop(columns='HR')
y = df_testHR['HR']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 100)

model = LinearRegression()
model.fit(x_train, y_train)

y_pred_model= model.predict(df_mean.drop(columns='HR'))

In [None]:
def seed_cols(col, to_seed, seed):
    seeded = to_seed.copy()
    no_col = to_seed.drop(columns=col)
    for col in no_col.columns:
        for row in no_col.index:
            if np.isnan(no_col[col].iloc[row]):
                seeded[col].iloc[row] = seed[col].iloc[row]
    return seeded

def remove_seeded_nans(col, df):
    return df[~df[col].isnull()]

def regress_seed(col, df, df_curr_seed):
    df_temp = remove_seeded_nans(col, df)
    x = df_temp.drop(columns=col)
    y = df_temp[col]
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 100)
    model = LinearRegression()
    model.fit(x_train, y_train)
    return model.predict(df_curr_seed.drop(columns=col))

def get_change(current, previous):
    if current == previous:
        return 0
    try:
        return (abs(current - previous) / previous) * 100.0
    except ZeroDivisionError:
        return float('inf')
    
def fill_nan_with_model(df, model):
    for col in df.columns:
        for row in df.index:
            if np.isnan(df[col].iloc[row]):
                df[col].iloc[row] = (model.round(1))[col].iloc[row]

df_to_seed = df.copy()
df_seed = df_mean.copy()
counter = 0

while True:
    df_next_seed = pd.DataFrame()
    for col in df.columns:
        df_seeded_col = seed_cols(col, df_to_seed, df_seed)
        df_seeded_col = remove_seeded_nans(col, df_seeded_col)
        df_next_seed[col] = regress_seed(col, df_seeded_col, df_seed)
    diff = 0
    for col in df.columns:
        for row in df.index:
            perc_change = get_change(df_next_seed[col].iloc[row], df_seed[col].iloc[row])
            if perc_change > diff:
                diff = perc_change
    df_seed = df_next_seed.copy()
    if diff < 0.5:
        break
    counter = counter + 1

df_seed = df_seed.round(1)
df_mice = df.copy()

print("Iterations: ", counter)
fill_nan_with_model(df_mice, df_seed)
df_mice

Imputation Method: KNN with model

In [None]:
def remove_all_nan(df):
    return df.dropna()

def regress_KNN(col, df, df_knn):
    df_temp = remove_all_nan(df)
    x = df_temp.drop(columns=col)
    y = df_temp[col]
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 100)
    model = LinearRegression()
    model.fit(x_train, y_train)
    return model.predict(df_knn.drop(columns=col))

def perform_knn(df):
    imputer = KNNImputer(n_neighbors=5)
    return pd.DataFrame(imputer.fit_transform(df),columns = df.columns)

def gen_regress_knn(df):
    df_knn = perform_knn(df)
    df_return = pd.DataFrame()
    for col in df.columns:
        df_return[col] = regress_KNN(col, df, df_knn)
    return df_return
    
df_knn_stacked = df.copy()
fill_nan_with_model(df_knn_stacked, gen_regress_knn(df_knn_stacked))
df_knn_stacked

In [None]:
os.makedirs('out', exist_ok=True)  
df_mean.to_csv('out/PPH_mean.csv')
df_median.to_csv('out/PPH_median.csv')
df_knn.to_csv('out/PPH_KNN.csv')
df_mice.to_csv('out/PPH_MICE.csv')
df_knn_stacked.to_csv('out/PPH_KNNs.csv')