In [2]:
import pandas as pd
import datetime as dt
import miceforest as mf

now = dt.datetime.now()
now = now.strftime('%Y-%m-%d')
filepath = '/Users/aargenti/Documents/proteomic_age/'
random_seed = 3456

#Load protein data (our file is columns for eid, plate, batch, and all olink proteins)
data_path2 = f'{filepath}data/olink_data_wide_oct_30_2023.csv'
data = pd.read_csv(data_path2)


In [17]:
# Calculate the threshold for missing data
perc = 0.3
threshold = perc * len(data)

# Count the number of missing values in each column
mperc = (data.isnull().sum() / len(data)) * 100

# Filter columns where missing data is over 30%
cols_over = mperc[mperc > (perc*100)].index

print(f"Column names with missing data over {perc*100}%: {cols_over}")

Column names with missing data over 30.0%: Index(['GLIPR1', 'NPM1', 'PCOLCE'], dtype='object')


In [8]:
## Create a dictionary of variables use to impute

# columns that should not be used to impute missing (including 3 proteins with high missing)
exclude = ['eid', 'olink_batch', 'olink_plate', 'GLIPR1', 'NPM1', 'PCOLCE']
# variables we don't want imputed
dont_impute = ['eid', 'olink_batch', 'olink_plate']
# create dict with list of predictors for each protein
column_dict = {col: [other_col for other_col in data.columns if other_col != col and other_col not in exclude] for col in data.columns if col not in dont_impute}

In [None]:
# run miceforest imputation on multiple cores
kds = mf.ImputationKernel(
  data,
  datasets=1,
  variable_schema=column_dict,
  random_state=random_seed
)

# run
kds.mice(
  iterations=5,
  n_jobs=-1, 
  verbose=True
)

In [None]:
# get the completed dataframe from the miceforest object
olink_data_imputed = kds.complete_data()

#save imputed data
olink_data_imputed.to_csv(f'{filepath}data/olink_data_wide_june_22_2023_imputed.csv', index=False)