In [None]:
import pandas as pd
import miceforest as mf

In [None]:
file_path = '/finngen/library-red/EA5/proteomics/olink/third_batch/QCd/proteomics_QC_all.txt'
finngen_olink = pd.read_csv(file_path, sep='\t')

In [None]:
# rename columns
finngen_olink = finngen_olink.rename(columns={'NT-proBNP': 'NTproBNP2', 'ERVV-1': 'ERVV_1', 'HLA-A': 'HLA_A', 'HLA-DRA': 'HLA_DRA', 'HLA-E': 'HLA_E'})

# Replace 'NTproBNP2' with the duplicate column name
finngen_olink['NTproBNP'] = finngen_olink[['NTproBNP', 'NTproBNP2']].bfill(axis=1).iloc[:, 0]
finngen_olink.drop(columns='NTproBNP2', inplace=True)  

In [None]:
# Calculate the threshold for missing data
perc = 0.3
threshold = perc * len(finngen_olink.index)

# Count the number of missing values in each column
mperc = (finngen_olink.isnull().sum() / len(finngen_olink.index)) * 100

# Filter columns where missing data is over 30%
cols_over = mperc[mperc > (perc*100)].index

print(f"Column names with missing data over {perc*100}%: {cols_over}")

In [None]:
# FID = finngen_olink['FID']
# finngen_olink.set_index('FID', inplace=True)
finngen_olink.drop('IID', axis=1, inplace=True)

In [None]:
## Create a dictionary of variables use to impute

# columns that should not be used to impute missing (including 3 proteins with high missing)
exclude = ['FID'] + list(cols_over)
# variables we don't want imputed
dont_impute = ['FID']
# create dict with list of predictors for each protein
column_dict = {col: [other_col for other_col in finngen_olink.columns if other_col != col and other_col not in exclude] for col in finngen_olink.columns if col not in dont_impute}

In [None]:
'FID' in column_dict['APOE']

In [None]:
finngen_olink['FID'] = pd.Categorical(finngen_olink['FID'])
finngen_olink

In [None]:
# set random seed
random_seed = 3456

# run miceforest imputation on multiple cores
kds = mf.ImputationKernel(
  finngen_olink,
  datasets=1,
  variable_schema=column_dict,
  random_state=random_seed
)

# run
kds.mice(
  iterations=5,
  n_jobs=-1, 
  verbose=True
)

# get the completed dataframe from the miceforest object
olink_data_imputed = kds.complete_data()

#save imputed data
olink_data_imputed.to_csv('/home/ivm/Documents/olink_imputed_jan_25_2024.csv', index=None)