### Imputation and covariate adjustment. The process in this file is computationally expensive and was run on the cluster. This file adds 'Gender' and exposure variables as a covariate to remove sex specfic and exprosure related differences.

#### 1. Import settings

In [None]:
# Settings imported from other notebook Settings.ipynb
#%run Settings.ipynb
from pyarrow import feather
import pandas as pd
import numpy as np
import os, datetime
from makedirectory import make_directory

In [None]:
# check if the program is running locally or cluster
process = os.popen('hostname') # open process
p_loc = process.read() 
p_loc = p_loc.strip('\n')

process.close() # close

if p_loc == 'WS-IDRB-404B':
    print("Running locally")
else:
    print("Running on cluster")

#### 2. Load data

In [None]:
# 2021-07-12_10-28-46 
# import pandas as pd
if p_loc == 'WS-IDRB-404B':
    beta = feather.read_feather("G:/PGC ML/Pre_Processed Data/2021-11-15_21-41-53/DNHS_GTP_MRS_ArmyS_Prismo_methylation.feather")
    pheno = pd.read_csv("G:/PGC ML/Pre_Processed Data/2021-11-15_21-41-53/DNHS_GTP_MRS_ArmyS_Prismo_Pheno.csv")
else:
    beta = feather.read_feather("/home/a/ahwani/PGCML/DNHS_GTP_MRS_ArmyS_Prismo_methylation.feather")
    pheno = pd.read_csv("/home/a/ahwani/PGCML/DNHS_GTP_MRS_ArmyS_Prismo_Pheno.csv")


In [None]:
# convert cpg names to index
beta = beta.set_index("CpGs")

In [None]:
# count different categories
pheno['Gender'].value_counts()

In [None]:
pheno['Ptsdpm'].value_counts()

In [None]:
pheno['Ptsdlife'].value_counts()

In [None]:
pheno

In [None]:
# In this phenotype file race column has strings like 1,5/2,5
# So we need to remve the substring after , otherwise an error in ML mode
pheno['Race'] = pheno['Race'].str.split(',').str[0]


In [None]:
# Check if "," is still there
pheno['Race'].str.contains(',').any()

In [None]:
# Dimension
beta.shape

In [None]:
pheno.shape

In [None]:
# outpheno type file has some columns not needed in ML
# Lets remove them
# Basename we will remove later, because we need it
# pheno = pheno.drop(['Unnamed: 0'], axis=1)

In [None]:
# count nas values in each column
len(pheno) - pheno.count()

#### Covariate adjustment

In [None]:
pheno.head()

In [None]:
# covariates to adjust for
# We removed "Neu" because cell types may sum up to 1
# may effect liear model. 
# added 'Gender' as a covariate including "Childhood_Mt", "Traumanum" 
covars = ["Bcell", "Cd4T", "Cd8T","Mono", "Nk", 
          "Smos", "Comp.2", "Comp.3", "Age", "Gender",
          "Childhood_Mt", "Traumanum"]
cols = ["Basename", "Study"] + covars
cols
covar_df = pheno[cols]
covar_df = covar_df.dropna(subset = covars[:-2]) # drop nas without childhood trauma and trauma num becase we'll impute them
covar_df

In [None]:
# check na
covar_df.isna().any().sum()

In [None]:
studies = covar_df["Study"].unique().tolist()
studies

In [None]:
# seprate data based on study
pheno_ls = [covar_df[covar_df["Study"] == x] for x in studies]

# Now impute missing values in each cohort
# using column mean
pheno_ls = [x.fillna(x.mean()) for x in pheno_ls]
[x.shape for x in pheno_ls]


In [None]:

# check for nas
[x.isna().any().sum() for x in pheno_ls]

In [None]:
beta = beta.T # tranpose to get cpgs as columns

In [None]:
# Now separate methylation data
meth_ls = [beta.loc[beta.index.isin(x["Basename"])] for x in pheno_ls]

In [None]:
[x.shape for x in meth_ls]

In [None]:
# Now impute missing values in each cohort
# using column mean
meth_ls = [x.fillna(x.mean()) for x in meth_ls]

In [None]:
# check for nas
[x.isna().any().sum() for x in meth_ls]

In [None]:
# Combine after imputation so that we can test it 
# without covariate adjustment in machine learning
meth_data = pd.concat(meth_ls)
meth_data

In [None]:
# check if df has zero element
meth_data.isin([0]).any().any()

In [None]:
# Convert beta to m values
meth_data_m = np.log((meth_data/(1-meth_data)))
meth_data_m

In [None]:
meth_data["Index"] = meth_data.index
meth_data_m["Index"] = meth_data_m.index
meth_data

In [None]:
# make directory data

In [None]:
# feather.write_feather(meth_data, "G:/PGC ML/Pre_Processed Data/2021-11-15_21-41-53/Imputed_DNHS_GTP_MRS_ArmyS_Prsm.feather")
# feather.write_feather(meth_data_m, "G:/PGC ML/Pre_Processed Data/2021-11-15_21-41-53/Imputed_DNHS_GTP_MRS_ArmyS_Prsm_m_vals.feather")

In [None]:
# Now convert individual cohorts beta to m values
meth_ls_mvals = [np.log(x/(1-x)) for x in meth_ls]

In [None]:
meth_ls_mvals

In [None]:
# check shape
[x.shape for x in meth_ls_mvals]

In [None]:
[x.shape for x in pheno_ls]

In [None]:
pheno_ls

In [None]:
# Now lets do covariate adjustment
# Do it for each cohort seprately
test_betavals =  [x.iloc[0:50, 0:10] for x in meth_ls] # meth_ls #
test_mvals = [x.iloc[0:50, 0:10] for x in meth_ls_mvals]
covar = [x.iloc[0:50, 2:] for x in pheno_ls]

In [None]:
print([x.shape for x in test_betavals])
print([x.shape for x in covar])

In [None]:
print([x.shape for x in test_mvals])

In [None]:
# adjusted_data = {'DNHS':[], 'GTP':[], 'MRS':[],
#                 'Armystarrs':[], 'Prismo':[]}
from collections import defaultdict
adjusted_data = defaultdict(list)
adjusted_data

In [None]:
studies

In [None]:
# For covariate adjustment, a model is fit on m-values
# And the m-value is subtracted from the residuals
from sklearn.linear_model import LinearRegression
names = ["beta", "m"]
dfs = [test_betavals, test_mvals]
for k in range(len(dfs)): # loop over beta and m values
    for i in range(len(dfs[k])): # loop over individual cohorts in beta and m
        print("Processing data ", i+1)
        for j in dfs[k][i]:
            lr = LinearRegression().fit(covar[i], dfs[k][i][j])
            pred = lr.predict(covar[i])
            residuals = round((dfs[k][i][j]-pred), 5)
            print(residuals.values[0:5])
            adjusted_data[names[k] + "_" + studies[i]].append((residuals.values))

In [None]:
adjusted_data.keys()

In [None]:
adjusted_data["beta_DNHS"]

In [None]:
adjusted_data["m_DNHS"]

In [None]:
cpg_names = test_betavals[0].columns # it will be same of all cohorts

In [None]:
# Samples names in each cohort
sample_names = [x.index for x in test_betavals]
sample_names

In [None]:
# combine all
sample_names = [item for sublist in sample_names for item in sublist]
sample_names

In [None]:
print(cpg_names[0:5])
print(sample_names[0:5])

In [None]:
def pull_studies(ptrn, in_dict):
    """
    Function to pull out beta and m values in cohorts
    Parameters:
    ptrn: Pattern to search
    in_dict: input dictionary 
    """
    return(dict(filter(lambda item: ptrn in item[0], in_dict.items())))


def add_column_names(col_names, in_dict,
                     cohorts):
    """
    Function to add column names to each cohort
    Parameters:
    col_names: column names (cpgs)
    in_dict: input dictionary
    cohorts: Names of cohorts
    """
    return([pd.DataFrame.from_dict(dict(zip(col_names, in_dict[x])))
         for x in cohorts])

In [None]:
beta_studies = pull_studies(ptrn="beta", in_dict=adjusted_data)
beta_studies.keys()

In [None]:
beta_studies

In [None]:
m_studies = pull_studies(ptrn="m_", in_dict=adjusted_data)
m_studies.keys()

In [None]:
m_studies

In [None]:
final_beta = add_column_names(col_names=cpg_names,
                             in_dict=beta_studies,
                             cohorts=beta_studies.keys())

final_m = add_column_names(col_names=cpg_names,
                             in_dict=m_studies,
                             cohorts=m_studies.keys())
# [pd.DataFrame.from_dict(dict(zip(cpg_names, adjusted_data[x])))
#          for x in studies]

In [None]:
final_beta

In [None]:
# now combine all cohorts for beta and m values
final_beta_comb = pd.concat(final_beta)
final_m_comb = pd.concat(final_m)

In [None]:
# Convert m values back to beta values after covariate adjustment
final_m_comb = 1/(1+(1/np.exp(final_m_comb)))

In [None]:
final_beta_comb["Index"] = sample_names
final_m_comb["Index"] =  sample_names

In [None]:
final_beta_comb.shape

In [None]:
final_m_comb

In [None]:
# make directory data
# d_dir = make_directory("G:/PGC ML/Covariate Adjusted/")
# d_dir

In [None]:
# save 
# feather.write_feather(final_beta_comb, "G:/PGC ML/Pre_Processed Data/2021-07-12_10-28-46/Imputed_Covariate_adjusted_Meth.feather")

In [None]:
# We converted the values back to beta values after linear regression 
# residuals on m values
# feather.write_feather(final_comb, "G:/PGC ML/Pre_Processed Data/2021-07-12_10-28-46/Imputed_Covariate_adjusted_Meth_on_mvals.feather")

In [None]:
# save
if p_loc != 'WS-IDRB-404B':
    d_dir = make_directory("/work/a/ahwani/PGCML/Covariate Adjusted/")
    feather.write_feather(final_beta_comb, os.path.join(d_dir, "Imputed_Covariate_including_childhood_and_total_trauma_adjusted_Meth.feather"))


    # We converted the values back to beta values after linear regression 
    # residuals on m values
    feather.write_feather(final_m_comb, os.path.join(d_dir, "Imputed_Covariate_including_childhood_and_total_trauma_adjusted_Meth_on_mvals_wo_Neu.feather"))