### This notebook is a part of PGC ML project, identifying methylation signatures to predict PTSD and create methylation risk scores. 

#### This first notebook contains code to pre-process the data

In [None]:
# First load the settings file
%run Settings.ipynb

In [None]:
# Function to read the data

def read_data(fname, dirpath = None, sheet_name = 0):
    
    """
    Function to load the data
    Parameters: 
    fname: file name including extension you want to read
    dirpath: path to the directory containing file, None by default
    sheet_name: Sheet name for reading excel sheets
    output: data frame
    
    """
    if dirpath is None:
        p = fname
    else:
        p = dirpath+fname
    
    if fname.endswith(".feather"):
        f = feather.read_feather(p)
    elif fname.endswith(".csv"):
        f = pd.read_csv(p)
    elif fname.endswith(".xlsx"):
        f = pd.read_excel(p, sheet_name = sheet_name)
        
    return(f)


def get_samples(df, cols):
    """
    Function to subset data
    Parameters:
    df: data frame
    cols: columns
    """
    meth = df.loc[:, df.columns.str.contains('|'.join(cols))]
    return(meth)

def get_trauma_exposed(df, col):
    """
    Function to get only trauman exposed samples
    Parameters: 
    df: data frame
    col: column name to filter the data frame
    """
    return(df[df[col] != 0])


def remove_duplicates(df, col):
    """
    Function to get only trauman exposed samples
    Parameters: 
    df: data frame
    col: column name that contain duplicate ids
    """
    return(df.drop_duplicates(subset= col))
    

#### DNHS

In [None]:
# Load DNHS
dnhs_path = "G:/DNHS 2nd Batch/DNHS2ndBatachAnalysis/data/"
dnhs = read_data(fname="DNHS_Noob_QCd_ComBAT_adj_Batch1&2.feather", dirpath = dnhs_path)
dnhs_pheno = read_data(fname = "pheno_PCs_QC_with_smoking_scores.csv", dirpath=dnhs_path)


In [None]:
# check shape
print("DNHS beta shape :", dnhs.shape)
print("DNHS Pheno shape :", dnhs_pheno.shape)

In [None]:
# we will try to get max sample size of unique participants
# As we have some nas values, we will first drop those duplicates
# that have missing values in any columns
# cols_wd_miss = dnhs_pheno.columns[dnhs_pheno.isnull().any()].tolist()
# dnhs_pheno_unq = dnhs_pheno[~dnhs_pheno['RESP'].duplicated(keep=False) | 
#                             dnhs_pheno[cols_wd_miss].notnull().any(axis=1)]
# # dnhs_pheno_comp = dnhs_pheno.dropna()
# dnhs_pheno_unq

In [None]:
# Now get the trauma exposed only
dnhs_pheno = get_trauma_exposed(df =  dnhs_pheno,
                                col = 'TraumaNum')

In [None]:
# trauma number should be > 0
dnhs_pheno['TraumaNum'].min()

In [None]:
# Now we need only the unique resp ids
print("No of unique ppts :", len(dnhs_pheno['RESP'].unique()))
dnhs_pheno = remove_duplicates(df = dnhs_pheno,
                               col = "RESP")
len(dnhs_pheno['RESP'].unique())


In [None]:
dnhs.head

In [None]:
# Now get the samples in phenotype file
# It will have cpgs and other sample columns
dnhs_cols = ['rowname']+ dnhs_pheno['X'].tolist() # cpgs and samples
dnhs = get_samples(df = dnhs, cols = dnhs_cols)
dnhs.shape

In [None]:
dnhs

In [None]:
dnhs_pheno.isna().sum()

#### GTP

In [None]:
# load GTP
gtp = read_data(fname="G:/GTP Data/QCd Data/GTP_Noob_QCd_Combat_adj.feather")
gtp_pheno = read_data(fname="G:/GTP Data/QCd Data/Pheno_662_samps_With_smoking_scores.csv")
gtp_more_pheno = read_data(fname = "G:/GTP Data/QCd Data/Agaz_ML_Pheno.csv")

In [None]:
# gtp_pheno_old = read_data(fname="G:/GTP Data/QCd Data/Pheno_662_samps.csv")
# gtp_pheno_old['BaseName'].str.contains('|'.join(gtp_pheno['BaseName'].to_list())).sum()

In [None]:
# check shape
print("GTP beta shape :", gtp.shape)
print("GTP Pheno shape :", gtp_pheno.shape)
print("GTP more pheno :", gtp_more_pheno.shape)

In [None]:
# Check if all are trauma exposed
gtp_pheno[gtp_pheno['tei_total_types_experienced_somewitness'] == 0].shape
# gtp_pheno[gtp_pheno['TEI_TOTAL_TYPES_Experienced_somewitness'] == 0].shape


In [None]:
# Keep only trauma exposed
gtp_pheno = get_trauma_exposed(df = gtp_pheno,
                              col = 'tei_total_types_experienced_somewitness')


In [None]:
gtp_pheno['tei_total_types_experienced_somewitness'].min()

In [None]:
gtp_pheno.shape

In [None]:
gtp

In [None]:
gtp_pheno.columns

In [None]:
# Now merge two gtp phenotype files
gtp_pheno_comb = pd.merge(gtp_pheno, gtp_more_pheno,
                         left_on= 'BaseName', 
                          right_on='EPIC_795', how='inner')

In [None]:
gtp_pheno_comb.shape

In [None]:
gtp_pheno_comb.isnull().sum()

In [None]:
# Now get the methylation samples that are in phenotype file
gtp_cols = ["rowname"] + gtp_pheno_comb["BaseName"].tolist()
gtp = get_samples(df = gtp, cols=gtp_cols)

In [None]:
gtp.shape

#### MRS 

In [None]:
# load MRS
mrs = read_data(fname="G:/PGC ML/MRS/MRS_noob_qcd_crossReactiveProbesRemoved_combat_CP_wcovar_age_ptsd_allPreAsControls.feather")
mrs_pheno = read_data(fname="G:/PGC ML/MRS/MRS_Pheno_With_smoking_scores.csv")

In [None]:
# sort the rows 
mrs_pheno = mrs_pheno.sort_values(by =["studyid", "visit"],
                                ascending=True, axis=0)

In [None]:
mrs_pheno = get_trauma_exposed(df = mrs_pheno,
                              col = 'LECCUM_Stringent')

In [None]:
mrs_pheno['LECCUM_Stringent'].min()

In [None]:
# Check how many have two time points
mrs_pheno.groupby(["studyid"]).size().value_counts()

In [None]:
# Childhoot trauma is recorded on the first visit, so copy that
# information for the second visit
mrs_pheno["CTQ_TOTAL"] = mrs_pheno.groupby(["studyid"])["CTQ_TOTAL"].ffill()

In [None]:
mrs_pheno[["studyid", "visit", "CTQ_TOTAL"]]

In [None]:
# check shape
print("MRS beta shape :", mrs.shape)
print("MRS Pheno shape :", mrs_pheno.shape)

In [None]:
# As MRS has pre and post, we will use only one sample
mrs_pheno_post = mrs_pheno.loc[mrs_pheno["ID"].str.contains('POST')]

In [None]:
mrs_pheno_post['Group'].value_counts()

In [None]:
# Now get methylation samples that are in pheno
mrs_cols = ["V1"] + mrs_pheno_post["BaseName"].tolist()
mrs_post = get_samples(df = mrs, cols=mrs_cols)

In [None]:
mrs_post


### ArmySTARRS

In [None]:
army_path = "G:/PGC ML/ArmySTARRS/"
army_pheno = read_data(fname="armystarrs_Pheno_ML_updated.csv", dirpath=army_path)
army_meth = read_data(fname="Starrs_noob_qcd_crossReactiveProbesRemoved_combat_CP_wcovar_age2TP_ptsd_allPreAsControls.feather",
                     dirpath=army_path)

In [None]:
print("Armystarrs beta shape :", army_meth.shape)
print("Armystarrs Pheno shape :", army_pheno.shape)

In [None]:
# samples for two visits
army_pheno["visit"].value_counts()

In [None]:
# In armystarrs we have different trauma variables
# lets conside either non-deployment related trauma or 
# deployment related trauma
army_pheno["trauma_exposed_critA"].value_counts()

In [None]:
# lets keep only trauma exposed
army_pheno = get_trauma_exposed(df = army_pheno, 
                               col = 'trauma_exposed_critA')

In [None]:
army_pheno['trauma_exposed_critA'].min()

In [None]:
# get only visit 2
army_v2_pheno = army_pheno[army_pheno['visit'] == 2]

In [None]:
army_v2_pheno['visit'].value_counts()

In [None]:
army_v2_pheno["trauma_exposed_critA"].value_counts()

In [None]:
# drop columns that have all nas
army_v2_pheno = army_v2_pheno.dropna(axis=1, how = 'all')
army_v2_pheno

In [None]:
print("No of unique ppts :", len(army_v2_pheno['EWAS_id'].unique()))

In [None]:
army_v2_pheno.isna().sum()

In [None]:
# Now get methylation samples that are in pheno
army_cols = ["V1"] + army_v2_pheno["BaseName"].tolist()
army_v2_meth = get_samples(df = army_meth, cols = army_cols)
army_v2_meth.shape

In [None]:
army_v2_pheno.columns

### PRISMO

In [None]:
prismo_path ="G:/PGC ML/PRISMO/"
prismo_pheno = read_data(fname="prismo_Pheno_ML_updated.csv", dirpath=prismo_path)
prismo_meth = read_data(fname="Prismo_noob_qcd_crossReactiveProbesRemoved_combat_CP_wcovar_age_ptsd_allPreAsControls.feather",
                     dirpath=prismo_path)

In [None]:
print("Prismo beta shape :", prismo_meth.shape)
print("Prismo Pheno shape :", prismo_pheno.shape)

In [None]:
prismo_pheno["visit"].value_counts()


In [None]:
prismo_pheno = get_trauma_exposed(df = prismo_pheno,
                                 col = 'Pes_number')
prismo_pheno.shape

In [None]:
prismo_v2_pheno = prismo_pheno[prismo_pheno["visit"] == "2_epic"]
prismo_v2_pheno

In [None]:
prismo_v2_pheno.shape

In [None]:
print("No of unique ppts :", len(prismo_v2_pheno['EWAS_id'].unique()))

In [None]:
prismo_v2_pheno.isna().sum()

In [None]:
prismo_meth

In [None]:
prismo_cols = ["V1"] + prismo_v2_pheno["BaseName"].tolist()
prismo_v2_meth = get_samples(df = prismo_meth, cols = prismo_cols)
prismo_v2_meth.shape

In [None]:
prismo_v2_pheno.columns

In [None]:
# just a thought ----------------
# in DNHS we have Remitted samples as well
# So when we use ptsdpm, we need to remove those remitted ones


In [None]:
dnhs_pheno.iloc[:5, :5]

In [None]:
gtp.iloc[:5, :5]

#### Now combine all data

In [None]:
# Make a list of dfs 
all_meth_dfs = [dnhs, gtp, mrs_post, army_v2_meth, prismo_v2_meth]

# rename the first column
all_meth_dfs = [x.rename(columns = {x.columns[0]: 'CpGs'}) 
 for x in all_meth_dfs]


In [None]:
[x.iloc[:5, :5] for x in all_meth_dfs]

In [None]:
# Combine all methylation data 
from functools import reduce
dfs_merged = reduce(lambda left, right: pd.merge(left, right,
                                                 on = "CpGs",
                                                how='inner'), all_meth_dfs)

In [None]:
dfs_merged.shape

In [None]:
len(dfs_merged['CpGs'].unique())

In [None]:
dfs_merged.iloc[:5, :5]

#### We also need to combine the phenotypes, but before we do that we need to get the common variables

In [None]:
dnhs_pheno.columns

In [None]:
gtp_pheno_comb.columns

In [None]:
mrs_pheno_post.columns

In [None]:
army_v2_pheno.columns

In [None]:
prismo_v2_pheno.columns

In [None]:
# raname the first columns
# DNHS
dnhs_pheno = dnhs_pheno.rename(columns={'X':'BaseName', 'race6cat':'Race',
                          'childhood_cum_trauma': 'Childhood_MT',
                           'life_worst_intrusion': 'Intrusion',
                           'life_worst_avoidance': 'Avoidance',
                           'life_worst_hyperarousal': 'Hyperarousal',
                           'phq9sum': 'MDD',
                           'gad7sum': 'GAD',
                           'Life_PTS_severity': 'PTS_severity'
                          })


In [None]:
# GTP
gtp_pheno_comb = gtp_pheno_comb.rename(columns={'Unnamed: 0':'BaseName',
                               'mergedcapsandpsswinthin30days':'PTSDpm', 
                               'Life_PTSD_01': 'PTSDLife',
                               'age_x': 'Age',
                               'tei_total_types_experienced_somewitness':'TraumaNum',
                              'caps_life_freqplusintens_combined': 'PTS_severity',
                               'PSS_Intrusive': 'Intrusion',
                               'PSS_avoidnumb': 'Avoidance',
                               'PSS_hyperarousal': 'Hyperarousal',
                               'BDItotalscore': 'MDD',
                               'CTQTOT': 'Childhood_MT',
                               'pc1': 'Comp.1',
                               'pc2': 'Comp.2',
                               'pc3': 'Comp.3',
                               
                              })

In [None]:
# MRS
mrs_pheno_post = mrs_pheno_post.rename(columns={'ba_race': 'race',
                               'CAPSF1I2s': 'PTSDpm',
                               'Lifetime.PTSD' : 'PTSDLife',
                               'LECCUM_Stringent': 'TraumaNum',
                               'CAPStots': 'PTS_severity',
                               'CAPSBs': 'Intrusion',
                               'CAPSCs': 'Avoidance',
                               'CAPSDs': 'Hyperarousal',
                               'BDI2_SUM': 'MDD',
                               'BAI_mod_sev': 'GAD',
                               'CTQ_TOTAL' : 'Childhood_MT',
                               'Sex': 'Gender'
                              })

In [None]:
# raname the first columns
# ArmyStarrs
army_v2_pheno = army_v2_pheno.rename(columns={'race':'Race',
                        'CURRENT_PTSD': "PTSDpm",
                        'LIFETIME_PTSD': 'PTSDLife',
                        'MaltreatmentGlobal': 'Childhood_MT',
                        'trauma_exposed_critA': 'TraumaNum',
                        'pcl17_b_5q': 'Intrusion',
                        'pcl17_c_7q': 'Avoidance',
                        'pcl17_d_5q': 'Hyperarousal',
                        'PCL17_t23': 'PTS_severity',
                        'CD8T.EPICnoob': 'CD8T',
                        'CD4T.EPICnoob': 'CD4T',
                        'NK.EPICnoob': 'NK',
                        'Bcell.EPICnoob': 'Bcell',
                        'Mono.EPICnoob': 'Mono',
                        'Neu.EPICnoob': 'Neu'
                          })

In [None]:
# Prismo
prismo_v2_pheno = prismo_v2_pheno.rename(columns={'ancestry':'Race',
                          'CURRENT_PTSD': "PTSDpm",
                          'LIFETIME_PTSD': 'PTSDLife',
                          'ETItot': 'Childhood_MT',
                          'Pes_number': 'TraumaNum',
                          'REEXPERIENCE': 'Intrusion',
                          'AVOID': 'Avoidance',
                          'HYPERAROUSAL': 'Hyperarousal',
                          'TOTAL_SCORE': 'PTS_severity',
                          'CD8T.Epic': 'CD8T',
                          'CD4T.Epic': 'CD4T',
                          'NK.Epic': 'NK',
                          'Bcell.Epic': 'Bcell',
                          'Mono.Epic': 'Mono',
                          'Neu.Epic': 'Neu',
                          'gender': 'Gender'
                          })

In [None]:
dnhs_pheno.columns

In [None]:
gtp_pheno_comb.columns

In [None]:
# For mrs, both Pcs from gwas and methylation data are available
# Comp.1, Comp.2, Comp.3 are methylation
mrs_pheno_post.columns

In [None]:
army_v2_pheno.columns

In [None]:
prismo_v2_pheno.columns

In [None]:
# We have two age columns, lets drop one
mrs_pheno_post.drop(columns=['Age'], inplace=True)

In [None]:
[x.drop(columns=['AGE'], inplace=True) for x in [army_v2_pheno, prismo_v2_pheno]]

In [None]:
need_cols = ['BaseName', 'Gender','race$', '^Age$', 
             'PTSDpm', 'PTSDLife', 'TraumaNum', 
             'CD8T$', 'CD4T$', 'NK$', 'Bcell$', 'Mono$',
             'Neu$','PTS_severity', 'Childhood_MT',
             'Intrusion', 'Avoidance', 'Hyperarousal', '^MDD$',
            'Comp.2', 'Comp.3', 'Study$', 'SmoS']

In [None]:
import re
def get_cols(df, cols, case=None, title=None, sort=None):
    """
    Function to get required columns
    Parameters: 
    df: data frame
    cols: columns that need to be fetched
    case: If case should be ignored,  None by default
    
    Output: The dataframe with selected columns
    """
    if case is True:
        d = df.filter(regex=re.compile('|'.join(cols), re.IGNORECASE))
    else:
        d = df.filter(regex= re.compile('|'.join(cols)))
        
    if title is True:
        d.columns = [i.title() for i in d.columns]
        
    if sort is True:
        d = d.sort_index(axis=1)
        
    return(d)


# get the frequency of elements
def get_frequency(df, col):
    return(df[col].value_counts())


In [None]:
# now get the required columns from all dfs

all_phenos = [dnhs_pheno, gtp_pheno_comb, mrs_pheno_post,
             army_v2_pheno, prismo_v2_pheno]
phenos_sub = [get_cols(df = x, cols=need_cols, case=True,
                      title=True, sort=True) for x in all_phenos]

cohorts = ["dnhs", "gtp", "mrs", "armystarrs", "prismo"]

phenos_sub = dict(zip(cohorts, phenos_sub)) # make a dictionary

In [None]:
phenos_sub.keys()

In [None]:
[x.columns for x in phenos_sub.values()]

In [None]:
# get columns of each df
dnhs_cols, gtp_cols, mrs_cols, army_cols, prismo_cols = [x.columns for x in phenos_sub.values()]

In [None]:
dnhs_cols

In [None]:
gtp_cols

In [None]:
mrs_cols

In [None]:
army_cols

In [None]:
prismo_cols

In [None]:
# check if column names are matching
(dnhs_cols == gtp_cols).all()

In [None]:
import warnings
def matching(l1, l2):
    """
    Function to compare two lists and check the order
    
    Parameters:
    l1: list 1
    l2: list 2 
    """
    
    print("Total elements in l1 :", len(l1))
    print("Total elements in l2 :", len(l2))
    m = len([l for l in l1 if l in l2])
    print("Elements matching between l1 and l2 :", m)
    if(len(l1) == len(l2)):
        print("All in order :", (l1 == l2).all())
    else:
        elm = list(set(l1).difference(l2))
        print(elm)
        l1 = [x for x in l1 if x not in elm]
        print(l1)
        print("All common elements in order :", (l1 == l2).all())
        

In [None]:
matching(l1 = dnhs_cols, l2 = gtp_cols)

In [None]:
matching(l1 = dnhs_cols, l2 = mrs_cols)

In [None]:
matching(l1 = dnhs_cols, l2 = army_cols)

In [None]:
matching(l1 = dnhs_cols, l2 = prismo_cols)

In [None]:
# common in all
list(set(dnhs_cols) & set(mrs_cols) & set(army_cols) & set(prismo_cols))

In [None]:
[x.iloc[:5, :5] for x in phenos_sub.values()]

In [None]:
phenos_sub['gtp']

In [None]:
# convert to int
gtp_p_sub = phenos_sub['gtp'].astype({"Traumanum":'int',
                             })
get_frequency(df=gtp_p_sub, col='Ptsdlife')

In [None]:
# nas in ptsd life
gtp_p_sub['Ptsdlife'].isna().sum()

In [None]:
phenos_sub['dnhs']

In [None]:
# get number of males and females
[get_frequency(df = x, col='Gender') for x in phenos_sub.values()]

In [None]:
def replace_elements(df, col, new_elements, verbose = None):
    
    """
    Function to replace the elements in a column, e.g female:2, male:1 
    Parameters:
    df: data frame in which you want to replace
    col: name of the column in which you want to replace the elements
    new_elements: new elements to replace with 
    verbose: Print some information, default None 
    
    """
    df = df.copy(deep = True)
    x = df[col].value_counts().index
    if(len(x) != len(new_elements)):
        raise ValueError("Elements to replace must have the same length as new elements")
    
    d = {x[i]:new_elements[i] for i in range(len(new_elements))} # make dictionary
    
    if verbose is True:
        print("Categories :\n", x)
        print("Replacing :\n", d)
    
    df[col] = df[col].replace(d)
    
    return(df)

In [None]:
# replace gender in DNHS
# In original study, M = 2, F = 1
# But here in ML we will replace it to make it uniform with other studies
dnhs_final = replace_elements(df = phenos_sub['dnhs'], col='Gender', 
                       new_elements=[2,1], verbose=True)

In [None]:
# before 
phenos_sub['dnhs']['Gender'].value_counts()

In [None]:
# After replacing
dnhs_final["Gender"].value_counts()

In [None]:
# replace gender in GTP
gtp_final = replace_elements(df = phenos_sub['gtp'], col='Gender', 
                           new_elements=[2,1], verbose=True)

In [None]:
# Before 
phenos_sub['gtp']['Gender'].value_counts()

In [None]:
gtp_final['Gender'].value_counts()

In [None]:
# replace race in GTP
gtp_final = replace_elements(df = gtp_final, col = "Race",
                              new_elements=[2,1], verbose=True)

In [None]:
gtp_final["Race"].value_counts()

In [None]:
gtp_final

In [None]:
# MRS
phenos_sub['mrs']['Gender'].value_counts()

In [None]:
phenos_sub['armystarrs']['Gender'].value_counts()

In [None]:
phenos_sub['prismo']['Gender'].value_counts()

In [None]:
final

In [None]:
# combine phenotype data
final = pd.concat([dnhs_final, gtp_final, phenos_sub['mrs'],
                  phenos_sub['armystarrs'], phenos_sub['prismo']],
                 sort = False)

In [None]:
final.shape

In [None]:
final

In [None]:
# Check categories in final
print("Gender:\n", final['Gender'].value_counts())
print("PTSDpm:\n", final['Ptsdpm'].value_counts())
print("PTSDlife:\n", final['Ptsdlife'].value_counts())

In [None]:
# now check na in the combined data
final.isnull().sum()

In [None]:
# Now check if we have all the samples in pheno and methylation files
dfs_merged.columns.str.contains('|'.join(final['Basename'].tolist())).sum()

In [None]:
def check_all_match(first, second):
    """
    Function to check if all the samples in methylation and phenotye match
    Parameters: 
    first: Elements to search
    second: Elements to search in
    """
    all_match = first.str.contains('|'.join(second.tolist())).all()
    num_match = first.str.contains('|'.join(second.tolist())).sum()
    if not all_match:
        raise ValueError('All are not matching')
    elif all_match:
        print("All samples match between pheno and methylation: ", num_match)


In [None]:
check_all_match(first = final['Basename'], 
               second = dfs_merged.columns)

In [None]:
# Now save the data
# Create the directory and assign timestamp folder

import os, datetime

def make_directory(maindir = None, verbose = None):
    """
    Function to create directory in you current working directory.
    The function will have time stamp assigned
    
    Parameters: 
    dirname : name of main directory to hold newly created directories
    
    """
    
#     os.chdir('..') # go one step back to the current dir
    
    if maindir is False or  maindir is True:
        raise ValueError("dirname can't be True or False")
    
    if maindir is None:
        mydir = os.path.join(os.getcwd(),
                     datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))
        
    elif maindir is not None:
        mydir = os.path.join(os.getcwd(), maindir,
                     datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))
        
    os.makedirs(mydir)
        
    if verbose:
        print("Directory created:", mydir)
        
    return(mydir)
      


In [None]:
# change to directory and make folder
os.chdir("G:/PGC ML/")
mydir = make_directory(maindir="Pre_Processed Data",  verbose=True)

In [None]:
# Save Phenotype file
final.to_csv(os.path.join(mydir, "DNHS_GTP_MRS_ArmyS_Prismo_Pheno.csv"),
            index=False)

In [None]:

def save_data(fname, df):
    """
    Function to save the data
    Parameters:
    fname: file name
    df: data frame
    """
    if fname.endswith(".csv"):
        df.to_csv(os.path.join(mydir, fname))
    elif fname.endswith(".feather"):
        feather.write_feather(df, os.path.join(mydir, fname))
        

In [None]:
# Save individual datasets
pheno_f_names = ["DNHS_UnqRESP_Pheno_final.csv", "GTP_Pheno_final.csv",
          "MRS_POST_DEP_Pheno_final.csv", "ArmyStarrs_visit2_pheno.csv",
          "Prismo_visit2_pheno.csv"]

individual_cohorts = [dnhs_final, gtp_final, phenos_sub['mrs'],
                  phenos_sub['armystarrs'], phenos_sub['prismo']]

In [None]:
for i in range(len(pheno_f_names)):
    save_data(fname=pheno_f_names[i], df = individual_cohorts[i])
    print(pheno_f_names[i])
    

In [None]:
# Save individual methylation data 
meth_f_names = ["DNHS_methylation_unq.feather", "GTP_methylation.feather",
               "MRS_methylation_post.feather",
                "ArmyStarrs_visit2_methylation.feather",
               "Prismo_visit2_methylation.feather"]
for i in range(len(meth_f_names)):
    save_data(fname=meth_f_names[i], df = all_meth_dfs[i])
    print(meth_f_names[i])

In [None]:
# Save combined methylation data
feather.write_feather(dfs_merged, os.path.join(mydir, "DNHS_GTP_MRS_ArmyS_Prismo_methylation.feather"))

In [None]:
# Total number of columns that are matching 
# Without rowname column
dfs_merged.columns.isin(final["Basename"]).sum()

In [None]:
# end