### This notebook is a part of PGC ML project, identifying methylation signatures to predict PTSD in pre and post trauma. 

#### This first notebook contains code to pre-process the pre and post trauma  data from three Army cohorts

In [1]:
# First load the settings file
%run Settings.ipynb

In [2]:
# Function to read the data

def read_data(fname, dirpath = None, sheet_name = 0):
    
    """
    Function to load the data
    Parameters: 
    fname: file name including extension you want to read
    dirpath: path to the directory containing file, None by default
    sheet_name: Sheet name for reading excel sheets
    output: data frame
    
    """
    if dirpath is None:
        p = fname
    else:
        p = dirpath+fname
    
    if fname.endswith(".feather"):
        f = feather.read_feather(p)
    elif fname.endswith(".csv"):
        f = pd.read_csv(p)
    elif fname.endswith(".xlsx"):
        f = pd.read_excel(p, sheet_name = sheet_name)
        
    return(f)


def get_samples(df, cols):
    """
    Function to subset data
    Parameters:
    df: data frame
    cols: columns
    """
    meth = df.loc[:, df.columns.str.contains('|'.join(cols))]
    return(meth)

def get_trauma_exposed(df, col):
    """
    Function to get only trauman exposed samples
    Parameters: 
    df: data frame
    col: column name to filter the data frame
    """
    return(df[df[col] != 0])


def remove_duplicates(df, col):
    """
    Function to get only trauman exposed samples
    Parameters: 
    df: data frame
    col: column name that contain duplicate ids
    """
    return(df.drop_duplicates(subset= col))
    

#### MRS 

In [132]:
# load MRS
mrs = read_data(fname="G:/PGC ML/MRS/MRS_noob_qcd_crossReactiveProbesRemoved_combat_CP_wcovar_age_ptsd_allPreAsControls.feather")
mrs_pheno = read_data(fname="G:/PGC ML/MRS/MRS_Pheno_With_smoking_scores.csv")

In [133]:
# sort the rows 
mrs_pheno = mrs_pheno.sort_values(by =["studyid", "visit"],
                                ascending=True, axis=0)
mrs_pheno.shape

(254, 64)

In [134]:
mrs_pheno = get_trauma_exposed(df = mrs_pheno,
                              col = 'LECCUM_Stringent')
mrs_pheno.shape

(254, 64)

In [135]:
mrs_pheno['LECCUM_Stringent'].min()

1

In [136]:
# Check how many have two time points
mrs_pheno.groupby(["studyid"]).size().value_counts()

2    127
dtype: int64

In [137]:
# Childhoot trauma is recorded on the first visit, so copy that
# information for the second visit
mrs_pheno["CTQ_TOTAL"] = mrs_pheno.groupby(["studyid"])["CTQ_TOTAL"].ffill()

In [138]:
mrs_pheno[["studyid", "visit", "CTQ_TOTAL"]]

Unnamed: 0,studyid,visit,CTQ_TOTAL
77,1058,0.0,
76,1058,3.0,
63,1148,0.0,
62,1148,2.0,
245,1340,0.0,
244,1340,2.0,
107,2005,0.0,
106,2005,2.0,
64,2035,0.0,
65,2035,2.0,


In [139]:
# check shape
print("MRS beta shape :", mrs.shape)
print("MRS Pheno shape :", mrs_pheno.shape)
print("Unique study ids:", len(mrs_pheno['studyid'].unique()))

MRS beta shape : (821112, 255)
MRS Pheno shape : (254, 64)
Unique study ids: 127


In [140]:
# As MRS has pre and post, we will use only pre and post samples
def pull_pre_post(df, col, str_p):
    return df.loc[df[col].str.contains(str_p)]

In [141]:
# mrs_pheno_pre, mrs_pheno_post = [pull_pre_post(df = mrs_pheno, 
#                                                col = "ID", 
#                                                str_p = s) 
#                                  for s in ['PRE', 'POST']]

In [142]:
[x['Group'].value_counts() for x in [mrs_pheno]]
# print(mrs_pheno_post['Group'].value_counts())

[POST    127
 PRE     127
 Name: Group, dtype: int64]

In [143]:
# check if we have same ids in pre and post samples
# mrs_pheno_pre['studyid'].isin(mrs_pheno_post['studyid']).sum()

In [144]:
# Now get methylation samples that are in pheno for pre and post 
def col_list(df):
    return ["V1"] + df["BaseName"].tolist()
     
mrs_cols = col_list(df = mrs_pheno)        
mrs_meth = get_samples(df = mrs, cols=mrs_cols)

In [145]:
mrs_meth

Unnamed: 0,V1,201533580029_R01C01,201533580029_R02C01,201533580029_R03C01,201533580029_R04C01,201533580044_R01C01,201533580044_R02C01,201533580044_R05C01,201533580044_R06C01,201533580044_R07C01,201533580044_R08C01,201533590001_R01C01,201533590001_R02C01,201533590001_R03C01,201533590001_R04C01,201533590001_R05C01,201533590001_R06C01,201533590001_R07C01,201533590001_R08C01,201533590015_R01C01,201533590015_R02C01,201533590015_R03C01,201533590015_R04C01,201533590015_R05C01,201533590015_R06C01,201533590015_R07C01,201533590015_R08C01,201533590017_R01C01,201533590017_R02C01,201533590017_R03C01,201533590017_R04C01,201533590017_R05C01,201533590017_R06C01,201533590017_R07C01,201533590017_R08C01,201533590021_R01C01,201533590021_R02C01,201533590021_R03C01,201533590021_R04C01,201533590021_R05C01,201533590021_R06C01,201533590022_R01C01,201533590022_R02C01,201533590022_R03C01,201533590022_R04C01,201533590027_R01C01,201533590027_R02C01,201533590027_R03C01,201533590027_R04C01,201533590040_R01C01,201533590040_R02C01,201533590040_R03C01,201533590040_R04C01,201533590040_R05C01,201533590040_R06C01,201533590040_R07C01,201533590040_R08C01,201533590047_R01C01,201533590047_R02C01,201533590047_R03C01,201533590047_R04C01,201533590047_R07C01,201533590047_R08C01,201533590058_R01C01,201533590058_R02C01,201533590058_R03C01,201533590058_R04C01,201533590058_R05C01,201533590058_R06C01,201533590058_R07C01,201533590058_R08C01,201533590060_R01C01,201533590060_R02C01,201533590060_R05C01,201533590060_R06C01,201533590060_R07C01,201533590060_R08C01,201858500010_R03C01,201858500010_R04C01,201858500010_R05C01,201858500010_R06C01,201858500010_R07C01,201858500010_R08C01,201858500011_R01C01,201858500011_R02C01,201858500011_R03C01,201858500011_R04C01,201858500011_R05C01,201858500011_R06C01,201858500045_R01C01,201858500045_R02C01,201858500045_R03C01,201858500045_R04C01,201858500045_R05C01,201858500045_R06C01,201858500045_R07C01,201858500045_R08C01,201858500072_R01C01,201858500072_R02C01,201858500072_R03C01,...,201858500191_R03C01,201858500191_R04C01,201858500191_R05C01,201858500191_R06C01,201858500191_R07C01,201858500191_R08C01,201858500197_R03C01,201858500197_R04C01,201858500197_R05C01,201858500197_R06C01,201858500197_R07C01,201858500197_R08C01,201858500218_R01C01,201858500218_R02C01,201858500218_R03C01,201858500218_R04C01,201858500218_R05C01,201858500218_R06C01,201858500218_R07C01,201858500218_R08C01,201858500222_R01C01,201858500222_R02C01,201858500222_R05C01,201858500222_R06C01,201858500222_R07C01,201858500222_R08C01,201858500223_R01C01,201858500223_R02C01,201858500223_R03C01,201858500223_R04C01,201858500225_R03C01,201858500225_R04C01,201858500225_R05C01,201858500225_R06C01,201858500225_R07C01,201858500225_R08C01,201858500232_R01C01,201858500232_R02C01,201858500232_R03C01,201858500232_R04C01,201858500232_R05C01,201858500232_R06C01,201858500232_R07C01,201858500232_R08C01,201858500237_R01C01,201858500237_R02C01,201858500237_R03C01,201858500237_R04C01,201858500237_R05C01,201858500237_R06C01,201858500237_R07C01,201858500237_R08C01,201858500244_R01C01,201858500244_R02C01,201858500244_R05C01,201858500244_R06C01,201858500244_R07C01,201858500244_R08C01,201858500249_R05C01,201858500249_R06C01,201858500249_R07C01,201858500249_R08C01,201858500252_R01C01,201858500252_R02C01,201858500252_R03C01,201858500252_R04C01,201858500252_R05C01,201858500252_R06C01,201858500252_R07C01,201858500252_R08C01,201858500265_R01C01,201858500265_R02C01,201858500265_R03C01,201858500265_R04C01,201858500265_R05C01,201858500265_R06C01,201858500265_R07C01,201858500265_R08C01,201858500266_R01C01,201858500266_R02C01,201858500266_R03C01,201858500266_R04C01,201858500266_R05C01,201858500266_R06C01,201858500266_R07C01,201858500266_R08C01,202410280158_R01C01,202410280158_R02C01,202410280158_R03C01,202410280158_R04C01,202410280158_R07C01,202410280158_R08C01,202410280159_R01C01,202410280159_R02C01,202410280159_R03C01,202410280159_R04C01,202410280159_R05C01,202410280159_R06C01,202410280159_R07C01,202410280159_R08C01
0,cg18478105,0.01471,0.01638,0.01675,0.01356,0.01436,0.01909,0.01499,0.01590,0.01561,0.01393,0.01705,0.01493,0.01478,0.01423,0.01680,0.01498,0.01201,0.01479,0.01436,0.01485,0.01430,0.01902,0.01306,0.01342,0.01481,0.01751,0.01633,0.01280,0.01368,0.01495,0.01466,0.01375,0.01436,0.01882,0.01661,0.01397,0.01520,0.01474,0.01423,0.01569,0.01496,0.01669,0.01471,0.01440,0.01278,0.01555,0.01637,0.01452,0.01447,0.01332,0.01263,0.01739,0.01727,0.01579,0.01693,0.01452,0.01389,0.01577,0.01348,0.01390,0.01694,0.01368,0.01880,0.01358,0.01424,0.01450,0.01423,0.01521,0.01413,0.01368,0.01636,0.01381,0.01492,0.01756,0.01300,0.01493,0.01519,0.01507,0.01587,0.01366,0.01486,0.01375,0.01378,0.01729,0.01600,0.01374,0.01540,0.01358,0.01350,0.01592,0.01714,0.01317,0.01617,0.01613,0.01314,0.01455,0.01666,0.01449,0.01817,...,0.01342,0.01729,0.01534,0.01481,0.01471,0.01329,0.01432,0.01685,0.01353,0.01609,0.01398,0.01350,0.01390,0.01556,0.01601,0.01598,0.01473,0.01379,0.01530,0.01473,0.01653,0.01697,0.01771,0.01322,0.01602,0.01663,0.01336,0.01605,0.01709,0.01667,0.01512,0.01698,0.01216,0.01462,0.01565,0.01513,0.01472,0.01558,0.01516,0.01655,0.01501,0.01446,0.01619,0.01474,0.01641,0.01708,0.01586,0.01689,0.01548,0.01726,0.01244,0.01559,0.01828,0.01533,0.01540,0.01440,0.01307,0.01478,0.01309,0.01489,0.01532,0.01915,0.01633,0.01645,0.01602,0.01585,0.01641,0.01562,0.01414,0.01532,0.01574,0.01378,0.01589,0.01490,0.01575,0.01525,0.01956,0.01497,0.01732,0.01437,0.01427,0.01594,0.01528,0.01412,0.01640,0.01620,0.01622,0.01645,0.01751,0.01571,0.01304,0.01414,0.01742,0.01291,0.01740,0.01537,0.01569,0.01570,0.01824,0.01507
1,cg09835024,0.02721,0.03539,0.02973,0.03160,0.03108,0.03306,0.03514,0.03134,0.03170,0.02792,0.03177,0.03188,0.02950,0.03258,0.02908,0.02910,0.02854,0.02913,0.02949,0.03080,0.03257,0.03315,0.02609,0.02711,0.03200,0.02960,0.02965,0.03057,0.03336,0.03355,0.03347,0.02804,0.02839,0.02978,0.02927,0.02962,0.03263,0.03231,0.03139,0.03055,0.03143,0.03250,0.03216,0.02728,0.03353,0.03068,0.02896,0.02981,0.03379,0.02753,0.02758,0.03081,0.03108,0.02821,0.03236,0.03292,0.02825,0.02876,0.02983,0.03239,0.03695,0.02949,0.02883,0.03551,0.02973,0.02950,0.03006,0.02692,0.02950,0.03146,0.03048,0.02855,0.03039,0.03240,0.03134,0.02637,0.02966,0.02832,0.02822,0.03467,0.03242,0.02990,0.02998,0.03143,0.02941,0.03200,0.03225,0.02696,0.03384,0.02906,0.03401,0.02978,0.02820,0.03169,0.03273,0.02924,0.02946,0.03201,0.03441,...,0.02962,0.03244,0.03149,0.02830,0.03074,0.02781,0.03200,0.03111,0.03152,0.02773,0.02824,0.03064,0.03193,0.03076,0.03154,0.03002,0.02758,0.03000,0.02739,0.03293,0.03247,0.03514,0.03020,0.03089,0.03020,0.02992,0.03016,0.02776,0.03103,0.03255,0.03075,0.03397,0.02829,0.02899,0.03002,0.03195,0.03469,0.03161,0.02949,0.03231,0.02913,0.03068,0.02833,0.03051,0.03299,0.03352,0.03367,0.02997,0.03082,0.03088,0.02725,0.03365,0.03285,0.03164,0.03256,0.03157,0.02827,0.03067,0.03399,0.03215,0.02898,0.03029,0.03401,0.03085,0.03115,0.03271,0.03001,0.03143,0.02951,0.02935,0.03570,0.03203,0.03020,0.02980,0.03098,0.02911,0.03128,0.03219,0.03147,0.03124,0.03570,0.02839,0.03284,0.02979,0.02944,0.03034,0.02949,0.03545,0.02953,0.03453,0.03024,0.02997,0.03144,0.03345,0.03168,0.03403,0.02850,0.02892,0.03144,0.03110
2,cg14361672,0.87702,0.90871,0.88785,0.86194,0.84975,0.90888,0.88636,0.89792,0.86142,0.87560,0.86569,0.89228,0.87207,0.89615,0.88493,0.86218,0.90597,0.89499,0.90078,0.87544,0.88053,0.89093,0.87180,0.88188,0.92086,0.87118,0.84717,0.87092,0.86081,0.90071,0.89426,0.87684,0.88230,0.87817,0.89828,0.87357,0.84645,0.88135,0.90285,0.89945,0.88409,0.89241,0.91371,0.85385,0.88557,0.91452,0.92147,0.87911,0.88214,0.87300,0.91670,0.84339,0.86085,0.88471,0.88406,0.89707,0.87765,0.88752,0.89701,0.86902,0.90443,0.90948,0.88257,0.88408,0.87175,0.88454,0.90603,0.87832,0.87220,0.89248,0.87671,0.83802,0.86184,0.90911,0.89805,0.88912,0.89509,0.89174,0.91689,0.90368,0.85554,0.87224,0.89818,0.90299,0.90051,0.86560,0.85992,0.88872,0.90248,0.88167,0.88465,0.87411,0.85610,0.87007,0.85879,0.88282,0.90285,0.90688,0.87577,...,0.83464,0.86832,0.87734,0.87844,0.88099,0.91357,0.86330,0.89401,0.86814,0.90421,0.89134,0.90388,0.88804,0.90817,0.88683,0.89830,0.86486,0.87647,0.86598,0.86205,0.86207,0.87138,0.87584,0.90808,0.88379,0.87390,0.88450,0.88356,0.85307,0.89633,0.89459,0.87107,0.90590,0.87300,0.84171,0.87671,0.85926,0.89058,0.86014,0.87624,0.90616,0.91049,0.89239,0.89935,0.87917,0.87961,0.86432,0.88553,0.92369,0.90835,0.89636,0.89048,0.91679,0.87910,0.85684,0.86536,0.87957,0.88475,0.90533,0.87319,0.85625,0.90120,0.88827,0.88293,0.89181,0.89306,0.90490,0.89413,0.88816,0.84319,0.90086,0.90603,0.89052,0.90190,0.86268,0.86170,0.89212,0.89367,0.88658,0.90992,0.88137,0.88661,0.86724,0.86324,0.90367,0.89351,0.87447,0.85577,0.89964,0.92038,0.89160,0.88590,0.90902,0.87879,0.88267,0.91711,0.88930,0.87473,0.87217,0.87888
3,cg01763666,0.80343,0.82561,0.82502,0.88384,0.86722,0.83013,0.85692,0.85494,0.86198,0.81639,0.87113,0.79930,0.87216,0.81352,0.81690,0.80537,0.84311,0.84012,0.81645,0.86882,0.83280,0.79054,0.79777,0.83222,0.84489,0.86632,0.82716,0.77534,0.82082,0.85708,0.87923,0.82719,0.86829,0.82453,0.82239,0.86140,0.87964,0.84336,0.81881,0.83430,0.87365,0.77762,0.82772,0.84053,0.82587,0.79873,0.86936,0.84912,0.88108,0.84611,0.80389,0.79439,0.83539,0.83688,0.80018,0.81702,0.85983,0.84923,0.84792,0.86252,0.80650,0.78503,0.82540,0.82496,0.86761,0.86051,0.80410,0.78266,0.87729,0.83308,0.85801,0.82038,0.85434,0.76649,0.84811,0.85172,0.85335,0.79822,0.82031,0.82720,0.83021,0.83881,0.80375,0.78212,0.86708,0.85027,0.82101,0.85603,0.81574,0.83552,0.86775,0.84239,0.88485,0.83625,0.83620,0.80698,0.83159,0.85054,0.78172,...,0.85774,0.80755,0.83477,0.79812,0.84436,0.86659,0.81551,0.84438,0.87553,0.83760,0.79367,0.85737,0.82295,0.84008,0.75211,0.80810,0.84237,0.87343,0.82670,0.86439,0.88462,0.88191,0.80973,0.79050,0.84119,0.84752,0.85900,0.86241,0.86537,0.78514,0.81356,0.83639,0.78274,0.80798,0.88447,0.83068,0.85289,0.80426,0.83054,0.81370,0.83143,0.81179,0.87189,0.87534,0.82608,0.84298,0.86244,0.85388,0.83312,0.82267,0.82455,0.76982,0.82506,0.86848,0.85928,0.87117,0.78654,0.80289,0.76794,0.80558,0.85243,0.85595,0.84435,0.81772,0.85106,0.86633,0.83969,0.84906,0.82816,0.81537,0.81990,0.82368,0.77486,0.82829,0.86603,0.86146,0.83722,0.84799,0.82735,0.78458,0.85018,0.85593,0.82709,0.86328,0.81543,0.84624,0.85258,0.83200,0.84527,0.85328,0.80868,0.80539,0.81140,0.86099,0.80647,0.78881,0.87379,0.86045,0.81452,0.79974
4,cg12950382,0.90471,0.91791,0.88500,0.92514,0.88006,0.88979,0.90464,0.91163,0.92379,0.92847,0.89746,0.91248,0.90917,0.93490,0.89505,0.85097,0.90890,0.89547,0.90925,0.92808,0.91155,0.86386,0.85988,0.89643,0.90143,0.92745,0.93958,0.92617,0.89104,0.88820,0.87271,0.86862,0.90739,0.90934,0.89969,0.86253,0.89932,0.88920,0.92489,0.92743,0.92408,0.91540,0.88494,0.88289,0.88017,0.91161,0.88105,0.91145,0.89459,0.91978,0.89014,0.88487,0.92396,0.90668,0.91321,0.93653,0.90740,0.87996,0.91614,0.92425,0.92163,0.91611,0.91457,0.92762,0.94313,0.86825,0.91435,0.90721,0.87760,0.90194,0.88403,0.90980,0.91601,0.89617,0.91628,0.89848,0.92581,0.90614,0.89262,0.93147,0.89677,0.87718,0.87412,0.88558,0.87640,0.90999,0.92804,0.91652,0.88732,0.84801,0.89234,0.90671,0.89561,0.90975,0.92496,0.90577,0.88446,0.89067,0.90201,...,0.86547,0.91272,0.88590,0.90748,0.93363,0.90436,0.91161,0.92233,0.91408,0.90139,0.90804,0.86092,0.92262,0.90783,0.91394,0.92985,0.92202,0.92868,0.88240,0.87444,0.89557,0.92032,0.90352,0.90488,0.89066,0.89172,0.91483,0.91641,0.90495,0.88297,0.91290,0.94158,0.89373,0.89650,0.89524,0.88407,0.87654,0.86913,0.88225,0.88898,0.92218,0.93165,0.90628,0.92300,0.89837,0.91105,0.88337,0.89306,0.88910,0.87294,0.93021,0.93502,0.89757,0.92401,0.89774,0.87523,0.92284,0.92182,0.89834,0.92380,0.90278,0.90049,0.90975,0.89468,0.91692,0.90374,0.91026,0.91421,0.91110,0.86892,0.92138,0.91070,0.89488,0.87142,0.87028,0.90754,0.91901,0.90337,0.85447,0.89643,0.92373,0.90206,0.91675,0.89988,0.90408,0.89122,0.91701,0.87515,0.93168,0.90491,0.90056,0.88852,0.89637,0.85058,0.91918,0.91970,0.90764,0.88461,0.89662,0.91957
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
821107,cg23079522,0.93280,0.89319,0.90821,0.91389,0.89621,0.91072,0.89489,0.91634,0.90769,0.91850,0.91936,0.92185,0.91517,0.91931,0.93204,0.82389,0.92232,0.91563,0.91599,0.91755,0.92713,0.90831,0.90998,0.89263,0.88037,0.92193,0.91141,0.90917,0.89113,0.92247,0.90024,0.92255,0.92199,0.92530,0.91985,0.88025,0.90866,0.91808,0.91771,0.91532,0.92582,0.90363,0.92748,0.88520,0.90754,0.87841,0.92888,0.91979,0.92930,0.91750,0.91273,0.90810,0.88709,0.92394,0.91928,0.90051,0.92109,0.83331,0.91442,0.90691,0.91712,0.92074,0.93121,0.89216,0.86720,0.91600,0.92683,0.92677,0.89996,0.91271,0.89322,0.92558,0.90972,0.91277,0.92148,0.92429,0.91090,0.89939,0.83875,0.92543,0.91991,0.91891,0.90471,0.92471,0.88653,0.93729,0.92646,0.89751,0.91812,0.88909,0.90463,0.94023,0.91287,0.89432,0.93047,0.93467,0.90995,0.92088,0.89317,...,0.90664,0.93138,0.92076,0.92489,0.93428,0.92094,0.93108,0.92012,0.91713,0.91318,0.92147,0.87067,0.86202,0.92846,0.91438,0.91960,0.91439,0.92843,0.93336,0.90976,0.89935,0.92651,0.92656,0.92133,0.92704,0.88618,0.92568,0.91957,0.91401,0.90976,0.89753,0.88738,0.92228,0.93219,0.91416,0.92104,0.86696,0.91093,0.92550,0.93342,0.92000,0.89963,0.92931,0.90200,0.89345,0.93023,0.93382,0.89419,0.90793,0.91712,0.90915,0.86779,0.92331,0.94381,0.85059,0.91025,0.91154,0.89689,0.91263,0.90068,0.89382,0.92119,0.92309,0.91376,0.92768,0.90841,0.91154,0.91761,0.90684,0.82131,0.84454,0.90139,0.92090,0.90280,0.91460,0.93160,0.91908,0.91880,0.85273,0.90375,0.90210,0.91153,0.92941,0.88450,0.92595,0.92070,0.89595,0.91601,0.89153,0.92087,0.91975,0.92090,0.91724,0.88946,0.92302,0.90094,0.90520,0.91875,0.91847,0.92338
821108,cg16818145,0.93357,0.93074,0.92460,0.90428,0.91658,0.91653,0.94095,0.92737,0.89464,0.91530,0.90619,0.89856,0.91136,0.93228,0.92537,0.93208,0.91666,0.92829,0.89521,0.91878,0.92680,0.92752,0.89927,0.89578,0.90978,0.89618,0.90477,0.90218,0.92697,0.90806,0.92262,0.93648,0.92703,0.91806,0.90278,0.91288,0.92675,0.92137,0.91930,0.92977,0.91480,0.91868,0.92690,0.92293,0.92225,0.92603,0.91671,0.93603,0.91966,0.91993,0.90224,0.92480,0.94597,0.93428,0.92241,0.91783,0.91238,0.92501,0.91074,0.93163,0.92027,0.92772,0.92092,0.93829,0.93306,0.91950,0.90620,0.89516,0.92943,0.90814,0.92370,0.90714,0.91629,0.91248,0.91443,0.93712,0.89088,0.90902,0.93278,0.92592,0.92338,0.92114,0.92691,0.89682,0.92207,0.92572,0.91810,0.92647,0.90110,0.90853,0.94020,0.92646,0.91789,0.91639,0.89903,0.91725,0.92634,0.91429,0.93147,...,0.91292,0.91790,0.92041,0.91798,0.90333,0.89769,0.92346,0.88315,0.92486,0.90770,0.92401,0.92968,0.93553,0.91627,0.91011,0.90257,0.92862,0.92727,0.89832,0.91158,0.92436,0.91827,0.90252,0.90343,0.93597,0.92283,0.93134,0.89681,0.90307,0.92082,0.93775,0.94697,0.90651,0.90888,0.90800,0.90972,0.91830,0.93722,0.93083,0.89802,0.91632,0.93227,0.91683,0.89272,0.92720,0.89313,0.91080,0.91960,0.91358,0.91784,0.92586,0.93421,0.92644,0.90967,0.91171,0.92552,0.91386,0.92969,0.90288,0.92063,0.92264,0.93159,0.91870,0.93122,0.90295,0.90173,0.92458,0.91176,0.92713,0.93665,0.89481,0.92428,0.91314,0.92300,0.91080,0.92920,0.93145,0.91515,0.92632,0.92375,0.92676,0.91123,0.93005,0.91528,0.91679,0.91323,0.92880,0.91327,0.92995,0.92455,0.88900,0.93115,0.91379,0.93058,0.88946,0.89858,0.91864,0.90881,0.92396,0.93812
821109,cg14585103,0.86160,0.82358,0.87366,0.85791,0.84572,0.86190,0.87040,0.87146,0.85576,0.85193,0.85613,0.85617,0.83184,0.86578,0.84813,0.85436,0.85931,0.87327,0.87672,0.85553,0.82210,0.85728,0.86144,0.85255,0.85869,0.86327,0.85732,0.85934,0.85640,0.82132,0.88267,0.84762,0.85174,0.85645,0.83031,0.86113,0.86677,0.86148,0.84617,0.86735,0.86356,0.85920,0.83434,0.86942,0.87476,0.87445,0.83109,0.85989,0.86643,0.84663,0.85558,0.84978,0.87834,0.86997,0.85022,0.83104,0.84616,0.88724,0.87644,0.85433,0.86491,0.84943,0.84573,0.83928,0.83802,0.86142,0.86726,0.85894,0.86968,0.86334,0.86071,0.85889,0.82459,0.84819,0.85231,0.86842,0.86276,0.83724,0.89096,0.85138,0.86225,0.85755,0.82127,0.83478,0.87576,0.84971,0.86123,0.87005,0.86115,0.85366,0.85052,0.83550,0.87004,0.87724,0.86240,0.82893,0.86346,0.85984,0.86177,...,0.84015,0.81757,0.83203,0.86824,0.87231,0.88181,0.83903,0.86482,0.86539,0.85785,0.87022,0.84373,0.84792,0.85952,0.87804,0.85845,0.85162,0.84714,0.83023,0.85380,0.86133,0.84055,0.83903,0.83602,0.83990,0.88248,0.85281,0.88655,0.83650,0.83452,0.84142,0.84661,0.85334,0.82338,0.85351,0.87281,0.86537,0.85732,0.86991,0.86141,0.85272,0.86956,0.83193,0.83474,0.87299,0.84585,0.87950,0.86148,0.84082,0.86164,0.83779,0.85679,0.83959,0.84079,0.86551,0.82795,0.86607,0.87295,0.86848,0.85435,0.86587,0.83292,0.83365,0.86460,0.84094,0.86932,0.86266,0.84723,0.85510,0.88296,0.88863,0.86754,0.85625,0.84301,0.85306,0.85155,0.85400,0.85509,0.87790,0.86879,0.87233,0.87379,0.83234,0.83382,0.82893,0.86772,0.84613,0.81867,0.86438,0.88494,0.85754,0.86520,0.86760,0.88545,0.83581,0.84062,0.85627,0.85864,0.86846,0.85195
821110,cg10633746,0.10045,0.07840,0.08573,0.08987,0.09049,0.09647,0.09471,0.10536,0.08079,0.08417,0.09068,0.09601,0.08265,0.08678,0.09140,0.09424,0.08064,0.08231,0.09145,0.09904,0.08328,0.07966,0.07820,0.09115,0.09627,0.09239,0.09182,0.08496,0.08673,0.09323,0.09074,0.08242,0.07091,0.07971,0.07763,0.08481,0.08840,0.08889,0.10064,0.09579,0.08328,0.08863,0.09975,0.08727,0.08025,0.08680,0.09058,0.09136,0.08441,0.08922,0.08555,0.08858,0.08357,0.08364,0.09512,0.10194,0.09111,0.10230,0.08492,0.07824,0.08563,0.08902,0.08887,0.09555,0.09207,0.10704,0.08133,0.09429,0.09372,0.08558,0.08382,0.09097,0.08531,0.09837,0.09844,0.09381,0.09226,0.09200,0.09561,0.08492,0.10672,0.09360,0.09009,0.09583,0.09552,0.09463,0.08277,0.08823,0.09138,0.10231,0.08692,0.08129,0.09889,0.08142,0.09242,0.08685,0.08190,0.08502,0.09924,...,0.08266,0.09930,0.09845,0.09792,0.08645,0.07873,0.10396,0.08415,0.09235,0.08149,0.08413,0.08980,0.11265,0.10505,0.09398,0.09489,0.08996,0.08270,0.08725,0.09284,0.08869,0.07940,0.08699,0.10260,0.07871,0.09401,0.08521,0.09525,0.09079,0.09108,0.10486,0.08407,0.09706,0.09430,0.08883,0.08874,0.08428,0.08787,0.10201,0.08592,0.09036,0.10257,0.08529,0.09159,0.10608,0.09898,0.08027,0.08827,0.08997,0.08222,0.09984,0.09526,0.08903,0.07977,0.10215,0.08360,0.09908,0.09919,0.09549,0.08140,0.09041,0.09855,0.09406,0.08784,0.08090,0.08662,0.09397,0.09963,0.09544,0.09301,0.09520,0.10075,0.09512,0.08875,0.08168,0.08575,0.08713,0.09365,0.08933,0.09419,0.09468,0.08322,0.08848,0.09522,0.09933,0.10506,0.08360,0.07802,0.09792,0.09710,0.09038,0.09270,0.09969,0.08534,0.07212,0.09957,0.08944,0.09127,0.08839,0.08133


In [146]:
# Lets check if we have all the samples we were supposed to have
# from pre and post samples
print("All pre matching:", mrs_pheno['BaseName'].isin(mrs_meth.columns).all())

All pre matching: True


### ArmySTARRS

In [147]:
army_path = "G:/PGC ML/ArmySTARRS/"
army_pheno = read_data(fname="pre_post_armystarrs_Pheno_ML.csv", dirpath=army_path)
army_meth = read_data(fname="Starrs_noob_qcd_crossReactiveProbesRemoved_combat_CP_wcovar_age2TP_ptsd_allPreAsControls.feather",
                     dirpath=army_path)

In [148]:
print("Armystarrs beta shape :", army_meth.shape)
print("Armystarrs Pheno shape :", army_pheno.shape)

Armystarrs beta shape : (821329, 438)
Armystarrs Pheno shape : (430, 80)


In [149]:
# samples for two visits
army_pheno["visit"].value_counts()

0    215
2    215
Name: visit, dtype: int64

In [150]:
# In armystarrs we have different trauma variables
# lets conside either non-deployment related trauma or 
# deployment related trauma
army_pheno["trauma_exposed_critA"].value_counts()

1    346
0     84
Name: trauma_exposed_critA, dtype: int64

In [151]:
# lets keep only trauma exposed
army_pheno = get_trauma_exposed(df = army_pheno, 
                               col = 'trauma_exposed_critA')

In [152]:
army_pheno['trauma_exposed_critA'].min()

1

In [153]:
army_pheno['visit'].value_counts()

0    173
2    173
Name: visit, dtype: int64

In [154]:
# get only visit 2
# army_v0_pheno, army_v2_pheno = [army_pheno[army_pheno['visit'] == i]
#                                 for i in [0,2]]

In [155]:
[x['visit'].value_counts() for x in [army_pheno]]

[0    173
 2    173
 Name: visit, dtype: int64]

In [156]:
[x["trauma_exposed_critA"].value_counts() 
 for x in [army_pheno]]

[1    346
 Name: trauma_exposed_critA, dtype: int64]

In [157]:
# drop columns that have all nas
army_pheno = army_pheno.dropna(axis=1, how = 'all')

In [158]:
[x.shape for x in [army_pheno]]

[(346, 75)]

In [159]:
print("No of unique ppts :", len(army_pheno['EWAS_id_new'].unique()))

No of unique ppts : 173


In [160]:
[x.isna().sum() for x in [army_pheno]]

[BaseName                            0
 EWAS_id                             0
 EWAS_id_new                         0
 GWAS_id                             0
 visitkey                            0
 visit                               0
 CURRENT_PTSD                       13
 LIFETIME_PTSD                       0
 TOBACCOUSE                          0
 P30DFREQ                           13
 MaltreatmentGlobal                  0
 nondeploy_trauma_exposed_critA      0
 deploy_trauma_exposed_critA         0
 trauma_exposed_critA                0
 pd_s                                0
 AGE                                13
 genetic_ancestry                    0
 hisp                                2
 race                                2
 PCL6_t0                           175
 PCL17_t23                         189
 pcl6_b_2q                         175
 pcl6_c_2q                         175
 pcl6_d_2q                         175
 pcl17_b_5q                        190
 pcl17_c_7q              

In [161]:
# Now get methylation samples that are in pheno
army_cols  = col_list(df=army_pheno)
army_meth  = get_samples(df = army_meth, cols = army_cols)
                            

In [162]:
[x.shape for x in [army_meth]]

[(821329, 347)]

In [163]:
# army_v0_pheno.columns.isin(army_v2_pheno.columns)

In [164]:
# ArmySTARRS has two age columns, let drop one
army_pheno.drop('age', axis = 1, inplace=True)

In [165]:
army_pheno.columns

Index(['BaseName', 'EWAS_id', 'EWAS_id_new', 'GWAS_id', 'visitkey', 'visit',
       'CURRENT_PTSD', 'LIFETIME_PTSD', 'TOBACCOUSE', 'P30DFREQ',
       'MaltreatmentGlobal', 'nondeploy_trauma_exposed_critA',
       'deploy_trauma_exposed_critA', 'trauma_exposed_critA', 'pd_s', 'AGE',
       'genetic_ancestry', 'hisp', 'race', 'PCL6_t0', 'PCL17_t23', 'pcl6_b_2q',
       'pcl6_c_2q', 'pcl6_d_2q', 'pcl17_b_5q', 'pcl17_c_7q', 'pcl17_d_5q',
       'GWAS', 'n', 'specimen_date', 'specimen_age', 'specimen_type', 'array',
       'methylationid', 'sentrix_id', 'sentrix_position', 'data_sharing',
       'case_status', 'PTSD_measure', 'PTSD_symptoms_pcl6',
       'PTSD_symptoms_pcl617', 'pheno_date', 's', 'Study', 'SampleID',
       'smoking', 'ptsd_case', 'bestpop_oneweek', 'Gender', 'CD8T.EPIC',
       'CD4T.EPIC', 'NK.EPIC', 'Bcell.EPIC', 'Mono.EPIC', 'Neu.EPIC',
       'CD8T.EPICnoob', 'CD4T.EPICnoob', 'NK.EPICnoob', 'Bcell.EPICnoob',
       'Mono.EPICnoob', 'Neu.EPICnoob', 'Comp.1', 'Comp.2', '

### PRISMO

In [166]:
prismo_path ="G:/PGC ML/PRISMO/"
prismo_pheno = read_data(fname="pre_post_prismo_Pheno_ML.csv", dirpath=prismo_path)
prismo_meth = read_data(fname="Prismo_noob_qcd_crossReactiveProbesRemoved_combat_CP_wcovar_age_ptsd_allPreAsControls.feather",
                     dirpath=prismo_path)

In [167]:
print("Prismo beta shape :", prismo_meth.shape)
print("Prismo Pheno shape :", prismo_pheno.shape)

Prismo beta shape : (821031, 237)
Prismo Pheno shape : (118, 64)


In [168]:
prismo_pheno["visit"].value_counts()


0_epic    59
2_epic    59
Name: visit, dtype: int64

In [169]:
prismo_pheno = get_trauma_exposed(df = prismo_pheno,
                                 col = 'Pes_number')
prismo_pheno.shape

(117, 64)

In [170]:
# prismo_pheno, prismo_v2_pheno = [prismo_pheno[prismo_pheno["visit"] == x]
#                                     for x in ["0_epic", "2_epic"]]

In [171]:
[x.shape for x in [prismo_pheno]]

[(117, 64)]

In [172]:
prismo_pheno.shape

(117, 64)

In [173]:
print("No of unique ppts pre :", len(prismo_pheno['EWAS_id'].unique()))

No of unique ppts pre : 59


In [174]:
prismo_pheno.isna().sum()

BaseName                    0
EWAS_id                     0
visit                       0
visitkey                    0
CURRENT_PTSD               12
SMOKING                    73
smoking_status              0
ETItot                      4
ETIalg                      2
ETIlich                     2
ETIgeest                    4
ETIs                        4
Pes_number                 67
gender                      0
AGE                         0
ancestry                    0
TOTAL_SCORE                12
REEXPERIENCE               12
AVOID                      12
HYPERAROUSAL               12
DEPRESSION                  8
ANXIETY                     8
GWAS                        0
n                           0
GWAS_id                     0
specimen_date              10
specimen_age               10
specimen_type               0
array                       0
methylation_id              0
sentrix_id                  0
sentrix_position            0
data_sharing                0
case_statu

In [175]:
prismo_cols = col_list(df = prismo_pheno)
        
prismo_meth = get_samples(df = prismo_meth, cols = prismo_cols)

In [176]:
[x.shape for x in [prismo_meth]]

[(821031, 118)]

In [177]:
prismo_pheno.columns

Index(['BaseName', 'EWAS_id', 'visit', 'visitkey', 'CURRENT_PTSD', 'SMOKING',
       'smoking_status', 'ETItot', 'ETIalg', 'ETIlich', 'ETIgeest', 'ETIs',
       'Pes_number', 'gender', 'AGE', 'ancestry', 'TOTAL_SCORE',
       'REEXPERIENCE', 'AVOID', 'HYPERAROUSAL', 'DEPRESSION', 'ANXIETY',
       'GWAS', 'n', 'GWAS_id', 'specimen_date', 'specimen_age',
       'specimen_type', 'array', 'methylation_id', 'sentrix_id',
       'sentrix_position', 'data_sharing', 'case_status', 'PTSD_measure',
       'PTSD_symptoms', 'pheno_date', 's', 'age', 'genetic_ancestry', 'Study',
       'SameIDs', 'Group', 'time', 'ptsdActualCont', 'ptsdActual', 'Comp.1',
       'Comp.2', 'Comp.3', 'Comp.4', 'Comp.5', 'Comp.6', 'Comp.7', 'Comp.8',
       'Comp.9', 'Comp.10', 'CD8T.Epic', 'CD4T.Epic', 'NK.Epic', 'Bcell.Epic',
       'Mono.Epic', 'Neu.Epic', 'ptsdActualCont_01scaled', 'SmoS'],
      dtype='object')

In [178]:
# just a thought ----------------
# in DNHS we have Remitted samples as well
# So when we use ptsdpm, we need to remove those remitted ones


In [179]:
# dnhs_pheno.iloc[:5, :5]

In [180]:
# gtp.iloc[:5, :5]

#### Now combine all data

In [181]:
def rename_column(df, col_indx):
    return df.rename(columns = {df.columns[col_indx]: 'CpGs'})

# Make a list of dfs 
all_meth_dfs = [mrs_meth, army_meth, prismo_meth]

# rename the first column
all_meth_dfs = [rename_column(df = x, col_indx = 0)
                for x in all_meth_dfs]


In [182]:
[x.iloc[:5, :5] for x in all_meth_dfs]

[         CpGs  201533580029_R01C01  201533580029_R02C01  201533580029_R03C01  \
 0  cg18478105              0.01471              0.01638              0.01675   
 1  cg09835024              0.02721              0.03539              0.02973   
 2  cg14361672              0.87702              0.90871              0.88785   
 3  cg01763666              0.80343              0.82561              0.82502   
 4  cg12950382              0.90471              0.91791              0.88500   
 
    201533580029_R04C01  
 0              0.01356  
 1              0.03160  
 2              0.86194  
 3              0.88384  
 4              0.92514  ,
          CpGs  201858500071_R03C01  201858500071_R04C01  201858500071_R07C01  \
 0  cg18478105              0.01526              0.01589              0.01544   
 1  cg09835024              0.02771              0.02711              0.02504   
 2  cg14361672              0.88314              0.92609              0.92533   
 3  cg01763666              0.8

In [183]:
# Combine all methylation data 
from functools import reduce
dfs_merged  = reduce(lambda left, 
                     right: pd.merge(left, 
                                     right,
                                     on = "CpGs",
                                     how='inner'), 
                     all_meth_dfs) 

In [184]:
[x.shape for x in [dfs_merged]]

[(820498, 718)]

In [185]:
[len(x['CpGs'].unique()) for x in [dfs_merged]]

[820498]

In [186]:
dfs_merged.iloc[:5, :5]

Unnamed: 0,CpGs,201533580029_R01C01,201533580029_R02C01,201533580029_R03C01,201533580029_R04C01
0,cg18478105,0.01471,0.01638,0.01675,0.01356
1,cg09835024,0.02721,0.03539,0.02973,0.0316
2,cg14361672,0.87702,0.90871,0.88785,0.86194
3,cg01763666,0.80343,0.82561,0.82502,0.88384
4,cg12950382,0.90471,0.91791,0.885,0.92514


#### We also need to combine the phenotypes, but before we do that we need to get the common variables

In [187]:
# dnhs_pheno.columns

In [188]:
# gtp_pheno_comb.columns

In [189]:
mrs_pheno.columns

Index(['BaseName', 'EWAS_id', 'GWAS_id', 'studyid', 'visitkey', 'visit',
       'specimen_date', 'specimen_type', 'array', 'sentrix_id',
       'sentrix_position', 'data_sharing', 'case_status', 'PTSD_measure',
       'PTSD_symptoms', 'pheno_date', 'CAPSF1I2s', 'TOBAC_lifetime_user',
       'current_smoking', 'current_snuff', 'current_chew', 'current_any',
       'smokelast', 'nicotine_use', 'CTQ_TOTAL', 'LECCUM_Stringent', 'Sex',
       'Age', 'genetic_ancestry', 'ba_ethnic', 'ba_race', 'CAPStots', 'CAPSBs',
       'CAPSCs', 'CAPSDs', 'BDI2_mod_sev', 'BDI2_SUM', 'BAI_mod_sev',
       'BAI_SUM', 'GWAS', 'Study', 'ID', 'Group', 'Case', 'PCL_SUM', 'tobacco',
       'post_deployment_visit', 'age', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5',
       'CD8T', 'CD4T', 'NK', 'Bcell', 'Mono', 'Neu', 'Comp.1', 'Comp.2',
       'Comp.3', 'PCL_SUM_01scaled', 'SmoS'],
      dtype='object')

In [190]:
army_pheno.columns

Index(['BaseName', 'EWAS_id', 'EWAS_id_new', 'GWAS_id', 'visitkey', 'visit',
       'CURRENT_PTSD', 'LIFETIME_PTSD', 'TOBACCOUSE', 'P30DFREQ',
       'MaltreatmentGlobal', 'nondeploy_trauma_exposed_critA',
       'deploy_trauma_exposed_critA', 'trauma_exposed_critA', 'pd_s', 'AGE',
       'genetic_ancestry', 'hisp', 'race', 'PCL6_t0', 'PCL17_t23', 'pcl6_b_2q',
       'pcl6_c_2q', 'pcl6_d_2q', 'pcl17_b_5q', 'pcl17_c_7q', 'pcl17_d_5q',
       'GWAS', 'n', 'specimen_date', 'specimen_age', 'specimen_type', 'array',
       'methylationid', 'sentrix_id', 'sentrix_position', 'data_sharing',
       'case_status', 'PTSD_measure', 'PTSD_symptoms_pcl6',
       'PTSD_symptoms_pcl617', 'pheno_date', 's', 'Study', 'SampleID',
       'smoking', 'ptsd_case', 'bestpop_oneweek', 'Gender', 'CD8T.EPIC',
       'CD4T.EPIC', 'NK.EPIC', 'Bcell.EPIC', 'Mono.EPIC', 'Neu.EPIC',
       'CD8T.EPICnoob', 'CD4T.EPICnoob', 'NK.EPICnoob', 'Bcell.EPICnoob',
       'Mono.EPICnoob', 'Neu.EPICnoob', 'Comp.1', 'Comp.2', '

In [191]:
prismo_pheno.columns

Index(['BaseName', 'EWAS_id', 'visit', 'visitkey', 'CURRENT_PTSD', 'SMOKING',
       'smoking_status', 'ETItot', 'ETIalg', 'ETIlich', 'ETIgeest', 'ETIs',
       'Pes_number', 'gender', 'AGE', 'ancestry', 'TOTAL_SCORE',
       'REEXPERIENCE', 'AVOID', 'HYPERAROUSAL', 'DEPRESSION', 'ANXIETY',
       'GWAS', 'n', 'GWAS_id', 'specimen_date', 'specimen_age',
       'specimen_type', 'array', 'methylation_id', 'sentrix_id',
       'sentrix_position', 'data_sharing', 'case_status', 'PTSD_measure',
       'PTSD_symptoms', 'pheno_date', 's', 'age', 'genetic_ancestry', 'Study',
       'SameIDs', 'Group', 'time', 'ptsdActualCont', 'ptsdActual', 'Comp.1',
       'Comp.2', 'Comp.3', 'Comp.4', 'Comp.5', 'Comp.6', 'Comp.7', 'Comp.8',
       'Comp.9', 'Comp.10', 'CD8T.Epic', 'CD4T.Epic', 'NK.Epic', 'Bcell.Epic',
       'Mono.Epic', 'Neu.Epic', 'ptsdActualCont_01scaled', 'SmoS'],
      dtype='object')

In [192]:
# raname the first columns
# DNHS
# dnhs_pheno = dnhs_pheno.rename(columns={'X':'BaseName', 'race6cat':'Race',
#                           'childhood_cum_trauma': 'Childhood_MT',
#                            'life_worst_intrusion': 'Intrusion',
#                            'life_worst_avoidance': 'Avoidance',
#                            'life_worst_hyperarousal': 'Hyperarousal',
#                            'phq9sum': 'MDD',
#                            'gad7sum': 'GAD',
#                            'Life_PTS_severity': 'PTS_severity'
#                           })


In [193]:
# GTP
# gtp_pheno_comb = gtp_pheno_comb.rename(columns={'Unnamed: 0':'BaseName',
#                                'mergedcapsandpsswinthin30days':'PTSDpm', 
#                                'Life_PTSD_01': 'PTSDLife',
#                                'age_x': 'Age',
#                                'tei_total_types_experienced_somewitness':'TraumaNum',
#                               'caps_life_freqplusintens_combined': 'PTS_severity',
#                                'PSS_Intrusive': 'Intrusion',
#                                'PSS_avoidnumb': 'Avoidance',
#                                'PSS_hyperarousal': 'Hyperarousal',
#                                'BDItotalscore': 'MDD',
#                                'CTQTOT': 'Childhood_MT',
#                                'pc1': 'Comp.1',
#                                'pc2': 'Comp.2',
#                                'pc3': 'Comp.3',
                               
#                               })

In [194]:
# MRS
mrs_pheno = mrs_pheno.rename(columns={'ba_race': 'race',
                               'CAPSF1I2s': 'PTSDpm',
                               'Lifetime.PTSD' : 'PTSDLife',
                               'LECCUM_Stringent': 'TraumaNum',
                               'CAPStots': 'PTS_severity',
                               'CAPSBs': 'Intrusion',
                               'CAPSCs': 'Avoidance',
                               'CAPSDs': 'Hyperarousal',
                               'BDI2_SUM': 'MDD',
                               'BAI_mod_sev': 'GAD',
                               'CTQ_TOTAL' : 'Childhood_MT',
                               'Sex': 'Gender'
                              })

In [195]:
# raname the first columns
# ArmyStarrs
army_pheno = army_pheno.rename(columns={'race':'Race',
                        'CURRENT_PTSD': "PTSDpm",
                        'LIFETIME_PTSD': 'PTSDLife',
                        'MaltreatmentGlobal': 'Childhood_MT',
                        'trauma_exposed_critA': 'TraumaNum',
                        'pcl17_b_5q': 'Intrusion',
                        'pcl17_c_7q': 'Avoidance',
                        'pcl17_d_5q': 'Hyperarousal',
                        'PCL17_t23': 'PTS_severity',
                        'CD8T.EPICnoob': 'CD8T',
                        'CD4T.EPICnoob': 'CD4T',
                        'NK.EPICnoob': 'NK',
                        'Bcell.EPICnoob': 'Bcell',
                        'Mono.EPICnoob': 'Mono',
                        'Neu.EPICnoob': 'Neu'
                          })

In [196]:
# Prismo
prismo_pheno  = prismo_pheno.rename(columns={'ancestry':'Race',
                          'CURRENT_PTSD': "PTSDpm",
                          'LIFETIME_PTSD': 'PTSDLife',
                          'ETItot': 'Childhood_MT',
                          'Pes_number': 'TraumaNum',
                          'REEXPERIENCE': 'Intrusion',
                          'AVOID': 'Avoidance',
                          'HYPERAROUSAL': 'Hyperarousal',
                          'TOTAL_SCORE': 'PTS_severity',
                          'CD8T.Epic': 'CD8T',
                          'CD4T.Epic': 'CD4T',
                          'NK.Epic': 'NK',
                          'Bcell.Epic': 'Bcell',
                          'Mono.Epic': 'Mono',
                          'Neu.Epic': 'Neu',
                          'gender': 'Gender'
                          })

In [197]:
# dnhs_pheno.columns

In [198]:
# gtp_pheno_comb.columns

In [199]:
# For mrs, both Pcs from gwas and methylation data are available
# Comp.1, Comp.2, Comp.3 are methylation
[x.columns for x in [mrs_pheno]]

[Index(['BaseName', 'EWAS_id', 'GWAS_id', 'studyid', 'visitkey', 'visit',
        'specimen_date', 'specimen_type', 'array', 'sentrix_id',
        'sentrix_position', 'data_sharing', 'case_status', 'PTSD_measure',
        'PTSD_symptoms', 'pheno_date', 'PTSDpm', 'TOBAC_lifetime_user',
        'current_smoking', 'current_snuff', 'current_chew', 'current_any',
        'smokelast', 'nicotine_use', 'Childhood_MT', 'TraumaNum', 'Gender',
        'Age', 'genetic_ancestry', 'ba_ethnic', 'race', 'PTS_severity',
        'Intrusion', 'Avoidance', 'Hyperarousal', 'BDI2_mod_sev', 'MDD', 'GAD',
        'BAI_SUM', 'GWAS', 'Study', 'ID', 'Group', 'Case', 'PCL_SUM', 'tobacco',
        'post_deployment_visit', 'age', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5',
        'CD8T', 'CD4T', 'NK', 'Bcell', 'Mono', 'Neu', 'Comp.1', 'Comp.2',
        'Comp.3', 'PCL_SUM_01scaled', 'SmoS'],
       dtype='object')]

In [200]:
[x.columns for x in[army_pheno]]

[Index(['BaseName', 'EWAS_id', 'EWAS_id_new', 'GWAS_id', 'visitkey', 'visit',
        'PTSDpm', 'PTSDLife', 'TOBACCOUSE', 'P30DFREQ', 'Childhood_MT',
        'nondeploy_trauma_exposed_critA', 'deploy_trauma_exposed_critA',
        'TraumaNum', 'pd_s', 'AGE', 'genetic_ancestry', 'hisp', 'Race',
        'PCL6_t0', 'PTS_severity', 'pcl6_b_2q', 'pcl6_c_2q', 'pcl6_d_2q',
        'Intrusion', 'Avoidance', 'Hyperarousal', 'GWAS', 'n', 'specimen_date',
        'specimen_age', 'specimen_type', 'array', 'methylationid', 'sentrix_id',
        'sentrix_position', 'data_sharing', 'case_status', 'PTSD_measure',
        'PTSD_symptoms_pcl6', 'PTSD_symptoms_pcl617', 'pheno_date', 's',
        'Study', 'SampleID', 'smoking', 'ptsd_case', 'bestpop_oneweek',
        'Gender', 'CD8T.EPIC', 'CD4T.EPIC', 'NK.EPIC', 'Bcell.EPIC',
        'Mono.EPIC', 'Neu.EPIC', 'CD8T', 'CD4T', 'NK', 'Bcell', 'Mono', 'Neu',
        'Comp.1', 'Comp.2', 'Comp.3', 'Comp.4', 'Comp.5', 'Comp.6', 'Comp.7',
        'Comp.8', 'Comp.

In [201]:
[x.columns for x in [prismo_pheno]]

[Index(['BaseName', 'EWAS_id', 'visit', 'visitkey', 'PTSDpm', 'SMOKING',
        'smoking_status', 'Childhood_MT', 'ETIalg', 'ETIlich', 'ETIgeest',
        'ETIs', 'TraumaNum', 'Gender', 'AGE', 'Race', 'PTS_severity',
        'Intrusion', 'Avoidance', 'Hyperarousal', 'DEPRESSION', 'ANXIETY',
        'GWAS', 'n', 'GWAS_id', 'specimen_date', 'specimen_age',
        'specimen_type', 'array', 'methylation_id', 'sentrix_id',
        'sentrix_position', 'data_sharing', 'case_status', 'PTSD_measure',
        'PTSD_symptoms', 'pheno_date', 's', 'age', 'genetic_ancestry', 'Study',
        'SameIDs', 'Group', 'time', 'ptsdActualCont', 'ptsdActual', 'Comp.1',
        'Comp.2', 'Comp.3', 'Comp.4', 'Comp.5', 'Comp.6', 'Comp.7', 'Comp.8',
        'Comp.9', 'Comp.10', 'CD8T', 'CD4T', 'NK', 'Bcell', 'Mono', 'Neu',
        'ptsdActualCont_01scaled', 'SmoS'],
       dtype='object')]

In [202]:
# We have two age columns, lets drop one
mrs_pheno.drop(columns=['Age'], inplace=True)

In [203]:
[x.drop(columns=['AGE'], inplace=True) 
 for x in [army_pheno, prismo_pheno]]

[None, None]

In [204]:
need_cols = ['BaseName', '^EWAS_id$', '^visit$', '^visitkey$',
             'Gender','race$', '^Age$', 
             'PTSDpm', 'PTSDLife', 'TraumaNum', 
             'CD8T$', 'CD4T$', 'NK$', 'Bcell$', 'Mono$',
             'Neu$','PTS_severity', 'Childhood_MT',
             'Intrusion', 'Avoidance', 'Hyperarousal', '^MDD$',
            'Comp.2', 'Comp.3', 'Study$', 'SmoS']

In [205]:
import re
def get_cols(df, cols, case=None, title=None, sort=None):
    """
    Function to get required columns
    Parameters: 
    df: data frame
    cols: columns that need to be fetched
    case: If case should be ignored,  None by default
    
    Output: The dataframe with selected columns
    """
    if case is True:
        d = df.filter(regex=re.compile('|'.join(cols), re.IGNORECASE))
    else:
        d = df.filter(regex= re.compile('|'.join(cols)))
        
    if title is True:
        d.columns = [i.title() for i in d.columns]
        
    if sort is True:
        d = d.sort_index(axis=1)
        
    return(d)


# get the frequency of elements
def get_frequency(df, col):
    return(df[col].value_counts())


In [206]:
# now get the required columns from all dfs

all_phenos = [mrs_pheno, army_pheno, prismo_pheno]

phenos_sub = [get_cols(df = x, cols=need_cols, case=True,
                      title=True, sort=True) for x in all_phenos]

cohorts = ["mrs", "armystarrs", "prismo"]

phenos_sub = dict(zip(cohorts, phenos_sub)) # make a dictionary

In [207]:
phenos_sub.keys()

dict_keys(['mrs', 'armystarrs', 'prismo'])

In [208]:
[x.columns for x in phenos_sub.values()]

[Index(['Age', 'Avoidance', 'Basename', 'Bcell', 'Cd4T', 'Cd8T', 'Childhood_Mt',
        'Comp.2', 'Comp.3', 'Ewas_Id', 'Gender', 'Hyperarousal', 'Intrusion',
        'Mdd', 'Mono', 'Neu', 'Nk', 'Pts_Severity', 'Ptsdpm', 'Race', 'Smos',
        'Study', 'Traumanum', 'Visit', 'Visitkey'],
       dtype='object'),
 Index(['Age', 'Avoidance', 'Basename', 'Bcell', 'Cd4T', 'Cd8T', 'Childhood_Mt',
        'Comp.2', 'Comp.3', 'Ewas_Id', 'Gender', 'Hyperarousal', 'Intrusion',
        'Mono', 'Neu', 'Nk', 'Pts_Severity', 'Ptsdlife', 'Ptsdpm', 'Race',
        'Smos', 'Study', 'Traumanum', 'Visit', 'Visitkey'],
       dtype='object'),
 Index(['Age', 'Avoidance', 'Basename', 'Bcell', 'Cd4T', 'Cd8T', 'Childhood_Mt',
        'Comp.2', 'Comp.3', 'Ewas_Id', 'Gender', 'Hyperarousal', 'Intrusion',
        'Mono', 'Neu', 'Nk', 'Pts_Severity', 'Ptsdpm', 'Race', 'Smos', 'Study',
        'Traumanum', 'Visit', 'Visitkey'],
       dtype='object')]

In [209]:
# get columns of each df
mrs_cols, army_cols, prismo_cols = [x.columns for x in phenos_sub.values()]

In [210]:
mrs_cols

Index(['Age', 'Avoidance', 'Basename', 'Bcell', 'Cd4T', 'Cd8T', 'Childhood_Mt',
       'Comp.2', 'Comp.3', 'Ewas_Id', 'Gender', 'Hyperarousal', 'Intrusion',
       'Mdd', 'Mono', 'Neu', 'Nk', 'Pts_Severity', 'Ptsdpm', 'Race', 'Smos',
       'Study', 'Traumanum', 'Visit', 'Visitkey'],
      dtype='object')

In [211]:
army_cols

Index(['Age', 'Avoidance', 'Basename', 'Bcell', 'Cd4T', 'Cd8T', 'Childhood_Mt',
       'Comp.2', 'Comp.3', 'Ewas_Id', 'Gender', 'Hyperarousal', 'Intrusion',
       'Mono', 'Neu', 'Nk', 'Pts_Severity', 'Ptsdlife', 'Ptsdpm', 'Race',
       'Smos', 'Study', 'Traumanum', 'Visit', 'Visitkey'],
      dtype='object')

In [212]:
prismo_cols

Index(['Age', 'Avoidance', 'Basename', 'Bcell', 'Cd4T', 'Cd8T', 'Childhood_Mt',
       'Comp.2', 'Comp.3', 'Ewas_Id', 'Gender', 'Hyperarousal', 'Intrusion',
       'Mono', 'Neu', 'Nk', 'Pts_Severity', 'Ptsdpm', 'Race', 'Smos', 'Study',
       'Traumanum', 'Visit', 'Visitkey'],
      dtype='object')

In [213]:
# check if column names are matching
# (dnhs_cols == gtp_cols).all()

In [214]:
import warnings
def matching(l1, l2):
    """
    Function to compare two lists and check the order
    
    Parameters:
    l1: list 1
    l2: list 2 
    """
    
    print("Total elements in l1 :", len(l1))
    print("Total elements in l2 :", len(l2))
    m = len([l for l in l1 if l in l2])
    print("Elements matching between l1 and l2 :", m)
    if(len(l1) == len(l2)):
        print("All in order :", (l1 == l2).all())
    else:
        elm = list(set(l1).difference(l2))
        print(elm)
        l1 = [x for x in l1 if x not in elm]
        print(l1)
        print("All common elements in order :", (l1 == l2).all())
        

In [215]:
matching(l1 = mrs_cols, l2 = army_cols)

Total elements in l1 : 25
Total elements in l2 : 25
Elements matching between l1 and l2 : 24
All in order : False


In [216]:
matching(l1 = army_cols, l2 = prismo_cols)

Total elements in l1 : 25
Total elements in l2 : 24
Elements matching between l1 and l2 : 24
['Ptsdlife']
['Age', 'Avoidance', 'Basename', 'Bcell', 'Cd4T', 'Cd8T', 'Childhood_Mt', 'Comp.2', 'Comp.3', 'Ewas_Id', 'Gender', 'Hyperarousal', 'Intrusion', 'Mono', 'Neu', 'Nk', 'Pts_Severity', 'Ptsdpm', 'Race', 'Smos', 'Study', 'Traumanum', 'Visit', 'Visitkey']
All common elements in order : True


In [217]:
# matching(l1 = dnhs_cols, l2 = army_cols)

In [218]:
# matching(l1 = dnhs_cols, l2 = prismo_cols)

In [219]:
# common in all
list(set(mrs_cols) & set(army_cols) & set(prismo_cols))

['Cd8T',
 'Gender',
 'Comp.3',
 'Pts_Severity',
 'Nk',
 'Ewas_Id',
 'Intrusion',
 'Mono',
 'Age',
 'Study',
 'Visitkey',
 'Cd4T',
 'Hyperarousal',
 'Race',
 'Ptsdpm',
 'Avoidance',
 'Traumanum',
 'Neu',
 'Basename',
 'Visit',
 'Childhood_Mt',
 'Smos',
 'Comp.2',
 'Bcell']

In [220]:
[x.iloc[:5, :5] for x in phenos_sub.values()]

[     Age  Avoidance             Basename   Bcell    Cd4T
 77    20          3  201858500010_R04C01 0.06982 0.11557
 76    21          8  201858500010_R03C01 0.03482 0.08983
 63    21         12  201533590058_R02C01 0.02957 0.12690
 62    22          3  201533590058_R01C01 0.02668 0.15131
 245   21          4  202410280158_R08C01 0.00214 0.12064,
     Age  Avoidance             Basename   Bcell    Cd4T
 2    30        NaN  201858500071_R03C01 0.06154 0.10474
 3    32        NaN  201858500071_R04C01 0.04358 0.13446
 6    29        NaN  201858500071_R07C01 0.03540 0.09427
 7    30    7.00000  201858500071_R08C01 0.04971 0.16086
 10   35   22.00000  201858500082_R03C01 0.04404 0.14152,
        Age  Avoidance             Basename   Bcell    Cd4T
 0 25.20128   16.00000  201228780091_R03C01 0.05651 0.14886
 1 25.64206   16.00000  201228780091_R04C01 0.04168 0.12357
 2 26.34908    9.00000  201228780091_R07C01 0.03384 0.09820
 3 26.84189   18.00000  201228780091_R08C01 0.04501 0.12114
 4 21.22

In [221]:
# phenos_sub['gtp']

In [222]:
# convert to int
# gtp_p_sub = phenos_sub['gtp'].astype({"Traumanum":'int',
#                              })
# get_frequency(df=gtp_p_sub, col='Ptsdlife')

In [223]:
# nas in ptsd life
# gtp_p_sub['Ptsdlife'].isna().sum()

In [224]:
# phenos_sub['dnhs']

In [225]:
# get number of males and females
# all are males in MRS, ArmySTARRS and PRISMO

# Convert 0 to 1 to make gender uniform 
phenos_sub["armystarrs"]["Gender"] = 1
[get_frequency(df = x, col='Gender') for x in phenos_sub.values()]

[1    254
 Name: Gender, dtype: int64,
 1    346
 Name: Gender, dtype: int64,
 1    117
 Name: Gender, dtype: int64]

In [226]:
def replace_elements(df, col, new_elements, verbose = None):
    
    """
    Function to replace the elements in a column, e.g female:2, male:1 
    Parameters:
    df: data frame in which you want to replace
    col: name of the column in which you want to replace the elements
    new_elements: new elements to replace with 
    verbose: Print some information, default None 
    
    """
    df = df.copy(deep = True)
    x = df[col].value_counts().index
    if(len(x) != len(new_elements)):
        raise ValueError("Elements to replace must have the same length as new elements")
    
    d = {x[i]:new_elements[i] for i in range(len(new_elements))} # make dictionary
    
    if verbose is True:
        print("Categories :\n", x)
        print("Replacing :\n", d)
    
    df[col] = df[col].replace(d)
    
    return(df)

In [227]:
# replace gender in DNHS
# In original study, M = 2, F = 1
# But here in ML we will replace it to make it uniform with other studies
# dnhs_final = replace_elements(df = phenos_sub['dnhs'], col='Gender', 
#                        new_elements=[2,1], verbose=True)

In [228]:
# before 
# phenos_sub['dnhs']['Gender'].value_counts()

In [229]:
# After replacing
# dnhs_final["Gender"].value_counts()

In [230]:
# replace gender in GTP
# gtp_final = replace_elements(df = phenos_sub['gtp'], col='Gender', 
#                            new_elements=[2,1], verbose=True)

In [231]:
# Before 
# phenos_sub['gtp']['Gender'].value_counts()

In [232]:
# gtp_final['Gender'].value_counts()

In [233]:
# replace race in GTP
# gtp_final = replace_elements(df = gtp_final, col = "Race",
#                               new_elements=[2,1], verbose=True)

In [234]:
# gtp_final["Race"].value_counts()

In [235]:
# gtp_final

In [236]:
# MRS
phenos_sub['mrs']['Gender'].value_counts()

1    254
Name: Gender, dtype: int64

In [237]:
phenos_sub['armystarrs']['Gender'].value_counts()

1    346
Name: Gender, dtype: int64

In [238]:
phenos_sub['prismo']['Gender'].value_counts()

1    117
Name: Gender, dtype: int64

In [239]:
# combine phenotype data
final = pd.concat([phenos_sub['mrs'],
                   phenos_sub['armystarrs'],
                   phenos_sub['prismo']],
                  sort = False)

In [240]:
final.shape

(717, 26)

In [241]:
final

Unnamed: 0,Age,Avoidance,Basename,Bcell,Cd4T,Cd8T,Childhood_Mt,Comp.2,Comp.3,Ewas_Id,Gender,Hyperarousal,Intrusion,Mdd,Mono,Neu,Nk,Pts_Severity,Ptsdpm,Race,Smos,Study,Traumanum,Visit,Visitkey,Ptsdlife
77,20.00000,3.00000,201858500010_R04C01,0.06982,0.11557,0.10634,,-0.02316,0.03448,1058_0,1,5.00000,2.00000,0.00000,0.05471,0.62616,0.02740,10.00000,0.00000,5,-14.42993,MRS,8.00000,0.00000,100119,
76,21.00000,8.00000,201858500010_R03C01,0.03482,0.08983,0.09597,,-0.02316,0.03448,1058_3,1,7.00000,3.00000,1.00000,0.05674,0.68559,0.03706,18.00000,0.00000,5,-14.66250,MRS,10.00000,3.00000,103964,
63,21.00000,12.00000,201533590058_R02C01,0.02957,0.12690,0.12345,,-0.03292,-0.01291,1148_0,1,10.00000,0.00000,11.00000,0.05332,0.65024,0.01651,22.00000,0.00000,5,18.71351,MRS,14.00000,0.00000,100208,
62,22.00000,3.00000,201533590058_R01C01,0.02668,0.15131,0.11937,,-0.03292,-0.01291,1148_2,1,19.00000,5.00000,4.00000,0.06432,0.60346,0.03486,27.00000,0.00000,5,21.32411,MRS,15.00000,2.00000,101866,
245,21.00000,4.00000,202410280158_R08C01,0.00214,0.12064,0.09551,,-0.08241,-0.00894,1340_0,1,23.00000,5.00000,9.00000,0.07120,0.66415,0.04637,32.00000,0.00000,5,-29.20186,MRS,3.00000,0.00000,100350,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113,20.22253,,201233760084_R01C01,0.03351,0.10032,0.10377,1.00000,-0.02648,0.06910,U.A.7.841,1,,,,0.04434,0.61882,0.09923,,,oth,-16.01063,Prismo,,0_epic,U.A.7.841.0,
114,21.61125,9.00000,201233760084_R03C01,0.04360,0.12664,0.12946,1.00000,-0.02329,0.06724,U.A.7.841,1,10.00000,6.00000,,0.05086,0.57904,0.07040,25.00000,0.00000,oth,-34.69551,Prismo,4.00000,2_epic,U.A.7.841.2,
115,21.58184,18.00000,201233760084_R04C01,0.03957,0.15071,0.10001,6.00000,0.05563,0.03870,A.1.025,1,17.00000,10.00000,,0.09667,0.57299,0.04004,45.00000,1.00000,eur,0.59052,Prismo,,2_epic,A.1.025.2,
116,21.00000,,201233760084_R06C01,0.04812,0.18336,0.12530,6.00000,0.05416,0.03520,A.1.025,1,,,,0.09192,0.50584,0.04546,,,eur,-2.26329,Prismo,,0_epic,A.1.025.0,


In [242]:
# Check categories in final
print("Gender:\n", final['Gender'].value_counts())
print("PTSDpm:\n", final['Ptsdpm'].value_counts())
print("PTSDlife:\n", final['Ptsdlife'].value_counts())

Gender:
 1    717
Name: Gender, dtype: int64
PTSDpm:
 0.00000    564
1.00000    128
Name: Ptsdpm, dtype: int64
PTSDlife:
 0.00000    285
1.00000     61
Name: Ptsdlife, dtype: int64


In [243]:
# now check na in the combined data
final.isnull().sum()

Age               1
Avoidance       204
Basename          0
Bcell             0
Cd4T              0
Cd8T              0
Childhood_Mt     40
Comp.2            0
Comp.3            0
Ewas_Id           0
Gender            0
Hyperarousal    203
Intrusion       202
Mdd             463
Mono              0
Neu               0
Nk                0
Pts_Severity    201
Ptsdpm           25
Race             10
Smos              0
Study             0
Traumanum        67
Visit             0
Visitkey          0
Ptsdlife        371
dtype: int64

In [244]:
# Now check if we have all the samples in pheno and methylation files
dfs_merged.columns.str.contains('|'.join(final['Basename'].tolist())).sum()

717

In [245]:
def check_all_match(first, second):
    """
    Function to check if all the samples in methylation and phenotye match
    Parameters: 
    first: Elements to search
    second: Elements to search in
    """
    all_match = first.str.contains('|'.join(second.tolist())).all()
    num_match = first.str.contains('|'.join(second.tolist())).sum()
    if not all_match:
        raise ValueError('All are not matching')
    elif all_match:
        print("All samples match between pheno and methylation: ", num_match)


In [246]:
check_all_match(first = final['Basename'], 
               second = dfs_merged.columns)

All samples match between pheno and methylation:  717


In [247]:
# Now save the data
# Create the directory and assign timestamp folder

import os, datetime

def make_directory(maindir = None, verbose = None):
    """
    Function to create directory in you current working directory.
    The function will have time stamp assigned
    
    Parameters: 
    dirname : name of main directory to hold newly created directories
    
    """
    
#     os.chdir('..') # go one step back to the current dir
    
    if maindir is False or  maindir is True:
        raise ValueError("dirname can't be True or False")
    
    if maindir is None:
        mydir = os.path.join(os.getcwd(),
                     datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))
        
    elif maindir is not None:
        mydir = os.path.join(os.getcwd(), maindir,
                     datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))
        
    os.makedirs(mydir)
        
    if verbose:
        print("Directory created:", mydir)
        
    return(mydir)
      


In [248]:
# change to directory and make folder
# os.chdir("G:/PGC ML/")
# mydir = make_directory(maindir="Pre_Processed Data",  verbose=True)

# use directory that is already created
mydir = "G:/PGC ML/Pre_Processed Data/2021-11-15_21-41-53"

In [249]:
# Save Phenotype file
final.to_csv(os.path.join(mydir, "Pre_Post_MRS_ArmyS_Prismo_Pheno.csv"),
            index=False)

In [250]:

def save_data(fname, df):
    """
    Function to save the data
    Parameters:
    fname: file name
    df: data frame
    """
    if fname.endswith(".csv"):
        df.to_csv(os.path.join(mydir, fname))
    elif fname.endswith(".feather"):
        feather.write_feather(df, os.path.join(mydir, fname))
        

In [251]:
# Save individual datasets
# pheno_f_names = ["DNHS_UnqRESP_Pheno_final.csv", "GTP_Pheno_final.csv",
#           "MRS_POST_DEP_Pheno_final.csv", "ArmyStarrs_visit2_pheno.csv",
#           "Prismo_visit2_pheno.csv"]

# individual_cohorts = [dnhs_final, gtp_final, phenos_sub['mrs'],
#                   phenos_sub['armystarrs'], phenos_sub['prismo']]

In [252]:
# for i in range(len(pheno_f_names)):
#     save_data(fname=pheno_f_names[i], df = individual_cohorts[i])
#     print(pheno_f_names[i])
    

In [253]:
# Save individual methylation data 
# meth_f_names = ["DNHS_methylation_unq.feather", "GTP_methylation.feather",
#                "MRS_methylation_post.feather",
#                 "ArmyStarrs_visit2_methylation.feather",
#                "Prismo_visit2_methylation.feather"]
# for i in range(len(meth_f_names)):
#     save_data(fname=meth_f_names[i], df = all_meth_dfs[i])
#     print(meth_f_names[i])

In [257]:
# Save combined methylation data
feather.write_feather(dfs_merged, os.path.join(mydir, "Pre_Post_ArmyS_Prismo_methylation.feather"))

In [258]:
# Total number of columns that are matching 
# Without rowname column
dfs_merged.columns.isin(final["Basename"]).sum()

717

In [None]:
# end