In [29]:
import pandas as pd
from collections import Counter
import numpy as np
import os

In [30]:
def prepare_targets(y,groups):
    class1 = groups.split('_')[0]
    class2 = groups.split('_')[1]
    count_dict = Counter(y)
    class1_count = count_dict[class1]
    class2_count = count_dict[class2]
    ## Label minority class = 1 and majority class = 0
    if  class1_count > class2_count:
        count_dict[class1] = int(0)
        count_dict[class2] = int(1)
    else:
        count_dict[class1] = int(1)
        count_dict[class2] = int(0)

    op = [count_dict[i] for i in y]
    return np.asarray(op)

def gwas_feats_expand(df,groups,cat_cols=[]): #This takes the dataframe and returns the one hot encoded expansion of input features
    target = prepare_targets(list(df.DIAG),groups)
    PTID = df.PTID
    df1 = df.drop(columns=['PTID','DIAG','Unnamed: 0']).reset_index(drop=True) #Patient ID and DIAG not needed
    num_cols = list(set(df1.columns) - set(cat_cols)) #Numerical features
    expand_cat = num_cols #Placeholder List for expanded columns
    for cat in cat_cols:
        expand_cat = expand_cat + [str(cat)+'_'+ str(c) for c in list(set(df1[cat]))]
    df_out = pd.DataFrame(columns=list(expand_cat))
    for col in num_cols:
        df_out[col] = df1[col]
    for i in range(len(df1)):
        row = df1.iloc[i]
        for col in cat_cols:
            item = row[col]
            df_out.at[i,str(col)+'_'+ str(item)] = str(1)
        
    df_out = df_out.fillna(str(0))
    return PTID, df_out, target.ravel()

In [31]:
RESULTS = 'data' #Change this path accordingly
GWAS_results_path = '/mnt/gpfs2_16m/pscratch/nja224_uksr/SKH259/LinLab/ADNI_Genetics/GWAS_Gene_Expr/data/Features_ranked_for_CN_AD_1000_prune.csv'
GWAS_data_path = '/mnt/gpfs2_16m/pscratch/nja224_uksr/SKH259/LinLab/ADNI_Genetics/GWAS_Gene_Expr/data/final_1000_GWAS12_data_Dx_bl.csv'

GeneExpr_results_path = '/mnt/gpfs2_16m/pscratch/nja224_uksr/SKH259/LinLab/ADNI_Genetics/GWAS_Gene_Expr/data/Features_ranked_for_CN_AD_400_prune.csv'
GeneExpr_data_path = '/mnt/gpfs2_16m/pscratch/nja224_uksr/SKH259/LinLab/ADNI_Genetics/GWAS_Gene_Expr/data/Unfiltered_gene_expr_dx.csv'

results_path = '/mnt/gpfs2_16m/pscratch/nja224_uksr/SKH259/LinLab/ADNI_Genetics/GWAS_Gene_Expr/'+RESULTS 


In [32]:
imp_gwas_features = list(pd.read_csv(GWAS_results_path,low_memory=False)['features'])
imp_gwas_snps = ['PTID']+['_'.join(a.split('_')[0:2]) for a in imp_gwas_features if 'GENDER' not in a] + ['DIAG']

include_gender = [a for a in imp_gwas_features if 'GENDER' in a]

if len(include_gender)>0:
    imp_gwas_snps = imp_gwas_snps + ['GENDER']

df_gwas = pd.read_csv(GWAS_data_path,na_values=["00"],low_memory=False)
df_gwas = df_gwas[imp_gwas_snps]


imp_expr_features = ['Unnamed: 0']+list(pd.read_csv(GeneExpr_results_path,low_memory=False)['features']) + ['DX_bl']
df_expr = pd.read_csv(GeneExpr_data_path)[imp_expr_features]
common_subjects = set(df_gwas['PTID']).intersection(set(df_expr['Unnamed: 0']))


  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [33]:
GWAS_data_final = df_gwas[pd.DataFrame(df_gwas.PTID.tolist()).isin(common_subjects).any(1).values]
GWAS_data_final.dropna(inplace=True)
print('Common GWAS data shape')
print(GWAS_data_final.shape)


Common GWAS data shape
(217, 155)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  GWAS_data_final.dropna(inplace=True)


In [34]:
GWAS_data_final.columns

Index(['PTID', '19_rs2075650', 'AGE', '5_rs10941091', '17_rs1029742',
       '10_rs1538424', '16_rs4949150', '5_rs966857', '8_rs10505551',
       '2_rs1065381',
       ...
       '5_rs7720806', '2_rs4241135', '1_rs2067606', '15_rs4432234',
       '5_rs973153', '1_rs3737741', '3_rs12488144', '9_rs7018577', '5_rs34900',
       'DIAG'],
      dtype='object', length=155)

In [35]:
GWAS_data_final.to_csv(os.path.join(results_path,'common_gwas.csv'))

In [36]:
GWAS_data_final = pd.read_csv(os.path.join(results_path,'common_gwas.csv'))
cat_cols = [col for col in GWAS_data_final.columns if 'rs' in col]
gwas_ptid, gwas_final, gwas_y = gwas_feats_expand(GWAS_data_final,'CN_AD',cat_cols)
gwas_final['PTID'] = gwas_ptid
gwas_final['DIAG'] = gwas_y 


In [37]:
non_rs_cols = [col for col in gwas_final.columns if 'rs' not in col]

In [38]:
rs_cols = [col for col in gwas_final.columns if 'rs' in col]

In [39]:
imp_rs_cols = [col for col in rs_cols if col in imp_gwas_features]

In [40]:
overall_cols = list(set(non_rs_cols + imp_rs_cols))
len(overall_cols)

155

In [41]:
gwas_final = gwas_final[overall_cols]
gwas_final.to_csv(os.path.join(results_path,'common_gwas_expanded.csv'))

In [42]:
gwas_final.head()

Unnamed: 0,7_rs12670401_CC,3_rs33491_TT,1_rs1887628_AG,9_rs10114675_GG,15_rs12915188_CC,5_rs973153_GG,11_rs2280544_TT,2_rs12710671_GG,17_rs6501384_CC,6_rs3800043_CT,...,1_rs1267305_GG,4_rs11722689_CT,PTID,12_rs10879839_TT,1_rs1328180_TT,22_rs2235573_AA,4_rs4645287_TT,9_rs16911006_AA,10_rs10752030_TT,9_rs7033668_GG
0,1,1,0,0,1,0,0,0,0,0,...,1,0,018_S_0055,1,0,0,0,0,0,0
1,0,1,1,1,0,1,0,1,1,1,...,0,0,027_S_0118,1,0,0,0,0,0,1
2,0,0,1,1,0,1,0,0,1,1,...,1,0,098_S_0171,1,0,0,1,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,1,0,011_S_0023,1,0,0,1,0,0,0
4,0,1,0,1,1,0,0,0,0,0,...,1,0,128_S_0522,1,1,0,0,0,1,0


In [43]:
len(set(gwas_final.PTID))

217

In [44]:
common_subjects_final = list(GWAS_data_final.PTID)
Gene_expr_final = df_expr[pd.DataFrame(df_expr['Unnamed: 0'].tolist()).isin(common_subjects_final).any(1).values]
cols = Gene_expr_final.columns
num_cols = list(cols[1:])
Gene_expr_final.columns = ['PTID']+list(cols[1:])
print('Common Gene Expression data shape')
print(Gene_expr_final.shape)
Gene_expr_final.head()

Common Gene Expression data shape
(217, 65)


Unnamed: 0,PTID,11720732_a_at_SUMF1,AGE,11728631_a_at_LYSMD1,11733482_a_at_DYRK3,11730765_at_CD177,11746336_a_at_CNOT8,11762503_at_11762535_at,11757581_x_at_MT1X,11760223_at_nan,...,11752515_a_at_ODC1,11750828_a_at_HIP1R,11739338_at_FAM46C,11743153_at_C16ORF72,11721863_a_at_SLC37A3,11722183_s_at_ATP6V1C1,11730498_s_at_MXI1,11724194_s_at_SERF2,11715258_s_at_PTMS,DX_bl
0,116_S_1249,3.332,70.8,7.756,2.6,2.685,6.569,2.222,6.997,6.721,...,8.499,3.732,8.294,8.516,4.453,8.006,10.759,9.369,6.478,CN
1,037_S_4410,3.313,69.1,7.437,3.594,2.871,6.932,2.414,6.924,6.619,...,8.261,3.363,8.015,8.583,3.832,8.219,10.511,9.314,6.072,CN
2,006_S_4153,3.481,79.3,7.934,3.053,3.387,6.534,2.213,6.661,7.47,...,8.294,3.438,7.96,8.048,3.577,7.811,10.397,9.389,6.242,AD
3,116_S_1232,3.871,72.1,7.667,3.407,2.574,6.357,2.51,7.094,7.25,...,8.859,3.219,10.158,7.79,3.171,7.705,11.126,9.838,6.764,CN
8,036_S_4491,3.732,84.1,8.11,2.676,5.755,6.361,2.646,7.282,6.212,...,8.538,4.002,7.932,7.675,2.534,7.322,10.589,9.858,7.534,CN


In [45]:
expr_target = prepare_targets(list(Gene_expr_final.DX_bl),'CN_AD')

In [46]:
Gene_expr_final['DX_bl'] = list(expr_target)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Gene_expr_final['DX_bl'] = list(expr_target)


In [47]:
Gene_expr_final.head()

Unnamed: 0,PTID,11720732_a_at_SUMF1,AGE,11728631_a_at_LYSMD1,11733482_a_at_DYRK3,11730765_at_CD177,11746336_a_at_CNOT8,11762503_at_11762535_at,11757581_x_at_MT1X,11760223_at_nan,...,11752515_a_at_ODC1,11750828_a_at_HIP1R,11739338_at_FAM46C,11743153_at_C16ORF72,11721863_a_at_SLC37A3,11722183_s_at_ATP6V1C1,11730498_s_at_MXI1,11724194_s_at_SERF2,11715258_s_at_PTMS,DX_bl
0,116_S_1249,3.332,70.8,7.756,2.6,2.685,6.569,2.222,6.997,6.721,...,8.499,3.732,8.294,8.516,4.453,8.006,10.759,9.369,6.478,0
1,037_S_4410,3.313,69.1,7.437,3.594,2.871,6.932,2.414,6.924,6.619,...,8.261,3.363,8.015,8.583,3.832,8.219,10.511,9.314,6.072,0
2,006_S_4153,3.481,79.3,7.934,3.053,3.387,6.534,2.213,6.661,7.47,...,8.294,3.438,7.96,8.048,3.577,7.811,10.397,9.389,6.242,1
3,116_S_1232,3.871,72.1,7.667,3.407,2.574,6.357,2.51,7.094,7.25,...,8.859,3.219,10.158,7.79,3.171,7.705,11.126,9.838,6.764,0
8,036_S_4491,3.732,84.1,8.11,2.676,5.755,6.361,2.646,7.282,6.212,...,8.538,4.002,7.932,7.675,2.534,7.322,10.589,9.858,7.534,0


In [48]:
Gene_expr_final = Gene_expr_final.rename(columns={"DX_bl": "DIAG"})

In [49]:
Gene_expr_final.columns

Index(['PTID', '11720732_a_at_SUMF1', 'AGE', '11728631_a_at_LYSMD1',
       '11733482_a_at_DYRK3', '11730765_at_CD177', '11746336_a_at_CNOT8',
       '11762503_at_11762535_at', '11757581_x_at_MT1X', '11760223_at_nan',
       '11747640_a_at_TDP2', '11715149_x_at_HIST2H2AB', '11717994_a_at_NR4A1',
       '11724431_at_C3ORF52', '11716702_a_at_SEMG1', '11727103_at_IGF2BP1',
       '11739849_x_at_UPK3BL || POLR2J2 || POLR2J3', '11742943_a_at_CKM',
       '11750143_a_at_PIGZ', '11757988_s_at_PRUNE2', '11753445_a_at_HMOX1',
       '11750256_a_at_RSF1', '11726385_a_at_MT1X', '11750558_s_at_ZNF271',
       '11741967_a_at_CDYL', '11761978_at_ARHGEF12',
       '11764161_a_at_11764193_at', '11721929_x_at_TMCC2', '11743992_at_UBE2O',
       '11719477_a_at_FASTKD5', '11756127_a_at_PGM1', '11724061_s_at_OSBPL1A',
       '11716563_s_at_AIMP2', '11726316_at_SELE', '11724751_at_MRTO4',
       '11763538_x_at_PRR5', '11724458_at_HCN4', '11746393_x_at_INTS6',
       '11718871_x_at_PALMD', '11717646_a_at_MC

In [50]:
Gene_expr_final.to_csv(os.path.join(results_path,'common_geneExpr.csv'))

In [51]:
gwas_final.head()

Unnamed: 0,7_rs12670401_CC,3_rs33491_TT,1_rs1887628_AG,9_rs10114675_GG,15_rs12915188_CC,5_rs973153_GG,11_rs2280544_TT,2_rs12710671_GG,17_rs6501384_CC,6_rs3800043_CT,...,1_rs1267305_GG,4_rs11722689_CT,PTID,12_rs10879839_TT,1_rs1328180_TT,22_rs2235573_AA,4_rs4645287_TT,9_rs16911006_AA,10_rs10752030_TT,9_rs7033668_GG
0,1,1,0,0,1,0,0,0,0,0,...,1,0,018_S_0055,1,0,0,0,0,0,0
1,0,1,1,1,0,1,0,1,1,1,...,0,0,027_S_0118,1,0,0,0,0,0,1
2,0,0,1,1,0,1,0,0,1,1,...,1,0,098_S_0171,1,0,0,1,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,1,0,011_S_0023,1,0,0,1,0,0,0
4,0,1,0,1,1,0,0,0,0,0,...,1,0,128_S_0522,1,1,0,0,0,1,0


In [52]:
GWAS_GeneExpr_df = pd.merge(gwas_final,Gene_expr_final,how='left', on=['PTID','AGE','DIAG']) #By inspecting at the columns of best features for GWAS and Gene Expression
print('Label Distribution of common data')
print(Counter(GWAS_GeneExpr_df.DIAG))
GWAS_GeneExpr_df.head()

Label Distribution of common data
Counter({0: 193, 1: 24})


Unnamed: 0,7_rs12670401_CC,3_rs33491_TT,1_rs1887628_AG,9_rs10114675_GG,15_rs12915188_CC,5_rs973153_GG,11_rs2280544_TT,2_rs12710671_GG,17_rs6501384_CC,6_rs3800043_CT,...,11727808_s_at_FECH,11752515_a_at_ODC1,11750828_a_at_HIP1R,11739338_at_FAM46C,11743153_at_C16ORF72,11721863_a_at_SLC37A3,11722183_s_at_ATP6V1C1,11730498_s_at_MXI1,11724194_s_at_SERF2,11715258_s_at_PTMS
0,1,1,0,0,1,0,0,0,0,0,...,8.399,8.321,3.636,9.277,8.468,5.461,7.823,10.819,9.411,6.374
1,0,1,1,1,0,1,0,1,1,1,...,7.522,7.312,3.697,7.946,7.591,2.776,8.091,10.384,9.85,6.94
2,0,0,1,1,0,1,0,0,1,1,...,6.747,7.942,3.511,7.409,7.659,2.69,6.948,10.44,9.86,6.604
3,0,0,0,0,0,0,0,0,0,0,...,11.234,9.629,3.758,11.485,7.835,3.152,7.75,12.033,10.382,7.426
4,0,1,0,1,1,0,0,0,0,0,...,7.791,8.342,3.637,8.499,8.439,3.993,7.966,10.612,9.439,6.426


In [53]:
GWAS_GeneExpr_df.to_csv(os.path.join(results_path,'common_combined_expanded.csv'))

In [54]:
print(Counter(GWAS_GeneExpr_df.DIAG))
print(Counter(Gene_expr_final.DIAG))
print(Counter(gwas_final.DIAG))

Counter({0: 193, 1: 24})
Counter({0: 193, 1: 24})
Counter({0: 193, 1: 24})


In [61]:
expr_df = pd.read_csv('/mnt/gpfs2_16m/pscratch/nja224_uksr/SKH259/LinLab/ADNI_Genetics/GWAS_Gene_Expr/data/common_geneExpr.csv').drop(columns=['Unnamed: 0']).reset_index(drop=True) 
GWAS_df = pd.read_csv('/mnt/gpfs2_16m/pscratch/nja224_uksr/SKH259/LinLab/ADNI_Genetics/GWAS_Gene_Expr/data/common_gwas_expanded.csv').drop(columns=['Unnamed: 0']).reset_index(drop=True) 
combined_df = pd.read_csv('/mnt/gpfs2_16m/pscratch/nja224_uksr/SKH259/LinLab/ADNI_Genetics/GWAS_Gene_Expr/data/common_combined_expanded.csv').drop(columns=['Unnamed: 0']).reset_index(drop=True) 
