In [62]:
import pandas as pd
import sys
import os

sys.path.insert(0, "../src/")
from Decomposed_matrices import Decomposed_matrices
from Genomic_bins import Genomic_bins
from great import read_great_res_wrapper

### Load Data Matrices

In [63]:
data_dir = '../private_data'

genomic_bins = Genomic_bins(os.path.join(data_dir, 'loci_def.bed'))
decomposed_mats = Decomposed_matrices(os.path.join(data_dir, 'diagonalScore.csv.gz'), os.path.join(data_dir, 'uScore.csv.gz'), os.path.join(data_dir, 'vScore.csv.gz'))

### Output from SNP2CHIP on SNP Catalog

Load results form SNP2ChIP.

In [4]:
input_dir = '../src/snp_scan.csv'
input_err_dir = '../src/snp_scan_err.csv'

snps_output_df = pd.read_csv(input_dir, sep=',', header=None, names=['CHR','LOC','K','PCs','SCORES','PHENOTYPE','TYPE','INTERGENIC'])
snps_error_df = pd.read_csv(input_err_dir, sep=',', header=None, names=['CHR','LOC','K','PHENOTYPE','TYPE','INTERGENIC'])

In [56]:
print(int(100*snps_output_df.shape[0]/snps_error_df.shape[0]),"% SNPs passed through snp2chip")
snps_output_df.head()

21 % SNPs passed through snp2chip


Unnamed: 0,CHR,LOC,K,PCs,SCORES,PHENOTYPE,TYPE,INTERGENIC
2,6,32441753,10,[ 7 1 0 8 21 5 2 3 11 65],[0.11328269 0.11149748 0.08087866 0.07007339 0...,Systemic sclerosis,intron_variant,0.0
3,6,33075103,10,[180 175 174 176 178 181 163 156 152 153],[0.17119542 0.1259751 0.11103297 0.03144724 0...,Systemic sclerosis,intron_variant,0.0
6,6,31125810,10,[ 0 25 5 8 39 20 129 16 64 145],[0.37018713 0.06244947 0.05597641 0.05212154 0...,Drug-induced Stevens-Johnson syndrome or toxic...,5_prime_UTR_variant,0.0
20,6,26092913,10,[ 0 135 17 13 1 23 60 20 15 3],[0.08444188 0.07176824 0.06359578 0.05624773 0...,Hepcidin levels,missense_variant,0.0
21,6,26092913,10,[ 0 135 17 13 1 23 60 20 15 3],[0.08444188 0.07176824 0.06359578 0.05624773 0...,Hepcidin levels,missense_variant,0.0


### Get Top Counts of each Phenotype

In [45]:
phen_df = snps_output_df.groupby('PHENOTYPE').size().to_frame().sort_values(0, ascending=False).rename(columns={0:'reg_count'})
phen_err_df = snps_error_df.groupby('PHENOTYPE').size().to_frame().sort_values(0, ascending=False).rename(columns={0:'err_count'})
full_phenotype_df = phen_err_df.join(phen_df)

full_phenotype_df.head()

Unnamed: 0_level_0,err_count,reg_count
PHENOTYPE,Unnamed: 1_level_1,Unnamed: 2_level_1
Blood protein levels,2012,545.0
Body mass index,904,142.0
Post bronchodilator FEV1/FVC ratio,878,173.0
Post bronchodilator FEV1,729,151.0
Schizophrenia,611,119.0


### Compute Fraction of Phenotype SNP Coverage in SNPs2ChIP

The ``frac`` indicates the number of SNPs that SNPs2ChIP currently covers. We filter for phenotypes sorted by the highest fraction coverage above 25 catalogued and reported in SNPs2ChIP. 

In [46]:
full_phenotype_df['frac'] = full_phenotype_df.apply(lambda x: x['reg_count'] / x.sum(), axis=1)
full_phenotype_df = full_phenotype_df.sort_values('frac', ascending=False)
full_phenotype_df.head()

Unnamed: 0_level_0,err_count,reg_count,frac
PHENOTYPE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ovarian cancer in BRCA1 mutation carriers,1,5.0,0.833333
Skin colour saturation,1,4.0,0.8
Myositis,1,4.0,0.8
IgG bisecting N-acetyl glucosamine phenotypes (multivariate analysis),1,3.0,0.75
IgG disialylation phenotypes (multivariate analysis),1,3.0,0.75


In [80]:
phenotypes_of_interest = full_phenotype_df[full_phenotype_df['reg_count'] > 25].head(15).index.values

In [97]:
def get_top_pcs(phenotype):
    """
    Takes in a phenotype from GWAS catalogue and returns the top pcs and scores. 
    """
    print('Phenotype: ', phenotype)
    chrs = snps_output_df[snps_output_df['PHENOTYPE'] == phenotype]['CHR'].values
    snps = snps_output_df[snps_output_df['PHENOTYPE'] == phenotype]['LOC'].values
    
    pcs, scores = decomposed_mats.find_pcs_given_loci_list(genomic_bins.find_loci_given_snps(list(chrs), list(snps)))
    print("Top PCs: ", pcs)
    return pcs, scores

def get_enrichment(pc):
    return read_great_res_wrapper(data_dir, pc, 'HumanPhenotypeOntology', topk=10) 

### Investigate Enrichment of Each Phenotype's Top PCs

In [98]:
get_enrichment(get_top_pcs(phenotypes_of_interest[0])[0][0])

Phenotype:  IgG glycosylation patterns
Top PCs:  [22 19 24 16 11]


Unnamed: 0,# ID,Desc,BFold,BPval,BFDR
0,HP:0001660,Truncus arteriosus,4.970477,4.786967e-09,1.4e-05
1,HP:0001103,Abnormality of the macula,2.034814,1.388952e-07,0.000207
4,HP:0000575,Scotoma,3.536168,3.819037e-06,0.002278
5,HP:0004712,Renal malrotation,3.931226,5.979286e-06,0.002973
8,HP:0003003,Colon cancer,4.436912,1.255658e-05,0.004162
9,HP:0004935,Pulmonary artery atresia,7.65531,1.395981e-05,0.004164
13,HP:0001669,Transposition of the great arteries,2.443429,2.499534e-05,0.005326
15,HP:0000074,Ureteropelvic junction obstruction,4.935518,5.145462e-05,0.009593
17,HP:0002619,Varicose veins,5.339305,6.695722e-05,0.011096
19,HP:0008496,Multiple rows of eyelashes,4.091756,0.0001150177,0.017155


In [114]:
get_enrichment(get_top_pcs(phenotypes_of_interest[1])[0][0])

Phenotype:  Vitiligo
Top PCs:  [  2   0   3 280   1]


Unnamed: 0,# ID,Desc,BFold,BPval,BFDR
8,HP:0012103,Abnormality of the mitochondrion,2.430178,1.981297e-08,7e-06
11,HP:0003287,Abnormality of mitochondrial metabolism,2.397021,5.547277e-08,1.4e-05
16,HP:0010972,Anemia of inadequate production,2.084019,1.467984e-07,2.6e-05
20,HP:0200042,Skin ulcer,2.02101,2.149489e-07,3.1e-05
22,HP:0001581,Recurrent skin infections,2.644626,3.26783e-07,4.2e-05
23,HP:0002665,Lymphoma,2.074043,3.357733e-07,4.2e-05
27,HP:0005406,Recurrent bacterial skin infections,4.113836,4.352149e-07,4.6e-05
37,HP:0006429,Broad femoral neck,7.213434,2.049863e-06,0.000161
38,HP:0002722,Recurrent abscess formation,3.547911,2.050315e-06,0.000157
43,HP:0001733,Pancreatitis,2.46457,3.059544e-06,0.000207


In [100]:
get_enrichment(get_top_pcs(phenotypes_of_interest[2])[0][0])

Phenotype:  Autism spectrum disorder or schizophrenia
Top PCs:  [ 1  0 37  5  2]


Unnamed: 0,# ID,Desc,BFold,BPval,BFDR
0,HP:0004395,Malnutrition,5.321346,5e-06,0.013516
1,HP:0001718,Mitral stenosis,3.783898,6.3e-05,0.093634
2,HP:0001413,Micronodular cirrhosis,2.821075,0.00012,0.118908
5,HP:0004333,Bone-marrow foam cells,6.657703,0.000345,0.171644
6,HP:0003548,Subsarcolemmal accumulations of abnormally sha...,6.137533,0.000527,0.224788
8,HP:0001414,Microvesicular hepatic steatosis,3.131675,0.000632,0.209512
12,HP:0004975,Erlenmeyer flask deformity of the femurs,3.197454,0.001488,0.341373
13,HP:0002725,Systemic lupus erythematosus,3.788696,0.001533,0.326633
17,HP:0005938,Abnormal respiratory motile cilium morphology,3.267208,0.002178,0.360958
21,HP:0001618,Dysphonia,2.208404,0.003317,0.449819


In [101]:
get_enrichment(get_top_pcs(phenotypes_of_interest[3])[0][0])

Phenotype:  Monocyte count
Top PCs:  [0 1 3 2 7]


Unnamed: 0,# ID,Desc,BFold,BPval,BFDR
3,HP:0012140,Abnormality of cells of the lymphoid lineage,2.315639,3.637427e-07,0.000271
5,HP:0001888,Lymphopenia,2.382784,5.56784e-07,0.000277
11,HP:0001878,Hemolytic anemia,2.008804,1.152956e-06,0.000287
13,HP:0002917,Hypomagnesemia,3.452539,3.079205e-06,0.000656
20,HP:0004921,Abnormality of magnesium homeostasis,2.814905,5.371737e-05,0.00763
22,HP:0000121,Nephrocalcinosis,2.448755,8.832524e-05,0.011455
24,HP:0200114,Metabolic alkalosis,4.508408,0.0001073928,0.012814
25,HP:0002643,Neonatal respiratory distress,3.743395,0.0001307868,0.015005
27,HP:0000360,Tinnitus,2.679365,0.0001513635,0.016126
28,HP:0001281,Tetany,2.937869,0.0001785555,0.018367


In [102]:
get_enrichment(get_top_pcs(phenotypes_of_interest[4])[0][0])

Phenotype:  IgG glycosylation
Top PCs:  [ 2  0  5  1 27]


Unnamed: 0,# ID,Desc,BFold,BPval,BFDR
8,HP:0012103,Abnormality of the mitochondrion,2.430178,1.981297e-08,7e-06
11,HP:0003287,Abnormality of mitochondrial metabolism,2.397021,5.547277e-08,1.4e-05
16,HP:0010972,Anemia of inadequate production,2.084019,1.467984e-07,2.6e-05
20,HP:0200042,Skin ulcer,2.02101,2.149489e-07,3.1e-05
22,HP:0001581,Recurrent skin infections,2.644626,3.26783e-07,4.2e-05
23,HP:0002665,Lymphoma,2.074043,3.357733e-07,4.2e-05
27,HP:0005406,Recurrent bacterial skin infections,4.113836,4.352149e-07,4.6e-05
37,HP:0006429,Broad femoral neck,7.213434,2.049863e-06,0.000161
38,HP:0002722,Recurrent abscess formation,3.547911,2.050315e-06,0.000157
43,HP:0001733,Pancreatitis,2.46457,3.059544e-06,0.000207


In [103]:
get_enrichment(get_top_pcs(phenotypes_of_interest[5])[0][0])

Phenotype:  Reticulocyte count
Top PCs:  [0 1 3 2 5]


Unnamed: 0,# ID,Desc,BFold,BPval,BFDR
3,HP:0012140,Abnormality of cells of the lymphoid lineage,2.315639,3.637427e-07,0.000271
5,HP:0001888,Lymphopenia,2.382784,5.56784e-07,0.000277
11,HP:0001878,Hemolytic anemia,2.008804,1.152956e-06,0.000287
13,HP:0002917,Hypomagnesemia,3.452539,3.079205e-06,0.000656
20,HP:0004921,Abnormality of magnesium homeostasis,2.814905,5.371737e-05,0.00763
22,HP:0000121,Nephrocalcinosis,2.448755,8.832524e-05,0.011455
24,HP:0200114,Metabolic alkalosis,4.508408,0.0001073928,0.012814
25,HP:0002643,Neonatal respiratory distress,3.743395,0.0001307868,0.015005
27,HP:0000360,Tinnitus,2.679365,0.0001513635,0.016126
28,HP:0001281,Tetany,2.937869,0.0001785555,0.018367


In [104]:
get_enrichment(get_top_pcs(phenotypes_of_interest[6])[0][0])

Phenotype:  Psoriasis
Top PCs:  [ 1  2 37 94  5]


Unnamed: 0,# ID,Desc,BFold,BPval,BFDR
0,HP:0004395,Malnutrition,5.321346,5e-06,0.013516
1,HP:0001718,Mitral stenosis,3.783898,6.3e-05,0.093634
2,HP:0001413,Micronodular cirrhosis,2.821075,0.00012,0.118908
5,HP:0004333,Bone-marrow foam cells,6.657703,0.000345,0.171644
6,HP:0003548,Subsarcolemmal accumulations of abnormally sha...,6.137533,0.000527,0.224788
8,HP:0001414,Microvesicular hepatic steatosis,3.131675,0.000632,0.209512
12,HP:0004975,Erlenmeyer flask deformity of the femurs,3.197454,0.001488,0.341373
13,HP:0002725,Systemic lupus erythematosus,3.788696,0.001533,0.326633
17,HP:0005938,Abnormal respiratory motile cilium morphology,3.267208,0.002178,0.360958
21,HP:0001618,Dysphonia,2.208404,0.003317,0.449819


In [105]:
get_enrichment(get_top_pcs(phenotypes_of_interest[7])[0][0])

Phenotype:  Eosinophil percentage of granulocytes
Top PCs:  [0 1 3 2 9]


Unnamed: 0,# ID,Desc,BFold,BPval,BFDR
3,HP:0012140,Abnormality of cells of the lymphoid lineage,2.315639,3.637427e-07,0.000271
5,HP:0001888,Lymphopenia,2.382784,5.56784e-07,0.000277
11,HP:0001878,Hemolytic anemia,2.008804,1.152956e-06,0.000287
13,HP:0002917,Hypomagnesemia,3.452539,3.079205e-06,0.000656
20,HP:0004921,Abnormality of magnesium homeostasis,2.814905,5.371737e-05,0.00763
22,HP:0000121,Nephrocalcinosis,2.448755,8.832524e-05,0.011455
24,HP:0200114,Metabolic alkalosis,4.508408,0.0001073928,0.012814
25,HP:0002643,Neonatal respiratory distress,3.743395,0.0001307868,0.015005
27,HP:0000360,Tinnitus,2.679365,0.0001513635,0.016126
28,HP:0001281,Tetany,2.937869,0.0001785555,0.018367


In [106]:
get_enrichment(get_top_pcs(phenotypes_of_interest[8])[0][0])

Phenotype:  Lymphocyte percentage of white cells
Top PCs:  [2 3 1 0 5]


Unnamed: 0,# ID,Desc,BFold,BPval,BFDR
8,HP:0012103,Abnormality of the mitochondrion,2.430178,1.981297e-08,7e-06
11,HP:0003287,Abnormality of mitochondrial metabolism,2.397021,5.547277e-08,1.4e-05
16,HP:0010972,Anemia of inadequate production,2.084019,1.467984e-07,2.6e-05
20,HP:0200042,Skin ulcer,2.02101,2.149489e-07,3.1e-05
22,HP:0001581,Recurrent skin infections,2.644626,3.26783e-07,4.2e-05
23,HP:0002665,Lymphoma,2.074043,3.357733e-07,4.2e-05
27,HP:0005406,Recurrent bacterial skin infections,4.113836,4.352149e-07,4.6e-05
37,HP:0006429,Broad femoral neck,7.213434,2.049863e-06,0.000161
38,HP:0002722,Recurrent abscess formation,3.547911,2.050315e-06,0.000157
43,HP:0001733,Pancreatitis,2.46457,3.059544e-06,0.000207


In [107]:
get_enrichment(get_top_pcs(phenotypes_of_interest[9])[0][0])

Phenotype:  Systemic lupus erythematosus
Top PCs:  [0 1 3 2 5]


Unnamed: 0,# ID,Desc,BFold,BPval,BFDR
3,HP:0012140,Abnormality of cells of the lymphoid lineage,2.315639,3.637427e-07,0.000271
5,HP:0001888,Lymphopenia,2.382784,5.56784e-07,0.000277
11,HP:0001878,Hemolytic anemia,2.008804,1.152956e-06,0.000287
13,HP:0002917,Hypomagnesemia,3.452539,3.079205e-06,0.000656
20,HP:0004921,Abnormality of magnesium homeostasis,2.814905,5.371737e-05,0.00763
22,HP:0000121,Nephrocalcinosis,2.448755,8.832524e-05,0.011455
24,HP:0200114,Metabolic alkalosis,4.508408,0.0001073928,0.012814
25,HP:0002643,Neonatal respiratory distress,3.743395,0.0001307868,0.015005
27,HP:0000360,Tinnitus,2.679365,0.0001513635,0.016126
28,HP:0001281,Tetany,2.937869,0.0001785555,0.018367


In [108]:
get_enrichment(get_top_pcs(phenotypes_of_interest[10])[0][0])

Phenotype:  Monocyte percentage of white cells
Top PCs:  [ 1  0  3 17  5]


Unnamed: 0,# ID,Desc,BFold,BPval,BFDR
0,HP:0004395,Malnutrition,5.321346,5e-06,0.013516
1,HP:0001718,Mitral stenosis,3.783898,6.3e-05,0.093634
2,HP:0001413,Micronodular cirrhosis,2.821075,0.00012,0.118908
5,HP:0004333,Bone-marrow foam cells,6.657703,0.000345,0.171644
6,HP:0003548,Subsarcolemmal accumulations of abnormally sha...,6.137533,0.000527,0.224788
8,HP:0001414,Microvesicular hepatic steatosis,3.131675,0.000632,0.209512
12,HP:0004975,Erlenmeyer flask deformity of the femurs,3.197454,0.001488,0.341373
13,HP:0002725,Systemic lupus erythematosus,3.788696,0.001533,0.326633
17,HP:0005938,Abnormal respiratory motile cilium morphology,3.267208,0.002178,0.360958
21,HP:0001618,Dysphonia,2.208404,0.003317,0.449819


In [109]:
get_enrichment(get_top_pcs(phenotypes_of_interest[11])[0][0])

Phenotype:  Reticulocyte fraction of red cells
Top PCs:  [0 3 1 2 5]


Unnamed: 0,# ID,Desc,BFold,BPval,BFDR
3,HP:0012140,Abnormality of cells of the lymphoid lineage,2.315639,3.637427e-07,0.000271
5,HP:0001888,Lymphopenia,2.382784,5.56784e-07,0.000277
11,HP:0001878,Hemolytic anemia,2.008804,1.152956e-06,0.000287
13,HP:0002917,Hypomagnesemia,3.452539,3.079205e-06,0.000656
20,HP:0004921,Abnormality of magnesium homeostasis,2.814905,5.371737e-05,0.00763
22,HP:0000121,Nephrocalcinosis,2.448755,8.832524e-05,0.011455
24,HP:0200114,Metabolic alkalosis,4.508408,0.0001073928,0.012814
25,HP:0002643,Neonatal respiratory distress,3.743395,0.0001307868,0.015005
27,HP:0000360,Tinnitus,2.679365,0.0001513635,0.016126
28,HP:0001281,Tetany,2.937869,0.0001785555,0.018367


In [110]:
get_enrichment(get_top_pcs(phenotypes_of_interest[12])[0][0])

Phenotype:  Eosinophil percentage of white cells
Top PCs:  [1 0 3 2 5]


Unnamed: 0,# ID,Desc,BFold,BPval,BFDR
0,HP:0004395,Malnutrition,5.321346,5e-06,0.013516
1,HP:0001718,Mitral stenosis,3.783898,6.3e-05,0.093634
2,HP:0001413,Micronodular cirrhosis,2.821075,0.00012,0.118908
5,HP:0004333,Bone-marrow foam cells,6.657703,0.000345,0.171644
6,HP:0003548,Subsarcolemmal accumulations of abnormally sha...,6.137533,0.000527,0.224788
8,HP:0001414,Microvesicular hepatic steatosis,3.131675,0.000632,0.209512
12,HP:0004975,Erlenmeyer flask deformity of the femurs,3.197454,0.001488,0.341373
13,HP:0002725,Systemic lupus erythematosus,3.788696,0.001533,0.326633
17,HP:0005938,Abnormal respiratory motile cilium morphology,3.267208,0.002178,0.360958
21,HP:0001618,Dysphonia,2.208404,0.003317,0.449819


In [111]:
get_enrichment(get_top_pcs(phenotypes_of_interest[13])[0][0])

Phenotype:  Granulocyte percentage of myeloid white cells
Top PCs:  [ 0  3  1 14  2]


Unnamed: 0,# ID,Desc,BFold,BPval,BFDR
3,HP:0012140,Abnormality of cells of the lymphoid lineage,2.315639,3.637427e-07,0.000271
5,HP:0001888,Lymphopenia,2.382784,5.56784e-07,0.000277
11,HP:0001878,Hemolytic anemia,2.008804,1.152956e-06,0.000287
13,HP:0002917,Hypomagnesemia,3.452539,3.079205e-06,0.000656
20,HP:0004921,Abnormality of magnesium homeostasis,2.814905,5.371737e-05,0.00763
22,HP:0000121,Nephrocalcinosis,2.448755,8.832524e-05,0.011455
24,HP:0200114,Metabolic alkalosis,4.508408,0.0001073928,0.012814
25,HP:0002643,Neonatal respiratory distress,3.743395,0.0001307868,0.015005
27,HP:0000360,Tinnitus,2.679365,0.0001513635,0.016126
28,HP:0001281,Tetany,2.937869,0.0001785555,0.018367


In [112]:
get_enrichment(get_top_pcs(phenotypes_of_interest[14])[0][0])

Phenotype:  Platelet count
Top PCs:  [0 1 3 2 9]


Unnamed: 0,# ID,Desc,BFold,BPval,BFDR
3,HP:0012140,Abnormality of cells of the lymphoid lineage,2.315639,3.637427e-07,0.000271
5,HP:0001888,Lymphopenia,2.382784,5.56784e-07,0.000277
11,HP:0001878,Hemolytic anemia,2.008804,1.152956e-06,0.000287
13,HP:0002917,Hypomagnesemia,3.452539,3.079205e-06,0.000656
20,HP:0004921,Abnormality of magnesium homeostasis,2.814905,5.371737e-05,0.00763
22,HP:0000121,Nephrocalcinosis,2.448755,8.832524e-05,0.011455
24,HP:0200114,Metabolic alkalosis,4.508408,0.0001073928,0.012814
25,HP:0002643,Neonatal respiratory distress,3.743395,0.0001307868,0.015005
27,HP:0000360,Tinnitus,2.679365,0.0001513635,0.016126
28,HP:0001281,Tetany,2.937869,0.0001785555,0.018367
