In [1]:
# imports
import pandas as pd
import json
import requests



In [2]:
# create an array of phenotypes
phenotypes = ['BMI', 'T2D', 'CHOL']

# print the list
print("the list of phenotypes is: {}".format(phenotypes))

the list of phenotypes is: ['BMI', 'T2D', 'CHOL']


In [3]:
# build the query string
# this API returns the magma data for all available genes available 
query_api = "http://public.type2diabeteskb.org/dccservices/testcalls/magma/gene/object?phenotype={}"

# create an empty data frame
master_data_frame = pd.DataFrame()

# for each variant, call the meta REST api and concatenate the data frames
for phe in phenotypes:
    # build the query and print
    phe_query_api = query_api.format(phe)
    print("the query for the phenotype: {} is: {}".format(phe, phe_query_api))
    
    # make the REST call and capture the http response
    req = requests.get(phe_query_api)
    
    # decode the http response into the json format
    req_json = req.json()
    
    # read the results into an individual pandas data frame, given it the 'data' section of the json
    phe_data_frame = pd.DataFrame(req_json['data'])
    
    # add the results to the master data frame (concatenate the results with the previous ones)
    master_data_frame = pd.concat([master_data_frame, phe_data_frame])
    
    # print the size of the master data frame
    print("after concatenation, the master data frame has a row count of: {}\n".format(master_data_frame.shape[0]))

the query for the phenotype: BMI is: http://public.type2diabeteskb.org/dccservices/testcalls/magma/gene/object?phenotype=BMI


after concatenation, the master data frame has a row count of: 18180

the query for the phenotype: T2D is: http://public.type2diabeteskb.org/dccservices/testcalls/magma/gene/object?phenotype=T2D


after concatenation, the master data frame has a row count of: 36729

the query for the phenotype: CHOL is: http://public.type2diabeteskb.org/dccservices/testcalls/magma/gene/object?phenotype=CHOL


after concatenation, the master data frame has a row count of: 55032



In [4]:
# show the top 10 rows of the combined data frame
print("the first 10 rows of the combined data frame")
print(master_data_frame.head(10))


the first 10 rows of the combined data frame
  ancestry      gene gene_ensemble_id  nsnps phenotype        pvalue    zstat
0    Mixed      A1BG  ENSG00000121410     33       BMI  7.101500e-05  3.80460
1    Mixed      NAT2  ENSG00000156006    176       BMI  3.351600e-02  1.83150
2    Mixed       ADA  ENSG00000196839     75       BMI  8.539200e-06  4.30000
3    Mixed      CDH2  ENSG00000170558    481       BMI  1.727800e-03  2.92400
4    Mixed      AKT3  ENSG00000117020    196       BMI  3.395200e-12  6.86200
5    Mixed      MED6  ENSG00000133997     97       BMI  3.142700e-02  1.86020
6    Mixed     NR2E3             None     67       BMI  7.232900e-07  4.81850
7    Mixed   NAALAD2  ENSG00000077616    148       BMI  1.570300e-05  4.16300
8    Mixed      DDTL  ENSG00000099974     28       BMI  1.737400e-01  0.93949
9    Mixed  NAALADL1  ENSG00000168060     56       BMI  5.975400e-04  3.24010


In [5]:
# trim the data frame to the p_value, std_err, beta, num_samples, phenotype and variant
filtered_data_frame = master_data_frame.filter(['gene', 'pvalue', 'zstat', 'phenotype'], axis=1)

# display the new data frame's first 15 rows
print(filtered_data_frame.head(15))


        gene        pvalue    zstat phenotype
0       A1BG  7.101500e-05  3.80460       BMI
1       NAT2  3.351600e-02  1.83150       BMI
2        ADA  8.539200e-06  4.30000       BMI
3       CDH2  1.727800e-03  2.92400       BMI
4       AKT3  3.395200e-12  6.86200       BMI
5       MED6  3.142700e-02  1.86020       BMI
6      NR2E3  7.232900e-07  4.81850       BMI
7    NAALAD2  1.570300e-05  4.16300       BMI
8       DDTL  1.737400e-01  0.93949       BMI
9   NAALADL1  5.975400e-04  3.24010       BMI
10  SIGLEC14  5.876800e-02  1.56520       BMI
11     ACOT8  1.455100e-02  2.18210       BMI
12      ABI1  9.514700e-02  1.30970       BMI
13    GNPDA1  6.398100e-01 -0.35794       BMI
14     KCNE3  1.940600e-02  2.06620       BMI


In [7]:
# sort the new data frame by ascending p_value (more significant first)
# this will not group the rows by gene
filtered_data_frame.sort_values('pvalue', inplace=True)

# show the first 20 rows of the newly sorted data frame
print(filtered_data_frame.head(20))

          gene         pvalue   zstat phenotype
14311   TCF7L2  2.333000e-209  30.856       T2D
12574    PVRL2  7.352100e-103  21.503      CHOL
7497      APOB   2.616400e-88  19.887      CHOL
820     TOMM40   3.630600e-85  19.521      CHOL
11101   CDKAL1   1.437800e-83  19.332       T2D
14789      FTO   2.729100e-78  18.695       BMI
1051   IGF2BP2   2.220900e-75  18.334       T2D
8702      LDLR   1.794900e-73  18.093      CHOL
16964   SEC16B   1.019600e-69  17.611       BMI
7827      APOE   9.171400e-69  17.486      CHOL
7644     APOC1   9.424200e-69  17.484      CHOL
14828     WFS1   2.756300e-65  17.023       T2D
5285      CBLC   1.119200e-64  16.941      CHOL
7468     APOA4   1.349000e-64  16.930      CHOL
16434    PSRC1   1.895700e-61  16.498      CHOL
3929    CELSR2   3.436700e-60  16.322      CHOL
12924     BDNF   4.382200e-60  16.307       BMI
7764     APOC3   3.885300e-59  16.173      CHOL
13278    THADA   2.840200e-58  16.050       T2D
13815  SMARCA4   1.303300e-57  15.956   

In [8]:
# reset the index
filtered_data_frame.reindex

# show the first 20 rows of the newly sorted data frame
print(filtered_data_frame.head(20))

          gene         pvalue   zstat phenotype
14311   TCF7L2  2.333000e-209  30.856       T2D
12574    PVRL2  7.352100e-103  21.503      CHOL
7497      APOB   2.616400e-88  19.887      CHOL
820     TOMM40   3.630600e-85  19.521      CHOL
11101   CDKAL1   1.437800e-83  19.332       T2D
14789      FTO   2.729100e-78  18.695       BMI
1051   IGF2BP2   2.220900e-75  18.334       T2D
8702      LDLR   1.794900e-73  18.093      CHOL
16964   SEC16B   1.019600e-69  17.611       BMI
7827      APOE   9.171400e-69  17.486      CHOL
7644     APOC1   9.424200e-69  17.484      CHOL
14828     WFS1   2.756300e-65  17.023       T2D
5285      CBLC   1.119200e-64  16.941      CHOL
7468     APOA4   1.349000e-64  16.930      CHOL
16434    PSRC1   1.895700e-61  16.498      CHOL
3929    CELSR2   3.436700e-60  16.322      CHOL
12924     BDNF   4.382200e-60  16.307       BMI
7764     APOC3   3.885300e-59  16.173      CHOL
13278    THADA   2.840200e-58  16.050       T2D
13815  SMARCA4   1.303300e-57  15.956   