In [None]:
# Run this cell if you are running this on google collab:
%cd /content/
!git clone https://github.com/mennowitteveen/pgsbenchmark.git
#!git -C ./pgsbenchmark/ checkout dev
%cd pgsbenchmark/nbs
!ls
!pip install pysnptools -q -q

In [1]:
# Run this cell if you are running this via the "binder" github link:
!pip install pysnptools 
!pip install matplotlib

[31mERROR: Invalid requirement: 'pysnptools,'[0m


# Imports

In [2]:
%config Completer.use_jedi = False
########################################################
## Base Imports:

# Sys Imports:
import time, sys, os

# Standard Imports:
import numpy as np
import scipy as sp
import pandas as pd
# import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats, linalg

#########################################################
## Experiment Specific Imports

# Basic Imports:
import inspect, glob, re, contextlib, pickle, functools #,submitit #pyreadr
from tqdm.auto import tqdm
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
# from mjwt.tools import jobinfo, corr, implot, sizegb, psrc, beep, Timer, Struct as mStruct
from scipy.stats import pearsonr, spearmanr

# Genomics Imports:
import pysnptools as pst
from pysnptools.snpreader import Bed, Pheno, SnpHdf5, SnpData
from pysnptools.pstreader import PstData, PstHdf5, PstReader

# PGSBenchmark Code:
%run ../pgsbenchmark/loaders.py
%run ../pgsbenchmark/scores.py
%run ../pgsbenchmark/tools.py

########################################################
## Configuration & Initialisation

# Display Configuration:
from IPython.display import set_matplotlib_formats, display #, HTML, Audio, Javascript
plt.rcParams['figure.figsize'] = [10, 5]
pd.set_option('max_colwidth', 200) 
# pd.set_option('display.max_colwidth', None) # No pd trunkation (radical)
# display(HTML("<style>.container { width:75% !important; }</style>"))
# pd.reset_option('all')

# Initializations & Extensions:
timer = Timer(); toc = timer.toc; tic = timer.tic; tic(''); log=np.log10
notebook = False  if '__file__' in locals() else True
with contextlib.suppress(BaseException): # non-essential import for development.
    get_ipython().run_line_magic('load_ext', 'line_profiler')

# Experiment Setup

In [80]:
# Load Genotype data:
dtype='float64'
# data_dn = '/home/jovyan/proj/data/'
data_dn = '../pgsbenchmark/data/'

# Load SNPs filter list & region definition:
srd            = Bed(data_dn+'mini/mini.bed', count_A1=True) # SnpReader (srd)
bim_df, fam_df = load_bimfam(data_dn+'mini/mini.bed') # pandas DataFrames of .bim & .fam files (= plink files)
pheno_df       = pd.read_csv(data_dn+'mini/pheno_df.csv').set_index(['FID','IID']); # dataframe with phenotypes
effects_df     = pd.read_csv(data_dn+'mini/effects_df.csv').set_index(['snp']); # dataframe with genetic effects (aka. the true weights)
regdef_df      = pd.read_csv(data_dn+'defs/regdef/regions_1blk_shift=0.regdef.tsv', delimiter='\t') # dataframe with region definitions

# Basic preprocessing into forms that are accepted by code below:
prd =  Pheno(dict(header=pheno_df.columns, # Initiation of phenotype object
                  vals=pheno_df.values,
                  iid=pheno_df.index.to_frame().astype(str).values.astype(dtype=str)))
snpids     = effects_df.index.values.astype(str) # snp id's which are rs id's in this case
pgsname    = effects_df.columns # name of the effect
pgsweights = effects_df.values.T # in this experiment we set the pgs weights equal to the true genetic weights.
brd        = pst.pstreader.PstData(row=pgsname, col=snpids, val=pgsweights) # creation of PGS weight object
# The creation of object for snp's, phenotypes and PGS-weights (srd, prd, brd) is useful because all these types
# are implicit. When data is small this does not add much, but when data is large this becomes very beneficial.

In [None]:
# %run ../pgsbenchmark/loaders.py
# %run ../pgsbenchmark/scores.py
# %run ../pgsbenchmark/tools.py

# Basic Functionality

In [73]:
## Do individual-level (=normal) approach for performance estimation:

# Create Multi PGS Computing object and make predictions:
mpc = MultiPGSComputer(verbose=False, brd=brd, dtype=dtype)
multi_dt = mpc.predict(srd=srd, prd=prd) # If one supplies prd (=phenotyes) it also computes sumstats.

# Compute Performance correlation pheno & predictions and square:
indr2_df = pheno_df.corrwith(multi_dt['Yhat']).to_frame().T**2


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




In [74]:
## Do the Privacy Preserving Benchmark Approach:

# Init LinkageData:
linkdata = LinkageData(sst_df=bim_df, srd=srd, regdef_df=regdef_df, shift=10, dtype=dtype,
                       _cross_chrom_ld=True) # Mind! _cross_chrom_ld=True lets this object calculate "LD" between chromosomes. 
                                             # This allows the full D matrix to be computed (D=X'*X/N, see manuscript for more info)
# linkdata.retrieve_linkage_allregions() # Retrieve (= computer or load, depending on whats available) all LD for all regions.
ppmc      = PrivacyPreservingMetricsComputer(linkdata=linkdata, # <-- The LD information
                                             brd=brd,           # <-- The PGS prediction weights
                                             Bm=multi_dt['Bm'], # <-- GWAS summary statistics in the form of a
                                             clear_linkage=False, dtype=dtype)
ppbres_dt = ppmc.evaluate() # Actually perform the privacy preserving performance metrics
fullppbr2_df  = ppbres_dt['ppbr2_df']
# Next line is needed because fullppbr2_df contains R^2 of all PGS vs all traits (from GWAS sumstats):
ppbr2_df = pd.DataFrame([fullppbr2_df.values.diagonal()], columns=fullppbr2_df.columns)

# Display results:
assert np.allclose(indr2_df, ppbr2_df) # A check that ppb and induv. r2 are practically identifcal.
print('The results (R^2) for the individual-level and privacy-preserving approach match up perfectly.')
r2_df = pd.DataFrame.from_dict(dict( # Putting the results together into one pandas dataframe.
    indr2=indr2_df.loc[0], 
    ppbr2=ppbr2_df.loc[0]), orient='index')
display(r2_df)



HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))


The results (R^2) for the individual-level and privacy-preserving approach match up perfectly.


Unnamed: 0,spike_slab,laplace,ridge
indr2,0.384197,0.401929,0.412574
ppbr2,0.384197,0.401929,0.412574


In [18]:
# The results matching up exactly is what one expects from the PPB math and thus reassuring.
# This is only going to be the case for a full covariance matrix D (=X'*X/n)

# Small Experiment with LD window

In [121]:
## Do the Privacy Preserving Benchmark Approach with LD window:

# Init LinkageData:
linkdata = LinkageData(sst_df=bim_df, srd=srd, regdef_df=regdef_df, cm=6.0, dtype=dtype)
ppmc      = PrivacyPreservingMetricsComputer(linkdata=linkdata, # <-- The LD information
                                             brd=brd,           # <-- The PGS prediction weights
                                             Bm=multi_dt['Bm'], # <-- GWAS summary statistics in the form of a
                                             clear_linkage=False, dtype=dtype)
ppbres_dt = ppmc.evaluate()
fullppbr2_df  = ppbres_dt['ppbr2_df']
ppbr2_df = pd.DataFrame([fullppbr2_df.values.diagonal()], columns=fullppbr2_df.columns)

# Display results:
print('The results (R^2) for the individual-level and privacy-preserving approach with a % difference. The difference is expected for this small experiment (n=503)')
r2_df = pd.DataFrame.from_dict(dict( # Putting the results together into one pandas dataframe.
    indr2=indr2_df.loc[0], 
    ppbr2=ppbr2_df.loc[0]), orient='index')
r2_df.loc['%'] = ((r2_df.loc['ppbr2']/r2_df.loc['indr2']-1)*100) # % difference
display(r2_df)



Retrieving Standard Dev. var 's'


HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))


The results (R^2) for the individual-level and privacy-preserving approach with a % difference. The difference is expected for this small experiment (n=503)


Unnamed: 0,spike_slab,laplace,ridge
indr2,0.384197,0.401929,0.412574
ppbr2,0.396791,0.393395,0.403902
%,3.278133,-2.123201,-2.101988


# Appendix

A code scratchpad

In [None]:
# p = linkdata.n_snps_total
# M = np.random.randn(p,p)

In [49]:
# # !pip freeze > /home/jovyan/proj/docker/jupyter-base-ds/requirements.txt
# !pip freeze | grep -v 'mennowitteveen' > ~/proj/docker/jupyter-base-ds/requirements.txt
# !git config --global user.email "" 
# !git config --global user.name  "Menno Witteveen"
# !git -C /home/jovyan/proj/docker/jupyter-base-ds/ add requirements.txt
# !git -C /home/jovyan/proj/docker/jupyter-base-ds/ status
# !git -C /home/jovyan/proj/docker/jupyter-base-ds/ commit -m "update req.txt"

# cd ~/proj/docker/jupyter-base-ds
# docker build --rm -t mennowitteveen/my-datascience-notebook-1 .


In [5]:
!pip freeze | grep -v mennowitteveen > ../requirements.txt