## Notebook to do any addtional setup on instance and pull input data for analysis

In [1]:
!date

Tue Apr 27 21:45:19 UTC 2021


In [1]:
import os

In [33]:
# parameter variables
cohort = 'amppd'
cohorts = ['biofind', 'pdbp', 'ppmi']
amp_version = 'amppdv1'

# directories for initial setup
home_dir = '/home/jupyter'
nbs_dir = f'{home_dir}/notebooks'
gwas_dir = f'{home_dir}/gwas'

# input data
eqtl_bucket_path = f'gs://ppmi-wb-analysis/eqtl'
nbs_bucket_path = f'{eqtl_bucket_path}/notebooks'
gwas_bucket_path = f'{eqtl_bucket_path}/gwas'
quants_bucket_path = f'{eqtl_bucket_path}/amppd/expression/matrix.genes.tsv'
gencode_bucket_path = f'{eqtl_bucket_path}/amppd/expression/gencode.v29.primary_assembly.annotation.gtf.gz'
tissue_genes_bckt_path = f'{eqtl_bucket_path}/amppd/expression/blood.elevated.features_targets.csv'

lcl_gencode_gtf = f'{home_dir}/{cohort}/expression/gencode.v29.primary_assembly.annotation.gtf.gz'
lcl_gencode_pkl = f'{home_dir}/{cohort}/expression/gencode.v29.primary_assembly.annotation.pkl'

# constants
capture_out = !(nproc)
max_procs = int(capture_out[0])

In [35]:
# make the dirs
for this_cohort in [cohort] + cohorts:
    os.makedirs(f'{home_dir}/{this_cohort}', exist_ok=True)
    os.makedirs(f'{home_dir}/{this_cohort}/sample_info', exist_ok=True)    
    os.makedirs(f'{home_dir}/{this_cohort}/expression', exist_ok=True)

for this_cohort in [cohort] + cohorts:    
    os.makedirs(f'{home_dir}/{this_cohort}/genotypes', exist_ok=True)
    
os.makedirs(gwas_dir, exist_ok=True)    

#### pull down the rest of the analysis notebooks

In [8]:
this_cmd = f'gsutil -mq cp -n -P -r {nbs_bucket_path}/* {nbs_dir}/'
print(this_cmd)
!{this_cmd}
!ls -lh {nbs_dir}

gsutil -mq cp -n -P -r gs://ppmi-wb-analysis/eqtl/notebooks/* /home/jupyter/notebooks/
total 436K
-rw-r--r-- 1 jupyter jupyter 7.4K Apr 27 20:01 Readme.md
drwxr-xr-x 2 jupyter jupyter 4.0K Apr 27 21:47 amppd
-rw-r--r-- 1 jupyter jupyter 384K Apr 27 19:57 compare_day_indep_results.ipynb
-rw-r--r-- 1 jupyter jupyter 5.4K Apr 27 19:57 finish_and_push_back.ipynb
drwxr-xr-x 2 jupyter jupyter 4.0K Apr 27 21:47 foundin
-rw-r--r-- 1 jupyter jupyter  25K Apr 27 21:47 setup_and_data_pull.ipynb


#### mirror down input data from the input buckets

In [10]:
# pull genotypes to local disk
for this_cohort in cohorts:
    genos_bucket_path = f'{eqtl_bucket_path}/{this_cohort}/genotypes'
    geno_dir = f'{home_dir}/{this_cohort}/genotypes'
    this_cmd = f'gsutil -mq cp -P {genos_bucket_path}/* {geno_dir}/'
    print(this_cmd)
    !{this_cmd}
#     !ls -lh {geno_dir}

gsutil -mq cp -P gs://ppmi-wb-analysis/eqtl/biofind/genotypes/* /home/jupyter/biofind/genotypes/
gsutil -mq cp -P gs://ppmi-wb-analysis/eqtl/pdbp/genotypes/* /home/jupyter/pdbp/genotypes/
gsutil -mq cp -P gs://ppmi-wb-analysis/eqtl/ppmi/genotypes/* /home/jupyter/ppmi/genotypes/


#### pull expression to local disk

In [11]:
this_cmd = f'gsutil -mq cp -P {quants_bucket_path} {home_dir}/{cohort}/expression/'
print(this_cmd)
!{this_cmd}

gsutil -mq cp -P gs://ppmi-wb-analysis/eqtl/amppd/expression/matrix.genes.tsv /home/jupyter/amppd/expression/


#### pull gencode annotation file

In [32]:
this_cmd = f'gsutil -mq cp -P {gencode_bucket_path} {home_dir}/{cohort}/expression/'
print(this_cmd)
!{this_cmd}

this_cmd = f'gsutil -mq cp -P {tissue_genes_bckt_path} {home_dir}/{cohort}/expression/'
print(this_cmd)
!{this_cmd}

!ls -lh {home_dir}/{cohort}/expression/

gsutil -mq cp -P gs://ppmi-wb-analysis/eqtl/amppd/expression/gencode.v29.primary_assembly.annotation.gtf.gz /home/jupyter/amppd/expression/
gsutil -mq cp -P gs://ppmi-wb-analysis/eqtl/amppd/expression/blood.elevated.features_targets.csv /home/jupyter/amppd/expression/
total 20G
-rw-r--r-- 1 jupyter jupyter  95M Apr 28 03:40 BF.SVM0_5T1.genes.csv
-rw-r--r-- 1 jupyter jupyter  95M Apr 28 03:44 BF.all_visits.genes.hdf5
-rw-r--r-- 1 jupyter jupyter 637M Apr 28 03:40 PD.BLM0T1.genes.csv
-rw-r--r-- 1 jupyter jupyter 268M Apr 28 03:40 PD.SVM12T1.genes.csv
-rw-r--r-- 1 jupyter jupyter 222M Apr 28 03:40 PD.SVM18T1.genes.csv
-rw-r--r-- 1 jupyter jupyter 207M Apr 28 03:40 PD.SVM24T1.genes.csv
-rw-r--r-- 1 jupyter jupyter 252M Apr 28 03:40 PD.SVM6T1.genes.csv
-rw-r--r-- 1 jupyter jupyter 1.6G Apr 28 03:44 PD.all_visits.genes.hdf5
-rw-r--r-- 1 jupyter jupyter 675M Apr 28 03:40 PP.BLM0T1.genes.csv
-rw-r--r-- 1 jupyter jupyter 391M Apr 28 03:40 PP.SVM12T1.genes.csv
-rw-r--r-- 1 jupyter jupyter 372M A

#### pull down the sample info

In [16]:
for this_cohort in [cohort] + cohorts:
    info_bucket_path = f'{eqtl_bucket_path}/{this_cohort}/sample_info'
    info_dir = f'{home_dir}/{this_cohort}/sample_info'
    this_cmd = f'gsutil -mq cp -P {info_bucket_path}/* {info_dir}/'
    print(this_cmd)
    !{this_cmd}
    !ls -lh {info_dir}

gsutil -mq cp -P gs://ppmi-wb-analysis/eqtl/amppd/sample_info/* /home/jupyter/amppd/sample_info/
total 2.5M
-rw-r--r-- 1 jupyter jupyter 896K Apr 27 22:12 amppd.wb.pred_cell_fracs.csv
-rw-r--r-- 1 jupyter jupyter 856K Apr 27 22:12 amppd_demographicsPlus_2019_v1release_1015.csv
-rw-r--r-- 1 jupyter jupyter 789K Apr 27 22:12 rna_quality_metrics.csv
gsutil -mq cp -P gs://ppmi-wb-analysis/eqtl/biofind/sample_info/* /home/jupyter/biofind/sample_info/
total 48K
-rw-r--r-- 1 jupyter jupyter  160 Apr 27 22:12 biofind.freeze9.pca.eigenval
-rw-r--r-- 1 jupyter jupyter  38K Apr 27 22:12 biofind.freeze9.pca.eigenvec
-rw-r--r-- 1 jupyter jupyter 1.2K Apr 27 22:12 biofind.freeze9.pca.log
gsutil -mq cp -P gs://ppmi-wb-analysis/eqtl/pdbp/sample_info/* /home/jupyter/pdbp/sample_info/
total 368K
-rw-r--r-- 1 jupyter jupyter  159 Apr 27 22:13 pdbp.freeze9.pca.eigenval
-rw-r--r-- 1 jupyter jupyter 358K Apr 27 22:13 pdbp.freeze9.pca.eigenvec
-rw-r--r-- 1 jupyter jupyter 1.2K Apr 27 22:13 pdbp.freeze9.pca.l

#### pull down the gwas data

In [36]:
this_cmd = f'gsutil -mq cp -P {gwas_bucket_path}/* {gwas_dir}/'
print(this_cmd)
!{this_cmd}
!ls -lh {gwas_dir}

gsutil -mq cp -P gs://ppmi-wb-analysis/eqtl/gwas/* /home/jupyter/gwas/
total 825M
-rw-r--r-- 1 jupyter jupyter 6.9K Apr 29 02:30 pd.table_s2.clean.txt
-rw-r--r-- 1 jupyter jupyter  15K Apr 29 02:30 pd_meta5v2_cojo_results.jma.cojo.csv
-rw-r--r-- 1 jupyter jupyter 825M Apr 29 02:30 pdmeta_sumstats_hg38.h5


#### add plink2

In [17]:
!wget http://s3.amazonaws.com/plink2-assets/alpha2/plink2_linux_x86_64.zip -O /tmp/plink_linux_x86_64.zip
!unzip /tmp/plink_linux_x86_64.zip -d /tmp/plink
!rm -f /tmp/plink_linux_x86_64.zip
!sudo mv /tmp/plink/plink2 /usr/local/bin/
!rm -rf /tmp/plink

--2021-04-27 22:13:06--  http://s3.amazonaws.com/plink2-assets/alpha2/plink2_linux_x86_64.zip
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.146.221
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.146.221|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 8879151 (8.5M) [application/zip]
Saving to: ‘/tmp/plink_linux_x86_64.zip’


2021-04-27 22:13:06 (28.4 MB/s) - ‘/tmp/plink_linux_x86_64.zip’ saved [8879151/8879151]

Archive:  /tmp/plink_linux_x86_64.zip
  inflating: /tmp/plink/plink2       


In [18]:
!plink2

PLINK v2.00a2.3LM 64-bit Intel (24 Jan 2020)   www.cog-genomics.org/plink/2.0/
(C) 2005-2020 Shaun Purcell, Christopher Chang   GNU General Public License v3

  plink2 <input flag(s)...> [command flag(s)...] [other flag(s)...]
  plink2 --help [flag name(s)...]

Commands include --rm-dup list, --make-bpgen, --export, --freq, --geno-counts,
--sample-counts, --missing, --hardy, --indep-pairwise, --ld, --sample-diff,
--make-king, --king-cutoff, --write-samples, --write-snplist, --make-grm-list,
--pca, --glm, --adjust-file, --score, --variant-score, --genotyping-rate,
--pgen-info, --validate, and --zst-decompress.

"plink2 --help | more" describes all functions.


#### add plink1.9

In [19]:
!wget http://s3.amazonaws.com/plink1-assets/plink_linux_x86_64_20201019.zip -O /tmp/plink_linux_x86_64_20201019.zip
!unzip /tmp/plink_linux_x86_64_20201019.zip -d /tmp/plink
!rm -f /tmp/plink_linux_x86_64_20201019.zip
!sudo mv /tmp/plink/plink /usr/local/bin/
!rm -rf /tmp/plink

--2021-04-27 22:13:14--  http://s3.amazonaws.com/plink1-assets/plink_linux_x86_64_20201019.zip
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.17.91
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.17.91|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 8916695 (8.5M) [application/zip]
Saving to: ‘/tmp/plink_linux_x86_64_20201019.zip’


2021-04-27 22:13:14 (30.0 MB/s) - ‘/tmp/plink_linux_x86_64_20201019.zip’ saved [8916695/8916695]

Archive:  /tmp/plink_linux_x86_64_20201019.zip
  inflating: /tmp/plink/plink        
  inflating: /tmp/plink/LICENSE      
  inflating: /tmp/plink/toy.ped      
  inflating: /tmp/plink/toy.map      
  inflating: /tmp/plink/prettify     


In [20]:
!plink

PLINK v1.90b6.21 64-bit (19 Oct 2020)          www.cog-genomics.org/plink/1.9/
(C) 2005-2020 Shaun Purcell, Christopher Chang   GNU General Public License v3

  plink <input flag(s)...> [command flag(s)...] [other flag(s)...]
  plink --help [flag name(s)...]

Commands include --make-bed, --recode, --flip-scan, --merge-list,
--write-snplist, --list-duplicate-vars, --freqx, --missing, --test-mishap,
--hardy, --mendel, --ibc, --impute-sex, --indep-pairphase, --r2, --show-tags,
--blocks, --distance, --genome, --homozyg, --make-rel, --make-grm-gz,
--rel-cutoff, --cluster, --pca, --neighbour, --ibs-test, --regress-distance,
--model, --bd, --gxe, --logistic, --dosage, --lasso, --test-missing,
--make-perm-pheno, --tdt, --qfam, --annotate, --clump, --gene-report,
--meta-analysis, --epistasis, --fast-epistasis, and --score.



#### install umap and ppscore

In [3]:
!pip install -U --quiet umap-learn ppscore dask[complete] tables plotly-express

In [4]:
import umap
import ppscore
import dask.dataframe as dd

#### install tensorQTL
there is an older version in pypi, but need a couple of the bug fixes so install from github

In [5]:
# required by tensorQTL
!pip install -U --quiet pandas-plink

In [25]:
# install github works but need to pull code to comment some of the logging 
# so doesn't spill 1K's our lines output to notebook
# pip install -U --quiet git+https://github.com/broadinstitute/tensorqtl.git

Note: you may need to restart the kernel to use updated packages.


In [None]:
### NOTE: had some permission problems, used jupyterlab to clone the git

# !git clone --quiet git@github.com:broadinstitute/tensorqtl.git
# os.chdir(f'{nbs_dir}/tensorqtl')
# !pip install -r install/requirements.txt

#### if want to use Story q-value call with tensorQTL need to add R

In [None]:
# sudo apt-get --quiet install -y r-base build-essential libcurl4-gnutls-dev \
# libxml2-dev libssl-dev

In [None]:
#sudo -i R
# install.packages("devtools")
# library("devtools")
# install_github("jdstorey/qvalue")

In [26]:
import tensorqtl



#### convert the gencode gtf to easier to read format
tool needs to be installed [gtf2csv](https://github.com/zyxue/gtf2csv)

In [27]:
!pip install -U --quiet git+https://github.com/zyxue/gtf2csv.git#egg=gtf2csv

#### converst gene annots gtf to pickle

In [30]:
this_cmd = f'gtf2csv --gtf {lcl_gencode_gtf} -m pkl -t {max_procs} -o {lcl_gencode_pkl}'

print(this_cmd)
!{this_cmd}

gtf2csv --gtf /home/jupyter/amppd/expression/gencode.v29.primary_assembly.annotation.gtf.gz -m pkl -t 16 -o /home/jupyter/amppd/expression/gencode.v29.primary_assembly.annotation.pkl
2021-04-28 03:30:52,794|INFO|will use 16 CPUs for parallel processing
2021-04-28 03:30:52,794|INFO|reading /home/jupyter/amppd/expression/gencode.v29.primary_assembly.annotation.gtf.gz ...
2021-04-28 03:31:05,268|INFO|time spent on read_gtf: 12.47s
2021-04-28 03:31:05,268|INFO|1st pass of gtf to obtain multiplicity tags ...
2021-04-28 03:31:10,620|INFO|multiplicity tags found: {'tag', 'ont'}
2021-04-28 03:31:10,748|INFO|time spent on get_multiplicity_tags: 5.48s
2021-04-28 03:31:10,748|INFO|2nd pass of gtf to classify multiplicity tags into low- and high-cardinality tags ...
2021-04-28 03:31:24,751|INFO|2 low-cardinality tags found: [('tag', 63), ('ont', 6)]; 0 high-cardinality tags found: []
2021-04-28 03:31:24,752|INFO|time spent on classify_multiplicity_tags: 14.00s
2021-04-28 03:31:24,752|INFO|3rd pass

#### install bcftools

In [41]:
!sudo apt-get --quiet install -y libbz2-dev liblzma-dev libtool-bin libncurses-dev

samtools_version = '1.12'
# htslib
!wget --quiet https://github.com/samtools/htslib/releases/download/{samtools_version}/htslib-{samtools_version}.tar.bz2 -O {home_dir}/htslib-{samtools_version}.tar.bz2
!tar -xf {home_dir}/htslib-{samtools_version}.tar.bz2 -C {home_dir}
# !cd {tools_dir}/htslib-{samtools_version}
os.chdir(f'{home_dir}/htslib-{samtools_version}')
!./configure --quiet --prefix=/usr/local
!make --quiet
!sudo make --quiet install
!rm {home_dir}/htslib-{samtools_version}.tar.bz2
# also need bcftools for subsetting vcf
!wget --quiet https://github.com/samtools/bcftools/releases/download/{samtools_version}/bcftools-{samtools_version}.tar.bz2 -O {home_dir}/bcftools-{samtools_version}.tar.bz2
!tar -xf /{home_dir}/bcftools-{samtools_version}.tar.bz2 -C {home_dir}
# !cd {tools_dir}/bcftools-{samtools_version}
os.chdir(f'{home_dir}/bcftools-{samtools_version}')
!./configure --quiet --prefix=/usr/local
!make --quiet {home_dir}/
!sudo make --quiet install
!rm {home_dir}/bcftools-{samtools_version}.tar.bz2

Reading package lists...
Building dependency tree...
Reading state information...
The following additional packages will be installed:
  automake autotools-dev bzip2-doc libltdl-dev libsigsegv2 libtool m4
Suggested packages:
  autoconf-archive gnu-standards autoconf-doc gettext libtool-doc liblzma-doc
  ncurses-doc gfortran | fortran95-compiler gcj-jdk m4-doc
The following NEW packages will be installed:
  autoconf automake autotools-dev bzip2-doc libbz2-dev libltdl-dev liblzma-dev
  libncurses-dev libsigsegv2 libtool libtool-bin m4
0 upgraded, 12 newly installed, 0 to remove and 7 not upgraded.
Need to get 3632 kB of archives.
After this operation, 10.9 MB of additional disk space will be used.
Get:1 http://deb.debian.org/debian buster/main amd64 libsigsegv2 amd64 2.12-2 [32.8 kB]
Get:2 http://deb.debian.org/debian buster/main amd64 m4 amd64 1.4.18-2 [203 kB]
Get:3 http://deb.debian.org/debian buster/main amd64 autoconf all 2.69-11 [341 kB]
Get:4 http://deb.debian.org/debian buster/ma