In [94]:
from __future__ import print_function

import multiprocessing
import os
import sys
import subprocess

import itertools
import functools

import pandas as pd
import numpy as np
import argparse




In [95]:
# use the below if running script within anaconda:
# os.environ["PATH"] += os.pathsep + "/tools/bedtools/2.27.1/bin/"
# os.environ["PATH"] += os.pathsep + "/tools/htslib/1.6/bin/"

from pybedtools import BedTool 
# ^ some/all of pybedtools requires 'bedtools' to be available on your PATH /tools/bedtools/2.27.1/bin/bedtools
# BedTool(..).sort(): "sortBed" does not appear to be installed or on the path, so this method is disabled. Please install a more recent version of BEDTools and re-import to use this method.

In [96]:
import sys

sys.path.insert(0, "/projects/timshel/sc-genetics/sc-genetics/src/ldsc")
from make_annot_from_geneset_all_chr import *

In [97]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [98]:
import shlex

In [99]:
parser = argparse.ArgumentParser()
parser.add_argument('--file_multi_gene_set', type=str, help='a file of gene names, one line per gene.')
parser.add_argument('--file_gene_coord', type=str, help='a file with columns GENE, CHR, START, and END, where START and END are base pair coordinates of TSS and TES. This file can contain more genes than are in the gene set. We provide ENSG_coord.txt as a default.')
parser.add_argument('--windowsize', type=int, help='how many base pairs to add around the transcribed region to make the annotation? Finucane uses 100 kb.')
parser.add_argument('--bimfile_basename', type=str, help='plink bim BASENAME for the dataset you will use to compute LD scores. If argument is "1000G.EUR.QC", then the files "1000G.EUR.QC.1.bim", "1000G.EUR.QC.2.bim", ..., "1000G.EUR.QC.22.bim" will be loaded')
parser.add_argument('--out_dir', type=str, help='output directory to write annot files. Relative or absolute. Dir be created if it does not exist. ')
parser.add_argument('--out_prefix', type=str, help='Prefix for output files. Outputfiles will be <out_dir>/<out_prefix>.<name_annotation>.<chromosome>.annot.gz')
parser.add_argument('--flag_annot_file_per_geneset', action='store_true', help='set flag to write one annot file per gene set per chromosome. Default is to write a combined annot file per chromosome containing all gene sets. NB: the annotation files are always split per chromosome.')
parser.add_argument('--flag_encode_as_binary_annotation', action='store_true', help='set flag if LDSC annotations should be encoded as binary annotations {0,1}. The default is to use the continuous annotations, which require an appropriate field in the file_multi_gene_set')
parser.add_argument('--flag_wgcna', action='store_true', help='set flag if file_multi_gene_set input is from WGCNA pipeline')
parser.add_argument('--flag_mouse', action='store_true', help='set flag if ile_multi_gene_set input has mouse genes (instead of human)')
parser.add_argument('--n_parallel_jobs', type=int, default=22, help='Number of processes. Default 22 (the number of chromosomes)')


_StoreAction(option_strings=['--n_parallel_jobs'], dest='n_parallel_jobs', nargs=None, const=None, default=22, type=<class 'int'>, choices=None, help='Number of processes. Default 22 (the number of chromosomes)', metavar=None)

In [100]:
# /projects/timshel/sc-genetics/sc-genetics/data/gene_lists/mludwig_thesis_hypothalamus_wgcna_modules.csv
### --flag_wgcna
argString = """--file_multi_gene_set /projects/timshel/sc-genetics/sc-genetics/src/ldsc/multi_geneset_files/test_file_multi_gene_set_wgcna200.csv \
--file_gene_coord /projects/timshel/sc-genetics/ldsc/data/gene_coords/gene_annotation.hsapiens_all_genes.GRCh37.ens_v91.LDSC_fmt.txt \
--windowsize 100000 \
--bimfile_basename /projects/timshel/sc-genetics/ldsc/data/1000G_EUR_Phase3_plink/1000G.EUR.QC \
--out_dir /scratch/sc-ldsc/TMP_TESTING_CONTINUOUS \
--out_prefix hypothalamus_mette_thesis \
--flag_wgcna \
--flag_mouse"""


### no --flag_wgcna
# argString = "--file_multi_gene_set /projects/timshel/sc-genetics/sc-genetics/src/GE-mousebrain/multi_geneset.mousebrain_all.mean.txt --file_gene_coord /projects/timshel/sc-genetics/ldsc/data/gene_coords/gene_annotation.hsapiens_all_genes.GRCh37.ens_v91.LDSC_fmt.txt --windowsize 100000 --bimfile_basename /projects/timshel/sc-genetics/ldsc/data/1000G_EUR_Phase3_plink/1000G.EUR.QC --out_dir /scratch/sc-ldsc/celltypes.mousebrain.all.mean --out_prefix celltypes.mousebrain.all.mean --n_parallel_jobs 5"

args = parser.parse_args(shlex.split(argString))

In [101]:
out_prefix = args.out_prefix
out_dir = args.out_dir
### Make out_dir
if not os.path.exists(out_dir):
    print("Making output dir {}".format(out_dir))
    os.makedirs(out_dir)

In [102]:
### read coord file
df_gene_coord = pd.read_csv(args.file_gene_coord, delim_whitespace = True)
df_gene_coord.head()

Unnamed: 0,GENE,CHR,START,END,gene_biotype
0,ENSG00000261657,HG991_PATCH,66119285,66465398,protein_coding
1,ENSG00000223116,13,23551994,23552136,miRNA
2,ENSG00000233440,13,23708313,23708703,pseudogene
3,ENSG00000207157,13,23726725,23726825,misc_RNA
4,ENSG00000229483,13,23743974,23744736,lincRNA


In [112]:
### read gene list file (a list of gene list files)
# df_multi_gene_set = read_multi_gene_set_file(args.file_multi_gene_set, flag_wgcna=args.flag_wgcna, flag_mouse=args.flag_mouse)
df_multi_gene_set = read_multi_gene_set_file(args)
df_multi_gene_set.head()

Read file_multi_gene_set. Header of the parsed/processed file:
             cell_cluster      annotation          gene_input           hgcn  \
0  Aorta_endothelial cell  antiquewhite32  ENSMUSG00000017639      Rab11fip4   
1  Aorta_endothelial cell   antiquewhite3  ENSMUSG00000004233          Wars2   
2  Aorta_endothelial cell   antiquewhite3  ENSMUSG00000031487           Brf2   
3  Aorta_endothelial cell   antiquewhite3  ENSMUSG00000032997           Chpf   
4  Aorta_endothelial cell   antiquewhite3  ENSMUSG00000022311          Csmd3   
5  Aorta_endothelial cell   antiquewhite3  ENSMUSG00000042208  0610010F05Rik   
6  Aorta_endothelial cell   antiquewhite3  ENSMUSG00000073176         Zfp449   
7  Aorta_endothelial cell   antiquewhite3  ENSMUSG00000054320         Lrrc36   
8  Aorta_endothelial cell   antiquewhite3  ENSMUSG00000022325           Pop1   
9  Aorta_endothelial cell   antiquewhite3  ENSMUSG00000089872        Rps6kc1   

   annotation_value  
0          0.888954  
1          0

is deprecated and will be removed in a future version
  'pct_genes_not_mapped': lambda x: "{:.2f}".format(sum(pd.isnull(x))/float(len(x))*100)})


ValueError: df_multi_gene_set contains new annotation-gene combinations

In [30]:
df_multi_gene_set.head()

Unnamed: 0,annotation,gene,annotation_value,cell_cluster,gene_input,hgcn
0,antiquewhite3,ENSG00000131242,0.888954,Aorta_endothelial cell,ENSMUSG00000017639,Rab11fip4
1,antiquewhite3,ENSG00000116874,0.860975,Aorta_endothelial cell,ENSMUSG00000004233,Wars2
2,antiquewhite3,ENSG00000104221,0.85592,Aorta_endothelial cell,ENSMUSG00000031487,Brf2
3,antiquewhite3,ENSG00000123989,0.824265,Aorta_endothelial cell,ENSMUSG00000032997,Chpf
4,antiquewhite3,ENSG00000164796,0.820682,Aorta_endothelial cell,ENSMUSG00000022311,Csmd3


file_out_multi_geneset=/scratch/sc-ldsc/TMP_TESTING_CONTINUOUS/log.hypothalamus_mette_thesis.multi_geneset.txt exists. Will merge df_multi_gene_set with its content.


Added n=111 new annotation-gene entries to existing file_out_multi_geneset
Unique new annotation names are:
blue1
chocolate


In [80]:
df_existing.head()

Unnamed: 0,annotation,gene,annotation_value,cell_cluster,gene_input,hgcn
0,antiquewhite31,ENSG00000131242,0.288954,Aorta_endothelial cell,ENSMUSG00000017639,Rab11fip4
1,antiquewhite3,ENSG00000116874,0.860975,Aorta_endothelial cell,ENSMUSG00000004233,Wars2
2,antiquewhite3,ENSG00000104221,0.85592,Aorta_endothelial cell,ENSMUSG00000031487,Brf2
3,antiquewhite3,ENSG00000123989,0.824265,Aorta_endothelial cell,ENSMUSG00000032997,Chpf
4,antiquewhite3,ENSG00000164796,0.820682,Aorta_endothelial cell,ENSMUSG00000022311,Csmd3


Unnamed: 0_level_0,Unnamed: 1_level_0,gene,annotation_value,cell_cluster,hgcn
annotation,gene_input,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
antiquewhite3,ENSMUSG00000001288,ENSG00000172819,0.37347,Aorta_endothelial cell,Rarg
antiquewhite3,ENSMUSG00000001467,ENSG00000001630,0.678483,Aorta_endothelial cell,Cyp51
antiquewhite3,ENSMUSG00000001663,ENSG00000184674,0.512886,Aorta_endothelial cell,Gstt1
antiquewhite3,ENSMUSG00000001786,ENSG00000100225,0.455188,Aorta_endothelial cell,Fbxo7
antiquewhite3,ENSMUSG00000002043,ENSG00000007255,0.422013,Aorta_endothelial cell,Trappc6a


In [30]:
### make beds
dict_of_beds = multi_gene_sets_to_dict_of_beds(df_multi_gene_set, df_gene_coord, args.windowsize)

making gene set bed files
Merging input multi gene set with gene coordinates for annotation = mousebrain_all.ABC.sem_mean
Merging input multi gene set with gene coordinates for annotation = mousebrain_all.ACBG.sem_mean
Merging input multi gene set with gene coordinates for annotation = mousebrain_all.ACMB.sem_mean
Merging input multi gene set with gene coordinates for annotation = mousebrain_all.ACNT1.sem_mean
Merging input multi gene set with gene coordinates for annotation = mousebrain_all.ACNT2.sem_mean
Merging input multi gene set with gene coordinates for annotation = mousebrain_all.ACOB.sem_mean
Merging input multi gene set with gene coordinates for annotation = mousebrain_all.ACTE1.sem_mean
Merging input multi gene set with gene coordinates for annotation = mousebrain_all.ACTE2.sem_mean
Merging input multi gene set with gene coordinates for annotation = mousebrain_all.CBGRC.sem_mean
Merging input multi gene set with gene coordinates for annotation = mousebrain_all.CBINH1.sem_mea

Merging input multi gene set with gene coordinates for annotation = mousebrain_all.HBSER3.sem_mean
Merging input multi gene set with gene coordinates for annotation = mousebrain_all.HBSER4.sem_mean
Merging input multi gene set with gene coordinates for annotation = mousebrain_all.HBSER5.sem_mean
Merging input multi gene set with gene coordinates for annotation = mousebrain_all.HYPEN.sem_mean
Merging input multi gene set with gene coordinates for annotation = mousebrain_all.HYPEP1.sem_mean
Merging input multi gene set with gene coordinates for annotation = mousebrain_all.HYPEP2.sem_mean
Merging input multi gene set with gene coordinates for annotation = mousebrain_all.HYPEP3.sem_mean
Merging input multi gene set with gene coordinates for annotation = mousebrain_all.HYPEP4.sem_mean
Merging input multi gene set with gene coordinates for annotation = mousebrain_all.HYPEP5.sem_mean
Merging input multi gene set with gene coordinates for annotation = mousebrain_all.HYPEP6.sem_mean
Merging inp

Merging input multi gene set with gene coordinates for annotation = mousebrain_all.PSPEP2.sem_mean
Merging input multi gene set with gene coordinates for annotation = mousebrain_all.PSPEP3.sem_mean
Merging input multi gene set with gene coordinates for annotation = mousebrain_all.PSPEP4.sem_mean
Merging input multi gene set with gene coordinates for annotation = mousebrain_all.PSPEP5.sem_mean
Merging input multi gene set with gene coordinates for annotation = mousebrain_all.PSPEP6.sem_mean
Merging input multi gene set with gene coordinates for annotation = mousebrain_all.PSPEP7.sem_mean
Merging input multi gene set with gene coordinates for annotation = mousebrain_all.PSPEP8.sem_mean
Merging input multi gene set with gene coordinates for annotation = mousebrain_all.PVM1.sem_mean
Merging input multi gene set with gene coordinates for annotation = mousebrain_all.PVM2.sem_mean
Merging input multi gene set with gene coordinates for annotation = mousebrain_all.RGDG.sem_mean
Merging input mu

Merging input multi gene set with gene coordinates for annotation = mousebrain_all.TEINH3.sem_mean
Merging input multi gene set with gene coordinates for annotation = mousebrain_all.TEINH4.sem_mean
Merging input multi gene set with gene coordinates for annotation = mousebrain_all.TEINH5.sem_mean
Merging input multi gene set with gene coordinates for annotation = mousebrain_all.TEINH6.sem_mean
Merging input multi gene set with gene coordinates for annotation = mousebrain_all.TEINH7.sem_mean
Merging input multi gene set with gene coordinates for annotation = mousebrain_all.TEINH8.sem_mean
Merging input multi gene set with gene coordinates for annotation = mousebrain_all.TEINH9.sem_mean
Merging input multi gene set with gene coordinates for annotation = mousebrain_all.VECA.sem_mean
Merging input multi gene set with gene coordinates for annotation = mousebrain_all.VECC.sem_mean
Merging input multi gene set with gene coordinates for annotation = mousebrain_all.VECV.sem_mean
Merging input mu

In [36]:
dict_of_beds['mousebrain_all.ABC.sem_mean'].head()

chr1	848803	1049920	ENSG00000187608	0.5835819011
 chr1	1052288	1397157	ENSG00000078808,ENSG00000162576,ENSG00000184163	0.7594902919
 chr1	1467474	1811896	ENSG00000008130,ENSG00000189409	0.1613366923
 chr1	2223267	2436883	ENSG00000157916	0.05271495264
 chr1	3589352	3792546	ENSG00000235169	0.153393824
 chr1	6141329	6369449	ENSG00000116251	0.06024032193
 chr1	6421211	6774667	ENSG00000162413,ENSG00000215788	0.3214305606
 chr1	7731329	7941492	ENSG00000049245	0.03899277518
 chr1	10358649	10580201	ENSG00000142657	0.4768783188
 chr1	11149398	11356038	ENSG00000171819	0.8701969784
 

In [42]:
make_annot_file_per_chromosome(chromosome=22, dict_of_beds=dict_of_beds, args=args)

making annot files for chromosome 22
CHR=22 | annotation=antiquewhite3, #1/#3
CHR=22 | annotation=blue1, #2/#3
CHR=22 | annotation=chocolate, #3/#3
CHR=22 | Concatenating annotations...
CHR=22 | Writing annotations...


TypeError: 'NoneType' object is not iterable

In [27]:
df_annot.dtypes

CHR                int64
SNP               object
CM               float64
BP                 int64
antiquewhite3    float64
blue1            float64
chocolate        float64
dtype: object

In [None]:
### TESTING pool.map(functools.partial()
def make_annot_file_per_chromosome(X, A, B):
    print(X)
    return(X+B)

A=1
B=2
X = range(1,10)
pool = multiprocessing.Pool(processes=10)
X_maped = pool.map(functools.partial(make_annot_file_per_chromosome, A=A, B=B), X)

### Testing combine_first()

In [42]:
df1 = pd.DataFrame({'id': [1, 2,3,4,5], 'first': [np.nan,np.nan,1,0,np.nan], 'second': [1,np.nan,np.nan,np.nan,0]})
df2 = pd.DataFrame({'id': [1, 2,3,4,5, 6], 'first': [np.nan,1,np.nan,np.nan,0, 1], 'third': [1,0,np.nan,1,1, 0]})

In [43]:
df1

Unnamed: 0,id,first,second
0,1,,1.0
1,2,,
2,3,1.0,
3,4,0.0,
4,5,,0.0


In [44]:
df2

Unnamed: 0,id,first,third
0,1,,1.0
1,2,1.0,0.0
2,3,,
3,4,,1.0
4,5,0.0,1.0
5,6,1.0,0.0


In [45]:
df1.combine_first(df2)

Unnamed: 0,first,id,second,third
0,,1.0,1.0,1.0
1,1.0,2.0,,0.0
2,1.0,3.0,,
3,0.0,4.0,,1.0
4,0.0,5.0,0.0,1.0
5,1.0,6.0,,0.0


In [125]:
x = "blasf _ sdf xx_xx /s * sd/sd __ \t"
list_matches = re.findall(r"[/\s]|__", x)
list_matches

[' ', ' ', ' ', ' ', '/', ' ', ' ', '/', ' ', '__', ' ', '\t']