In [13]:
from __future__ import division
import os
import sys
import subprocess as sb
import glob
import shutil
import multiprocessing

try:
    import cPickle as cp
except:
    import pickle as cp

import pandas as pd
import matplotlib as mpl

mpl.use('Agg')
import pylab as pl
import xml.etree.cElementTree as ET
from pybedtools import BedTool
#from bioutilities import Genome_2bit
import pysam
import re
import numpy as np
from scipy.stats import zscore
import urllib2

HAYSTACK_VERSION = "0.4.0"

import logging
logging.basicConfig(level=logging.DEBUG,
                    format='%(levelname)-5s @ %(asctime)s:\n\t %(message)s \n',
                    datefmt='%a, %d %b %Y %H:%M:%S',
                    stream=sys.stderr,
                    filemode="w", filename='example.log'
                    )

error = logging.critical
warn = logging.warning
debug = logging.debug
info = logging.info
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
logging.debug("start")

In [14]:
# commmon functions haystack hotspots
def check_library(library_name):
    try:
        return __import__(library_name)
    except:
        error('You need to install %s module to use haystack!' % library_name)
        sys.exit(1)

def which(program):
    def is_exe(fpath):
        return os.path.isfile(fpath) and os.access(fpath, os.X_OK)

    fpath, fname = os.path.split(program)
    if fpath:
        if is_exe(program):
            return program
    else:
        for path in os.environ["PATH"].split(os.pathsep):
            path = path.strip('"')
            exe_file = os.path.join(path, program)
            if is_exe(exe_file):
                return exe_file
    return None

def check_program(binary_name, download_url=None):
    if not which(binary_name):
        error(
            'You need to install and have the command #####%s##### in your PATH variable to use CRISPResso!\n Please read the documentation!' % binary_name)
        if download_url:
            error('You can download it from here:%s' % download_url)
        sys.exit(1)

def check_file(filename):
    try:
        with open(filename):
            pass
    except IOError:
        raise Exception('I cannot open the file: ' + filename)

def quantile_normalization(A):
    AA = np.zeros_like(A)
    I = np.argsort(A, axis=0)
    AA[I, np.arange(A.shape[1])] = np.mean(A[I, np.arange(A.shape[1])], axis=1)[:, np.newaxis]

    return AA

def smooth(x, window_len=200):
    s = np.r_[x[window_len - 1:0:-1], x, x[-1:-window_len:-1]]
    w = np.hanning(window_len)
    y = np.convolve(w / w.sum(), s, mode='valid')
    return y[int(window_len / 2):-int(window_len / 2) + 1]

# write the IGV session file
def rem_base_path(path, base_path):
    return path.replace(os.path.join(base_path, ''), '')

def find_th_rpm(df_chip, th_rpm_param):
    return np.min(df_chip.apply(lambda x: np.percentile(x, th_rpm_param)))

def log2_transform(x):
    return np.log2(x + 1)

def angle_transform(x):
    return np.arcsin(np.sqrt(x) / 1000000.0)

def download_genome(name, output_directory=None):
    urlpath = "http://hgdownload.cse.ucsc.edu/goldenPath/%s/bigZips/%s.2bit" % (name, name)
    genome_url_origin = urllib2.urlopen(urlpath)

    genome_filename = os.path.join(output_directory, "%s.2bit" % name)

    print 'Downloading %s in %s...' % (urlpath, genome_filename)

    if os.path.exists(genome_filename):
        print 'File %s exists, skipping download' % genome_filename
    else:

        with open(genome_filename, 'wb') as genome_file_destination:
            shutil.copyfileobj(genome_url_origin, genome_file_destination)

        print 'Downloded %s in %s:' % (urlpath, genome_filename)

    g = Genome_2bit(genome_filename, verbose=True)

    chr_len_filename = os.path.join(output_directory, "%s_chr_lengths.txt" % name)
    if not os.path.exists(chr_len_filename):
        print 'Extracting chromosome lengths'
        g.write_chr_len(chr_len_filename)
        print 'Done!'
    else:
        print 'File %s exists, skipping generation' % chr_len_filename

    meme_bg_filename = os.path.join(output_directory, "%s_meme_bg" % name)
    if not os.path.exists(meme_bg_filename):
        print 'Calculating nucleotide frequencies....'
        g.write_meme_background(meme_bg_filename)
        print 'Done!'
    else:
        print 'File %s exists, skipping generation' % meme_bg_filename

def determine_path(folder):
    _ROOT = os.getcwd()
    return os.path.join(_ROOT, folder)

Determine pipeline folder paths

In [15]:
samples_filename = determine_path('test_data/samples_names.txt')
output_directory = determine_path('haystack_analysis/output')
motif_directory = determine_path('motif_databases')
annotation_directory = determine_path('gene_annotations')
genome_directory = determine_path('genomes')

In [16]:
samples_filename_or_bam_folder = samples_filename
genome_name = 'hg19'
# optional
name = ''
output_directory = output_directory
bin_size = 200
recompute_all = True
depleted = True
input_is_bigwig = False
disable_quantile_normalization = False
transformation = 'angle'
z_score_high = 1.5
z_score_low = 0.25
th_rpm = 99
meme_motifs_filename = ''  # os.path.join(MOTIF_DIR, 'JASPAR_CORE_2016_vertebrates.meme')
motif_mapping_filename = ''  # os.path.join(MOTIF_DIR, 'JASPAR_CORE_2016_vertebrates_mapped_to_gene_human_mouse.txt')
plot_all = True
use_X_Y = True
max_regions_percentage=0.1
n_processes = multiprocessing.cpu_count()
temp_directory = ''  # os.path.join(_ROOT, 'tmp')
version = 'Version %s' % HAYSTACK_VERSION
# add two flags
chrom_exclude = ''  # '_|chrM|chrX|chrY'
ext = 200 # read extension in bps

Intialize

In [17]:
if depleted:
    z_score_high = -z_score_high
    z_score_low = -z_score_low

if meme_motifs_filename:
    check_file(meme_motifs_filename)

if motif_mapping_filename:
    check_file(motif_mapping_filename)

# if not os.path.exists(temp_directory):
#    error('The folder specified with --temp_directory: %s does not exist!' % temp_directory)
#    sys.exit(1)

if input_is_bigwig:
    extension_to_check = '.bw'
    info('Input is set BigWig (.bw)')
else:
    extension_to_check = '.bam'
    info('Input is set compressed SAM (.bam)')

if name:
    directory_name = 'HAYSTACK_PIPELINE_RESULTS_on_%s' % name

else:
    directory_name = 'HAYSTACK_PIPELINE_RESULTS'

if output_directory:
    output_directory = os.path.join(output_directory, directory_name)
else:
    output_directory = directory_name

read data

In [18]:
# check folder or sample filename

USE_GENE_EXPRESSION = True

if os.path.isfile(samples_filename_or_bam_folder):
    BAM_FOLDER = False
    bam_filenames = []
    gene_expression_filenames = []
    sample_names = []

    dir_path = os.path.dirname(os.path.realpath(samples_filename_or_bam_folder))

    with open(samples_filename_or_bam_folder) as infile:
        for line in infile:

            if not line.strip():
                continue

            if line.startswith('#'):  # skip optional header line or empty lines
                info('Skipping header/comment line:%s' % line)
                continue

            fields = line.strip().split("\t")
            n_fields = len(fields)

            if n_fields == 2:

                USE_GENE_EXPRESSION = False

                sample_names.append(fields[0])
                bam_filenames.append(fields[1])

            elif n_fields == 3:

                USE_GENE_EXPRESSION = USE_GENE_EXPRESSION and True

                sample_names.append(fields[0])
                bam_filenames.append(fields[1])
                gene_expression_filenames.append(fields[2])
            else:
                error('The samples file format is wrong!')

    bam_filenames = [os.path.join(dir_path, filename) for filename in bam_filenames]
    gene_expression_filenames = [os.path.join(dir_path, filename) for filename in gene_expression_filenames]

else:
    if os.path.exists(samples_filename_or_bam_folder):
        BAM_FOLDER = True
        USE_GENE_EXPRESSION = False
        bam_filenames = glob.glob(os.path.join(samples_filename_or_bam_folder, '*' + extension_to_check))

        if not bam_filenames:
            error('No bam/bigwig  files to analyze in %s. Exiting.' % samples_filename_or_bam_folder)
            sys.exit(1)

        sample_names = [os.path.basename(bam_filename).replace(extension_to_check, '') for bam_filename in
                        bam_filenames]
    else:
        error("The file or folder %s doesn't exist. Exiting." % samples_filename_or_bam_folder)
        sys.exit(1)

# check all the files before starting
info('Checking samples files location...')
for bam_filename in bam_filenames:
    check_file(bam_filename)

if USE_GENE_EXPRESSION:
    for gene_expression_filename in gene_expression_filenames:
        check_file(gene_expression_filename)

if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# copy back the file used
if not BAM_FOLDER:
    shutil.copy2(samples_filename_or_bam_folder, output_directory)

# write hotspots conf files
sample_names_hotspots_filename = os.path.join(output_directory, 'sample_names_hotspots.txt')

with open(sample_names_hotspots_filename, 'w+') as outfile:
    for sample_name, bam_filename in zip(sample_names, bam_filenames):
        outfile.write('%s\t%s\n' % (sample_name, bam_filename))

# write tf activity  conf files
if USE_GENE_EXPRESSION:
    sample_names_tf_activity_filename = os.path.join(output_directory, 'sample_names_tf_activity.txt')

    with open(sample_names_tf_activity_filename, 'w+') as outfile:
        for sample_name, gene_expression_filename in zip(sample_names, gene_expression_filenames):
            outfile.write('%s\t%s\n' % (sample_name, gene_expression_filename))

    tf_activity_directory = os.path.join(output_directory, 'HAYSTACK_TFs_ACTIVITY_PLANES')

create HAYSTACK_HOTSPOTS directories

In [19]:
if name:
    directory_name = 'HAYSTACK_HOTSPOTS_on_%s' % name

else:
    directory_name = 'HAYSTACK_HOTSPOTS'

if output_directory:
    output_directory = os.path.join(output_directory, directory_name)
else:
    output_directory = directory_name

if not os.path.exists(output_directory):
    os.makedirs(output_directory)

tracks_directory = os.path.join(output_directory, 'TRACKS')
if not os.path.exists(tracks_directory):
    os.makedirs(tracks_directory)

intermediate_directory = os.path.join(output_directory, 'INTERMEDIATE')
if not os.path.exists(intermediate_directory):
    os.makedirs(intermediate_directory)


specific_regions_directory = os.path.join(output_directory, 'SPECIFIC_REGIONS')
if not os.path.exists(specific_regions_directory):
        os.makedirs(specific_regions_directory)

Create filepaths to required files

In [20]:
# input files
chr_len_filename = os.path.join(genome_directory, "%s_chr_lengths.txt" % genome_name)
blacklist = os.path.join(genome_directory, 'blacklist.bed')
check_file(chr_len_filename)
check_file(blacklist)

# output files
genome_sorted_filtered_bins_file = os.path.join(output_directory,
                                                '%s.%dbp.bins.sorted.filterd.bed' % (os.path.basename(genome_name),
                                                                                     bin_size))
bedgraph_iod_track_filename = os.path.join(tracks_directory,
                                           'VARIABILITY.bedgraph')
bw_iod_track_filename = os.path.join(tracks_directory,
                                     'VARIABILITY.bw')
bedgraph_hpr_filename = os.path.join(tracks_directory,
                                     'SELECTED_VARIABILITY_HOTSPOT.bedgraph')
bed_hpr_fileaname = os.path.join(output_directory,
                                 'SELECTED_VARIABILITY_HOTSPOT.bed')

Required functions for the pipeline

Create filepaths to required files

Pipeline

In [None]:
# step 1
info('Initializing Genome:%s' % genome_name)
intialize_genome(genome_name)

# step 2
info('Creating bins of %dbp in %s' % (bin_size, genome_sorted_filtered_bins_file))
if not os.path.exists(genome_sorted_filtered_bins_file) or recompute_all:
    create_tiled_genome(genome_sorted_filtered_bins_file)

# step 3
info('Convert files to genome-wide rpm tracks')
to_rpm_tracks(sample_names, bam_filenames, skipfilter=False)

In [22]:
# step 4
info('Normalize rpm tracks')
binned_rpm_filenames = glob.glob(os.path.join(intermediate_directory, '*.rpm'))
df_chip, coordinates_bin =load_rpm_tracks(binned_rpm_filenames)

In [25]:
df_chip.head(10)

Unnamed: 0,GM12878.200bp,H1hesc.200bp,HEPG2.200bp,HSMM.200bp,K562.200bp,NHLF.200bp
0,0.0,0.038836,0.0,0.0,0.0,0.069769
1,0.0,0.077672,0.0,0.0,0.068074,0.069769
2,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
coordinates_bin.head(10)

Unnamed: 0,chr_id,bpstart,bpend
0,chr1,10000,10200
1,chr1,10200,10400
2,chr1,11600,11800
3,chr1,11800,12000
4,chr1,12000,12200
5,chr1,12200,12400
6,chr1,12400,12600
7,chr1,12600,12800
8,chr1,12800,13000
9,chr1,13000,13200


In [None]:

if disable_quantile_normalization:
    info('Skipping quantile normalization...')
    df_chip_to_bigwig(df_chip, coordinates_bin)

else:
    info('Normalizing the data...')
    df_chip = pd.DataFrame(quantile_normalization(df_chip.values),
                               columns=df_chip.columns,
                               index=df_chip.index)
    df_chip_to_bigwig(df_chip, coordinates_bin, normalized=True)

In [29]:
df_chip.head(10)

Unnamed: 0,GM12878.200bp,H1hesc.200bp,HEPG2.200bp,HSMM.200bp,K562.200bp,NHLF.200bp
0,0.0,0.006473,0.0,0.0,0.0,0.071288
1,0.040222,0.012945,0.05185,0.012945,0.085606,0.101537
2,0.040222,0.0,0.05185,0.012945,0.028876,0.012945
3,0.040222,0.0,0.05185,0.012945,0.028876,0.012945
4,0.040222,0.0,0.05185,0.012945,0.028876,0.012945
5,0.040222,0.0,0.05185,0.012945,0.028876,0.012945
6,0.040222,0.0,0.05185,0.012945,0.028876,0.012945
7,0.040222,0.0,0.05185,0.012945,0.028876,0.012945
8,0.040222,0.0,0.05185,0.012945,0.028876,0.012945
9,0.040222,0.0,0.05185,0.012945,0.028876,0.012945


In [30]:
# step 5
info('Determine HP regions')
hpr_idxs, coordinates_bin, df_chip_hpr_zscore, hpr_iod_scores = find_hpr_coordinates(df_chip, coordinates_bin)
info('Save files')
hpr_to_bigwig(coordinates_bin)
hpr_to_bedgraph(hpr_idxs, coordinates_bin)
write_specific_regions(coordinates_bin, df_chip_hpr_zscore)
create_igv_track_file(hpr_iod_scores)

In [33]:
hpr_iod_scores.head(10)

4370    0.000002
4371    0.000001
4446    0.000001
4447    0.000002
5147    0.000001
5148    0.000001
5875    0.000002
8516    0.000001
9474    0.000001
9475    0.000002
dtype: float64

In [31]:
coordinates_bin.head(10)

Unnamed: 0,chr_id,bpstart,bpend,iod
2729,chr1,713600,713800,1.522046e-07
2730,chr1,713800,714000,1.360054e-07
3540,chr1,879200,879400,5.883203e-07
3541,chr1,879400,879600,5.085509e-07
3614,chr1,894000,894200,2.181267e-07
3615,chr1,894200,894400,1.283248e-07
3616,chr1,894400,894600,9.571502e-08
3652,chr1,901600,901800,5.096526e-07
3653,chr1,901800,902000,6.060991e-07
3654,chr1,902000,902200,5.704145e-07


In [32]:
df_chip_hpr_zscore.head(10)

Unnamed: 0,GM12878.200bp,H1hesc.200bp,HEPG2.200bp,HSMM.200bp,K562.200bp,NHLF.200bp
4370,-0.488069,-0.311384,-0.387298,-0.458339,2.228175,-0.583085
4371,-0.459942,-0.478323,-0.27201,-0.57477,2.227086,-0.44204
4446,-0.536726,-0.43827,2.234039,-0.410387,-0.43827,-0.410387
4447,-0.547223,-0.426057,2.23367,-0.426057,-0.408275,-0.426057
5147,-0.99477,-0.151963,-0.8715,1.220835,-0.70118,1.498577
5148,-0.879217,-0.274687,-0.807228,1.081352,-0.773283,1.653064
5875,-0.402657,2.224369,-0.448244,-0.289137,-0.448244,-0.636087
8516,-0.44797,-0.332391,2.22665,-0.556949,-0.332391,-0.556949
9474,-0.673379,-0.512369,2.208026,-0.430424,-0.451251,-0.140602
9475,-0.604159,-0.382655,2.180051,-0.578032,-0.633627,0.018422


In [None]:
info('All done! Ciao!')