**Sections:**<a name="contents"></a>

[Control data download](#control_data_download)

[Create symbolink link copies for all experimental fastq files](#symb_link_input)

[Quality control](#QC_analysis)

In [1]:
from __future__ import (absolute_import, division,
                        print_function, unicode_literals)

import warnings
warnings.simplefilter('ignore')

# general purpose packages
import pandas as pd
import numpy as np
import os
import json
import time
import re
import csv
import subprocess
import sys

import scipy.stats as stats
import statsmodels.stats as smstats
import statsmodels.api as sm
from statsmodels.stats.multitest import multipletests

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

import umap
import rpy2

from multiprocessing import Process, Manager, Pool
import multiprocessing
from functools import partial

from collections import Counter

import seaborn as sns; sns.set()

import matplotlib
matplotlib.style.use('seaborn')
matplotlib.use('Agg')
import matplotlib.pyplot as plt
matplotlib.rcParams['backend'] = "Qt5Agg"
import matplotlib.ticker as ticker
from matplotlib.ticker import FuncFormatter

from IPython.display import display, Image

from adjustText import adjust_text
import builtins
%matplotlib inline

# for normalization
from sklearn.linear_model import QuantileRegressor

# for working with yaml files
import ruamel.yaml

# for working with genomic intervals
import pyranges as pr

import itertools

In [2]:
def get_pvalue_star(pval, thr=0.05):
    if thr == 0.05:
        if pval < 0.001:
            return "***"
        elif pval < 0.01:
            return "**"
        elif pval < 0.05:
            return "*"
        else:
            return "ns"
    elif thr == 0.1:
        if pval < 0.001:
            return "***"
        elif pval < 0.01:
            return "**"
        elif pval < 0.1:
            return "*"
        else:
            return "ns"

In [3]:
# paths to subdirectories
subdirs = {}

subdirs['main_project_dir'] = '/scicore/home/zavolan/GROUP/RBP_perturbational_networks/'
subdirs['wf_dir'] = '/scicore/home/zavolan/mirono0000/Projects/RBP_perturbational_networks/WF/'

subdirs['human_annotation_dir'] = '/scicore/home/zavolan/GROUP/Genomes/homo_sapiens/'

# shared project folder 
subdirs['shared_project_dir'] = subdirs['main_project_dir']
subdirs['temp_dir'] = subdirs['shared_project_dir']+'temp_dir/'
subdirs['slurm_dir'] = subdirs['temp_dir']+'slurm/'
subdirs['scripts_dir'] = subdirs['shared_project_dir']+'scripts/'
subdirs['figures_dir'] = subdirs['shared_project_dir']+'figures/'
subdirs['fastq_dir'] = subdirs['shared_project_dir']+'input_fastq/'
subdirs['metadata_dir'] = subdirs['shared_project_dir']+'metadata/'

subdirs['wf_runs_dir'] = subdirs['shared_project_dir']+'wf_runs/'

# paths to files
file_paths = {}
### genome annotation files
file_paths['human_genome_file'] = subdirs['human_annotation_dir']+'GRCh38.primary_assembly.genome.fa'
file_paths['human_annotation_file'] = subdirs['human_annotation_dir']+'hg38_v42/gencode.v42.annotation.gtf'
file_paths['human_RNAcentral_annotation_file'] = subdirs['human_annotation_dir']+'hg38_v42/homo_sapiens.GRCh38.gff3.gz'
file_paths['human_enriched_annotation_file'] = subdirs['human_annotation_dir']+'hg38_v42/enriched.gencode.v42.annotation.gtf'

os.system('mkdir -p '+' '.join(list(subdirs.values()))) # create all subdirs

0

# Make enriched gtf file for mouse and human

In [9]:
organisms = ['human']

for organism in organisms:
    command = 'samtools faidx '+file_paths[organism+'_genome_file']
    out = subprocess.check_output(command, shell=True)

    gtf_df = pd.read_csv(file_paths[organism+'_annotation_file'],delimiter="\t",index_col=None,header=None,skiprows=5)

    rna_central = pd.read_csv(file_paths[organism+'_RNAcentral_annotation_file'],delimiter="\t",index_col=None,header=None,skiprows=1,compression='gzip')
    rna_central[2] = rna_central[2].str.replace('noncoding_exon','exon')
    rna_central['gene_biotype'] = rna_central[8].str.split(';type=|;',expand=True)[1]
    rna_central['gene_source'] = 'RNA_central'
    rna_central['gene_id'] = rna_central[8].str.split('ID=|;|:',expand=True)[4]

    rna_central_exons = rna_central.loc[rna_central[2]=='exon'].copy().reset_index(drop=True)
    rna_central_exons['exon_id'] = rna_central_exons['gene_id']+'.transcript'+'.'+rna_central_exons[8].str.split('ID=|;|:',expand=True)[5]
    rna_central_exons['exon_number'] = rna_central_exons[8].str.split('ID=|;|:',expand=True)[5].str.split('exon',expand=True).iloc[:, -1]
    rna_central_exons[8] = 'gene_id "'+rna_central_exons['gene_id']+'"; transcript_id "'+rna_central_exons['gene_id']+'.transcript'+'"; exon_number "'+rna_central_exons['exon_number']+'"; gene_source "'+rna_central_exons['gene_source']+'"; gene_biotype "'+rna_central_exons['gene_biotype']+'"; transcript_source "'+rna_central_exons['gene_source']+'"; transcript_biotype "'+rna_central_exons['gene_biotype']+'"; exon_id "'+rna_central_exons['exon_id']+'"; tag "'+rna_central['gene_source']+'";'
    rna_central_exons['order']=3

    rna_central_exons['transcript_id'] = rna_central_exons['gene_id']+'.transcript'
    rna_central_exons['exon_coords'] = rna_central_exons[3].astype('str')+'_'+rna_central_exons[4].astype('str')+','
    rna_central_gr_transcripts = rna_central_exons.groupby([0,6,'transcript_id']).agg({'exon_coords':sum}).reset_index()
    rna_central_gr_transcripts['transcript_alt_id'] = rna_central_gr_transcripts[0].astype('str')+'_'+rna_central_gr_transcripts[6]+'_'+rna_central_gr_transcripts['exon_coords']

    gtf_df_exons = gtf_df.loc[gtf_df[2]=='exon'].reset_index(drop=True)
    gtf_df_exons['transcript_id'] = gtf_df_exons[8].str.split('transcript_id "',expand=True)[1].str.split('"',expand=True)[0]
    gtf_df_exons['exon_coords'] = gtf_df_exons[3].astype('str')+'_'+gtf_df_exons[4].astype('str')+','
    gtf_df_gr_transcripts = gtf_df_exons.groupby([0,6,'transcript_id']).agg({'exon_coords':sum}).reset_index()
    gtf_df_gr_transcripts['transcript_alt_id'] = gtf_df_gr_transcripts[0].astype('str')+'_'+gtf_df_gr_transcripts[6]+'_'+gtf_df_gr_transcripts['exon_coords']
    ensembl_transcripts = list(gtf_df_gr_transcripts['transcript_alt_id'].unique())

    preserve_transcripts_list = list(rna_central_gr_transcripts.loc[~rna_central_gr_transcripts['transcript_alt_id'].isin(ensembl_transcripts)]['transcript_id'].unique()) # when a transcript is present in ensemble and RNA central, prioritize ensemble
    rna_central_exons = rna_central_exons.loc[rna_central_exons['transcript_id'].isin(preserve_transcripts_list)].reset_index(drop=True)
    rna_central['transcript_id'] = rna_central['gene_id']+'.transcript'
    rna_central = rna_central.loc[rna_central['transcript_id'].isin(preserve_transcripts_list)].reset_index(drop=True)

    rna_central_exons = rna_central_exons[list(range(0,9))+['gene_id','order']]

    rna_central_transcripts = rna_central.loc[rna_central[2]=='transcript'].copy().reset_index(drop=True)
    rna_central_transcripts[8] = 'gene_id "'+rna_central_transcripts['gene_id']+'"; transcript_id "'+rna_central_transcripts['gene_id']+'.transcript'+'"; gene_source "'+rna_central_transcripts['gene_source']+'"; gene_biotype "'+rna_central_transcripts['gene_biotype']+'"; transcript_source "'+rna_central_transcripts['gene_source']+'"; transcript_biotype "'+rna_central_transcripts['gene_biotype']+'"; tag "'+rna_central_transcripts['gene_source']+'";'
    rna_central_transcripts['order']=2
    rna_central_transcripts = rna_central_transcripts[list(range(0,9))+['gene_id','order']]

    rna_central_genes = rna_central.loc[rna_central[2]=='transcript'].copy().reset_index(drop=True)
    rna_central_genes[2] = 'gene'
    rna_central_genes[8] = 'gene_id "'+rna_central_genes['gene_id']+'"; gene_source "'+rna_central_genes['gene_source']+'"; gene_biotype "'+rna_central_genes['gene_biotype']+'";'
    rna_central_genes['order']=1
    rna_central_genes = rna_central_genes[list(range(0,9))+['gene_id','order']]

    rna_central_gtf = pd.concat([rna_central_genes,rna_central_transcripts,rna_central_exons]).sort_values(['gene_id','order']).reset_index(drop=True).drop(['gene_id','order'],1)

    # save standard annotation enriched with RNA central
    enriched_gtf = pd.concat([gtf_df,rna_central_gtf]).reset_index(drop=True)
    genome_fai = pd.read_csv(file_paths[organism+'_genome_file']+'.fai',delimiter="\t",index_col=None,header=None)
    enriched_gtf = pd.merge(genome_fai[[0]],enriched_gtf,how='inner',on=0)

    # remove non-canonical chromosomes, they are not of interest
    enriched_gtf = enriched_gtf.loc[enriched_gtf[0].str.startswith('chr')].reset_index(drop=True)
    enriched_gtf.to_csv(file_paths[organism+'_enriched_annotation_file'], sep=str('\t'),header=False,index=None,quoting=csv.QUOTE_NONE)

# Enrich ENCODE metadata using json files from ENCODE portal

In [4]:
ENCODE_metadata = pd.read_csv(subdirs['metadata_dir']+'ENCODE_metadata.tsv',delimiter="\t",
                                   index_col=None,header=0)

# add information about where are right controls
ENCODE_metadata['json_url'] = """https://www.encodeproject.org"""+ENCODE_metadata['File dataset']+"""?format=json"""
ENCODE_metadata[['json_url']].drop_duplicates().to_csv(subdirs['metadata_dir']+'ENCODE_json_urls.txt', sep=str('\t'),header=False,index=None,quoting=csv.QUOTE_NONE)

command = 'mkdir -p '+subdirs['metadata_dir']+'ENCODE_json_files/'
out = subprocess.check_output(command, shell=True)

command = """wget -i """+subdirs['metadata_dir']+'ENCODE_json_urls.txt -P '+subdirs['metadata_dir']+'ENCODE_json_files/'
print(command)

os.system("""find """+subdirs['metadata_dir']+'ENCODE_json_files/'+""" -name '*=json*' > """+subdirs['temp_dir']+"""json_file_paths.tsv""")
json_file_paths = pd.read_csv(subdirs['temp_dir']+'json_file_paths.tsv',delimiter="\t",
                                   index_col=None,header=None)
a = []
for json_file_path in list(json_file_paths[0]):
    json = pd.read_json(json_file_path,orient='index')
    if len(json.loc['possible_controls'][0])>0:
        a.append([json.loc['accession'][0],json.loc['possible_controls'][0][0]['accession']])
    else:
        a.append([json.loc['accession'][0],''])
controls = pd.DataFrame(a,columns= ['File dataset','File dataset controls'])
controls['File dataset'] = '/experiments/'+controls['File dataset']+'/'

ENCODE_metadata = pd.merge(ENCODE_metadata,controls,how='left',on=['File dataset'])

# add information about which file corresponds to read 1, and which - to read 2

ENCODE_metadata['file_json_url'] = """https://www.encodeproject.org/files/"""+ENCODE_metadata['File accession']+"""/?format=json"""
ENCODE_metadata[['file_json_url']].drop_duplicates().to_csv(subdirs['metadata_dir']+'ENCODE_fastq_json_urls.txt', sep=str('\t'),header=False,index=None,quoting=csv.QUOTE_NONE)

command = 'mkdir -p '+subdirs['metadata_dir']+'ENCODE_fastq_json_files/'
out = subprocess.check_output(command, shell=True)

command = """wget -i """+subdirs['metadata_dir']+'ENCODE_fastq_json_urls.txt -P '+subdirs['metadata_dir']+'ENCODE_fastq_json_files/'
print(command)

os.system("""find """+subdirs['metadata_dir']+'ENCODE_fastq_json_files/'+""" -name '*=json*' > """+subdirs['temp_dir']+"""json_fastq_file_paths.tsv""")
json_fastq_file_paths = pd.read_csv(subdirs['temp_dir']+'json_fastq_file_paths.tsv',delimiter="\t",
                                   index_col=None,header=None)

a = []
for json_file_path in list(json_fastq_file_paths[0]):
    json = pd.read_json(json_file_path,orient='index')
    a.append([json.loc['accession'][0],json.loc['paired_with'][0],json.loc['paired_end'][0],json.loc['biological_replicates_formatted'][0],json.loc['read_length'][0]])
file_metadata = pd.DataFrame(a,columns= ['File accession','paired_with','paired_end','bioreplicate','read_length'])
file_metadata['paired_with'] = file_metadata['paired_with'].str.replace('/files/','').str.replace('/','')

ENCODE_metadata = pd.merge(ENCODE_metadata,file_metadata,how='left',on=['File accession'])

def get_sample(x):
    l = [x['File accession'],x['paired_with']]
    l.sort()
    return '_'.join(l)
ENCODE_metadata['sample'] = ENCODE_metadata.apply(lambda x: get_sample(x),1)

ENCODE_metadata['File dataset'] = ENCODE_metadata['File dataset'].str.replace('/experiments/','').str.replace('/','')

EXPS = ENCODE_metadata.loc[~ENCODE_metadata['File target'].isna()].reset_index(drop=True)
EXPS['experiment_id'] = EXPS['File dataset']+'_'+EXPS['File dataset controls']+';'+EXPS['Biosample term name']+';'+EXPS['File target']

CONTROLS = ENCODE_metadata.loc[ENCODE_metadata['File target'].isna()].reset_index(drop=True)
CONTROLS['File dataset controls'] = CONTROLS['File dataset']
tmp = pd.merge(EXPS[['sample','experiment_id','Biosample term name','File target','File dataset controls']],CONTROLS[['File dataset controls','sample']],how='inner',on='File dataset controls')

df = pd.melt(tmp,id_vars=['experiment_id','Biosample term name','File target'],value_vars=['sample_x','sample_y'],value_name='sample')[['sample','experiment_id','Biosample term name','File target']].drop_duplicates()

# prepare the start samples table in which rows correspond to samples (like in SRA metadata) and experiment ids are provided
encode_start_samples = pd.merge(df,ENCODE_metadata.loc[ENCODE_metadata['paired_end']=='1'][['sample','read_length','File target','bioreplicate','Assay term name','File download URL']].rename(columns={'File download URL':'fq1','File target':'EXP_CTL'}),how='left',on='sample')
encode_start_samples = pd.merge(encode_start_samples,ENCODE_metadata.loc[ENCODE_metadata['paired_end']=='2'][['sample','File download URL']].rename(columns={'File download URL':'fq2'}),how='left',on='sample')

encode_start_samples = encode_start_samples[['sample','fq1','fq2','read_length','bioreplicate','EXP_CTL','experiment_id','Biosample term name','File target','Assay term name']]
encode_start_samples['experiment_id'] = encode_start_samples['experiment_id']+';'+(encode_start_samples['Assay term name'].str.contains('shRNA')).astype('str').str.replace('True','KD').replace('False','KO')
encode_start_samples['EXP_CTL'] = (encode_start_samples['EXP_CTL'].isna()).astype('str').str.replace('True','CTL').replace('False','EXP')
encode_start_samples = encode_start_samples.sort_values(['experiment_id','EXP_CTL']).reset_index(drop=True)
encode_start_samples = encode_start_samples.rename(columns = {'File target':'targeted_gene','Biosample term name':'cell_line'})
encode_start_samples['assay'] = (encode_start_samples['Assay term name'].str.contains('shRNA')).astype('str').str.replace('True','shRNA').replace('False','CRISPR')

encode_start_samples['genome_file'] = file_paths['human_genome_file']
encode_start_samples['gtf_file'] = file_paths['human_enriched_annotation_file']
encode_start_samples['organism'] = 'human'
encode_start_samples = encode_start_samples[['sample','fq1','fq2','read_length','organism','genome_file','gtf_file','experiment_id','targeted_gene','bioreplicate','EXP_CTL','cell_line','assay']]

encode_start_samples.to_csv(subdirs['metadata_dir']+'encode_start_samples.tsv', sep=str('\t'),header=True,index=None,quoting=csv.QUOTE_NONE)

wget -i /scicore/home/zavolan/GROUP/RBP_perturbational_networks/metadata/ENCODE_json_urls.txt -P /scicore/home/zavolan/GROUP/RBP_perturbational_networks/metadata/ENCODE_json_files/
wget -i /scicore/home/zavolan/GROUP/RBP_perturbational_networks/metadata/ENCODE_fastq_json_urls.txt -P /scicore/home/zavolan/GROUP/RBP_perturbational_networks/metadata/ENCODE_fastq_json_files/


In [21]:
len(encode_start_samples.drop_duplicates('sample'))

1060

# Prepare .yaml config file

In [155]:
# load default rule_config, modify it and save

WF_version = 'v1'

yaml = ruamel.yaml.YAML()
yaml.preserve_quotes = True
with open(subdirs['wf_dir']+'config.yaml') as f_read:
    data = yaml.load(f_read)
data['samples_file'] = subdirs['metadata_dir']+'encode_start_samples.tsv'

data['output_dir'] = subdirs['wf_runs_dir']+WF_version+'/output/'
data['local_log'] = subdirs['wf_runs_dir']+WF_version+'/output/local_log/'
data['cluster_log'] = subdirs['wf_runs_dir']+WF_version+'/output/cluster_log/'

command = 'mkdir -p '+subdirs['wf_runs_dir']+WF_version+'/output/'
out = subprocess.check_output(command, shell=True)

with open(subdirs['wf_runs_dir']+WF_version+'/config.yaml','w') as f_write:     
    yaml.dump(data, f_write)

In [161]:
subdirs['wf_runs_dir']+WF_version+'/output/cluster_log/'

'/scicore/home/zavolan/GROUP/RBP_perturbational_networks/wf_runs/v1/output/cluster_log/'

# Run main WF

In [32]:
subdirs['wf_dir']

'/scicore/home/zavolan/mirono0000/Projects/RBP_perturbational_networks/WF/'

In [167]:
WF_version = 'v1'

command = """snakemake \
--snakefile Snakefile \
--configfile """+subdirs['wf_runs_dir']+WF_version+'/config.yaml'+""" \
--printshellcmds \
--use-conda --conda-frontend conda \
--use-singularity \
--singularity-args "--bind """+subdirs['main_project_dir']+','+subdirs['human_annotation_dir']+"""" \
--cluster-config cluster.json \
--cores 500 \
--local-cores 10 \
--jobs 100 \
--latency-wait 60 \
--cluster "sbatch \
--cpus-per-task={cluster.threads} \
--mem={cluster.mem} \
--qos={cluster.queue} \
--partition={cluster.partition} \
--time={cluster.time} \
--output={cluster.out}" \
--nolock \
-np"""
command

'snakemake --snakefile Snakefile --configfile /scicore/home/zavolan/GROUP/RBP_perturbational_networks/wf_runs/v1/config.yaml --printshellcmds --use-conda --conda-frontend conda --use-singularity --singularity-args "--bind /scicore/home/zavolan/GROUP/RBP_perturbational_networks/,/scicore/home/zavolan/GROUP/Genomes/homo_sapiens/" --cluster-config cluster.json --cores 500 --local-cores 10 --jobs 100 --latency-wait 60 --cluster "sbatch --cpus-per-task={cluster.threads} --mem={cluster.mem} --qos={cluster.queue} --partition={cluster.partition} --time={cluster.time} --output={cluster.out}" --nolock -np'

In [None]:
s=$(zcat /scicore/home/zavolan/GROUP/StefanieCLIP/aleksei/output/samples/INTS11C_replicate10_exp5/map_genome/read_categories/2_um/INTS11C_replicate10_exp5.2_um.fastq.gz | head -n 2 | tail -n 1 | awk '{print length}')
l=$((s<150 ? s : 150))

In [4]:
"""picard FilterSamReads I=/scicore/home/zavolan/GROUP/StefanieCLIP/aleksei/output/samples/SRR15070640/map_genome/SRR15070640.dedup.sorted.indexed.bam \
O=/scicore/home/zavolan/GROUP/StefanieCLIP/aleksei/output/samples/SRR15070640/map_genome/read_categories/5_um/picard_test.sam \
READ_LIST_FILE=/scicore/home/zavolan/GROUP/StefanieCLIP/aleksei/output/samples/SRR15070640/map_genome/read_categories/5_um/SRR15070640.5_um.read_names.txt \
FILTER=includeReadList"""

'picard FilterSamReads I=/scicore/home/zavolan/GROUP/StefanieCLIP/aleksei/output/samples/SRR15070640/map_genome/SRR15070640.dedup.sorted.indexed.bam O=/scicore/home/zavolan/GROUP/StefanieCLIP/aleksei/output/samples/SRR15070640/map_genome/read_categories/5_um/picard_test.sam READ_LIST_FILE=/scicore/home/zavolan/GROUP/StefanieCLIP/aleksei/output/samples/SRR15070640/map_genome/read_categories/5_um/SRR15070640.5_um.read_names.txt FILTER=includeReadList'

In [7]:
command = """python /scicore/home/zavolan/mirono0000/Projects/bCLIP/bclip_workflow/scripts/get_genome_segmentations.py \
--input_gtf /scicore/home/zavolan/GROUP/Genomes/mus_musculus/enriched.gencode.vM32.annotation.gtf \
--input_genome_fai /scicore/home/zavolan/GROUP/StefanieCLIP/aleksei/output/genome_indices/mouse/genome.fai \
--output_exon_intron_gs /scicore/home/zavolan/GROUP/StefanieCLIP/aleksei/output/transcriptome/mouse/exon_intron_genome_segmentation.bed \
--output_binned_gs /scicore/home/zavolan/GROUP/StefanieCLIP/aleksei/output/transcriptome/mouse/binned_genome_segmentation.bed \
--bin_size 10 \
--temp_dir /scicore/home/zavolan/GROUP/StefanieCLIP/aleksei/output/transcriptome/mouse/temp \
--output_modified_gtf /scicore/home/zavolan/GROUP/StefanieCLIP/aleksei/output/transcriptome/mouse/modified_annotation.gtf \
--gene_flank 1000"""
command

'python /scicore/home/zavolan/mirono0000/Projects/bCLIP/bclip_workflow/scripts/get_genome_segmentations.py --input_gtf /scicore/home/zavolan/GROUP/Genomes/mus_musculus/enriched.gencode.vM32.annotation.gtf --input_genome_fai /scicore/home/zavolan/GROUP/StefanieCLIP/aleksei/output/genome_indices/mouse/genome.fai --output_exon_intron_gs /scicore/home/zavolan/GROUP/StefanieCLIP/aleksei/output/transcriptome/mouse/exon_intron_genome_segmentation.bed --output_binned_gs /scicore/home/zavolan/GROUP/StefanieCLIP/aleksei/output/transcriptome/mouse/binned_genome_segmentation.bed --bin_size 10 --temp_dir /scicore/home/zavolan/GROUP/StefanieCLIP/aleksei/output/transcriptome/mouse/temp --output_modified_gtf /scicore/home/zavolan/GROUP/StefanieCLIP/aleksei/output/transcriptome/mouse/modified_annotation.gtf --gene_flank 1000'

In [None]:
bedtools bamtobed -split -cigar -i /scicore/home/zavolan/GROUP/StefanieCLIP/aleksei/output/samples/NRDE2_replicate3_exp10/map_genome/NRDE2_replicate3_exp10.Aligned.sortedByCoord.out.bam

In [None]:
set +o pipefail; bedtools bamtobed -cigar -i {input.bam} | bedtools groupby -g 4 -c 7 -o distinct | awk '$2 !~ /,/' | awk '$2 !~ /N/' | bedtools groupby -g 2 -c 2 -o count > {output.cigar_frequencies}

In [22]:
import itertools

grouped_bed_file_path = '/scicore/home/zavolan/GROUP/StefanieCLIP/aleksei/output/samples/RBC_replicate1/map_genome/RBC_replicate1.dedup.sorted.indexed.grouped.bed'
bed_file_path = '/scicore/home/zavolan/GROUP/StefanieCLIP/aleksei/output/samples/RBC_replicate1/map_genome/RBC_replicate1.dedup.sorted.indexed.bed'

outdir = os.path.dirname(grouped_bed_file_path)+'/read_categories/'
sample_id = os.path.basename(grouped_bed_file_path).replace('.dedup.sorted.indexed.grouped.bed','')

d_cats = [0,1,2,3,4,5]
mm_modes = ['um','mm']

read_categories = list(itertools.product(*[d_cats,mm_modes])) # cartesian product

grouped_bed_file = pd.read_csv(grouped_bed_file_path,delimiter="\t",index_col=None,header=None)
grouped_bed_file.columns = [0,1,2,'name','w',5,'d','wd','d_cat']

for read_category in read_categories:
    cat_name = '_'.join(list(pd.Series(read_category).astype('str')))
    out_subdir = outdir+cat_name+'/'
    command = 'mkdir -p '+out_subdir
    out = subprocess.check_output(command, shell=True)    

    output_read_names_path = out_subdir+sample_id+'.'+cat_name+'.read_names.txt'
    output_read_weights_path = out_subdir+sample_id+'.'+cat_name+'.read_weights.txt'
    
    d_cat = read_category[0]
    if read_category[1]=='um':
        cur_reads = bed_file.loc[(bed_file['d_cat']==d_cat)&(bed_file['w']==1)]
    else:
        cur_reads = bed_file.loc[(bed_file['d_cat']==d_cat)&(bed_file['w']<1)]
    if len(cur_reads)==0:
        command = 'echo "empty" > '+output_read_names_path
        out = subprocess.check_output(command, shell=True)
        command = 'echo "0" > '+output_read_weights_path
        out = subprocess.check_output(command, shell=True)
        continue
    cur_reads = cur_reads.sort_values([0,1,2])reset_index(drop=True)
    cur_reads[[0,1,2,'name','w',5]].to_csv(out_subdir+'temp1.bed', sep=str('\t'),header=True,index=None)
    
    command = 'bedtools intersect -a '+out_subdir+'temp1.bed -b '+bed_file_path+' -wo -f 1.0 -r -s | bedtools groupby -g <QNAME> -c <w> -o first > '+out_subdir+'temp2.bed'
    out = subprocess.check_output(command, shell=True)
    
    command = 'cut -f1 '+out_subdir+'temp2.bed > '+output_read_names_path
    out = subprocess.check_output(command, shell=True)
    command = 'cut -f2 '+out_subdir+'temp2.bed > '+output_read_weights_path
    out = subprocess.check_output(command, shell=True)

    command = 'rm '+out_subdir+'temp2.bed '+out_subdir+'temp1.bed'
    out = subprocess.check_output(command, shell=True)

'/scicore/home/zavolan/GROUP/StefanieCLIP/aleksei/output/samples/RBC_replicate1/map_genome/read_categories/'

In [23]:
read_categories

[(0, 'um'),
 (0, 'mm'),
 (1, 'um'),
 (1, 'mm'),
 (2, 'um'),
 (2, 'mm'),
 (3, 'um'),
 (3, 'mm'),
 (4, 'um'),
 (4, 'mm'),
 (5, 'um'),
 (5, 'mm')]

In [18]:
# building bracken dbs for all lengths present
kraken_db = '/scicore/home/zavolan/GROUP/Genomes/KRAKEN_DB/standard/'
start_samples = pd.read_csv(subdirs['metadata_dir']+'start_samples.tsv',delimiter="\t",index_col=None,header=0)

read_lengths_file_name = subdirs['temp_dir']+'read_lengths.tsv'
read_lengths = pd.read_csv(subdirs['wf_output_dir']+'misc/max_read_lengths.tsv',delimiter="\t",index_col=None,header=0)
read_lengths = read_lengths[['max_read_length_estimate']].drop_duplicates().rename(columns = {'max_read_length_estimate':'index_size'}).reset_index(drop=True)

read_lengths['id'] = read_lengths.index+1
read_lengths[['id','index_size']].to_csv(read_lengths_file_name, sep=str('\t'),header=False,index=None)

max_node_mem = 512
mem = 400
cpus = min(15,int(128/((max_node_mem*0.8)/mem)))
print(str(cpus))

f = open(subdirs['scripts_dir']+'bracken_build.sbatch', "w")

preambula = \
"""#!/bin/bash
  
#SBATCH --job-name=bracken_build
#SBATCH --time=5:00:00
#SBATCH --qos=6hours
#SBATCH --output="""+subdirs['slurm_dir']+"""%A_%a.out
#SBATCH --error="""+subdirs['slurm_dir']+"""%A_%a.err
#SBATCH --cpus-per-task="""+str(cpus)+"""
#SBATCH --mem="""+str(mem)+"""G
#SBATCH --array=1-"""+str(len(read_lengths))+"""%200

source ~/.bashrc
"""
f.write(preambula)
f.write("""
read_length=$(awk -v i=$SLURM_ARRAY_TASK_ID '{ if ($1 == i) { print $2} }' """+read_lengths_file_name+""")
echo $read_length
""")

command = """bracken-build -d """+kraken_db+""" -t """+str(cpus)+""" -k 35 -l $read_length"""
f.write(command)
f.close()

print('sbatch '+subdirs['scripts_dir']+'bracken_build.sbatch')

15
sbatch /scicore/home/zavolan/GROUP/StefanieCLIP/aleksei/scripts/bracken_build.sbatch


In [4]:
# do kraken and bracken for unmapped reads
kraken_db = '/scicore/home/zavolan/GROUP/Genomes/KRAKEN_DB/standard/'

start_samples = pd.read_csv(subdirs['metadata_dir']+'start_samples.tsv',delimiter="\t",index_col=None,header=0)
# get length estimates for samples
l = []
for sample in list(start_samples['name']):
    rl = pd.read_csv(subdirs['wf_output_dir']+'samples/'+sample+'/read_length/'+sample+'.max_read_length.txt',header=None).loc[0,0]
    l.append(rl)
start_samples['index_size'] = l
start_samples = start_samples.rename(columns={'name':'sample'})

sample_file_name = subdirs['temp_dir']+'unmapped_fastq_file_paths.tsv'

os.system("""find """+subdirs['wf_output_dir']+""" -name '*.Unmapped.out.fastq.gz' > """+sample_file_name)
fastq_file_paths = pd.read_csv(sample_file_name,delimiter="\t",
                                   index_col=None,header=None)
fastq_file_paths.columns = ['path']
fastq_file_paths['output_dir'] = fastq_file_paths['path'].str.split('map_genome/',expand=True)[0]+'kraken/'
fastq_file_paths['sample'] = fastq_file_paths['path'].str.split('map_genome/',expand=True)[1].str.replace('.Unmapped.out.fastq.gz','')
fastq_file_paths = pd.merge(fastq_file_paths,start_samples[['sample','index_size']].rename(columns={'index_size':'expected_read_length'}),how='inner',on='sample')
fastq_file_paths['kraken_output_file'] = fastq_file_paths['output_dir']+fastq_file_paths['sample']+'.unmapped.kraken'
fastq_file_paths['kreport_output_file'] = fastq_file_paths['output_dir']+fastq_file_paths['sample']+'.unmapped.kreport'
fastq_file_paths['bracken_output_file'] = fastq_file_paths['output_dir']+fastq_file_paths['sample']+'.unmapped.bracken'
fastq_file_paths['id'] = fastq_file_paths.index+1
fastq_file_paths[['id','path','output_dir','kraken_output_file','kreport_output_file','bracken_output_file','expected_read_length']].to_csv(sample_file_name, sep=str('\t'),header=False,index=None)

max_node_mem = 512
mem = 100
cpus = int(128/((max_node_mem*0.8)/mem))
print(str(cpus))

f = open(subdirs['scripts_dir']+'kraken_unmapped_reads.sbatch', "w")

preambula = \
"""#!/bin/bash
  
#SBATCH --job-name=kraken_unmapped_reads
#SBATCH --time=1:00:00
#SBATCH --qos=6hours
#SBATCH --output="""+subdirs['slurm_dir']+"""%A_%a.out
#SBATCH --error="""+subdirs['slurm_dir']+"""%A_%a.err
#SBATCH --cpus-per-task="""+str(cpus)+"""
#SBATCH --mem="""+str(mem)+"""G
#SBATCH --array=1-"""+str(len(fastq_file_paths))+"""%200

source ~/.bashrc
"""
f.write(preambula)
f.write("""
fastq_file_path=$(awk -v i=$SLURM_ARRAY_TASK_ID '{ if ($1 == i) { print $2} }' """+sample_file_name+""")
output_dir=$(awk -v i=$SLURM_ARRAY_TASK_ID '{ if ($1 == i) { print $3} }' """+sample_file_name+""")
kraken_output_file=$(awk -v i=$SLURM_ARRAY_TASK_ID '{ if ($1 == i) { print $4} }' """+sample_file_name+""")
kreport_output_file=$(awk -v i=$SLURM_ARRAY_TASK_ID '{ if ($1 == i) { print $5} }' """+sample_file_name+""")
bracken_output_file=$(awk -v i=$SLURM_ARRAY_TASK_ID '{ if ($1 == i) { print $6} }' """+sample_file_name+""")
expected_read_length=$(awk -v i=$SLURM_ARRAY_TASK_ID '{ if ($1 == i) { print $7} }' """+sample_file_name+""")
echo $fastq_file_path
echo $output_dir
""")

command = """mkdir -p $output_dir"""
f.write(command+'\n') 

command = """
kraken2 \
--threads """+str(cpus)+""" \
--db """+kraken_db+""" \
--report $kreport_output_file \
$fastq_file_path > \
$kraken_output_file"""
f.write(command)

command = """ && bracken \
-d """+kraken_db+""" \
-i $kreport_output_file \
-o $bracken_output_file \
-r $expected_read_length -l S -t 1
"""
f.write(command)

f.close()

print('sbatch '+subdirs['scripts_dir']+'kraken_unmapped_reads.sbatch')

31
sbatch /scicore/home/zavolan/GROUP/StefanieCLIP/aleksei/scripts/kraken_unmapped_reads.sbatch


In [5]:
# do kraken and bracken for mapped reads
kraken_db = '/scicore/home/zavolan/GROUP/Genomes/KRAKEN_DB/standard/'

start_samples = pd.read_csv(subdirs['metadata_dir']+'start_samples.tsv',delimiter="\t",index_col=None,header=0)
# get length estimates for samples
l = []
for sample in list(start_samples['name']):
    rl = pd.read_csv(subdirs['wf_output_dir']+'samples/'+sample+'/read_length/'+sample+'.max_read_length.txt',header=None).loc[0,0]
    l.append(rl)
start_samples['index_size'] = l
start_samples = start_samples.rename(columns={'name':'sample'})

sample_file_name = subdirs['temp_dir']+'mapped_fastq_file_paths.tsv'

os.system("""find """+subdirs['wf_output_dir']+""" -name '*.Aligned.sortedByCoord.out.fastq.gz' > """+sample_file_name)
fastq_file_paths = pd.read_csv(sample_file_name,delimiter="\t",
                                   index_col=None,header=None)
fastq_file_paths.columns = ['path']
fastq_file_paths['output_dir'] = fastq_file_paths['path'].str.split('map_genome/',expand=True)[0]+'kraken/'
fastq_file_paths['sample'] = fastq_file_paths['path'].str.split('map_genome/',expand=True)[1].str.replace('.Aligned.sortedByCoord.out.fastq.gz','')
fastq_file_paths = pd.merge(fastq_file_paths,start_samples[['sample','index_size']].rename(columns={'index_size':'expected_read_length'}),how='inner',on='sample')
fastq_file_paths['kraken_output_file'] = fastq_file_paths['output_dir']+fastq_file_paths['sample']+'.mapped.kraken'
fastq_file_paths['kreport_output_file'] = fastq_file_paths['output_dir']+fastq_file_paths['sample']+'.mapped.kreport'
fastq_file_paths['bracken_output_file'] = fastq_file_paths['output_dir']+fastq_file_paths['sample']+'.mapped.bracken'
fastq_file_paths['id'] = fastq_file_paths.index+1
fastq_file_paths[['id','path','output_dir','kraken_output_file','kreport_output_file','bracken_output_file','expected_read_length']].to_csv(sample_file_name, sep=str('\t'),header=False,index=None)

max_node_mem = 512
mem = 100
cpus = int(128/((max_node_mem*0.8)/mem))
print(str(cpus))

f = open(subdirs['scripts_dir']+'kraken_mapped_reads.sbatch', "w")

preambula = \
"""#!/bin/bash
  
#SBATCH --job-name=kraken_mapped_reads
#SBATCH --time=1:00:00
#SBATCH --qos=6hours
#SBATCH --output="""+subdirs['slurm_dir']+"""%A_%a.out
#SBATCH --error="""+subdirs['slurm_dir']+"""%A_%a.err
#SBATCH --cpus-per-task="""+str(cpus)+"""
#SBATCH --mem="""+str(mem)+"""G
#SBATCH --array=1-"""+str(len(fastq_file_paths))+"""%200

source ~/.bashrc
"""
f.write(preambula)
f.write("""
fastq_file_path=$(awk -v i=$SLURM_ARRAY_TASK_ID '{ if ($1 == i) { print $2} }' """+sample_file_name+""")
output_dir=$(awk -v i=$SLURM_ARRAY_TASK_ID '{ if ($1 == i) { print $3} }' """+sample_file_name+""")
kraken_output_file=$(awk -v i=$SLURM_ARRAY_TASK_ID '{ if ($1 == i) { print $4} }' """+sample_file_name+""")
kreport_output_file=$(awk -v i=$SLURM_ARRAY_TASK_ID '{ if ($1 == i) { print $5} }' """+sample_file_name+""")
bracken_output_file=$(awk -v i=$SLURM_ARRAY_TASK_ID '{ if ($1 == i) { print $6} }' """+sample_file_name+""")
expected_read_length=$(awk -v i=$SLURM_ARRAY_TASK_ID '{ if ($1 == i) { print $7} }' """+sample_file_name+""")
echo $fastq_file_path
echo $output_dir
""")

command = """mkdir -p $output_dir"""
f.write(command+'\n') 

command = """
kraken2 \
--threads """+str(cpus)+""" \
--db """+kraken_db+""" \
--report $kreport_output_file \
$fastq_file_path > \
$kraken_output_file"""
f.write(command)

command = """ && bracken \
-d """+kraken_db+""" \
-i $kreport_output_file \
-o $bracken_output_file \
-r $expected_read_length -l S -t 1
"""
f.write(command)

f.close()

print('sbatch '+subdirs['scripts_dir']+'kraken_mapped_reads.sbatch')

31
sbatch /scicore/home/zavolan/GROUP/StefanieCLIP/aleksei/scripts/kraken_mapped_reads.sbatch
