**Sections:**<a name="contents"></a>

[Control data download](#control_data_download)

[Create symbolink link copies for all experimental fastq files](#symb_link_input)

[Quality control](#QC_analysis)

In [2]:
from __future__ import (absolute_import, division,
                        print_function, unicode_literals)

import warnings
warnings.simplefilter('ignore')

# general purpose packages
import pandas as pd
import numpy as np
import os
import json
import time
import re
import csv
import subprocess
import sys

import scipy.stats as stats
import statsmodels.stats as smstats
import statsmodels.api as sm
from statsmodels.stats.multitest import multipletests

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

import umap
import rpy2

from multiprocessing import Process, Manager, Pool
import multiprocessing
from functools import partial

from collections import Counter

import seaborn as sns; sns.set()

import matplotlib
matplotlib.style.use('seaborn')
matplotlib.use('Agg')
import matplotlib.pyplot as plt
matplotlib.rcParams['backend'] = "Qt5Agg"
import matplotlib.ticker as ticker
from matplotlib.ticker import FuncFormatter

from IPython.display import display, Image

from adjustText import adjust_text
import builtins
%matplotlib inline

# for normalization
from sklearn.linear_model import QuantileRegressor

# for working with yaml files
import ruamel.yaml

# for working with genomic intervals
import pyranges as pr

In [3]:
def get_pvalue_star(pval, thr=0.05):
    if thr == 0.05:
        if pval < 0.001:
            return "***"
        elif pval < 0.01:
            return "**"
        elif pval < 0.05:
            return "*"
        else:
            return "ns"
    elif thr == 0.1:
        if pval < 0.001:
            return "***"
        elif pval < 0.01:
            return "**"
        elif pval < 0.1:
            return "*"
        else:
            return "ns"

In [4]:
# paths to subdirectories
subdirs = {}

subdirs['main_project_dir'] = '/scicore/home/zavolan/GROUP/StefanieCLIP/'
subdirs['wf_dir'] = '/scicore/home/zavolan/mirono0000/Projects/bCLIP/bclip_workflow/'

subdirs['zarp_dir'] = '/scicore/home/zavolan/mirono0000/libs/zarp/'
subdirs['htsinfer_dir'] = '/scicore/home/zavolan/mirono0000/libs/htsinfer/'
subdirs['zarp_config_dir'] = subdirs['zarp_dir']+'config/'

subdirs['mouse_annotation_dir'] = '/scicore/home/zavolan/GROUP/Genomes/mus_musculus/'
subdirs['human_annotation_dir'] = '/scicore/home/zavolan/GROUP/Genomes/homo_sapiens/'

# shared project folder 
subdirs['shared_project_dir'] = subdirs['main_project_dir']+'aleksei/'
subdirs['temp_dir'] = subdirs['shared_project_dir']+'temp_dir/'
subdirs['slurm_dir'] = subdirs['temp_dir']+'slurm/'
subdirs['scripts_dir'] = subdirs['shared_project_dir']+'scripts/'
subdirs['figures_dir'] = subdirs['shared_project_dir']+'figures/'

subdirs['fastq_dir'] = subdirs['shared_project_dir']+'input_fastq/'
subdirs['metadata_dir'] = subdirs['shared_project_dir']+'metadata/'
subdirs['wf_output_dir'] = subdirs['shared_project_dir']+'output/'

# paths to files
file_paths = {}
### genome annotation files
file_paths['mouse_genome_file'] = subdirs['mouse_annotation_dir']+'GRCm39.primary_assembly.genome.fa'
file_paths['mouse_annotation_file'] = subdirs['mouse_annotation_dir']+'gencode.vM32.annotation.gtf'
file_paths['mouse_RNAcentral_annotation_file'] = subdirs['mouse_annotation_dir']+'mus_musculus.GRCm39.gff3.gz'

file_paths['mouse_prot_coding_gtf'] = subdirs['mouse_annotation_dir']+'coding.gencode.vM32.annotation.gtf'
file_paths['mouse_collapsed_prot_coding_gtf'] = subdirs['mouse_annotation_dir']+'collapsed.coding.gencode.vM32.annotation.gtf'

file_paths['mouse_enriched_annotation_file'] = subdirs['mouse_annotation_dir']+'enriched.gencode.vM32.annotation.gtf' # added RNA species from RNA central
file_paths['mouse_collapsed_enriched_annotation_file'] = subdirs['mouse_annotation_dir']+'collapsed.enriched.gencode.vM32.annotation.gtf'

file_paths['human_genome_file'] = subdirs['human_annotation_dir']+'GRCh38.primary_assembly.genome.fa'
file_paths['human_annotation_file'] = subdirs['human_annotation_dir']+'hg38_v42/gencode.v42.annotation.gtf'
file_paths['human_RNAcentral_annotation_file'] = subdirs['human_annotation_dir']+'hg38_v42/homo_sapiens.GRCh38.gff3.gz'

file_paths['human_enriched_annotation_file'] = subdirs['human_annotation_dir']+'hg38_v42/enriched.gencode.v42.annotation.gtf'

file_paths['htsinfer_transcripts_file'] = '/scicore/home/zavolan/GROUP/Genomes/htsinfer_deduplicated_transcripts.fasta'

### control data
file_paths['control_metadata_file'] = subdirs['metadata_dir']+'SRA_CONTROL_DATA.tsv'

os.system('mkdir -p '+' '.join(list(subdirs.values()))) # create all subdirs

0

# Make enriched gtf file for mouse and human

In [43]:
organisms = ['mouse','human']

for organism in organisms:
    command = 'samtools faidx '+file_paths[organism+'_genome_file']
    out = subprocess.check_output(command, shell=True)

    gtf_df = pd.read_csv(file_paths[organism+'_annotation_file'],delimiter="\t",index_col=None,header=None,skiprows=5)

    rna_central = pd.read_csv(file_paths[organism+'_RNAcentral_annotation_file'],delimiter="\t",index_col=None,header=None,skiprows=1,compression='gzip')
    rna_central[2] = rna_central[2].str.replace('noncoding_exon','exon')
    rna_central['gene_biotype'] = rna_central[8].str.split(';type=|;',expand=True)[1]
    rna_central['gene_source'] = 'RNA_central'
    rna_central['gene_id'] = rna_central[8].str.split('ID=|;|:',expand=True)[4]

    rna_central_exons = rna_central.loc[rna_central[2]=='exon'].copy().reset_index(drop=True)
    rna_central_exons['exon_id'] = rna_central_exons['gene_id']+'.transcript'+'.'+rna_central_exons[8].str.split('ID=|;|:',expand=True)[5]
    rna_central_exons['exon_number'] = rna_central_exons[8].str.split('ID=|;|:',expand=True)[5].str.split('exon',expand=True).iloc[:, -1]
    rna_central_exons[8] = 'gene_id "'+rna_central_exons['gene_id']+'"; transcript_id "'+rna_central_exons['gene_id']+'.transcript'+'"; exon_number "'+rna_central_exons['exon_number']+'"; gene_source "'+rna_central_exons['gene_source']+'"; gene_biotype "'+rna_central_exons['gene_biotype']+'"; transcript_source "'+rna_central_exons['gene_source']+'"; transcript_biotype "'+rna_central_exons['gene_biotype']+'"; exon_id "'+rna_central_exons['exon_id']+'"; tag "'+rna_central['gene_source']+'";'
    rna_central_exons['order']=3

    rna_central_exons['transcript_id'] = rna_central_exons['gene_id']+'.transcript'
    rna_central_exons['exon_coords'] = rna_central_exons[3].astype('str')+'_'+rna_central_exons[4].astype('str')+','
    rna_central_gr_transcripts = rna_central_exons.groupby([0,6,'transcript_id']).agg({'exon_coords':sum}).reset_index()
    rna_central_gr_transcripts['transcript_alt_id'] = rna_central_gr_transcripts[0].astype('str')+'_'+rna_central_gr_transcripts[6]+'_'+rna_central_gr_transcripts['exon_coords']

    gtf_df_exons = gtf_df.loc[gtf_df[2]=='exon'].reset_index(drop=True)
    gtf_df_exons['transcript_id'] = gtf_df_exons[8].str.split('transcript_id "',expand=True)[1].str.split('"',expand=True)[0]
    gtf_df_exons['exon_coords'] = gtf_df_exons[3].astype('str')+'_'+gtf_df_exons[4].astype('str')+','
    gtf_df_gr_transcripts = gtf_df_exons.groupby([0,6,'transcript_id']).agg({'exon_coords':sum}).reset_index()
    gtf_df_gr_transcripts['transcript_alt_id'] = gtf_df_gr_transcripts[0].astype('str')+'_'+gtf_df_gr_transcripts[6]+'_'+gtf_df_gr_transcripts['exon_coords']
    ensembl_transcripts = list(gtf_df_gr_transcripts['transcript_alt_id'].unique())

    preserve_transcripts_list = list(rna_central_gr_transcripts.loc[~rna_central_gr_transcripts['transcript_alt_id'].isin(ensembl_transcripts)]['transcript_id'].unique()) # when a transcript is present in ensemble and RNA central, prioritize ensemble
    rna_central_exons = rna_central_exons.loc[rna_central_exons['transcript_id'].isin(preserve_transcripts_list)].reset_index(drop=True)
    rna_central['transcript_id'] = rna_central['gene_id']+'.transcript'
    rna_central = rna_central.loc[rna_central['transcript_id'].isin(preserve_transcripts_list)].reset_index(drop=True)

    rna_central_exons = rna_central_exons[list(range(0,9))+['gene_id','order']]

    rna_central_transcripts = rna_central.loc[rna_central[2]=='transcript'].copy().reset_index(drop=True)
    rna_central_transcripts[8] = 'gene_id "'+rna_central_transcripts['gene_id']+'"; transcript_id "'+rna_central_transcripts['gene_id']+'.transcript'+'"; gene_source "'+rna_central_transcripts['gene_source']+'"; gene_biotype "'+rna_central_transcripts['gene_biotype']+'"; transcript_source "'+rna_central_transcripts['gene_source']+'"; transcript_biotype "'+rna_central_transcripts['gene_biotype']+'"; tag "'+rna_central_transcripts['gene_source']+'";'
    rna_central_transcripts['order']=2
    rna_central_transcripts = rna_central_transcripts[list(range(0,9))+['gene_id','order']]

    rna_central_genes = rna_central.loc[rna_central[2]=='transcript'].copy().reset_index(drop=True)
    rna_central_genes[2] = 'gene'
    rna_central_genes[8] = 'gene_id "'+rna_central_genes['gene_id']+'"; gene_source "'+rna_central_genes['gene_source']+'"; gene_biotype "'+rna_central_genes['gene_biotype']+'";'
    rna_central_genes['order']=1
    rna_central_genes = rna_central_genes[list(range(0,9))+['gene_id','order']]

    rna_central_gtf = pd.concat([rna_central_genes,rna_central_transcripts,rna_central_exons]).sort_values(['gene_id','order']).reset_index(drop=True).drop(['gene_id','order'],1)

    # save standard annotation enriched with RNA central
    enriched_gtf = pd.concat([gtf_df,rna_central_gtf]).reset_index(drop=True)
    genome_fai = pd.read_csv(file_paths[organism+'_genome_file']+'.fai',delimiter="\t",index_col=None,header=None)
    enriched_gtf = pd.merge(genome_fai[[0]],enriched_gtf,how='inner',on=0)
    enriched_gtf.to_csv(file_paths[organism+'_enriched_annotation_file'], sep=str('\t'),header=False,index=None,quoting=csv.QUOTE_NONE)

# Download control data<a name="control_data_download"></a>

In [6]:
# download CONTROL samples from SRA using zarp
 
f = open(subdirs['scripts_dir']+'zarp_download_control_samples.sh', "w")
preambula = \
"""#!/bin/bash
source ~/.bashrc
conda activate zarp
"""
f.write(preambula)

command = """snakemake --snakefile="""+'"'+subdirs['zarp_dir']+"""workflow/rules/sra_download.smk" \
--profile="""+'"'+subdirs['zarp_dir']+"""profiles/local-conda" \
--config samples="""+'"'+file_paths['control_metadata_file']+'"'+""" \
outdir="""+'"'+subdirs['fastq_dir']+'"'+""" \
samples_out="""+'"'+subdirs['metadata_dir']+'CONTROL_processed_metadata.tsv'+'"'+""" \
log_dir="""+'"'+subdirs['slurm_dir']+'zarp_download_logs/'+'"'+""" \
cluster_log_dir="""+'"'+subdirs['slurm_dir']+'"'

# command = command.replace(' --snakefile','--list-input-changes --snakefile')+' --forcerun $('+command+')'
f.write(command)
f.close()
print(subdirs['scripts_dir']+'zarp_download_control_samples.sh')

/scicore/home/zavolan/GROUP/StefanieCLIP/aleksei/scripts/zarp_download_control_samples.sh


In [10]:
# delete unnecessary .sra files
os.system("""find """+subdirs['fastq_dir']+""" -name '*.sra' > """+subdirs['temp_dir']+"""sra_file_paths.tsv""")
sra_file_paths = pd.read_csv(subdirs['temp_dir']+"""sra_file_paths.tsv""",delimiter="\t",
                                   index_col=None,header=None)
sra_file_paths.columns = ['path']
command = 'rm '+' '.join(sra_file_paths['path'])
out = subprocess.check_output(command, shell=True)

# Create symbolink link copies for all experimental fastq files<a name="symb_link_input"></a>

In [15]:
# description file for experimental samples
experimental_samples = pd.read_csv(subdirs['metadata_dir']+'experimental_samples.tsv',delimiter="\t",
                                   index_col=None,header=0)

In [16]:
exp_file_paths = experimental_samples[['lane_file','barcode_file']].drop_duplicates().reset_index(drop=True)

os.system("""find """+subdirs['main_project_dir']+'input/'+""" -name '*.fastq.gz' > """+subdirs['temp_dir']+"""exper_fastq_file_paths.tsv""")
fastq_file_paths = pd.read_csv(subdirs['temp_dir']+'exper_fastq_file_paths.tsv',delimiter="\t",
                                   index_col=None,header=None)
fastq_file_paths.columns = ['lane_file_path']
fastq_file_paths['lane_file'] = fastq_file_paths['lane_file_path'].str.split('/',expand=True).iloc[:, -1]
exp_file_paths = pd.merge(fastq_file_paths,exp_file_paths,how='inner',on=['lane_file'])

os.system("""find """+subdirs['main_project_dir']+'input/'+""" -name '*.fasta' > """+subdirs['temp_dir']+"""barcode_fasta_files.tsv""")
barcode_fasta_files = pd.read_csv(subdirs['temp_dir']+'barcode_fasta_files.tsv',delimiter="\t",
                                   index_col=None,header=None)
barcode_fasta_files.columns = ['barcode_file_path']
barcode_fasta_files['barcode_file'] = barcode_fasta_files['barcode_file_path'].str.split('/',expand=True).iloc[:, -1]
exp_file_paths = pd.merge(barcode_fasta_files,exp_file_paths,how='inner',on=['barcode_file'])

# 
exp_file_paths['wfstart_lane_file_path'] = subdirs['fastq_dir']+exp_file_paths['lane_file'].str.replace('.fastq.gz','')+'/'+exp_file_paths['lane_file']
exp_file_paths['wfstart_barcode_file_path'] = subdirs['fastq_dir']+exp_file_paths['lane_file'].str.replace('.fastq.gz','')+'/'+exp_file_paths['barcode_file']

for index, row in exp_file_paths.iterrows():
    sample_name = row['lane_file'].replace('.fastq.gz','')
    command = 'mkdir -p '+subdirs['fastq_dir']+sample_name+'/'
    out = subprocess.check_output(command, shell=True)
    command = 'ln -f -s '+row['lane_file_path']+' '+row['wfstart_lane_file_path']
    out = subprocess.check_output(command, shell=True)
    command = 'ln -f -s '+row['barcode_file_path']+' '+row['wfstart_barcode_file_path']
    out = subprocess.check_output(command, shell=True)

In [17]:
experimental_samples = pd.merge(experimental_samples,
                                exp_file_paths[['lane_file','barcode_file','wfstart_lane_file_path','wfstart_barcode_file_path']],
                                how='inner',on=['lane_file','barcode_file'])

In [18]:
experimental_samples = experimental_samples.drop(['lane_file','barcode_file'],1).rename(columns={'wfstart_lane_file_path':'lane_file','wfstart_barcode_file_path':'barcode_file'})

In [19]:
experimental_samples['genome_file'] = file_paths['mouse_genome_file']
experimental_samples['gtf_file'] = file_paths['mouse_enriched_annotation_file']
experimental_samples = experimental_samples[['name','lane_name','lane_file','kmer','barcode_file','exp_ctl','batch','condition_name','genome_file','gtf_file']]

In [20]:
# description file for public samples
public_samples = pd.read_csv(subdirs['metadata_dir']+'public_samples.tsv',delimiter="\t",
                                   index_col=None,header=0)
public_samples['genome_file'] = public_samples.apply(lambda x:file_paths['mouse_genome_file'] if x['batch'].endswith('_mESC') else file_paths['human_genome_file'],1)
public_samples['gtf_file'] = public_samples.apply(lambda x:file_paths['mouse_enriched_annotation_file'] if x['batch'].endswith('_mESC') else file_paths['human_enriched_annotation_file'],1)
public_samples['lane_name'] = 'f'+public_samples['name']
public_samples['organism'] = public_samples.apply(lambda x: 'mouse' if x['batch'].endswith('_mESC') else 'human',1)

In [21]:
experimental_samples['lane_name'] = 'f'+experimental_samples['barcode_file'].str.split('barcodes_',expand=True).iloc[:, -1].str.replace('.fasta','')
experimental_samples['organism'] = 'mouse'

In [22]:
# merge with public data
start_samples = pd.concat([public_samples,experimental_samples]).reset_index(drop=True)

In [23]:
start_samples['kmer'] = start_samples['kmer'].fillna('XXXXXXX')

In [24]:
start_samples.to_csv(subdirs['metadata_dir']+'start_samples.tsv', sep=str('\t'),header=True,index=None,quoting=csv.QUOTE_NONE)

In [46]:
subdirs['metadata_dir']+'start_samples.tsv'

'/scicore/home/zavolan/GROUP/StefanieCLIP/aleksei/metadata/start_samples.tsv'

In [47]:
subdirs['wf_output_dir']

'/scicore/home/zavolan/GROUP/StefanieCLIP/aleksei/output/'

In [48]:
subdirs['slurm_dir']

'/scicore/home/zavolan/GROUP/StefanieCLIP/aleksei/temp_dir/slurm/'

# Run main WF

In [87]:
sample_file_name = subdirs['temp_dir']+'bam_file_paths.tsv'

os.system("""find """+'/scicore/home/zavolan/GROUP/StefanieCLIP/aleksei/output/'+""" -name '*.Aligned.sortedByCoord.out.bam' > """+sample_file_name)
bam_file_paths = pd.read_csv(sample_file_name,delimiter="\t",
                                   index_col=None,header=None)
bam_file_paths['dir'] = bam_file_paths.apply(lambda x:os.path.dirname(x[0]), 1)
command = 'rm -r '+' '.join(bam_file_paths['dir'])
out = subprocess.check_output(command, shell=True)

In [7]:
command = """snakemake \
--snakefile Snakefile \
--configfile config.yaml \
--printshellcmds \
--use-singularity \
--singularity-args "--bind /scicore/home/zavolan/GROUP/StefanieCLIP/,/scicore/home/zavolan/GROUP/Genomes/" \
--cluster-config cluster.json \
--cores 500 \
--local-cores 10 \
--jobs 100 \
--cluster "sbatch \
--cpus-per-task={cluster.threads} \
--mem={cluster.mem} \
--qos={cluster.queue} \
--time={cluster.time} \
--output={cluster.out}" \
--nolock \
-np"""
command

'snakemake --snakefile Snakefile --configfile config.yaml --printshellcmds --use-singularity --singularity-args "--bind /scicore/home/zavolan/GROUP/StefanieCLIP/,/scicore/home/zavolan/GROUP/Genomes/" --cluster-config cluster.json --cores 500 --local-cores 10 --jobs 100 --cluster "sbatch --cpus-per-task={cluster.threads} --mem={cluster.mem} --qos={cluster.queue} --time={cluster.time} --output={cluster.out}" --nolock -np'

# Quality control analysis<a name="QC_analysis"></a>

In [5]:
sample_metadata = pd.read_csv(subdirs['metadata_dir']+'start_samples.tsv',delimiter="\t",index_col=None,header=0)

In [6]:
sample_metadata.head()

Unnamed: 0,name,lane_name,lane_file,kmer,barcode_file,exp_ctl,batch,condition_name,genome_file,gtf_file,organism
0,SRR15070630,fSRR15070630,/scicore/home/zavolan/GROUP/StefanieCLIP/aleks...,XXXXXXX,,exp,public_bCLIP_mESC,NRDE2,/scicore/home/zavolan/GROUP/Genomes/mus_muscul...,/scicore/home/zavolan/GROUP/Genomes/mus_muscul...,mouse
1,SRR15070631,fSRR15070631,/scicore/home/zavolan/GROUP/StefanieCLIP/aleks...,XXXXXXX,,exp,public_bCLIP_mESC,NRDE2,/scicore/home/zavolan/GROUP/Genomes/mus_muscul...,/scicore/home/zavolan/GROUP/Genomes/mus_muscul...,mouse
2,SRR15070632,fSRR15070632,/scicore/home/zavolan/GROUP/StefanieCLIP/aleks...,XXXXXXX,,exp,public_bCLIP_mESC,NRDE2_200AANenddel,/scicore/home/zavolan/GROUP/Genomes/mus_muscul...,/scicore/home/zavolan/GROUP/Genomes/mus_muscul...,mouse
3,SRR15070633,fSRR15070633,/scicore/home/zavolan/GROUP/StefanieCLIP/aleks...,XXXXXXX,,exp,public_bCLIP_mESC,NRDE2_200AANenddel,/scicore/home/zavolan/GROUP/Genomes/mus_muscul...,/scicore/home/zavolan/GROUP/Genomes/mus_muscul...,mouse
4,SRR15070635,fSRR15070635,/scicore/home/zavolan/GROUP/StefanieCLIP/aleks...,XXXXXXX,,exp,public_bCLIP_mESC,Nrde2_D174Rmut,/scicore/home/zavolan/GROUP/Genomes/mus_muscul...,/scicore/home/zavolan/GROUP/Genomes/mus_muscul...,mouse
