In [None]:
from __future__ import print_function
import os.path
import dalmatian as dm
import pandas as pd
import sys
sys.path.insert(0, '../ccle_processing/')
from CCLE_postp_function import *
sys.path.insert(0, '../JKBio/')
import datanalytics as da 
import TerraFunction as terra
%load_ext autoreload
%autoreload 2
%load_ext rpy2.ipython
from taigapy import TaigaClient
tc = TaigaClient()
from wand.image import Image as WImage
import numpy as np
from sklearn.manifold import TSNE
from sklearn.neighbors import KNeighborsClassifier
from bokeh.plotting import *
from bokeh.models import HoverTool
output_notebook()
from collections import OrderedDict

In [None]:
refnamespace="broad-firecloud-ccle"
refworkspace="DepMap_hg38_RNAseq"
source="DFCI"
samplesetname="MAX_AMLproject"
gcpfolder= 'jkobject'
prefix='JK_RNA_AMLproject/'
release = samplesetname

In [None]:
refwm = dm.WorkspaceManager(refnamespace, refworkspace)

# Generate sample set from new samples

In [None]:
terra.UploadFromFolder(gcpfolder, prefix, refwm, samplesetname=samplesetname)

# run the pipeline

### if submitted as bams

In [None]:
samtofastq = refwm.get_config("samtofastq_v1-0_BETA_cfg")
samtofastq

In [None]:
samtofastq['samtofastq_workflow.samtofastq.input_bam_cram']= 'this.WES_bam'
refwm.update_config(samtofastq)
submission_id = refwm.create_submission(samtofastq['name'], samplesetname,'sample_set',expression='this.samples')


In [None]:
wait_for_submission(refwm, submission_id1)

### else as fastqs

In [None]:
star = refwm.get_config("star_v1-0_BETA_cfg")
star

In [None]:
refwm.update_config(star)
submission_id = refwm.create_submission(star['name'], samplesetname,'sample_set',expression='this.samples')

In [None]:
terra.wait_for_submission(refwm, submission_id)

In [None]:
rsem = refwm.get_config("rsem_v1-0_BETA_cfg")
rsem

In [None]:
refwm.update_config(rsem)
submission_id1 = refwm.create_submission(rsem['name'], samplesetname,'sample_set',expression='this.samples')

In [None]:
wait_for_submission(refwm, submission_id1)

In [None]:
fusion = refwm.get_config("hg38_STAR_fusion")
fusion

In [None]:
fusion['inputs']['trinity_cleaned.samtofastq.input_bam'] = 'this.WES_bam'
refwm.update_config(fusion)
submission_id2 = refwm.create_submission(fusion['name'], samplesetname,'sample_set',expression='this.samples')

In [None]:
wait_for_submission(refwm, submission_id1)

In [None]:
aggregate = refwm.get_config("rsem_aggregate_results_v1-0_BETA_cfg")
aggregate

In [None]:
submission_id1 = refwm.create_submission(aggregate['name'], samplesetname)

## Expression post processing

In [None]:
terra.wait_for_submission(refwm, submission_id1)

In [None]:
%%R
release <- '19Q3interim'

In [None]:
%%R
source('../JKBio/gkugener/RScripts/load_libraries_and_annotations.R')

In [None]:
%%R
source('CCLE_postp_function.R')
release <- "MAX_AMLproject"

In [None]:
res = refwm.get_sample_sets().loc[samplesetname]
rsem_genes_expected_count = res['rsem_genes_expected_count']
rsem_genes_tpm = res['rsem_genes_tpm']
rsem_transcripts_tpm = res['rsem_transcripts_tpm']
! gsutil cp $rsem_genes_expected_count "temp/expression/expectedcount.txt.gz" & gsutil cp $rsem_genes_tpm "temp/expression/genestpm.txt.gz" & gsutil cp $rsem_transcripts_tpm "temp/expression/transcripttpm.txt.gz"

In [None]:
%%R
# These files are downloaded from FireCloud/Terra
download_paths <- list(
  tpm_genes='temp/expression/genestpm.txt.gz',
  tpm_transcripts='temp/expression/transcripttpm.txt.gz',
  counts_genes='temp/expression/expectedcount.txt.gz')

In [None]:
%%R
tpm_transcripts = readTranscripts(download_paths$tpm_transcripts)
counts_genes = readCounts(download_paths$counts_genes)
tpm_genes = readTPM(download_paths$tpm_genes)

In [None]:
%%R
counts_genes

### data exploration and QC

In [None]:
%%R 
# Quick QC
# We are looking for samples with a worrying amount of zeros
zero_threshold <- 39000

number_zeros <- apply(tpm_genes[,3:ncol(tpm_genes)], 2, FUN = function(x) length(x[x == 0]))
nzdf <- data.frame(CL=names(number_zeros), nz=number_zeros, stringsAsFactors = F)

In [None]:
%%R
# Plot of the samples that fail
plot <- ggplot(nzdf, aes(nz)) +
  geom_histogram(bins = 100, color='black', fill='white') +
  geom_vline(xintercept = zero_threshold, linetype=2) +
  geom_label_repel(data = nzdf %>% filter(nz > zero_threshold), aes(x=nz, y=0, label=CL), size=5, fill=rgb(1,1,1,0.5))

ggsave(plot, filename ='temp/ggplot.pdf', width=20, height = 20)

In [None]:
img = WImage(filename='temp/ggplot.pdf')
img

In [None]:
%%R
pass <- number_zeros[order(-number_zeros)] %>% .[. < zero_threshold] %>% names()

# These samples failed
failed <- setdiff(colnames(tpm_genes), pass) %>% .[!(. %in% c('gene_id', 'transcript_id(s)'))]

counts_genes %<>% dplyr::select(c("gene_id","transcript_id(s)", pass))
tpm_genes %<>% dplyr::select(c("gene_id","transcript_id(s)", pass))
tpm_transcripts %<>% dplyr::select("transcript_id", "gene_id", pass)

In [None]:
%%R
tpm_genes

In [None]:
%%R
failed

## Save files for taiga

In [None]:
%%R
write.table(
  counts_genes, 
  file = paste0('temp/expression.', release,'.counts.tsv'), 
  sep = '\t', row.names = F, quote = F)
write.table(
  tpm_genes, 
  file = paste0('temp/expression.', release,'.tpm.tsv'), 
  sep = '\t', row.names = F, quote = F)
write.table(
  tpm_transcripts, 
  file = paste0('temp/expression.', release,'.transcripts.tsv'),
  sep = '\t', row.names = F, quote = F)

# Validation

In [None]:
counts_genes = pd.read_csv('../AMLproject/data/expression.'+ release + '.counts.tsv', sep='\t')
tpm_genes = pd.read_csv('../AMLproject/data/expression.'+ release + '.tpm.tsv', sep='\t')
tpm_transcripts = pd.read_csv('../AMLproject/data/expression.'+ release + '.transcripts.tsv', sep='\t')

In [None]:
# similarity accross time:
counts_genes.shape

## solving the problem of amount of genes set to zero 

### verifying the data

In [None]:
np.count_nonzero(tpm_genes[tpm_genes.columns[2:]].values)

In [None]:
(tpm_genes[tpm_genes.columns[2:]].shape[1]*tpm_genes[tpm_genes.columns[2:]].shape[0] - np.count_nonzero(tpm_genes[tpm_genes.columns[2:]].values))/len(tpm_genes.columns[2:])

In [None]:
val = counts_genes[counts_genes.columns[2:]].values

In [None]:
val.sum(0).mean()

In [None]:
val.mean(0).mean()

In [None]:
(val.sum(0)/(val!=0).sum(0)).mean()

In [None]:
import math
math.sqrt(val.var(0).mean())

In [None]:
new1 = set(counts_genes.columns.values.tolist())
new2 = set(tpm_genes.columns.values.tolist())
new3 = set(tpm_transcripts.columns.values.tolist())
print(len(new1), len(new1 & new2 & new3))

In [None]:
np.count_nonzero(tpm_genes.values,0)

## looking at previous information on cell line

- compare to CCLE rna data (MV411?)
- overlap
- spot check a few genes (are they all weird?)
- if not sure, do GSEA on non overlapping set for each

In [None]:
prev = tc.get(name='depmap-expression-87f8', version=7, file='DM19Q2.tpm')

In [None]:
np.count_nonzero(prev.values[2:])/1288

In [None]:
np.count_nonzero(prev["ACH-000045"].values)

In [None]:
# genes that are expressed on MV411 ccle version
CCLEgenes = set(prev["gene_id"].values[np.where(prev["ACH-000045"].values!=0)])

In [None]:
overlap = set(np.where(val[:,0]!=0)[0])
for i in range(1, val.shape[1]):
    overlap = overlap & set(np.where(val[:,i]!=0)[0])

In [None]:
len(overlap)

In [None]:
# genes that are expressed in all version of the AML rna seq data
AMLgenes = set(counts_genes["gene_id"].values[list(overlap)])

In [None]:
len(AMLgenes&CCLEgenes)

In [None]:
nonoverlapping = CCLEgenes - AMLgenes

In [None]:
nonoverlapping

### we now have the feeling that they are only the protein coding ones

In [None]:
ProteinCoding = tc.get(name='depmap-rnaseq-expression-data-363a', file='CCLE_depMap_19Q2_TPM_ProteinCoding')
CCLEgenes = set(ProteinCoding.columns.values[np.where(ProteinCoding.loc["ACH-000045"].values!=0)])
nonoverlapping = CCLEgenes - AMLgenes

In [None]:
ProteinCoding.shape

In [None]:
ProteinCoding

In [None]:
gene_mapping = tc.get(name='hgnc-87ab', file='hgnc_complete_set-2018q3')

In [None]:
gene_mapping

In [None]:
gene_mapping[gene_mapping["ensembl_gene_id"]=='ENSG00000271743']

In [None]:
gene_mapping.shape

In [None]:
i = 0
b=0
for val in nonoverlapping:
    val = val.split(".")[0]
    a = gene_mapping["locus_group"][gene_mapping["ensembl_gene_id"]==val].values
    if len(a) > 0:
        if a[0] == "protein-coding gene":
            i+=1
    else:
        b+=1

In [None]:
i

In [None]:
b

In [None]:
len(nonoverlapping)

In [None]:
! mv temp/expression/* AMLproject/data

## Fusion post processing

In [None]:
terra.wait_for_submission(submission_id2)

In [None]:
aggregate = refwm.get_config('Aggregate_Fusion_Calls')
aggregate

In [None]:
refwm.update_config(aggregate)
submission_id2 = refwm.create_submission(aggregate['name'], 'All_samples')

In [None]:
terra.wait_for_submission(submission_id2)

In [None]:
refwm.get_sample_sets().loc['All_samples']['fusions_star']

In [None]:
! gsutil cp $aggregated "temp/fusion.Fusions.aggregated.tsv"

### Overview

This document contains the code used to generate the unfiltered and filtered versions of the fusion datasets for the release. The bottom of the document also contains some comparisons between the release fusion dataset, CCLE2 fusion calls, and the translocation data from CCLE2.

In [None]:
%%R
source("CCLE_postp_function.R")
filepath <- 'temp/fusion.Fusions.aggregated.tsv'

## Generate filtered fusion table

Release: `r release`

We want to apply filters to the fusion table to reduce the number of artifacts in the dataset. Specifically, we filter the following:

* Remove fusions involving mitochondrial chromosomes, or HLA genes, or immunoglobulin genes
* Remove red herring fusions (from STAR-Fusion annotations column)
* Remove recurrent in CCLE (>= 25 samples)
* Remove fusion with (SpliceType=" INCL_NON_REF_SPLICE" and LargeAnchorSupport="No" and FFPM < 0.1)
* Remove fusions with FFPM < 0.05 (STAR-Fusion suggests using 0.1, but looking at the translocation data, this looks like it might be too aggressive)

In [None]:
%%R
unfiltered_fusions <- read_fusions(filepath)
filtered_fusions <- filter_fusions(unfiltered_fusions)

In [None]:
%%R
# Save the files (to be uploaded to taiga)
write.table(
  unfiltered_fusions,
  file = paste0('temp/fusions.',release, '.unfiltered.tsv'),
  sep = '\t', quote = F, row.names = F
)
write.table(
  filtered_fusions,
  file = paste0('temp/fusions.', release, '.filtered.tsv'),
  sep = '\t', quote = F, row.names = F
)

# IF want to merge here instead of on Terra:

In [None]:
res = refwm.get_samples().loc['ibm_ACH-001616']
genes_fusion = res['fusion_predictions_abridged']
rsem_genes_transcripts = res['rsem_isoforms']
rsem_genes_expected_count = res['rsem_genes']

! gsutil cp $rsem_genes_expected_count "temp/expression.genes.results" && gsutil cp $rsem_genes_transcripts "temp/expression.transcripts.results" && gsutil cp $genes_fusion "temp/expression.fusion.tsv"

In [None]:
%%R
library("taigr")
source('CCLE_postp_function.R')
source('../JKBio/gkugener/RScripts/load_libraries_and_annotations.R')

In [None]:
%%R
dataset <- list(genes_count= load.from.taiga(data.name='depmap-rnaseq-expression-data-363a', data.file='CCLE_depMap_19Q2_RNAseq_reads'),trancripts =load.from.taiga(data.name='depmap-rnaseq-expression-data-363a', data.file='CCLE_depMap_19Q2_TPM_transcripts'),genes_tpm = load.from.taiga(data.name='depmap-rnaseq-expression-data-363a', data.file='CCLE_depMap_19Q2_TPM'))

download.raw.from.taiga(data.name='gene-fusions-8b7a', data.file='temp/fusions.prevunfiltered.tsv')

listOfSamples<- list( genes_count= c("temp/expression.genes.results"), trancripts= c("temp/expression.transcripts.results"), fusions= c("temp/expression.fusion.tsv"))

In [None]:
%%R
head(dataset$genes_count)

In [None]:
%%R
read_tsv(f)

In [None]:
%%R
dataset <- AddSamplesTo(dataset, listOfSamples)

In [None]:
%%R
source('CCLE_postp_function.R')

fusionFusions(c('temp/expression.fusion.tsv'),'temp/fusions.unfiltered.tsv')

In [None]:
%%R 
