# Slamseq

In [31]:
from __future__ import print_function
import os.path
import pandas as pd
import sys
sys.path.insert(0, '../../')
import seaborn as sns
import numpy as np
from functools import reduce
import glob

from natsort import natsorted, ns

from genepy.utils import helper as h
from genepy.utils import plot as genepyPlot
from genepy.rna import pyDESeq2
import genepy.rna as rna

from bokeh.plotting import *
from bokeh.models import HoverTool
from bokeh.io import output_notebook

import matplotlib.pyplot as plt

from sklearn.neighbors import KNeighborsClassifier
from sklearn.manifold import MDS, TSNE
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
#from umap import UMAP

output_notebook()
%load_ext autoreload
%matplotlib inline
%autoreload 2
%load_ext rpy2.ipython

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


## Get files

Expermients
1. IRF2BP2 degraded v1
    - 3h
    - 6h
2. MYB degraded v1
    - 1h
3. 4 degrons v1
    - RUNX1
    - RUNX2
    - SPI1
    - GFI1
4. MYB degraded time course v1
5. inhibitors v1
6. mebendazole v1

DESeq output files
- Tccounts ERCC
- Readcounts ERCC
- Tccounts (unscaled)
- Readcounts (unscaled)

In [1]:
res_dir = "../results"

In [2]:
project1 = "IRF2BP2_degraded_v1"
project2 = "MYB_degraded_v1"
project3 = "4_degrons_v1"
project4 = "MYB_degraded_timecourse_v1"
project5 = "inhibitors_v1"
project6 = "mebendazole_v1"

In [67]:
def get_file_info(project, file, conditions):
    
    project_row = [project]
    #name = os.path.basename(file).split(".")[0]
    file_row = [os.path.basename(file)]
    scaling_row = ["ERCCsamplewise" if "ERCCsamplewise" in file else None]
    if len(conditions) > 1:
        condition_row = [cond for cond in conditions if cond in file.split("_")]
    else:
        condition_row = conditions
    count_row = ["readcounts" if "readcounts" in file else "tccounts"]
    
    return [project_row, file_row, scaling_row, condition_row, count_row]

In [4]:
def get_deseq_data(file, low_memory=True):
    ''' input deseq file & returns df '''
    ''' add file name to headers except gene columns '''
    df = pd.read_csv(file, header=[0], index_col=None, squeeze=True, low_memory=low_memory)
    df.columns = ["{}_{}".format(col_name, os.path.basename(file).split(".")[0]) if "gene" not in col_name
                  else col_name for col_name in df.columns.tolist()]
    return(df)

# MYB_degraded_timecourse_v1 (no alt)

In [191]:
project = "MYB_degraded_timecourse_v1"
alt_loci = False
location = "../data/slamseq_"+project+"/"
fastq_folder = "../../data/"+"slamseq_"+project+"/fastqs"
fastqs = ! ls $fastq_folder/*fastq.gz

## processing slamseq

### run trim galore

In [44]:
fastqs[:2]

['../../data/slamseq_MYB_degraded_timecourse_v1/fastqs/mr439-MV411-MYB-dTAG-VHL_2h-r3_R1_001.fastq.gz',
 '../../data/slamseq_MYB_degraded_timecourse_v1/fastqs/mr439-MV411-MYB-dTAG-VHL_2h-r3_R2_001.fastq.gz']

In [45]:
for val1, val2 in h.grouped(fastqs[:2], 2):
    print(val1)
    print(val2)
    print("\n")

../../data/slamseq_MYB_degraded_timecourse_v1/fastqs/mr439-MV411-MYB-dTAG-VHL_2h-r3_R1_001.fastq.gz
../../data/slamseq_MYB_degraded_timecourse_v1/fastqs/mr439-MV411-MYB-dTAG-VHL_2h-r3_R2_001.fastq.gz




In [46]:
%%time
h.parrun(['trim_galore --paired --cores 8 --retain_unpaired -stringency 3\
 --illumina '+val1+' '+val2+' -o '+fastq_folder for val1, val2 in h.grouped(fastqs[:2], 2)], 2)

CPU times: user 228 ms, sys: 48 ms, total: 276 ms
Wall time: 1h 54min 31s


['pigz 2.6\n']

In [315]:
%%time
h.parrun(['trim_galore --paired --cores 8 --retain_unpaired -stringency 3\
 --illumina '+val1+' '+val2+' -o '+fastq_folder for val1, val2 in h.grouped(fastqs[18:20], 2)], 2)

CPU times: user 124 ms, sys: 68 ms, total: 192 ms
Wall time: 1h 27min 36s


['pigz 2.6\n']

### run slamdunk

In [8]:
# using an ERCC ref genome (you can just append ERCC fasta to the hg38 fasta)
refgenome_noAlt="../../ref/Homo_sapiens_assembly38_ERCC92_noAlt.fasta"

In [9]:
# gene intervals bed (use for all projects)
gene_intervals = "../../ref/GSE100708_hg38_refseq_062016_ensemblv84_3UTR.bed"

res = slamdunk all w/ (-fb, -m, -n 1, -rl 152, -r Homo_sapiens_assembly38_ERCC92_noAlt.fasta)

Read length note!
<br>Difference between min and max read length is > 100bp
<br>Average read length 127 (min: 8, max: 152)

In [10]:
fastq_folder = "../../data/"+"slamseq_"+project+"/fastqs"
fastqs = ! ls $fastq_folder
fastqs = [fastq for fastq in fastqs if "val" in fastq]
fastq_full = [fastq_folder+"/"+f for f in fastqs]

In [11]:
len(fastq_full)/2

4.0

In [12]:
res_dir = "../../data/"+"slamseq_"+project+"/res"
res_dir

'../../data/slamseq_MYB_degraded_timecourse_v1/res'

In [13]:
%%time

print(['slamdunk all'+
       ' -o '+res_dir+
       ' -r '+refgenome_noAlt+
       ' -b '+gene_intervals+
       ' -t 14'+
       ' -c 2'+
       ' -rl 152'+ # max-read-length = 152
       # new parameters
       ' -5 12'+
       ' -n 1'+ # reads map to multiple locations at once
       ' -m'+ # multimapping
       ' -N '+val1.split('/')[-1].split('_R')[0]+
       ' '+val1+" "+val2
       for val1, val2 in h.grouped(fastq_full[:2], 2)])

['slamdunk all -o ../../data/slamseq_MYB_degraded_timecourse_v1/res -r ../../ref/Homo_sapiens_assembly38_ERCC92_noAlt.fasta -b ../../ref/GSE100708_hg38_refseq_062016_ensemblv84_3UTR.bed -t 14 -c 2 -rl 152 -5 12 -n 1 -m -N mr488-MV411-MYB-dTAG-VHL_48h-r2 ../../data/slamseq_MYB_degraded_timecourse_v1/fastqs/mr488-MV411-MYB-dTAG-VHL_48h-r2_R1_001_val_1.fq.gz ../../data/slamseq_MYB_degraded_timecourse_v1/fastqs/mr488-MV411-MYB-dTAG-VHL_48h-r2_R2_001_val_2.fq.gz']
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 173 µs


In [15]:
for idx, vals in enumerate(h.grouped(fastq_full[:], 2)): #[26:36]
    print(idx)
    print(vals[0])
    print(vals[1])

0
../../data/slamseq_MYB_degraded_timecourse_v1/fastqs/mr488-MV411-MYB-dTAG-VHL_48h-r2_R1_001_val_1.fq.gz
../../data/slamseq_MYB_degraded_timecourse_v1/fastqs/mr488-MV411-MYB-dTAG-VHL_48h-r2_R2_001_val_2.fq.gz
1
../../data/slamseq_MYB_degraded_timecourse_v1/fastqs/mr489-MV411-MYB-dTAG-VHL_48h-r3_R1_001_val_1.fq.gz
../../data/slamseq_MYB_degraded_timecourse_v1/fastqs/mr489-MV411-MYB-dTAG-VHL_48h-r3_R2_001_val_2.fq.gz
2
../../data/slamseq_MYB_degraded_timecourse_v1/fastqs/mr490-MV411-MYB-dTAG-VHL_48h-r4_R1_001_val_1.fq.gz
../../data/slamseq_MYB_degraded_timecourse_v1/fastqs/mr490-MV411-MYB-dTAG-VHL_48h-r4_R2_001_val_2.fq.gz
3
../../data/slamseq_MYB_degraded_timecourse_v1/fastqs/mr491-MV411-MYB-dTAG-VHL_48h-r5_R1_001_val_1.fq.gz
../../data/slamseq_MYB_degraded_timecourse_v1/fastqs/mr491-MV411-MYB-dTAG-VHL_48h-r5_R2_001_val_2.fq.gz


In [None]:
%%time

# we are doing it paired end
h.parrun(['slamdunk all'+
          ' -o '+res_dir+
          ' -r '+refgenome_noAlt+
          ' -b '+gene_intervals+
          ' -t 14'+
          ' -c 2'+
          ' -rl 152'+ # max-read-length = 152
          # new parameters
          ' -5 12'+
          ' -n 1'+ # reads map to multiple locations at once
          ' -m True'+ # multimapping
          ' -N '+val1.split('/')[-1].split('_R')[0]+
          ' '+val1+" "+val2
          for val1, val2 in h.grouped(fastq_full[:2], 2)], 2)

In [52]:
%%time

# we are doing it paired end
h.parrun(['slamdunk all'+
          ' -o '+res_dir+
          ' -r '+refgenome_noAlt+
          ' -b '+gene_intervals+
          ' -t 14'+
          ' -c 2'+
          ' -rl 152'+ # max-read-length = 152
          # new parameters
          ' -5 12'+
          ' -n 1'+ # reads map to multiple locations at once
          ' -m True'+ # multimapping
          ' -N '+val1.split('/')[-1].split('_R')[0]+
          ' '+val1+" "+val2
          for val1, val2 in h.grouped(fastq_full[12:], 2)], 2)

CPU times: user 908 ms, sys: 472 ms, total: 1.38 s
Wall time: 10h 2min 40s


['parsing jkobject\ndoing all\nRunning slamDunk map for 2 files (14 threads)\ndoing paired end mapping!\n -1 ../../data/slamseq_MYB_degraded_timecourse_v1/fastqs/mr488-MV411-MYB-dTAG-VHL_48h-r2_R1_001_val_1.fq.gz -2 ../../data/slamseq_MYB_degraded_timecourse_v1/fastqs/mr488-MV411-MYB-dTAG-VHL_48h-r2_R2_001_val_2.fq.gz\nRunning: "ngm -r ../../ref/Homo_sapiens_assembly38_ERCC92_noAlt.fasta -1 ../../data/slamseq_MYB_degraded_timecourse_v1/fastqs/mr488-MV411-MYB-dTAG-VHL_48h-r2_R1_001_val_1.fq.gz -2 ../../data/slamseq_MYB_degraded_timecourse_v1/fastqs/mr488-MV411-MYB-dTAG-VHL_48h-r2_R2_001_val_2.fq.gz -t 14 --no-progress --slam-seq 2 -5 12 --max-polya 4 -l  --rg-id 0 --rg-sm sample_0:NA:-1 -o ../../data/slamseq_MYB_degraded_timecourse_v1/res/map/mr488-MV411-MYB-dTAG-VHL_48h-r2_slamdunk_mapped.sam"\nparsing jkobject\ndoing all\nRunning slamDunk map for 2 files (14 threads)\ndoing paired end mapping!\n -1 ../../data/slamseq_MYB_degraded_timecourse_v1/fastqs/mr489-MV411-MYB-dTAG-VHL_48h-r3_R1

### remove processed fastq files

In [17]:
res_folder = "../../data/"+"slamseq_"+project+"/res"
count_files = ! ls $res_folder/count/*tsv
#count_files

In [18]:
processed_samples = [f.split("/")[-1].split("-")[0] for f in count_files]
#processed_samples

In [19]:
fastq_files = ! ls $fastq_folder
fastq_files = [fastq_folder+"/"+f for f in fastq_files]
fastqs_rm = [fastq for fastq in fastq_files if any(sample in fastq for sample in processed_samples)]
print(len(fastqs_rm))
fastqs_rm

32


['../../data/slamseq_MYB_degraded_timecourse_v1/fastqs/mr488-MV411-MYB-dTAG-VHL_48h-r2_R1_001.fastq.gz',
 '../../data/slamseq_MYB_degraded_timecourse_v1/fastqs/mr488-MV411-MYB-dTAG-VHL_48h-r2_R1_001.fastq.gz_trimming_report.txt',
 '../../data/slamseq_MYB_degraded_timecourse_v1/fastqs/mr488-MV411-MYB-dTAG-VHL_48h-r2_R1_001_unpaired_1.fq.gz',
 '../../data/slamseq_MYB_degraded_timecourse_v1/fastqs/mr488-MV411-MYB-dTAG-VHL_48h-r2_R1_001_val_1.fq.gz',
 '../../data/slamseq_MYB_degraded_timecourse_v1/fastqs/mr488-MV411-MYB-dTAG-VHL_48h-r2_R2_001.fastq.gz',
 '../../data/slamseq_MYB_degraded_timecourse_v1/fastqs/mr488-MV411-MYB-dTAG-VHL_48h-r2_R2_001.fastq.gz_trimming_report.txt',
 '../../data/slamseq_MYB_degraded_timecourse_v1/fastqs/mr488-MV411-MYB-dTAG-VHL_48h-r2_R2_001_unpaired_2.fq.gz',
 '../../data/slamseq_MYB_degraded_timecourse_v1/fastqs/mr488-MV411-MYB-dTAG-VHL_48h-r2_R2_001_val_2.fq.gz',
 '../../data/slamseq_MYB_degraded_timecourse_v1/fastqs/mr489-MV411-MYB-dTAG-VHL_48h-r3_R1_001.fast

In [20]:
for idx, f in enumerate(fastqs_rm):
    if idx == 0:
        rm_sample = f.split("/")[-1].split("-")[0]
        print("Removing fastqs: {}".format(rm_sample))
    if rm_sample != f.split("/")[-1].split("-")[0]:
        rm_sample = f.split("/")[-1].split("-")[0]
        print("Removing fastqs: {}".format(rm_sample))
    
    os.remove(f)

Removing fastqs: mr488
Removing fastqs: mr489
Removing fastqs: mr490
Removing fastqs: mr491


In [30]:
! ls $fastq_folder/*fastq.gz

../../data/slamseq_MYB_degraded_timecourse_v1/fastqs/mr439-MV411-MYB-dTAG-VHL_2h-r3_R1_001.fastq.gz
../../data/slamseq_MYB_degraded_timecourse_v1/fastqs/mr439-MV411-MYB-dTAG-VHL_2h-r3_R2_001.fastq.gz
../../data/slamseq_MYB_degraded_timecourse_v1/fastqs/mr445-MV411-MYB-dTAG-DMSO_4h-r4_R1_001.fastq.gz
../../data/slamseq_MYB_degraded_timecourse_v1/fastqs/mr445-MV411-MYB-dTAG-DMSO_4h-r4_R2_001.fastq.gz
../../data/slamseq_MYB_degraded_timecourse_v1/fastqs/mr446-MV411-MYB-dTAG-DMSO_4h-r5_R1_001.fastq.gz
../../data/slamseq_MYB_degraded_timecourse_v1/fastqs/mr446-MV411-MYB-dTAG-DMSO_4h-r5_R2_001.fastq.gz
../../data/slamseq_MYB_degraded_timecourse_v1/fastqs/mr447-MV411-MYB-dTAG-VHL_4h-r1_R1_001.fastq.gz
../../data/slamseq_MYB_degraded_timecourse_v1/fastqs/mr447-MV411-MYB-dTAG-VHL_4h-r1_R2_001.fastq.gz
../../data/slamseq_MYB_degraded_timecourse_v1/fastqs/mr448-MV411-MYB-dTAG-VHL_4h-r2_R1_001.fastq.gz
../../data/slamseq_MYB_degraded_timecourse_v1/fastqs/mr448-MV411-MYB-dTAG-VHL_4h-r2_R2_

### Saving the data in our bucket

In [22]:
print(project)

MYB_degraded_timecourse_v1


* save fastqs to bucket
* save processed res to bucket
* remove processed fastqs from local

In [23]:
! gsutil ls gs://amlproject/RNA/slamseq/$project/res/count/*tsv

gs://amlproject/RNA/slamseq/MYB_degraded_timecourse_v1/res/count/mr422-MV411-MYB-dTAG-DMSO_1h-r1_tcount.tsv
gs://amlproject/RNA/slamseq/MYB_degraded_timecourse_v1/res/count/mr423-MV411-MYB-dTAG-DMSO_1h-r2_tcount.tsv
gs://amlproject/RNA/slamseq/MYB_degraded_timecourse_v1/res/count/mr424-MV411-MYB-dTAG-DMSO_1h-r3_tcount.tsv
gs://amlproject/RNA/slamseq/MYB_degraded_timecourse_v1/res/count/mr425-MV411-MYB-dTAG-DMSO_1h-r4_tcount.tsv
gs://amlproject/RNA/slamseq/MYB_degraded_timecourse_v1/res/count/mr426-MV411-MYB-dTAG-DMSO_1h-r5_tcount.tsv
gs://amlproject/RNA/slamseq/MYB_degraded_timecourse_v1/res/count/mr427-MV411-MYB-dTAG-VHL_1h-r1_tcount.tsv
gs://amlproject/RNA/slamseq/MYB_degraded_timecourse_v1/res/count/mr428-MV411-MYB-dTAG-VHL_1h-r2_tcount.tsv
gs://amlproject/RNA/slamseq/MYB_degraded_timecourse_v1/res/count/mr429-MV411-MYB-dTAG-VHL_1h-r3_tcount.tsv
gs://amlproject/RNA/slamseq/MYB_degraded_timecourse_v1/res/count/mr430-MV411-MYB-dTAG-VHL_1h-r4_tcount.tsv
gs://amlproject/RNA/sla

In [24]:
! ls ../../data/slamseq_$project/res/count/mr48*

../../data/slamseq_MYB_degraded_timecourse_v1/res/count/mr480-MV411-MYB-dTAG-VHL_24h-r4_tcount.log
../../data/slamseq_MYB_degraded_timecourse_v1/res/count/mr480-MV411-MYB-dTAG-VHL_24h-r4_tcount_mins.bedgraph
../../data/slamseq_MYB_degraded_timecourse_v1/res/count/mr480-MV411-MYB-dTAG-VHL_24h-r4_tcount_plus.bedgraph
../../data/slamseq_MYB_degraded_timecourse_v1/res/count/mr480-MV411-MYB-dTAG-VHL_24h-r4_tcount.tsv
../../data/slamseq_MYB_degraded_timecourse_v1/res/count/mr481-MV411-MYB-dTAG-VHL_24h-r5_tcount.log
../../data/slamseq_MYB_degraded_timecourse_v1/res/count/mr481-MV411-MYB-dTAG-VHL_24h-r5_tcount_mins.bedgraph
../../data/slamseq_MYB_degraded_timecourse_v1/res/count/mr481-MV411-MYB-dTAG-VHL_24h-r5_tcount_plus.bedgraph
../../data/slamseq_MYB_degraded_timecourse_v1/res/count/mr481-MV411-MYB-dTAG-VHL_24h-r5_tcount.tsv
../../data/slamseq_MYB_degraded_timecourse_v1/res/count/mr482-MV411-MYB-dTAG-DMSO_48h-r1_tcount.log
../../data/slamseq_MYB_degraded_timecourse_v1/res/count/mr4

In [25]:
! gsutil -m cp ../../data/slamseq_$project/res/map/mr48* gs://amlproject/RNA/slamseq/$project/res/map/
#! gsutil -m cp ../../data/slamseq_$project/res/filter/mr47* gs://amlproject/RNA/slamseq/$project/res/filter/
! gsutil -m cp ../../data/slamseq_$project/res/snp/mr48* gs://amlproject/RNA/slamseq/$project/res/snp/ 
! gsutil -m cp ../../data/slamseq_$project/res/count/mr48* gs://amlproject/RNA/slamseq/$project/res/count/

Copying file://../../data/slamseq_MYB_degraded_timecourse_v1/res/map/mr480-MV411-MYB-dTAG-VHL_24h-r4_slamdunk_mapped.bam [Content-Type=application/octet-stream]...
==> NOTE: You are uploading one or more large file(s), which would run          
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composite-objects>`_,which
means that any user who downloads such objects will need to have a
compiled crcmod installed (see "gsutil help crcmod"). This is because
without a compiled crcmod, computing checksums on composite objects is
so slow that gsutil disables downloads of composite objects.

Copying file://../../data/slamseq_MYB_degraded_timecourse_v1/res/map/mr480-MV411-MYB-dTAG-VHL_24h-r4_slamdunk_mapped.log [Content-Type=application/oct

Copying file://../../data/slamseq_MYB_degraded_timecourse_v1/res/count/mr487-MV411-MYB-dTAG-VHL_48h-r1_tcount.log [Content-Type=application/octet-stream]...
Copying file://../../data/slamseq_MYB_degraded_timecourse_v1/res/count/mr487-MV411-MYB-dTAG-VHL_48h-r1_tcount_plus.bedgraph [Content-Type=application/octet-stream]...
Copying file://../../data/slamseq_MYB_degraded_timecourse_v1/res/count/mr487-MV411-MYB-dTAG-VHL_48h-r1_tcount_mins.bedgraph [Content-Type=application/octet-stream]...
Copying file://../../data/slamseq_MYB_degraded_timecourse_v1/res/count/mr486-MV411-MYB-dTAG-DMSO_48h-r5_tcount.tsv [Content-Type=text/tab-separated-values]...
Copying file://../../data/slamseq_MYB_degraded_timecourse_v1/res/count/mr487-MV411-MYB-dTAG-VHL_48h-r1_tcount.tsv [Content-Type=text/tab-separated-values]...
Copying file://../../data/slamseq_MYB_degraded_timecourse_v1/res/count/mr488-MV411-MYB-dTAG-VHL_48h-r2_tcount.tsv [Content-Type=text/tab-separated-values]...
Copying file://../../data/slamseq_

In [27]:
! gsutil -m cp ../../data/slamseq_$project/res/filter/* gs://amlproject/RNA/slamseq/$project/res/filter/

Copying file://../../data/slamseq_MYB_degraded_timecourse_v1/res/filter/ERCC_mr422-MV411-MYB-dTAG-DMSO_1h-r1_filtered.bam [Content-Type=application/octet-stream]...
Copying file://../../data/slamseq_MYB_degraded_timecourse_v1/res/filter/ERCC_mr422-MV411-MYB-dTAG-DMSO_1h-r1_filtered.bam.bed [Content-Type=application/octet-stream]...
Copying file://../../data/slamseq_MYB_degraded_timecourse_v1/res/filter/ERCC_mr423-MV411-MYB-dTAG-DMSO_1h-r2_filtered.bam [Content-Type=application/octet-stream]...
Copying file://../../data/slamseq_MYB_degraded_timecourse_v1/res/filter/ERCC_mr423-MV411-MYB-dTAG-DMSO_1h-r2_filtered.bam.bed [Content-Type=application/octet-stream]...
Copying file://../../data/slamseq_MYB_degraded_timecourse_v1/res/filter/ERCC_mr424-MV411-MYB-dTAG-DMSO_1h-r3_filtered.bam [Content-Type=application/octet-stream]...
Copying file://../../data/slamseq_MYB_degraded_timecourse_v1/res/filter/ERCC_mr424-MV411-MYB-dTAG-DMSO_1h-r3_filtered.bam.bed [Content-Type=application/octet-stream]..

Copying file://../../data/slamseq_MYB_degraded_timecourse_v1/res/filter/ERCC_mr447-MV411-MYB-dTAG-VHL_4h-r1_filtered.bam [Content-Type=application/octet-stream]...
Copying file://../../data/slamseq_MYB_degraded_timecourse_v1/res/filter/ERCC_mr447-MV411-MYB-dTAG-VHL_4h-r1_filtered.bam.bed [Content-Type=application/octet-stream]...
Copying file://../../data/slamseq_MYB_degraded_timecourse_v1/res/filter/ERCC_mr448-MV411-MYB-dTAG-VHL_4h-r2_filtered.bam [Content-Type=application/octet-stream]...
Copying file://../../data/slamseq_MYB_degraded_timecourse_v1/res/filter/ERCC_mr448-MV411-MYB-dTAG-VHL_4h-r2_filtered.bam.bed [Content-Type=application/octet-stream]...
Copying file://../../data/slamseq_MYB_degraded_timecourse_v1/res/filter/ERCC_mr449-MV411-MYB-dTAG-VHL_4h-r3_filtered.bam [Content-Type=application/octet-stream]...
Copying file://../../data/slamseq_MYB_degraded_timecourse_v1/res/filter/ERCC_mr449-MV411-MYB-dTAG-VHL_4h-r3_filtered.bam.bed [Content-Type=application/octet-stream]...
Copy

Copying file://../../data/slamseq_MYB_degraded_timecourse_v1/res/filter/mr423-MV411-MYB-dTAG-DMSO_1h-r2_filtered.log [Content-Type=application/octet-stream]...
Copying file://../../data/slamseq_MYB_degraded_timecourse_v1/res/filter/mr424-MV411-MYB-dTAG-DMSO_1h-r3_filtered.bam [Content-Type=application/octet-stream]...
Copying file://../../data/slamseq_MYB_degraded_timecourse_v1/res/filter/mr424-MV411-MYB-dTAG-DMSO_1h-r3_filtered.bam.bai [Content-Type=application/octet-stream]...
Copying file://../../data/slamseq_MYB_degraded_timecourse_v1/res/filter/mr424-MV411-MYB-dTAG-DMSO_1h-r3_filtered.log [Content-Type=application/octet-stream]...
Copying file://../../data/slamseq_MYB_degraded_timecourse_v1/res/filter/mr425-MV411-MYB-dTAG-DMSO_1h-r4_filtered.bam [Content-Type=application/octet-stream]...
Copying file://../../data/slamseq_MYB_degraded_timecourse_v1/res/filter/mr425-MV411-MYB-dTAG-DMSO_1h-r4_filtered.bam.bai [Content-Type=application/octet-stream]...
Copying file://../../data/slamse

Copying file://../../data/slamseq_MYB_degraded_timecourse_v1/res/filter/mr440-MV411-MYB-dTAG-VHL_2h-r4_filtered.log [Content-Type=application/octet-stream]...
Copying file://../../data/slamseq_MYB_degraded_timecourse_v1/res/filter/mr441-MV411-MYB-dTAG-VHL_2h-r5_filtered.bam [Content-Type=application/octet-stream]...
Copying file://../../data/slamseq_MYB_degraded_timecourse_v1/res/filter/mr441-MV411-MYB-dTAG-VHL_2h-r5_filtered.bam.bai [Content-Type=application/octet-stream]...
Copying file://../../data/slamseq_MYB_degraded_timecourse_v1/res/filter/mr441-MV411-MYB-dTAG-VHL_2h-r5_filtered.log [Content-Type=application/octet-stream]...
Copying file://../../data/slamseq_MYB_degraded_timecourse_v1/res/filter/mr442-MV411-MYB-dTAG-DMSO_4h-r1_filtered.bam [Content-Type=application/octet-stream]...
Copying file://../../data/slamseq_MYB_degraded_timecourse_v1/res/filter/mr442-MV411-MYB-dTAG-DMSO_4h-r1_filtered.bam.bai [Content-Type=application/octet-stream]...
Copying file://../../data/slamseq_MY

Copying file://../../data/slamseq_MYB_degraded_timecourse_v1/res/filter/mr457-MV411-MYB-dTAG-VHL_8h-r1_filtered.log [Content-Type=application/octet-stream]...
Copying file://../../data/slamseq_MYB_degraded_timecourse_v1/res/filter/mr458-MV411-MYB-dTAG-VHL_8h-r2_filtered.bam [Content-Type=application/octet-stream]...
Copying file://../../data/slamseq_MYB_degraded_timecourse_v1/res/filter/mr458-MV411-MYB-dTAG-VHL_8h-r2_filtered.bam.bai [Content-Type=application/octet-stream]...
Copying file://../../data/slamseq_MYB_degraded_timecourse_v1/res/filter/mr458-MV411-MYB-dTAG-VHL_8h-r2_filtered.log [Content-Type=application/octet-stream]...
Copying file://../../data/slamseq_MYB_degraded_timecourse_v1/res/filter/mr459-MV411-MYB-dTAG-VHL_8h-r3_filtered.bam [Content-Type=application/octet-stream]...
Copying file://../../data/slamseq_MYB_degraded_timecourse_v1/res/filter/mr459-MV411-MYB-dTAG-VHL_8h-r3_filtered.bam.bai [Content-Type=application/octet-stream]...
Copying file://../../data/slamseq_MYB_

Copying file://../../data/slamseq_MYB_degraded_timecourse_v1/res/filter/mr474-MV411-MYB-dTAG-DMSO_24h-r3_filtered.log [Content-Type=application/octet-stream]...
Copying file://../../data/slamseq_MYB_degraded_timecourse_v1/res/filter/mr475-MV411-MYB-dTAG-DMSO_24h-r4_filtered.bam [Content-Type=application/octet-stream]...
Copying file://../../data/slamseq_MYB_degraded_timecourse_v1/res/filter/mr475-MV411-MYB-dTAG-DMSO_24h-r4_filtered.bam.bai [Content-Type=application/octet-stream]...
Copying file://../../data/slamseq_MYB_degraded_timecourse_v1/res/filter/mr475-MV411-MYB-dTAG-DMSO_24h-r4_filtered.log [Content-Type=application/octet-stream]...
Copying file://../../data/slamseq_MYB_degraded_timecourse_v1/res/filter/mr476-MV411-MYB-dTAG-DMSO_24h-r5_filtered.bam [Content-Type=application/octet-stream]...
Copying file://../../data/slamseq_MYB_degraded_timecourse_v1/res/filter/mr476-MV411-MYB-dTAG-DMSO_24h-r5_filtered.bam.bai [Content-Type=application/octet-stream]...
Copying file://../../data/

Copying file://../../data/slamseq_MYB_degraded_timecourse_v1/res/filter/mr491-MV411-MYB-dTAG-VHL_48h-r5_filtered.log [Content-Type=application/octet-stream]...
\ [350/350 files][384.4 GiB/384.4 GiB] 100% Done  88.3 MiB/s ETA 00:00:00       
Operation completed over 350 objects/384.4 GiB.                                  


## merging DESeq outputs

In [172]:
project = "MYB_degraded_timecourse_v1"
res_v = "res"
project_dir = "slamseq_{}".format(project)

In [173]:
files = ! ls $res_dir/$project_dir/*deseq*.csv
# nested list per time point
files = natsorted([files[i:i + 4] for i in range(0, len(files), 4)])
# ERCC (read, tc) | nonERCC (read, tc)
files = [lst[-2:]+lst[:-2] for lst in files]
files = [i for sublist in files for i in sublist]

In [174]:
# project 1
conditions = [i.split("_")[-6] for i in files[::4]]
print(conditions)
scaling = ["ERCCsamplewise", None]
count_type = ["readcounts", "tccounts"]

['1h', '2h', '4h', '8h', '12h', '24h', '48h']


### make info rows

In [175]:
# df with file info
df_info = pd.DataFrame(list(zip(["project", "file", "scale", "condition", "type"],
                                ["project", "file", "scale", "condition", "type"])))

In [176]:
df_info = pd.concat([df_info] + 
                    [pd.DataFrame([x*6 for x in get_file_info(project, file, conditions)]) for file in files], 
                    axis=1)
df_info.shape

(5, 170)

### get dfs and merge

In [177]:
# Get list of deseq dfs
deseq_dfs = [get_deseq_data(file) for file in files]

In [178]:
# main df
df_merged = pd.DataFrame()
df_merged = df_merged.append(deseq_dfs[0][["genes", "gene_id"]]) # add gene cols to main df

In [179]:
# join df columns on gene & gene_id columns
df_merged = reduce(lambda left, right: pd.merge(left, right, on=['genes', "gene_id"], how="outer"), 
                   [df_merged] + deseq_dfs)
df_merged.shape

(18305, 170)

In [180]:
df_merged.iloc[:5]

Unnamed: 0,genes,gene_id,baseMean_MYB_degraded_timecourse_v1_MYB_1h_ERCCsamplewise_0_5_deseq_readcounts,log2FoldChange_MYB_degraded_timecourse_v1_MYB_1h_ERCCsamplewise_0_5_deseq_readcounts,lfcSE_MYB_degraded_timecourse_v1_MYB_1h_ERCCsamplewise_0_5_deseq_readcounts,stat_MYB_degraded_timecourse_v1_MYB_1h_ERCCsamplewise_0_5_deseq_readcounts,pvalue_MYB_degraded_timecourse_v1_MYB_1h_ERCCsamplewise_0_5_deseq_readcounts,padj_MYB_degraded_timecourse_v1_MYB_1h_ERCCsamplewise_0_5_deseq_readcounts,baseMean_MYB_degraded_timecourse_v1_MYB_1h_ERCCsamplewise_0_5_deseq_tccounts,log2FoldChange_MYB_degraded_timecourse_v1_MYB_1h_ERCCsamplewise_0_5_deseq_tccounts,...,lfcSE_MYB_degraded_timecourse_v1_MYB_48h_0_5_deseq_readcounts,stat_MYB_degraded_timecourse_v1_MYB_48h_0_5_deseq_readcounts,pvalue_MYB_degraded_timecourse_v1_MYB_48h_0_5_deseq_readcounts,padj_MYB_degraded_timecourse_v1_MYB_48h_0_5_deseq_readcounts,baseMean_MYB_degraded_timecourse_v1_MYB_48h_0_5_deseq_tccounts,log2FoldChange_MYB_degraded_timecourse_v1_MYB_48h_0_5_deseq_tccounts,lfcSE_MYB_degraded_timecourse_v1_MYB_48h_0_5_deseq_tccounts,stat_MYB_degraded_timecourse_v1_MYB_48h_0_5_deseq_tccounts,pvalue_MYB_degraded_timecourse_v1_MYB_48h_0_5_deseq_tccounts,padj_MYB_degraded_timecourse_v1_MYB_48h_0_5_deseq_tccounts
0,A1BG,A1BG,1.50902,1.35526,1.458802,-0.929023,0.352877,,0.0,-0.0,...,1.170742,-1.301873,0.19296,0.334765,0.0,-0.0,,,0.0,
1,A2MP1,A2MP1,3.901096,0.267785,0.972756,-0.275284,0.783098,0.978553,0.0,-0.0,...,1.364342,-0.811901,0.416849,0.575441,0.0,-0.0,,,0.0,
2,NAT1,NAT1,209.773084,0.467232,0.285163,-1.638471,0.101324,0.686926,19.677031,0.880069,...,0.205305,1.14792,0.251002,0.404705,31.858555,0.339874,0.298272,-1.139476,0.254505,0.46744
3,AAMP,AAMP,991.520767,-0.135566,0.199995,0.677847,0.497869,0.919651,31.279046,-0.060397,...,0.18322,0.78666,0.431481,0.588881,29.033709,-0.604376,0.377976,1.598981,0.109825,0.266266
4,AARS1,AARS1,604.134304,0.072954,0.226546,-0.322026,0.747433,0.976487,35.111806,0.242548,...,0.178529,4.229819,2.3e-05,0.00016,66.075502,-1.083337,0.256242,4.227784,2.4e-05,0.000277


In [181]:
# change column names
df_merged.columns = [col_name.split("_")[0] if "gene" not in col_name else 
                     col_name for col_name in df_merged.columns.tolist()]
df_merged = df_merged.T.reset_index().T.reset_index(drop=True)
df_merged.shape

(18306, 170)

### add info columns

In [183]:
print(df_info.shape)
print(df_merged.shape)

(5, 170)
(18306, 170)


In [184]:
# reset column index
df_info = df_info.T.reset_index(drop=True).T

In [185]:
%%time

# combine info and data
df_save = df_info.append(df_merged, ignore_index=True)
df_save.shape

CPU times: user 20 ms, sys: 0 ns, total: 20 ms
Wall time: 19.8 ms


(18311, 170)

### save merged file

In [207]:
# save file
res_dir = "../results"
deseq_res_file = "slamseq_{}_0_5_deseq_{}output.csv".format(project, "" if alt_loci == True else "noAlt_")

df_save.to_csv("{}/{}/{}".format(res_dir, project_dir, deseq_res_file), header=False, index=False)

print("Saved DESeq results to: {}/{}".format(project_dir, deseq_res_file))

Saved DESeq results to: slamseq_MYB_degraded_timecourse_v1/slamseq_MYB_degraded_timecourse_v1_0_5_deseq_noAlt_output.csv


## merge w/ MYB_degraded_v7

In [283]:
project_dirs = ["slamseq_MYB_degraded_timecourse_v1",
                "slamseq_MYB_degraded_v1_res_v7"]

deseq_res_dirs = ["{}/{}".format(res_dir, project_dir) for project_dir in project_dirs]

In [336]:
def mergeDESeqOutputs(deseq_files):
    """
    combines merged DESeq outputs from each project
    
    Args:
      deseq_files: list of deseq files
    """
    # list of deseq dfs w/o header
    dfs = [pd.read_csv(file, index_col=False, header=None, squeeze=True, low_memory=False) for file in deseq_files]
    
    # merge info data
    df_info_merged = deseq_dfs[0].iloc[:5]
    print("df1 cols: {}".format(df_info_merged.shape[1]))
    # merge info columns
    for i, df in enumerate(dfs[1:]):
        print("df{} cols: {}".format(str(i+2), df.shape[1]))
        df_info_merged = merge_info_cols(df_info_merged, df.iloc[:5])
        print("Merged rows: {}\nMerged columns: {}".format(df_info_merged.shape[0], df_info_merged.shape[1]))
    
    # merge data
    # make first row unique col names and remove info rows
    df_data = [rename_deseq_data(df) for df in dfs]
    df_data = [df.iloc[4:] for df in df_data]
    
    df_merged = pd.DataFrame()
    # add gene cols to main df
    df_merged = df_merged.append(df_data[0][["project_0", "project_1"]])
    # join df columns on gene & gene_id columns
    df_merged = reduce(lambda left, right: pd.merge(left, right, on=["project_0", "project_1"], how="outer"), 
                       [df_merged] + df_data)
    print("Total unique genes: {}".format(str(df_merged.shape[0]-1)))
    
    # reset column index
    df_merged = df_merged.T.reset_index(drop=True).T
    df_info_merged = df_info_merged.T.reset_index(drop=True).T
    
    # combine info and data
    df_merged = df_info_merged.append(df_merged, ignore_index=True)
    
    return(df_merged)

In [322]:
print(df_save.shape)
df_save.iloc[:10]

(18675, 194)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,184,185,186,187,188,189,190,191,192,193
0,project,project,MYB_degraded_timecourse_v1,MYB_degraded_timecourse_v1,MYB_degraded_timecourse_v1,MYB_degraded_timecourse_v1,MYB_degraded_timecourse_v1,MYB_degraded_timecourse_v1,MYB_degraded_timecourse_v1,MYB_degraded_timecourse_v1,...,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1
1,file,file,MYB_degraded_timecourse_v1_MYB_1h_ERCCsamplewi...,MYB_degraded_timecourse_v1_MYB_1h_ERCCsamplewi...,MYB_degraded_timecourse_v1_MYB_1h_ERCCsamplewi...,MYB_degraded_timecourse_v1_MYB_1h_ERCCsamplewi...,MYB_degraded_timecourse_v1_MYB_1h_ERCCsamplewi...,MYB_degraded_timecourse_v1_MYB_1h_ERCCsamplewi...,MYB_degraded_timecourse_v1_MYB_1h_ERCCsamplewi...,MYB_degraded_timecourse_v1_MYB_1h_ERCCsamplewi...,...,MYB_degraded_v1_0_5_1h_deseq_readcounts.csv,MYB_degraded_v1_0_5_1h_deseq_readcounts.csv,MYB_degraded_v1_0_5_1h_deseq_readcounts.csv,MYB_degraded_v1_0_5_1h_deseq_readcounts.csv,MYB_degraded_v1_0_5_1h_deseq_tccounts.csv,MYB_degraded_v1_0_5_1h_deseq_tccounts.csv,MYB_degraded_v1_0_5_1h_deseq_tccounts.csv,MYB_degraded_v1_0_5_1h_deseq_tccounts.csv,MYB_degraded_v1_0_5_1h_deseq_tccounts.csv,MYB_degraded_v1_0_5_1h_deseq_tccounts.csv
2,scale,scale,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,...,,,,,,,,,,
3,condition,condition,1h,1h,1h,1h,1h,1h,1h,1h,...,1h,1h,1h,1h,1h,1h,1h,1h,1h,1h
4,type,type,readcounts,readcounts,readcounts,readcounts,readcounts,readcounts,tccounts,tccounts,...,readcounts,readcounts,readcounts,readcounts,tccounts,tccounts,tccounts,tccounts,tccounts,tccounts
5,genes,gene_id,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,baseMean,log2FoldChange,...,lfcSE,stat,pvalue,padj,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
6,A1BG,A1BG,1.509020285391674,1.3552603563897296,1.458802271013707,-0.929022653253736,0.3528773465741438,,0.0,-0.0,...,0.9846438463419186,-0.1150979000590839,0.9083675243535372,,0.0586605454997294,-0.5006131361318857,3.1165396500854228,0.1606310820137212,0.8723839728699718,
7,A2MP1,A2MP1,3.901096120730295,0.2677846766039957,0.9727561997080278,-0.2752844717765572,0.7830976934992575,0.978553274562618,0.0,-0.0,...,0.4803906922299277,0.3695105397497671,0.7117472185967626,,0.0,-0.0,,,0.0,
8,NAT1,NAT1,209.7730839698947,0.4672315284418883,0.2851632168827915,-1.6384705346970854,0.1013235757696181,0.6869262732838352,19.67703120627283,0.8800687912156389,...,0.1563761040955219,-0.730186631009811,0.4652761132688894,0.9999151129996772,5.269074099365515,0.6666255413136394,0.5672453670506757,-1.175197859754552,0.2399155663058868,0.7345162758223772
9,AAMP,AAMP,991.5207667751332,-0.1355659279477709,0.1999948765709417,0.6778470042440479,0.4978687046349399,0.9196506180982232,31.279046224279405,-0.0603969776660648,...,0.0753972581115594,0.3452268366600209,0.7299238448286257,0.9999151129996772,9.858397411395757,0.2946151482964133,0.3381536029740998,-0.8712465155043128,0.3836195667847473,0.8365778574054543


In [323]:
deseq_files

['../results/slamseq_MYB_degraded_timecourse_v1/slamseq_MYB_degraded_timecourse_v1_0_5_deseq_noAlt_output.csv',
 '../results/slamseq_MYB_degraded_v1_res_v7/slamseq_MYB_degraded_v1_res_v7_0_5_deseq_output.csv']

In [337]:
df_merged_test = mergeDESeqOutputs(deseq_files)

df1 cols: 170
df2 cols: 26
Merged rows: 5
Merged columns: 194
Total unique genes: 18669


In [339]:
print(df_merged_test.shape)
df_merged_test.iloc[:10]

(18675, 194)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,184,185,186,187,188,189,190,191,192,193
0,project,project,MYB_degraded_timecourse_v1,MYB_degraded_timecourse_v1,MYB_degraded_timecourse_v1,MYB_degraded_timecourse_v1,MYB_degraded_timecourse_v1,MYB_degraded_timecourse_v1,MYB_degraded_timecourse_v1,MYB_degraded_timecourse_v1,...,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1
1,file,file,MYB_degraded_timecourse_v1_MYB_1h_ERCCsamplewi...,MYB_degraded_timecourse_v1_MYB_1h_ERCCsamplewi...,MYB_degraded_timecourse_v1_MYB_1h_ERCCsamplewi...,MYB_degraded_timecourse_v1_MYB_1h_ERCCsamplewi...,MYB_degraded_timecourse_v1_MYB_1h_ERCCsamplewi...,MYB_degraded_timecourse_v1_MYB_1h_ERCCsamplewi...,MYB_degraded_timecourse_v1_MYB_1h_ERCCsamplewi...,MYB_degraded_timecourse_v1_MYB_1h_ERCCsamplewi...,...,MYB_degraded_v1_0_5_1h_deseq_readcounts.csv,MYB_degraded_v1_0_5_1h_deseq_readcounts.csv,MYB_degraded_v1_0_5_1h_deseq_readcounts.csv,MYB_degraded_v1_0_5_1h_deseq_readcounts.csv,MYB_degraded_v1_0_5_1h_deseq_tccounts.csv,MYB_degraded_v1_0_5_1h_deseq_tccounts.csv,MYB_degraded_v1_0_5_1h_deseq_tccounts.csv,MYB_degraded_v1_0_5_1h_deseq_tccounts.csv,MYB_degraded_v1_0_5_1h_deseq_tccounts.csv,MYB_degraded_v1_0_5_1h_deseq_tccounts.csv
2,scale,scale,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,...,,,,,,,,,,
3,condition,condition,1h,1h,1h,1h,1h,1h,1h,1h,...,1h,1h,1h,1h,1h,1h,1h,1h,1h,1h
4,type,type,readcounts,readcounts,readcounts,readcounts,readcounts,readcounts,tccounts,tccounts,...,readcounts,readcounts,readcounts,readcounts,tccounts,tccounts,tccounts,tccounts,tccounts,tccounts
5,genes,gene_id,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,baseMean,log2FoldChange,...,lfcSE,stat,pvalue,padj,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
6,A1BG,A1BG,1.509020285391674,1.3552603563897296,1.458802271013707,-0.929022653253736,0.3528773465741438,,0.0,-0.0,...,0.9846438463419186,-0.1150979000590839,0.9083675243535372,,0.0586605454997294,-0.5006131361318857,3.1165396500854228,0.1606310820137212,0.8723839728699718,
7,A2MP1,A2MP1,3.901096120730295,0.2677846766039957,0.9727561997080278,-0.2752844717765572,0.7830976934992575,0.978553274562618,0.0,-0.0,...,0.4803906922299277,0.3695105397497671,0.7117472185967626,,0.0,-0.0,,,0.0,
8,NAT1,NAT1,209.7730839698947,0.4672315284418883,0.2851632168827915,-1.6384705346970854,0.1013235757696181,0.6869262732838352,19.67703120627283,0.8800687912156389,...,0.1563761040955219,-0.730186631009811,0.4652761132688894,0.9999151129996772,5.269074099365515,0.6666255413136394,0.5672453670506757,-1.175197859754552,0.2399155663058868,0.7345162758223772
9,AAMP,AAMP,991.5207667751332,-0.1355659279477709,0.1999948765709417,0.6778470042440479,0.4978687046349399,0.9196506180982232,31.279046224279405,-0.0603969776660648,...,0.0753972581115594,0.3452268366600209,0.7299238448286257,0.9999151129996772,9.858397411395757,0.2946151482964133,0.3381536029740998,-0.8712465155043128,0.3836195667847473,0.8365778574054543


In [329]:
deseq_files = [glob.glob(deseq_res_dir+"/*output.csv") for deseq_res_dir in deseq_res_dirs]
deseq_files = [item for sublist in deseq_files for item in sublist]
deseq_files

['../results/slamseq_MYB_degraded_timecourse_v1/slamseq_MYB_degraded_timecourse_v1_0_5_deseq_noAlt_output.csv',
 '../results/slamseq_MYB_degraded_v1_res_v7/slamseq_MYB_degraded_v1_res_v7_0_5_deseq_output.csv']

In [330]:
# list of deseq dfs
#deseq_dfs = [get_deseq_data(deseq_file, low_memory=False) for deseq_file in deseq_files]

# list of deseq dfs (no header)
deseq_dfs = [pd.read_csv(deseq_file, index_col=False, header=None, squeeze=True, low_memory=False)
             for deseq_file in deseq_files]

In [331]:
deseq_dfs[0].head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,160,161,162,163,164,165,166,167,168,169
0,project,project,MYB_degraded_timecourse_v1,MYB_degraded_timecourse_v1,MYB_degraded_timecourse_v1,MYB_degraded_timecourse_v1,MYB_degraded_timecourse_v1,MYB_degraded_timecourse_v1,MYB_degraded_timecourse_v1,MYB_degraded_timecourse_v1,...,MYB_degraded_timecourse_v1,MYB_degraded_timecourse_v1,MYB_degraded_timecourse_v1,MYB_degraded_timecourse_v1,MYB_degraded_timecourse_v1,MYB_degraded_timecourse_v1,MYB_degraded_timecourse_v1,MYB_degraded_timecourse_v1,MYB_degraded_timecourse_v1,MYB_degraded_timecourse_v1
1,file,file,MYB_degraded_timecourse_v1_MYB_1h_ERCCsamplewi...,MYB_degraded_timecourse_v1_MYB_1h_ERCCsamplewi...,MYB_degraded_timecourse_v1_MYB_1h_ERCCsamplewi...,MYB_degraded_timecourse_v1_MYB_1h_ERCCsamplewi...,MYB_degraded_timecourse_v1_MYB_1h_ERCCsamplewi...,MYB_degraded_timecourse_v1_MYB_1h_ERCCsamplewi...,MYB_degraded_timecourse_v1_MYB_1h_ERCCsamplewi...,MYB_degraded_timecourse_v1_MYB_1h_ERCCsamplewi...,...,MYB_degraded_timecourse_v1_MYB_48h_0_5_deseq_r...,MYB_degraded_timecourse_v1_MYB_48h_0_5_deseq_r...,MYB_degraded_timecourse_v1_MYB_48h_0_5_deseq_r...,MYB_degraded_timecourse_v1_MYB_48h_0_5_deseq_r...,MYB_degraded_timecourse_v1_MYB_48h_0_5_deseq_t...,MYB_degraded_timecourse_v1_MYB_48h_0_5_deseq_t...,MYB_degraded_timecourse_v1_MYB_48h_0_5_deseq_t...,MYB_degraded_timecourse_v1_MYB_48h_0_5_deseq_t...,MYB_degraded_timecourse_v1_MYB_48h_0_5_deseq_t...,MYB_degraded_timecourse_v1_MYB_48h_0_5_deseq_t...
2,scale,scale,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,...,,,,,,,,,,
3,condition,condition,1h,1h,1h,1h,1h,1h,1h,1h,...,48h,48h,48h,48h,48h,48h,48h,48h,48h,48h
4,type,type,readcounts,readcounts,readcounts,readcounts,readcounts,readcounts,tccounts,tccounts,...,readcounts,readcounts,readcounts,readcounts,tccounts,tccounts,tccounts,tccounts,tccounts,tccounts


In [332]:
for df in deseq_dfs:
    print(df.shape)

(18311, 170)
(17217, 26)


In [333]:
deseq_dfs[1].head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,project,project,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,...,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1
1,file,file,MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_re...,MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_re...,MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_re...,MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_re...,MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_re...,MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_re...,MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_tc...,MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_tc...,...,MYB_degraded_v1_0_5_1h_deseq_readcounts.csv,MYB_degraded_v1_0_5_1h_deseq_readcounts.csv,MYB_degraded_v1_0_5_1h_deseq_readcounts.csv,MYB_degraded_v1_0_5_1h_deseq_readcounts.csv,MYB_degraded_v1_0_5_1h_deseq_tccounts.csv,MYB_degraded_v1_0_5_1h_deseq_tccounts.csv,MYB_degraded_v1_0_5_1h_deseq_tccounts.csv,MYB_degraded_v1_0_5_1h_deseq_tccounts.csv,MYB_degraded_v1_0_5_1h_deseq_tccounts.csv,MYB_degraded_v1_0_5_1h_deseq_tccounts.csv
2,scale,scale,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,...,,,,,,,,,,
3,condition,condition,1h,1h,1h,1h,1h,1h,1h,1h,...,1h,1h,1h,1h,1h,1h,1h,1h,1h,1h
4,type,type,readcounts,readcounts,readcounts,readcounts,readcounts,readcounts,tccounts,tccounts,...,readcounts,readcounts,readcounts,readcounts,tccounts,tccounts,tccounts,tccounts,tccounts,tccounts


### merge info data

In [334]:
def merge_info_cols(df1, df2):
    ''' merged info columns from 2 info dfs '''
    df_merged = pd.merge(df1, df2, left_on=[0,1], right_on=[0,1])
    df_merged = df_merged.T.reset_index(drop=True).T
    if (df1.shape[1]+df2.shape[1]-2) != df_merged.shape[1]:
        raise ValueError("Incorrect number of columns.\
                          df1: {}\
                          df2: {}".format(df1.shape, df2.shape))
    return(df_merged)

In [328]:
deseq_dfs[0].head()

Unnamed: 0,project_0,project_1,MYB_degraded_timecourse_v1_2,MYB_degraded_timecourse_v1_3,MYB_degraded_timecourse_v1_4,MYB_degraded_timecourse_v1_5,MYB_degraded_timecourse_v1_6,MYB_degraded_timecourse_v1_7,MYB_degraded_timecourse_v1_8,MYB_degraded_timecourse_v1_9,...,MYB_degraded_timecourse_v1_160,MYB_degraded_timecourse_v1_161,MYB_degraded_timecourse_v1_162,MYB_degraded_timecourse_v1_163,MYB_degraded_timecourse_v1_164,MYB_degraded_timecourse_v1_165,MYB_degraded_timecourse_v1_166,MYB_degraded_timecourse_v1_167,MYB_degraded_timecourse_v1_168,MYB_degraded_timecourse_v1_169
0,project,project,MYB_degraded_timecourse_v1,MYB_degraded_timecourse_v1,MYB_degraded_timecourse_v1,MYB_degraded_timecourse_v1,MYB_degraded_timecourse_v1,MYB_degraded_timecourse_v1,MYB_degraded_timecourse_v1,MYB_degraded_timecourse_v1,...,MYB_degraded_timecourse_v1,MYB_degraded_timecourse_v1,MYB_degraded_timecourse_v1,MYB_degraded_timecourse_v1,MYB_degraded_timecourse_v1,MYB_degraded_timecourse_v1,MYB_degraded_timecourse_v1,MYB_degraded_timecourse_v1,MYB_degraded_timecourse_v1,MYB_degraded_timecourse_v1
1,file,file,MYB_degraded_timecourse_v1_MYB_1h_ERCCsamplewi...,MYB_degraded_timecourse_v1_MYB_1h_ERCCsamplewi...,MYB_degraded_timecourse_v1_MYB_1h_ERCCsamplewi...,MYB_degraded_timecourse_v1_MYB_1h_ERCCsamplewi...,MYB_degraded_timecourse_v1_MYB_1h_ERCCsamplewi...,MYB_degraded_timecourse_v1_MYB_1h_ERCCsamplewi...,MYB_degraded_timecourse_v1_MYB_1h_ERCCsamplewi...,MYB_degraded_timecourse_v1_MYB_1h_ERCCsamplewi...,...,MYB_degraded_timecourse_v1_MYB_48h_0_5_deseq_r...,MYB_degraded_timecourse_v1_MYB_48h_0_5_deseq_r...,MYB_degraded_timecourse_v1_MYB_48h_0_5_deseq_r...,MYB_degraded_timecourse_v1_MYB_48h_0_5_deseq_r...,MYB_degraded_timecourse_v1_MYB_48h_0_5_deseq_t...,MYB_degraded_timecourse_v1_MYB_48h_0_5_deseq_t...,MYB_degraded_timecourse_v1_MYB_48h_0_5_deseq_t...,MYB_degraded_timecourse_v1_MYB_48h_0_5_deseq_t...,MYB_degraded_timecourse_v1_MYB_48h_0_5_deseq_t...,MYB_degraded_timecourse_v1_MYB_48h_0_5_deseq_t...
2,scale,scale,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,...,,,,,,,,,,
3,condition,condition,1h,1h,1h,1h,1h,1h,1h,1h,...,48h,48h,48h,48h,48h,48h,48h,48h,48h,48h
4,type,type,readcounts,readcounts,readcounts,readcounts,readcounts,readcounts,tccounts,tccounts,...,readcounts,readcounts,readcounts,readcounts,tccounts,tccounts,tccounts,tccounts,tccounts,tccounts


In [335]:
# from first df
df_info_merged = deseq_dfs[0].iloc[:5]

# merge info columns
for df_res in deseq_dfs[1:]:
    print("df1 cols: {} | df2 cols: {}".format(df_info_merged.shape[1], df_res.shape[1]))
    df_info = df_res.iloc[:5]
    df_info_merged = merge_info_cols(df_info_merged, df_info)
    print(df_info_merged.shape[1])

df1 cols: 170 | df2 cols: 26
194


In [291]:
df_info_merged

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,184,185,186,187,188,189,190,191,192,193
0,project,project,MYB_degraded_timecourse_v1,MYB_degraded_timecourse_v1,MYB_degraded_timecourse_v1,MYB_degraded_timecourse_v1,MYB_degraded_timecourse_v1,MYB_degraded_timecourse_v1,MYB_degraded_timecourse_v1,MYB_degraded_timecourse_v1,...,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1
1,file,file,MYB_degraded_timecourse_v1_MYB_1h_ERCCsamplewi...,MYB_degraded_timecourse_v1_MYB_1h_ERCCsamplewi...,MYB_degraded_timecourse_v1_MYB_1h_ERCCsamplewi...,MYB_degraded_timecourse_v1_MYB_1h_ERCCsamplewi...,MYB_degraded_timecourse_v1_MYB_1h_ERCCsamplewi...,MYB_degraded_timecourse_v1_MYB_1h_ERCCsamplewi...,MYB_degraded_timecourse_v1_MYB_1h_ERCCsamplewi...,MYB_degraded_timecourse_v1_MYB_1h_ERCCsamplewi...,...,MYB_degraded_v1_0_5_1h_deseq_readcounts.csv,MYB_degraded_v1_0_5_1h_deseq_readcounts.csv,MYB_degraded_v1_0_5_1h_deseq_readcounts.csv,MYB_degraded_v1_0_5_1h_deseq_readcounts.csv,MYB_degraded_v1_0_5_1h_deseq_tccounts.csv,MYB_degraded_v1_0_5_1h_deseq_tccounts.csv,MYB_degraded_v1_0_5_1h_deseq_tccounts.csv,MYB_degraded_v1_0_5_1h_deseq_tccounts.csv,MYB_degraded_v1_0_5_1h_deseq_tccounts.csv,MYB_degraded_v1_0_5_1h_deseq_tccounts.csv
2,scale,scale,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,...,,,,,,,,,,
3,condition,condition,1h,1h,1h,1h,1h,1h,1h,1h,...,1h,1h,1h,1h,1h,1h,1h,1h,1h,1h
4,type,type,readcounts,readcounts,readcounts,readcounts,readcounts,readcounts,tccounts,tccounts,...,readcounts,readcounts,readcounts,readcounts,tccounts,tccounts,tccounts,tccounts,tccounts,tccounts


### merge gene data

In [292]:
def get_deseq_data(file, low_memory=True):
    ''' input deseq file & returns df '''
    ''' add file name to headers except gene columns '''
    df = pd.read_csv(file, header=[0], index_col=None, squeeze=True, low_memory=low_memory)
    df.columns = ["{}_{}".format(col_name, os.path.basename(file).split(".")[0]) if "gene" not in col_name
                  else col_name for col_name in df.columns.tolist()]
    return(df)

In [293]:
def rename_deseq_data(df):
    ''' renames columns uniquely to deseq output header '''
    df.columns = ["{}_{}".format(col_name, str(idx)) if "gene" not in col_name
                  else col_name for idx, col_name in enumerate(df.iloc[0].tolist())]
    df = df.iloc[1:]
    return(df)

In [295]:
# make first row unique col names
deseq_df_data = [rename_deseq_data(df) for df in deseq_dfs]

# get df data without info
deseq_df_data = [df.iloc[4:] for df in deseq_df_data]

In [317]:
deseq_df_data[0].head()

Unnamed: 0,project_0,project_1,MYB_degraded_timecourse_v1_2,MYB_degraded_timecourse_v1_3,MYB_degraded_timecourse_v1_4,MYB_degraded_timecourse_v1_5,MYB_degraded_timecourse_v1_6,MYB_degraded_timecourse_v1_7,MYB_degraded_timecourse_v1_8,MYB_degraded_timecourse_v1_9,...,MYB_degraded_timecourse_v1_160,MYB_degraded_timecourse_v1_161,MYB_degraded_timecourse_v1_162,MYB_degraded_timecourse_v1_163,MYB_degraded_timecourse_v1_164,MYB_degraded_timecourse_v1_165,MYB_degraded_timecourse_v1_166,MYB_degraded_timecourse_v1_167,MYB_degraded_timecourse_v1_168,MYB_degraded_timecourse_v1_169
5,genes,gene_id,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,baseMean,log2FoldChange,...,lfcSE,stat,pvalue,padj,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
6,A1BG,A1BG,1.509020285391674,1.3552603563897296,1.458802271013707,-0.929022653253736,0.3528773465741438,,0.0,-0.0,...,1.170742283627402,-1.3018730488038075,0.1929597870180214,0.334764736706353,0.0,-0.0,,,0.0,
7,A2MP1,A2MP1,3.901096120730295,0.2677846766039957,0.9727561997080278,-0.2752844717765572,0.7830976934992575,0.978553274562618,0.0,-0.0,...,1.3643418380377714,-0.8119007173071736,0.4168486028266804,0.5754412325696503,0.0,-0.0,,,0.0,
8,NAT1,NAT1,209.7730839698947,0.4672315284418883,0.2851632168827915,-1.6384705346970854,0.1013235757696181,0.6869262732838352,19.67703120627283,0.8800687912156389,...,0.2053050081215886,1.1479200077699958,0.2510015877499436,0.4047054828838333,31.85855453188856,0.3398739341848437,0.2982720343502219,-1.1394763673545545,0.2545045197593326,0.4674400955483486
9,AAMP,AAMP,991.5207667751332,-0.1355659279477709,0.1999948765709417,0.6778470042440479,0.4978687046349399,0.9196506180982232,31.279046224279405,-0.0603969776660648,...,0.1832196404660721,0.7866595463401398,0.4314811912520493,0.5888808447117486,29.033709272402703,-0.6043760170701675,0.3779756424934378,1.5989813869571246,0.109824738419389,0.2662655758541947


In [297]:
for df_data in deseq_df_data:
    print(df_data.shape)

(18306, 170)
(17212, 26)


In [298]:
# main df
df_data_merged = pd.DataFrame()
# add gene cols to main df
df_data_merged = df_data_merged.append(deseq_df_data[0][["project_0", "project_1"]])

In [299]:
# join df columns on gene & gene_id columns
df_data_merged = reduce(lambda left, right: pd.merge(left, right, on=["project_0", "project_1"], how="outer"), 
                        [df_data_merged] + deseq_df_data)
df_data_merged.shape

(18670, 194)

In [300]:
df_data_merged.head()

Unnamed: 0,project_0,project_1,MYB_degraded_timecourse_v1_2,MYB_degraded_timecourse_v1_3,MYB_degraded_timecourse_v1_4,MYB_degraded_timecourse_v1_5,MYB_degraded_timecourse_v1_6,MYB_degraded_timecourse_v1_7,MYB_degraded_timecourse_v1_8,MYB_degraded_timecourse_v1_9,...,MYB_degraded_v1_16,MYB_degraded_v1_17,MYB_degraded_v1_18,MYB_degraded_v1_19,MYB_degraded_v1_20,MYB_degraded_v1_21,MYB_degraded_v1_22,MYB_degraded_v1_23,MYB_degraded_v1_24,MYB_degraded_v1_25
0,genes,gene_id,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,baseMean,log2FoldChange,...,lfcSE,stat,pvalue,padj,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
1,A1BG,A1BG,1.509020285391674,1.3552603563897296,1.458802271013707,-0.929022653253736,0.3528773465741438,,0.0,-0.0,...,0.9846438463419186,-0.1150979000590839,0.9083675243535372,,0.0586605454997294,-0.5006131361318857,3.1165396500854228,0.1606310820137212,0.8723839728699718,
2,A2MP1,A2MP1,3.901096120730295,0.2677846766039957,0.9727561997080278,-0.2752844717765572,0.7830976934992575,0.978553274562618,0.0,-0.0,...,0.4803906922299277,0.3695105397497671,0.7117472185967626,,0.0,-0.0,,,0.0,
3,NAT1,NAT1,209.7730839698947,0.4672315284418883,0.2851632168827915,-1.6384705346970854,0.1013235757696181,0.6869262732838352,19.67703120627283,0.8800687912156389,...,0.1563761040955219,-0.730186631009811,0.4652761132688894,0.9999151129996772,5.269074099365515,0.6666255413136394,0.5672453670506757,-1.175197859754552,0.2399155663058868,0.7345162758223772
4,AAMP,AAMP,991.5207667751332,-0.1355659279477709,0.1999948765709417,0.6778470042440479,0.4978687046349399,0.9196506180982232,31.279046224279405,-0.0603969776660648,...,0.0753972581115594,0.3452268366600209,0.7299238448286257,0.9999151129996772,9.858397411395757,0.2946151482964133,0.3381536029740998,-0.8712465155043128,0.3836195667847473,0.8365778574054543


In [301]:
# total unique genes
unique_gene_list = []
for df_data in deseq_df_data:
    unique_gene_list.extend(df_data.iloc[1:,0].tolist())
print("Total unique genes: {}".format(len(list(set(unique_gene_list)))))

Total unique genes: 18669


In [302]:
df_merged = df_data_merged

In [303]:
df_merged.head()

Unnamed: 0,project_0,project_1,MYB_degraded_timecourse_v1_2,MYB_degraded_timecourse_v1_3,MYB_degraded_timecourse_v1_4,MYB_degraded_timecourse_v1_5,MYB_degraded_timecourse_v1_6,MYB_degraded_timecourse_v1_7,MYB_degraded_timecourse_v1_8,MYB_degraded_timecourse_v1_9,...,MYB_degraded_v1_16,MYB_degraded_v1_17,MYB_degraded_v1_18,MYB_degraded_v1_19,MYB_degraded_v1_20,MYB_degraded_v1_21,MYB_degraded_v1_22,MYB_degraded_v1_23,MYB_degraded_v1_24,MYB_degraded_v1_25
0,genes,gene_id,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,baseMean,log2FoldChange,...,lfcSE,stat,pvalue,padj,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
1,A1BG,A1BG,1.509020285391674,1.3552603563897296,1.458802271013707,-0.929022653253736,0.3528773465741438,,0.0,-0.0,...,0.9846438463419186,-0.1150979000590839,0.9083675243535372,,0.0586605454997294,-0.5006131361318857,3.1165396500854228,0.1606310820137212,0.8723839728699718,
2,A2MP1,A2MP1,3.901096120730295,0.2677846766039957,0.9727561997080278,-0.2752844717765572,0.7830976934992575,0.978553274562618,0.0,-0.0,...,0.4803906922299277,0.3695105397497671,0.7117472185967626,,0.0,-0.0,,,0.0,
3,NAT1,NAT1,209.7730839698947,0.4672315284418883,0.2851632168827915,-1.6384705346970854,0.1013235757696181,0.6869262732838352,19.67703120627283,0.8800687912156389,...,0.1563761040955219,-0.730186631009811,0.4652761132688894,0.9999151129996772,5.269074099365515,0.6666255413136394,0.5672453670506757,-1.175197859754552,0.2399155663058868,0.7345162758223772
4,AAMP,AAMP,991.5207667751332,-0.1355659279477709,0.1999948765709417,0.6778470042440479,0.4978687046349399,0.9196506180982232,31.279046224279405,-0.0603969776660648,...,0.0753972581115594,0.3452268366600209,0.7299238448286257,0.9999151129996772,9.858397411395757,0.2946151482964133,0.3381536029740998,-0.8712465155043128,0.3836195667847473,0.8365778574054543


In [304]:
df_merged = df_merged.T.reset_index(drop=True).T

In [305]:
df_merged.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,184,185,186,187,188,189,190,191,192,193
0,genes,gene_id,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,baseMean,log2FoldChange,...,lfcSE,stat,pvalue,padj,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
1,A1BG,A1BG,1.509020285391674,1.3552603563897296,1.458802271013707,-0.929022653253736,0.3528773465741438,,0.0,-0.0,...,0.9846438463419186,-0.1150979000590839,0.9083675243535372,,0.0586605454997294,-0.5006131361318857,3.1165396500854228,0.1606310820137212,0.8723839728699718,
2,A2MP1,A2MP1,3.901096120730295,0.2677846766039957,0.9727561997080278,-0.2752844717765572,0.7830976934992575,0.978553274562618,0.0,-0.0,...,0.4803906922299277,0.3695105397497671,0.7117472185967626,,0.0,-0.0,,,0.0,
3,NAT1,NAT1,209.7730839698947,0.4672315284418883,0.2851632168827915,-1.6384705346970854,0.1013235757696181,0.6869262732838352,19.67703120627283,0.8800687912156389,...,0.1563761040955219,-0.730186631009811,0.4652761132688894,0.9999151129996772,5.269074099365515,0.6666255413136394,0.5672453670506757,-1.175197859754552,0.2399155663058868,0.7345162758223772
4,AAMP,AAMP,991.5207667751332,-0.1355659279477709,0.1999948765709417,0.6778470042440479,0.4978687046349399,0.9196506180982232,31.279046224279405,-0.0603969776660648,...,0.0753972581115594,0.3452268366600209,0.7299238448286257,0.9999151129996772,9.858397411395757,0.2946151482964133,0.3381536029740998,-0.8712465155043128,0.3836195667847473,0.8365778574054543


__add info columns__

In [306]:
# reset column index
df_info_merged = df_info_merged.T.reset_index(drop=True).T

In [307]:
df_info_merged.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,184,185,186,187,188,189,190,191,192,193
0,project,project,MYB_degraded_timecourse_v1,MYB_degraded_timecourse_v1,MYB_degraded_timecourse_v1,MYB_degraded_timecourse_v1,MYB_degraded_timecourse_v1,MYB_degraded_timecourse_v1,MYB_degraded_timecourse_v1,MYB_degraded_timecourse_v1,...,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1
1,file,file,MYB_degraded_timecourse_v1_MYB_1h_ERCCsamplewi...,MYB_degraded_timecourse_v1_MYB_1h_ERCCsamplewi...,MYB_degraded_timecourse_v1_MYB_1h_ERCCsamplewi...,MYB_degraded_timecourse_v1_MYB_1h_ERCCsamplewi...,MYB_degraded_timecourse_v1_MYB_1h_ERCCsamplewi...,MYB_degraded_timecourse_v1_MYB_1h_ERCCsamplewi...,MYB_degraded_timecourse_v1_MYB_1h_ERCCsamplewi...,MYB_degraded_timecourse_v1_MYB_1h_ERCCsamplewi...,...,MYB_degraded_v1_0_5_1h_deseq_readcounts.csv,MYB_degraded_v1_0_5_1h_deseq_readcounts.csv,MYB_degraded_v1_0_5_1h_deseq_readcounts.csv,MYB_degraded_v1_0_5_1h_deseq_readcounts.csv,MYB_degraded_v1_0_5_1h_deseq_tccounts.csv,MYB_degraded_v1_0_5_1h_deseq_tccounts.csv,MYB_degraded_v1_0_5_1h_deseq_tccounts.csv,MYB_degraded_v1_0_5_1h_deseq_tccounts.csv,MYB_degraded_v1_0_5_1h_deseq_tccounts.csv,MYB_degraded_v1_0_5_1h_deseq_tccounts.csv
2,scale,scale,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,...,,,,,,,,,,
3,condition,condition,1h,1h,1h,1h,1h,1h,1h,1h,...,1h,1h,1h,1h,1h,1h,1h,1h,1h,1h
4,type,type,readcounts,readcounts,readcounts,readcounts,readcounts,readcounts,tccounts,tccounts,...,readcounts,readcounts,readcounts,readcounts,tccounts,tccounts,tccounts,tccounts,tccounts,tccounts


In [308]:
%%time

# combine info and data
df_save = df_info_merged.append(df_merged, ignore_index=True)
df_save.shape

CPU times: user 32 ms, sys: 0 ns, total: 32 ms
Wall time: 31.7 ms


(18675, 194)

In [313]:
df_save.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,184,185,186,187,188,189,190,191,192,193
0,project,project,MYB_degraded_timecourse_v1,MYB_degraded_timecourse_v1,MYB_degraded_timecourse_v1,MYB_degraded_timecourse_v1,MYB_degraded_timecourse_v1,MYB_degraded_timecourse_v1,MYB_degraded_timecourse_v1,MYB_degraded_timecourse_v1,...,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1
1,file,file,MYB_degraded_timecourse_v1_MYB_1h_ERCCsamplewi...,MYB_degraded_timecourse_v1_MYB_1h_ERCCsamplewi...,MYB_degraded_timecourse_v1_MYB_1h_ERCCsamplewi...,MYB_degraded_timecourse_v1_MYB_1h_ERCCsamplewi...,MYB_degraded_timecourse_v1_MYB_1h_ERCCsamplewi...,MYB_degraded_timecourse_v1_MYB_1h_ERCCsamplewi...,MYB_degraded_timecourse_v1_MYB_1h_ERCCsamplewi...,MYB_degraded_timecourse_v1_MYB_1h_ERCCsamplewi...,...,MYB_degraded_v1_0_5_1h_deseq_readcounts.csv,MYB_degraded_v1_0_5_1h_deseq_readcounts.csv,MYB_degraded_v1_0_5_1h_deseq_readcounts.csv,MYB_degraded_v1_0_5_1h_deseq_readcounts.csv,MYB_degraded_v1_0_5_1h_deseq_tccounts.csv,MYB_degraded_v1_0_5_1h_deseq_tccounts.csv,MYB_degraded_v1_0_5_1h_deseq_tccounts.csv,MYB_degraded_v1_0_5_1h_deseq_tccounts.csv,MYB_degraded_v1_0_5_1h_deseq_tccounts.csv,MYB_degraded_v1_0_5_1h_deseq_tccounts.csv
2,scale,scale,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,...,,,,,,,,,,
3,condition,condition,1h,1h,1h,1h,1h,1h,1h,1h,...,1h,1h,1h,1h,1h,1h,1h,1h,1h,1h
4,type,type,readcounts,readcounts,readcounts,readcounts,readcounts,readcounts,tccounts,tccounts,...,readcounts,readcounts,readcounts,readcounts,tccounts,tccounts,tccounts,tccounts,tccounts,tccounts


__save composite merged file__

In [316]:
# save file
res_dir = "../results"
proj_group = "MYB_degraded"
deseq_res_file = "slamseq_{}merged_deseq_output{}.csv".format(proj_group+"_", "" if alt_loci == True else "_noAlt")

df_save.to_csv("{}/{}".format(res_dir, deseq_res_file), header=False, index=False)

print("Saved DESeq results to: {}/{}".format(res_dir, deseq_res_file))

Saved DESeq results to: ../results/slamseq_MYB_degraded_merged_deseq_output_noAlt.csv


# IRF2BP2_degraded_v1 (project 1)

__res_v4 no alt__

In [183]:
project = "IRF2BP2_degraded_v1"
res_v = "res_v4"
project_dir = "slamseq_{}_{}".format(project, res_v)
#proj_dir = "slamseq_{}".format(project1)

In [184]:
files = ! ls $res_dir/$project_dir/*.csv
files = [file for file in files if "deseq" in file if "drop" not in file]
# ERCC files first
files = [file for file in files if "ERCCsamplewise" in file] + \
[file for file in files if "ERCCsamplewise" not in file]
print(len(files))

9


In [185]:
# remove existing deseq merged output
files = files[:-1]

In [186]:
# project 1
conditions = ["3h", "6h"]
scaling = ["ERCCsamplewise", None]
count_type = ["readcounts", "tccounts"]

### make info rows

In [187]:
# df with file info
df_info = pd.DataFrame(list(zip(["project", "file", "scale", "condition", "type"],
                                ["project", "file", "scale", "condition", "type"])))

In [188]:
df_info = pd.concat([df_info] + [pd.DataFrame([x*6 for x in get_file_info(project, file, conditions)]) \
                                 for file in files], axis=1)
df_info.shape

(5, 50)

In [189]:
df_info.head()

Unnamed: 0,0,1,0.1,1.1,2,3,4,5,0.2,1.2,...,2.1,3.1,4.1,5.1,0.3,1.3,2.2,3.2,4.2,5.2
0,project,project,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,...,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1
1,file,file,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,...,IRF2BP2_degraded_v1_0_5_6h_deseq_readcounts.csv,IRF2BP2_degraded_v1_0_5_6h_deseq_readcounts.csv,IRF2BP2_degraded_v1_0_5_6h_deseq_readcounts.csv,IRF2BP2_degraded_v1_0_5_6h_deseq_readcounts.csv,IRF2BP2_degraded_v1_0_5_6h_deseq_tccounts.csv,IRF2BP2_degraded_v1_0_5_6h_deseq_tccounts.csv,IRF2BP2_degraded_v1_0_5_6h_deseq_tccounts.csv,IRF2BP2_degraded_v1_0_5_6h_deseq_tccounts.csv,IRF2BP2_degraded_v1_0_5_6h_deseq_tccounts.csv,IRF2BP2_degraded_v1_0_5_6h_deseq_tccounts.csv
2,scale,scale,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,...,,,,,,,,,,
3,condition,condition,3h,3h,3h,3h,3h,3h,3h,3h,...,6h,6h,6h,6h,6h,6h,6h,6h,6h,6h
4,type,type,readcounts,readcounts,readcounts,readcounts,readcounts,readcounts,tccounts,tccounts,...,readcounts,readcounts,readcounts,readcounts,tccounts,tccounts,tccounts,tccounts,tccounts,tccounts


### get dfs and merge

In [190]:
# Get list of deseq dfs
deseq_dfs = [get_deseq_data(file) for file in files]

In [30]:
# main df
df_merged = pd.DataFrame()
df_merged = df_merged.append(deseq_dfs[0][["genes", "gene_id"]]) # add gene cols to main df

In [31]:
# join df columns on gene & gene_id columns
df_merged = reduce(lambda left, right: pd.merge(left, right, on=['genes', "gene_id"], how="outer"), 
                   [df_merged] + deseq_dfs)
df_merged.shape

(15632, 50)

In [32]:
df_merged.iloc[:5]

Unnamed: 0,genes,gene_id,baseMean_IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_deseq_readcounts,log2FoldChange_IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_deseq_readcounts,lfcSE_IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_deseq_readcounts,stat_IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_deseq_readcounts,pvalue_IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_deseq_readcounts,padj_IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_deseq_readcounts,baseMean_IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_deseq_tccounts,log2FoldChange_IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_deseq_tccounts,...,lfcSE_IRF2BP2_degraded_v1_0_5_6h_deseq_readcounts,stat_IRF2BP2_degraded_v1_0_5_6h_deseq_readcounts,pvalue_IRF2BP2_degraded_v1_0_5_6h_deseq_readcounts,padj_IRF2BP2_degraded_v1_0_5_6h_deseq_readcounts,baseMean_IRF2BP2_degraded_v1_0_5_6h_deseq_tccounts,log2FoldChange_IRF2BP2_degraded_v1_0_5_6h_deseq_tccounts,lfcSE_IRF2BP2_degraded_v1_0_5_6h_deseq_tccounts,stat_IRF2BP2_degraded_v1_0_5_6h_deseq_tccounts,pvalue_IRF2BP2_degraded_v1_0_5_6h_deseq_tccounts,padj_IRF2BP2_degraded_v1_0_5_6h_deseq_tccounts
0,NAT1,NAT1,95.870596,-0.461812,0.183899,2.511222,0.012031,0.176482,5.033275,-0.560236,...,0.249984,1.345021,0.178618,0.918252,0.75263,2.004205,1.827815,-1.096503,0.272859,
1,AAMP,AAMP,842.371621,-0.147606,0.118079,1.250062,0.211277,0.621102,14.553492,0.310173,...,0.143612,-1.174215,0.240309,0.95945,7.433181,0.284259,0.600667,-0.473238,0.636043,
2,AARS1,AARS1,1374.176487,-0.025248,0.109373,0.230845,0.817435,0.957162,16.200285,-0.137228,...,0.167173,-0.658551,0.510184,0.996674,11.302474,0.073024,0.490502,-0.148877,0.881651,
3,ABAT,ABAT,55.922405,-0.358522,0.2117,1.693536,0.090354,0.438694,1.693745,-0.949985,...,0.268125,0.426446,0.669783,0.996674,0.634574,-1.007275,2.080292,0.484199,0.628245,
4,ABCA1,ABCA1,194.049987,0.07271,0.160794,-0.45219,0.651132,0.90241,6.337421,-0.445437,...,0.274569,4.61696,4e-06,0.000465,1.389801,-1.065289,1.377289,0.773468,0.439245,


In [33]:
# change column names
df_merged.columns = [col_name.split("_")[0] if "gene" not in col_name else 
                     col_name for col_name in df_merged.columns.tolist()]
df_merged = df_merged.T.reset_index().T.reset_index(drop=True)
df_merged.shape

(15633, 50)

### add info columns

In [35]:
print(df_info.shape)
print(df_merged.shape)

(5, 50)
(15633, 50)


In [36]:
# reset column index
df_info = df_info.T.reset_index(drop=True).T

In [37]:
%%time

# combine info and data
df_save = df_info.append(df_merged, ignore_index=True)
df_save.shape

CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 5.72 ms


(15638, 50)

### save merged file

In [39]:
# save file
res_dir = "../results"
deseq_res_file = "slamseq_{}_{}_0_5_deseq_output.csv".format(project, res_v)

df_save.to_csv("{}/{}/{}".format(res_dir, project_dir, deseq_res_file), header=False, index=False)

print("Saved DESeq results to: {}/{}".format(project_dir, deseq_res_file))

Saved DESeq results to: slamseq_IRF2BP2_degraded_v1_res_v4/slamseq_IRF2BP2_degraded_v1_res_v4_0_5_deseq_output.csv


# MYB_degraded_v1 (project 2)

In [34]:
# read in merged deseq file
res_dir = "../results"
deseq_res_file = "slamseq_merged_deseq_output.csv"

df_save = pd.read_csv("{}/{}".format(res_dir, deseq_res_file), index_col=False, header=None, squeeze=True)
print(df_save.shape)

(17275, 74)


  exec(code_obj, self.user_global_ns, self.user_ns)


In [35]:
project = project2
project_dir = "slamseq_{}".format(project)

In [36]:
files = ! ls $res_dir/$project_dir/*.csv
files = [file for file in files if "deseq" in file if "drop" not in file]
# ERCC files first
files = [file for file in files if "ERCCsamplewise" in file] + \
[file for file in files if "ERCCsamplewise" not in file]
print(len(files))

4


In [37]:
# project 2
conditions = ["1h"]
scaling = ["ERCCsamplewise", None]
count_type = ["readcounts", "tccounts"]

### make info rows

In [164]:
# df with file info
df_info = pd.DataFrame(list(zip(["project", "file", "scale", "condition", "type"],
                                ["project", "file", "scale", "condition", "type"])))

In [165]:
df_info = pd.concat([df_info] + [pd.DataFrame([x*6 for x in get_file_info(project2, file, conditions)]) \
                                 for file in files], axis=1)
df_info.shape

(5, 26)

In [168]:
df_info.iloc[:,[0,1,20,21,22,23,24]]

Unnamed: 0,0,1,0.1,1.1,2,3,4
0,project,project,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1
1,file,file,MYB_degraded_v1_0_5_1h_deseq_tccounts.csv,MYB_degraded_v1_0_5_1h_deseq_tccounts.csv,MYB_degraded_v1_0_5_1h_deseq_tccounts.csv,MYB_degraded_v1_0_5_1h_deseq_tccounts.csv,MYB_degraded_v1_0_5_1h_deseq_tccounts.csv
2,scale,scale,,,,,
3,condition,condition,1h,1h,1h,1h,1h
4,type,type,tccounts,tccounts,tccounts,tccounts,tccounts


### merge dfs

In [10]:
# Get list of deseq dfs
deseq_dfs = [get_deseq_data(file) for file in files]
deseq_dfs[0].iloc[:1]

Unnamed: 0,baseMean_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_readcounts,log2FoldChange_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_readcounts,lfcSE_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_readcounts,stat_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_readcounts,pvalue_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_readcounts,padj_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_readcounts,genes,gene_id
0,1.877541,-0.045789,1.010641,0.045307,0.963863,,A1BG,A1BG


In [11]:
# main df
df_merged = pd.DataFrame()
df_merged = df_merged.append(deseq_dfs[0][["genes", "gene_id"]]) # add gene cols to main df

In [12]:
# join df columns on gene & gene_id columns
df_merged = reduce(lambda left, right: pd.merge(left, right, on=['genes', "gene_id"], how="outer"), 
                   [df_merged] + deseq_dfs)
df_merged.shape

(16558, 26)

In [146]:
# change column names
df_merged.columns = [col_name.split("_")[0] if "gene" not in col_name else 
                     col_name for col_name in df_merged.columns.tolist()]
df_merged = df_merged.T.reset_index().T.reset_index(drop=True)

### merge info cols

In [147]:
df_save_info = df_save.iloc[:5,]

In [171]:
print(df_save_info.shape)
print(df_info.shape)

(5, 50)
(5, 26)


In [170]:
df_save_info

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
0,project,project,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,...,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1
1,file,file,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,...,IRF2BP2_degraded_v1_0_5_6h_deseq_readcounts.csv,IRF2BP2_degraded_v1_0_5_6h_deseq_readcounts.csv,IRF2BP2_degraded_v1_0_5_6h_deseq_readcounts.csv,IRF2BP2_degraded_v1_0_5_6h_deseq_readcounts.csv,IRF2BP2_degraded_v1_0_5_6h_deseq_tccounts.csv,IRF2BP2_degraded_v1_0_5_6h_deseq_tccounts.csv,IRF2BP2_degraded_v1_0_5_6h_deseq_tccounts.csv,IRF2BP2_degraded_v1_0_5_6h_deseq_tccounts.csv,IRF2BP2_degraded_v1_0_5_6h_deseq_tccounts.csv,IRF2BP2_degraded_v1_0_5_6h_deseq_tccounts.csv
2,scale,scale,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,...,,,,,,,,,,
3,condition,condition,3h,3h,3h,3h,3h,3h,3h,3h,...,6h,6h,6h,6h,6h,6h,6h,6h,6h,6h
4,type,type,readcounts,readcounts,readcounts,readcounts,readcounts,readcounts,tccounts,tccounts,...,readcounts,readcounts,readcounts,readcounts,tccounts,tccounts,tccounts,tccounts,tccounts,tccounts


In [173]:
# reset column index
df_info = df_info.T.reset_index(drop=True).T

In [177]:
# merge info columns
df_info_merged = pd.merge(df_save_info, df_info, left_on=[0,1], right_on=[0,1])
df_info_merged = df_info_merged.T.reset_index(drop=True).T
df_info_merged.shape

(5, 74)

### merge gene data

In [None]:
# main df
df_merged = pd.DataFrame()
df_merged = df_merged.append(deseq_dfs[0][["genes", "gene_id"]]) # add gene cols to main df

In [179]:
print(df_merged.shape)
df_merged.iloc[:3]

(16559, 26)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,genes,gene_id,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,baseMean,log2FoldChange,...,lfcSE,stat,pvalue,padj,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
1,A1BG,A1BG,1.877541,-0.045789,1.010641,0.045307,0.963863,,0.05833,-0.614498,...,0.980153,-0.116824,0.907,,0.05833,-0.499954,3.11654,0.16042,0.87255,
2,A2MP1,A2MP1,5.229468,-0.26904,0.500213,0.537851,0.59068,,0.0,-0.0,...,0.478116,0.372684,0.709383,,0.0,-0.0,,,0.0,


In [182]:
# merged data w/o info
df_saved_output = df_save.iloc[5:]
df_saved_output.shape

(15113, 50)

In [190]:
saved_genes = df_saved_output.iloc[1:,0].tolist()
print(len(saved_genes))
new_genes = df_merged.iloc[1:,0].tolist()
print(len(new_genes))
print("Total unique genes combined: {}".format(len(set(saved_genes+new_genes))))

15112
16558
Total unique genes combined: 17269


In [194]:
df_saved_output.iloc[:3]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
5,genes,gene_id,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,baseMean,log2FoldChange,...,lfcSE,stat,pvalue,padj,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
6,NAT1,NAT1,98.73474046943294,-0.3588695183526639,0.1795865924500918,1.998309080074538,0.0456831617026497,0.5573132426854484,5.021740199533236,-0.5547845714960977,...,0.2491509542089338,1.3371348848527798,0.181178611933578,0.8915384574240992,0.7530778794721894,2.014292355431807,1.8320814934075496,-1.0994556534083848,0.2715693670830352,
7,AAMP,AAMP,872.5000270269832,-0.0418951057520827,0.1166181865869584,0.3592501905424756,0.7194079326130601,0.9684493014304824,14.549760596196997,0.3164736874507167,...,0.1441953888640895,-1.1898593830650157,0.2341016648908468,0.9320326022466292,7.440114778273713,0.290637206923639,0.6010687521535898,-0.483534048113972,0.6287165903309597,


In [201]:
df_saved_output.columns = df_saved_output.iloc[0]
df_saved_output = df_saved_output[1:]

In [203]:
df_saved_output[:2]

5,genes,gene_id,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,baseMean.1,log2FoldChange.1,...,lfcSE.1,stat.1,pvalue.1,padj.1,baseMean.2,log2FoldChange.2,lfcSE.2,stat.2,pvalue.2,padj.2
6,NAT1,NAT1,98.73474046943294,-0.3588695183526639,0.1795865924500918,1.998309080074538,0.0456831617026497,0.5573132426854484,5.021740199533236,-0.5547845714960977,...,0.2491509542089338,1.3371348848527798,0.181178611933578,0.8915384574240992,0.7530778794721894,2.014292355431807,1.8320814934075496,-1.0994556534083848,0.2715693670830352,
7,AAMP,AAMP,872.5000270269832,-0.0418951057520827,0.1166181865869584,0.3592501905424756,0.7194079326130601,0.9684493014304824,14.549760596196997,0.3164736874507167,...,0.1441953888640895,-1.1898593830650157,0.2341016648908468,0.9320326022466292,7.440114778273713,0.290637206923639,0.6010687521535898,-0.483534048113972,0.6287165903309597,


In [216]:
df_saved_output.shape

(15112, 50)

In [223]:
# change number values to float
num_cols = list(range(2,df_saved_output.shape[1]))
df_saved_output.iloc[:,num_cols] = df_saved_output.iloc[:,num_cols].astype(float)

__Make df with all genes__

In [241]:
print("Saved df genes: {}".format(len(df_saved_output['genes'])))
print("Merged df genes: {}".format(len(df_merged['genes'])))
merged_genes = list(set(df_saved_output['genes'].tolist() + df_merged['genes'].tolist()))
merged_genes.sort()
merged_genes = sorted(merged_genes, key=lambda x: (x.isnumeric(),int(x) if x.isnumeric() else x))
print("Total unique genes: {}".format(len(merged_genes)))

Saved df genes: 15112
Merged df genes: 16558
Total unique genes: 17269


In [243]:
df = pd.DataFrame(list(zip(merged_genes, merged_genes)),
                  columns=["genes", "gene_id"])

In [245]:
df = reduce(lambda left, right: pd.merge(left, right, on=['genes', "gene_id"], how="outer"), 
                   [df, df_saved_output, df_merged])

In [246]:
print(df.shape)
df.head()

(17269, 74)


Unnamed: 0,genes,gene_id,baseMean_x,log2FoldChange_x,lfcSE_x,stat_x,pvalue_x,padj_x,baseMean_x.1,log2FoldChange_x.1,...,lfcSE_y,stat_y,pvalue_y,padj_y,baseMean_y,log2FoldChange_y,lfcSE_y.1,stat_y.1,pvalue_y.1,padj_y.1
0,A1BG,A1BG,,,,,,,,,...,0.980153,-0.116824,0.907,,0.05833,-0.499954,3.11654,0.16042,0.87255,
1,A1BG-AS1,A1BG-AS1,25.689167,0.01394,0.311866,-0.044699,0.964347,0.997077,0.209183,-0.545181,...,0.179837,-1.153464,0.24872,0.999857,0.0,-0.0,,,0.0,
2,A1CF,A1CF,,,,,,,,,...,0.577384,0.735683,0.461924,,0.0,-0.0,,,0.0,
3,A2M-AS1,A2M-AS1,,,,,,,,,...,0.894369,-0.055995,0.955345,,0.0,-0.0,,,0.0,
4,A2ML1,A2ML1,,,,,,,,,...,0.706911,0.342414,0.732039,,0.0,-0.0,,,0.0,


In [247]:
# change column names
df.columns = [col_name.split("_")[0] if "gene" not in col_name else 
              col_name for col_name in df.columns.tolist()]
df = df.T.reset_index().T.reset_index(drop=True)

### add info cols

In [248]:
print(df.shape)
print(df_info_merged.shape)

(17270, 74)
(5, 74)


In [250]:
# reset column index
df_info_merged = df_info_merged.T.reset_index(drop=True).T

In [251]:
%%time

# combine info and data
df_save = df_info_merged.append(df, ignore_index=True)
df_save.shape

CPU times: user 12 ms, sys: 8 ms, total: 20 ms
Wall time: 22.7 ms


(17275, 74)

### save merged df

In [252]:
# save file
res_dir = "../results"
deseq_res_file = "slamseq_merged_deseq_output.csv"

df_save.to_csv("{}/{}".format(res_dir, deseq_res_file), header=False, index=False)

print("Saved DESeq results to: {}/{}".format(res_dir, deseq_res_file))

Saved DESeq results to: ../results/slamseq_merged_deseq_output.csv


# 4_degrons_v1 (RUNX1)

In [68]:
project = "4_degrons_v1"
project_dir = "slamseq_{}".format(project)
gene = "RUNX1"

In [69]:
files = ! ls $res_dir/$project_dir/*.csv
files = [file for file in files if "deseq" in file and gene in file]
# ERCC files first
files = [file for file in files if "ERCCsamplewise" in file] + \
[file for file in files if "ERCCsamplewise" not in file]
print(len(files))
files

5


['../results/slamseq_4_degrons_v1/4_degrons_v1_RUNX1_1h_ERCCsamplewise_0_5_deseq_readcounts.csv',
 '../results/slamseq_4_degrons_v1/4_degrons_v1_RUNX1_1h_ERCCsamplewise_0_5_deseq_tccounts.csv',
 '../results/slamseq_4_degrons_v1/4_degrons_v1_RUNX1_1h_0_5_deseq_readcounts.csv',
 '../results/slamseq_4_degrons_v1/4_degrons_v1_RUNX1_1h_0_5_deseq_tccounts.csv',
 '../results/slamseq_4_degrons_v1/slamseq_4_degrons_v1_RUNX1_0_5_deseq_output.csv']

In [70]:
files = files[:-1]

In [71]:
# project 3
conditions = ["1h"]
scaling = ["ERCCsamplewise", None]
count_type = ["readcounts", "tccounts"]

### make info rows

In [72]:
# df with file info
df_info = pd.DataFrame(list(zip(["project", "file", "scale", "condition", "type"],
                                ["project", "file", "scale", "condition", "type"])))

In [73]:
df_info = pd.concat([df_info] + [pd.DataFrame([x*6 for x in get_file_info(project, file, conditions)]) \
                                 for file in files], axis=1)
df_info.shape

(5, 26)

### get dfs and merge

In [75]:
# Get list of deseq dfs
deseq_dfs = [get_deseq_data(file) for file in files]

In [76]:
# main df
df_merged = pd.DataFrame()
df_merged = df_merged.append(deseq_dfs[0][["genes", "gene_id"]]) # add gene cols to main df

In [78]:
# join df columns on gene & gene_id columns
df_merged = reduce(lambda left, right: pd.merge(left, right, on=['genes', "gene_id"], how="outer"), 
                   [df_merged] + deseq_dfs)
df_merged.shape

(16744, 26)

In [79]:
df_merged.iloc[:2]

Unnamed: 0,genes,gene_id,baseMean_4_degrons_v1_RUNX1_1h_ERCCsamplewise_0_5_deseq_readcounts,log2FoldChange_4_degrons_v1_RUNX1_1h_ERCCsamplewise_0_5_deseq_readcounts,lfcSE_4_degrons_v1_RUNX1_1h_ERCCsamplewise_0_5_deseq_readcounts,stat_4_degrons_v1_RUNX1_1h_ERCCsamplewise_0_5_deseq_readcounts,pvalue_4_degrons_v1_RUNX1_1h_ERCCsamplewise_0_5_deseq_readcounts,padj_4_degrons_v1_RUNX1_1h_ERCCsamplewise_0_5_deseq_readcounts,baseMean_4_degrons_v1_RUNX1_1h_ERCCsamplewise_0_5_deseq_tccounts,log2FoldChange_4_degrons_v1_RUNX1_1h_ERCCsamplewise_0_5_deseq_tccounts,...,lfcSE_4_degrons_v1_RUNX1_1h_0_5_deseq_readcounts,stat_4_degrons_v1_RUNX1_1h_0_5_deseq_readcounts,pvalue_4_degrons_v1_RUNX1_1h_0_5_deseq_readcounts,padj_4_degrons_v1_RUNX1_1h_0_5_deseq_readcounts,baseMean_4_degrons_v1_RUNX1_1h_0_5_deseq_tccounts,log2FoldChange_4_degrons_v1_RUNX1_1h_0_5_deseq_tccounts,lfcSE_4_degrons_v1_RUNX1_1h_0_5_deseq_tccounts,stat_4_degrons_v1_RUNX1_1h_0_5_deseq_tccounts,pvalue_4_degrons_v1_RUNX1_1h_0_5_deseq_tccounts,padj_4_degrons_v1_RUNX1_1h_0_5_deseq_tccounts
0,A2M,A2M,2.269069,-0.155731,1.023965,0.152086,0.879119,,0.0,-0.0,...,1.025011,0.157375,0.87495,,0.0,-0.0,,,0.0,
1,A2MP1,A2MP1,2.0503,3.698355,1.401673,-2.638529,0.008327,,0.0,-0.0,...,1.402745,-2.657264,0.007878,,0.0,-0.0,,,0.0,


In [86]:
# change column names
df_merged.columns = [col_name.split("_")[0] if "gene" not in col_name else 
                     col_name for col_name in df_merged.columns.tolist()]
df_merged = df_merged.T.reset_index().T.reset_index(drop=True)
df_merged.shape

(16745, 26)

### add info columns

In [89]:
print(df_info.shape)
print(df_merged.shape)

(5, 26)
(16745, 26)


In [81]:
# reset column index
df_info = df_info.T.reset_index(drop=True).T

In [82]:
print(df_info.shape)
print(df_merged.shape)

(5, 26)
(16744, 26)


In [90]:
df_info.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,project,project,4_degrons_v1,4_degrons_v1,4_degrons_v1,4_degrons_v1,4_degrons_v1,4_degrons_v1,4_degrons_v1,4_degrons_v1,...,4_degrons_v1,4_degrons_v1,4_degrons_v1,4_degrons_v1,4_degrons_v1,4_degrons_v1,4_degrons_v1,4_degrons_v1,4_degrons_v1,4_degrons_v1
1,file,file,4_degrons_v1_RUNX1_1h_ERCCsamplewise_0_5_deseq...,4_degrons_v1_RUNX1_1h_ERCCsamplewise_0_5_deseq...,4_degrons_v1_RUNX1_1h_ERCCsamplewise_0_5_deseq...,4_degrons_v1_RUNX1_1h_ERCCsamplewise_0_5_deseq...,4_degrons_v1_RUNX1_1h_ERCCsamplewise_0_5_deseq...,4_degrons_v1_RUNX1_1h_ERCCsamplewise_0_5_deseq...,4_degrons_v1_RUNX1_1h_ERCCsamplewise_0_5_deseq...,4_degrons_v1_RUNX1_1h_ERCCsamplewise_0_5_deseq...,...,4_degrons_v1_RUNX1_1h_0_5_deseq_readcounts.csv,4_degrons_v1_RUNX1_1h_0_5_deseq_readcounts.csv,4_degrons_v1_RUNX1_1h_0_5_deseq_readcounts.csv,4_degrons_v1_RUNX1_1h_0_5_deseq_readcounts.csv,4_degrons_v1_RUNX1_1h_0_5_deseq_tccounts.csv,4_degrons_v1_RUNX1_1h_0_5_deseq_tccounts.csv,4_degrons_v1_RUNX1_1h_0_5_deseq_tccounts.csv,4_degrons_v1_RUNX1_1h_0_5_deseq_tccounts.csv,4_degrons_v1_RUNX1_1h_0_5_deseq_tccounts.csv,4_degrons_v1_RUNX1_1h_0_5_deseq_tccounts.csv
2,scale,scale,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,...,,,,,,,,,,
3,condition,condition,1h,1h,1h,1h,1h,1h,1h,1h,...,1h,1h,1h,1h,1h,1h,1h,1h,1h,1h
4,type,type,readcounts,readcounts,readcounts,readcounts,readcounts,readcounts,tccounts,tccounts,...,readcounts,readcounts,readcounts,readcounts,tccounts,tccounts,tccounts,tccounts,tccounts,tccounts


In [91]:
df_merged.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,genes,gene_id,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,baseMean,log2FoldChange,...,lfcSE,stat,pvalue,padj,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
1,A2M,A2M,2.269069,-0.155731,1.023965,0.152086,0.879119,,0.0,-0.0,...,1.025011,0.157375,0.87495,,0.0,-0.0,,,0.0,
2,A2MP1,A2MP1,2.0503,3.698355,1.401673,-2.638529,0.008327,,0.0,-0.0,...,1.402745,-2.657264,0.007878,,0.0,-0.0,,,0.0,
3,NAT1,NAT1,93.83574,-0.131532,0.177942,0.739184,0.459795,0.954039,7.47324,0.280315,...,0.179349,0.669795,0.502989,,7.476669,0.284898,0.540413,-0.527186,0.598064,
4,AAMP,AAMP,668.981822,-0.004677,0.085659,0.054603,0.956455,0.996275,16.217859,-0.184215,...,0.086942,-0.063919,0.949035,0.993941,16.281909,-0.173311,0.362675,0.477867,0.632745,


In [92]:
%%time

# combine info and data
df_save = df_info.append(df_merged, ignore_index=True)
df_save.shape

CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 5.29 ms


(16750, 26)

### save merged file

In [94]:
# save file
res_dir = "../results"
deseq_res_file = "slamseq_{}_{}_0_5_deseq_output.csv".format(project, gene)

df_save.to_csv("{}/{}/{}".format(res_dir, project_dir, deseq_res_file), header=False, index=False)

print("Saved DESeq results to: {}/{}".format(project_dir, deseq_res_file))

Saved DESeq results to: slamseq_4_degrons_v1/slamseq_4_degrons_v1_RUNX1_0_5_deseq_output.csv


# 4_degrons_v1 (RUNX2)

In [95]:
project = "4_degrons_v1"
project_dir = "slamseq_{}".format(project)
gene = "RUNX2"

In [96]:
files = ! ls $res_dir/$project_dir/*.csv
files = [file for file in files if "deseq" in file and gene in file]
# ERCC files first
files = [file for file in files if "ERCCsamplewise" in file] + \
[file for file in files if "ERCCsamplewise" not in file]
print(len(files))

5


In [97]:
files = files[:-1]

In [98]:
# project 3
conditions = ["1h"]
scaling = ["ERCCsamplewise", None]
count_type = ["readcounts", "tccounts"]

### make info rows

In [99]:
# df with file info
df_info = pd.DataFrame(list(zip(["project", "file", "scale", "condition", "type"],
                                ["project", "file", "scale", "condition", "type"])))

In [100]:
df_info = pd.concat([df_info] + [pd.DataFrame([x*6 for x in get_file_info(project, file, conditions)]) \
                                 for file in files], axis=1)
df_info.shape

(5, 26)

In [101]:
df_info.head()

Unnamed: 0,0,1,0.1,1.1,2,3,4,5,0.2,1.2,...,2.1,3.1,4.1,5.1,0.3,1.3,2.2,3.2,4.2,5.2
0,project,project,4_degrons_v1,4_degrons_v1,4_degrons_v1,4_degrons_v1,4_degrons_v1,4_degrons_v1,4_degrons_v1,4_degrons_v1,...,4_degrons_v1,4_degrons_v1,4_degrons_v1,4_degrons_v1,4_degrons_v1,4_degrons_v1,4_degrons_v1,4_degrons_v1,4_degrons_v1,4_degrons_v1
1,file,file,4_degrons_v1_RUNX2_1h_ERCCsamplewise_0_5_deseq...,4_degrons_v1_RUNX2_1h_ERCCsamplewise_0_5_deseq...,4_degrons_v1_RUNX2_1h_ERCCsamplewise_0_5_deseq...,4_degrons_v1_RUNX2_1h_ERCCsamplewise_0_5_deseq...,4_degrons_v1_RUNX2_1h_ERCCsamplewise_0_5_deseq...,4_degrons_v1_RUNX2_1h_ERCCsamplewise_0_5_deseq...,4_degrons_v1_RUNX2_1h_ERCCsamplewise_0_5_deseq...,4_degrons_v1_RUNX2_1h_ERCCsamplewise_0_5_deseq...,...,4_degrons_v1_RUNX2_1h_0_5_deseq_readcounts.csv,4_degrons_v1_RUNX2_1h_0_5_deseq_readcounts.csv,4_degrons_v1_RUNX2_1h_0_5_deseq_readcounts.csv,4_degrons_v1_RUNX2_1h_0_5_deseq_readcounts.csv,4_degrons_v1_RUNX2_1h_0_5_deseq_tccounts.csv,4_degrons_v1_RUNX2_1h_0_5_deseq_tccounts.csv,4_degrons_v1_RUNX2_1h_0_5_deseq_tccounts.csv,4_degrons_v1_RUNX2_1h_0_5_deseq_tccounts.csv,4_degrons_v1_RUNX2_1h_0_5_deseq_tccounts.csv,4_degrons_v1_RUNX2_1h_0_5_deseq_tccounts.csv
2,scale,scale,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,...,,,,,,,,,,
3,condition,condition,1h,1h,1h,1h,1h,1h,1h,1h,...,1h,1h,1h,1h,1h,1h,1h,1h,1h,1h
4,type,type,readcounts,readcounts,readcounts,readcounts,readcounts,readcounts,tccounts,tccounts,...,readcounts,readcounts,readcounts,readcounts,tccounts,tccounts,tccounts,tccounts,tccounts,tccounts


### get dfs and merge

In [102]:
# Get list of deseq dfs
deseq_dfs = [get_deseq_data(file) for file in files]

In [103]:
# main df
df_merged = pd.DataFrame()
df_merged = df_merged.append(deseq_dfs[0][["genes", "gene_id"]]) # add gene cols to main df

In [104]:
# join df columns on gene & gene_id columns
df_merged = reduce(lambda left, right: pd.merge(left, right, on=['genes', "gene_id"], how="outer"), 
                   [df_merged] + deseq_dfs)
df_merged.shape

(16744, 26)

In [105]:
# change column names
df_merged.columns = [col_name.split("_")[0] if "gene" not in col_name else 
                     col_name for col_name in df_merged.columns.tolist()]
df_merged = df_merged.T.reset_index().T.reset_index(drop=True)
df_merged.shape

(16745, 26)

### add info columns

In [107]:
print(df_info.shape)
print(df_merged.shape)

(5, 26)
(16745, 26)


In [108]:
# reset column index
df_info = df_info.T.reset_index(drop=True).T

In [109]:
%%time

# combine info and data
df_save = df_info.append(df_merged, ignore_index=True)
df_save.shape

CPU times: user 8 ms, sys: 0 ns, total: 8 ms
Wall time: 7.75 ms


(16750, 26)

### save merged file

In [110]:
# save file
res_dir = "../results"
deseq_res_file = "slamseq_{}_{}_0_5_deseq_output.csv".format(project, gene)

df_save.to_csv("{}/{}/{}".format(res_dir, project_dir, deseq_res_file), header=False, index=False)

print("Saved DESeq results to: {}/{}".format(project_dir, deseq_res_file))

Saved DESeq results to: slamseq_4_degrons_v1/slamseq_4_degrons_v1_RUNX2_0_5_deseq_output.csv


# 4_degrons_v1 (SPI1)

In [111]:
project = "4_degrons_v1"
project_dir = "slamseq_{}".format(project)
gene = "SPI1"

In [112]:
files = ! ls $res_dir/$project_dir/*.csv
files = [file for file in files if "deseq" in file and gene in file]
# ERCC files first
files = [file for file in files if "ERCCsamplewise" in file] + \
[file for file in files if "ERCCsamplewise" not in file]
print(len(files))

5


In [114]:
# project 3
conditions = ["1h"]
scaling = ["ERCCsamplewise", None]
count_type = ["readcounts", "tccounts"]

### make info rows

In [115]:
# df with file info
df_info = pd.DataFrame(list(zip(["project", "file", "scale", "condition", "type"],
                                ["project", "file", "scale", "condition", "type"])))

In [116]:
df_info = pd.concat([df_info] + [pd.DataFrame([x*6 for x in get_file_info(project, file, conditions)]) \
                                 for file in files], axis=1)
df_info.shape

(5, 26)

### get dfs and merge

In [117]:
# Get list of deseq dfs
deseq_dfs = [get_deseq_data(file) for file in files]

In [118]:
# main df
df_merged = pd.DataFrame()
df_merged = df_merged.append(deseq_dfs[0][["genes", "gene_id"]]) # add gene cols to main df

In [119]:
# join df columns on gene & gene_id columns
df_merged = reduce(lambda left, right: pd.merge(left, right, on=['genes', "gene_id"], how="outer"), 
                   [df_merged] + deseq_dfs)
df_merged.shape

(16744, 26)

In [120]:
# change column names
df_merged.columns = [col_name.split("_")[0] if "gene" not in col_name else 
                     col_name for col_name in df_merged.columns.tolist()]
df_merged = df_merged.T.reset_index().T.reset_index(drop=True)
df_merged.shape

(16745, 26)

### add info columns

In [121]:
print(df_info.shape)
print(df_merged.shape)

(5, 26)
(16745, 26)


In [122]:
# reset column index
df_info = df_info.T.reset_index(drop=True).T

In [123]:
%%time

# combine info and data
df_save = df_info.append(df_merged, ignore_index=True)
df_save.shape

CPU times: user 4 ms, sys: 4 ms, total: 8 ms
Wall time: 10 ms


(16750, 26)

### save merged file

In [124]:
# save file
res_dir = "../results"
deseq_res_file = "slamseq_{}_{}_0_5_deseq_output.csv".format(project, gene)

df_save.to_csv("{}/{}/{}".format(res_dir, project_dir, deseq_res_file), header=False, index=False)

print("Saved DESeq results to: {}/{}".format(project_dir, deseq_res_file))

Saved DESeq results to: slamseq_4_degrons_v1/slamseq_4_degrons_v1_SPI1_0_5_deseq_output.csv


# 4_degrons_v1 (GFI1)

In [125]:
project = "4_degrons_v1"
project_dir = "slamseq_{}".format(project)
gene = "GFI1"

In [126]:
files = ! ls $res_dir/$project_dir/*.csv
files = [file for file in files if "deseq" in file and gene in file]
# ERCC files first
files = [file for file in files if "ERCCsamplewise" in file] + \
[file for file in files if "ERCCsamplewise" not in file]
print(len(files))

5


In [127]:
files = files[:-1]

In [128]:
# project 3
conditions = ["2h"]
scaling = ["ERCCsamplewise", None]
count_type = ["readcounts", "tccounts"]

### make info rows

In [129]:
# df with file info
df_info = pd.DataFrame(list(zip(["project", "file", "scale", "condition", "type"],
                                ["project", "file", "scale", "condition", "type"])))

In [130]:
df_info = pd.concat([df_info] + [pd.DataFrame([x*6 for x in get_file_info(project, file, conditions)]) \
                                 for file in files], axis=1)
df_info.shape

(5, 26)

### get dfs and merge

In [131]:
# Get list of deseq dfs
deseq_dfs = [get_deseq_data(file) for file in files]

In [132]:
# main df
df_merged = pd.DataFrame()
df_merged = df_merged.append(deseq_dfs[0][["genes", "gene_id"]]) # add gene cols to main df

In [133]:
# join df columns on gene & gene_id columns
df_merged = reduce(lambda left, right: pd.merge(left, right, on=['genes', "gene_id"], how="outer"), 
                   [df_merged] + deseq_dfs)
df_merged.shape

(16744, 26)

In [134]:
# change column names
df_merged.columns = [col_name.split("_")[0] if "gene" not in col_name else 
                     col_name for col_name in df_merged.columns.tolist()]
df_merged = df_merged.T.reset_index().T.reset_index(drop=True)
df_merged.shape

(16745, 26)

### add info columns

In [135]:
print(df_info.shape)
print(df_merged.shape)

(5, 26)
(16745, 26)


In [136]:
# reset column index
df_info = df_info.T.reset_index(drop=True).T

In [137]:
%%time

# combine info and data
df_save = df_info.append(df_merged, ignore_index=True)
df_save.shape

CPU times: user 8 ms, sys: 0 ns, total: 8 ms
Wall time: 7.52 ms


(16750, 26)

### save merged file

In [138]:
# save file
res_dir = "../results"
deseq_res_file = "slamseq_{}_{}_0_5_deseq_output.csv".format(project, gene)

df_save.to_csv("{}/{}/{}".format(res_dir, project_dir, deseq_res_file), header=False, index=False)

print("Saved DESeq results to: {}/{}".format(project_dir, deseq_res_file))

Saved DESeq results to: slamseq_4_degrons_v1/slamseq_4_degrons_v1_GFI1_0_5_deseq_output.csv


# Merge all no alt deseq outputs

In [192]:
project_dirs = ["slamseq_IRF2BP2_degraded_v1_res_v4",
                "slamseq_MYB_degraded_v1_res_v7",
                "slamseq_4_degrons_v1"]

deseq_res_dirs = ["{}/{}".format(res_dir, project_dir) for project_dir in project_dirs]

In [193]:
deseq_files = [glob.glob(deseq_res_dir+"/*output.csv") for deseq_res_dir in deseq_res_dirs]
deseq_files = [item for sublist in deseq_files for item in sublist]
deseq_files

['../results/slamseq_IRF2BP2_degraded_v1_res_v4/slamseq_IRF2BP2_degraded_v1_res_v4_0_5_deseq_output.csv',
 '../results/slamseq_MYB_degraded_v1_res_v7/slamseq_MYB_degraded_v1_res_v7_0_5_deseq_output.csv',
 '../results/slamseq_4_degrons_v1/slamseq_4_degrons_v1_SPI1_0_5_deseq_output.csv',
 '../results/slamseq_4_degrons_v1/slamseq_4_degrons_v1_GFI1_0_5_deseq_output.csv',
 '../results/slamseq_4_degrons_v1/slamseq_4_degrons_v1_RUNX1_0_5_deseq_output.csv',
 '../results/slamseq_4_degrons_v1/slamseq_4_degrons_v1_RUNX2_0_5_deseq_output.csv']

In [194]:
df_res = get_deseq_data(deseq_files[0])
df_res.shape

(15637, 50)

In [195]:
# list of deseq dfs
#deseq_dfs = [get_deseq_data(deseq_file, low_memory=False) for deseq_file in deseq_files]

# list of deseq dfs (no header)
deseq_dfs = [pd.read_csv(deseq_file, index_col=False, header=None, squeeze=True) for deseq_file in deseq_files]

In [197]:
for df in deseq_dfs:
    print(df.shape)

(15638, 50)
(17217, 26)
(16750, 26)
(16750, 26)
(16750, 26)
(16750, 26)


In [198]:
# make empty df
df_save_data = pd.DataFrame()
df_save_info = pd.DataFrame()

## Merge info data

In [199]:
def merge_info_cols(df1, df2):
    ''' merged info columns from 2 info dfs '''
    df_merged = pd.merge(df1, df2, left_on=[0,1], right_on=[0,1])
    df_merged = df_merged.T.reset_index(drop=True).T
    if (df1.shape[1]+df2.shape[1]-2) != df_merged.shape[1]:
        raise ValueError("Incorrect number of columns.\
                          df1: {}\
                          df2: {}".format(df1.shape, df2.shape))
    return(df_merged)

In [200]:
df_info_merged = deseq_dfs[0].iloc[:5]

In [201]:
df_res = deseq_dfs[1]

In [202]:
deseq_dfs[0].shape

(15638, 50)

In [203]:
df_info_merged = pd.merge(df_info_merged, df_res.iloc[:5], left_on=[0,1], right_on=[0,1])
df_info_merged.shape

(5, 74)

In [204]:
# IRF2BP2
df_info_merged = deseq_dfs[0].iloc[:5]

# merge info columns
for df_res in deseq_dfs[1:]:
    print("df1 cols: {} | df2 cols: {}".format(df_info_merged.shape[1], df_res.shape[1]))
    df_info = df_res.iloc[:5]
    df_info_merged = merge_info_cols(df_info_merged, df_info)
    print(df_info_merged.shape[1])

df1 cols: 50 | df2 cols: 26
74
df1 cols: 74 | df2 cols: 26
98
df1 cols: 98 | df2 cols: 26
122
df1 cols: 122 | df2 cols: 26
146
df1 cols: 146 | df2 cols: 26
170


## Merge gene data

In [None]:
def get_deseq_data(file, low_memory=True):
    ''' input deseq file & returns df '''
    ''' add file name to headers except gene columns '''
    df = pd.read_csv(file, header=[0], index_col=None, squeeze=True, low_memory=low_memory)
    df.columns = ["{}_{}".format(col_name, os.path.basename(file).split(".")[0]) if "gene" not in col_name
                  else col_name for col_name in df.columns.tolist()]
    return(df)

In [226]:
def rename_deseq_data(df):
    ''' renames columns uniquely to deseq output header '''
    df.columns = ["{}_{}".format(col_name, str(idx)) if "gene" not in col_name
                  else col_name for idx, col_name in enumerate(df.iloc[0].tolist())]
    df = df.iloc[1:]
    return(df)

In [233]:
# get df data without info
deseq_df_data = [df_res.iloc[5:] for df_res in deseq_dfs]

# make first row col names & unique
deseq_df_data = [rename_deseq_data(df_data) for df_data in deseq_df_data]

In [247]:
for df_data in deseq_df_data:
    print(df_data.shape)

(15632, 50)
(17211, 26)
(16744, 26)
(16744, 26)
(16744, 26)
(16744, 26)


In [249]:
# get df data without info
deseq_df_data = [df_res.iloc[5:] for df_res in deseq_dfs]
deseq_df_data[0].iloc[:5]

Unnamed: 0,project_0,project_1,IRF2BP2_degraded_v1_2,IRF2BP2_degraded_v1_3,IRF2BP2_degraded_v1_4,IRF2BP2_degraded_v1_5,IRF2BP2_degraded_v1_6,IRF2BP2_degraded_v1_7,IRF2BP2_degraded_v1_8,IRF2BP2_degraded_v1_9,...,IRF2BP2_degraded_v1_40,IRF2BP2_degraded_v1_41,IRF2BP2_degraded_v1_42,IRF2BP2_degraded_v1_43,IRF2BP2_degraded_v1_44,IRF2BP2_degraded_v1_45,IRF2BP2_degraded_v1_46,IRF2BP2_degraded_v1_47,IRF2BP2_degraded_v1_48,IRF2BP2_degraded_v1_49
5,genes,gene_id,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,baseMean,log2FoldChange,...,lfcSE,stat,pvalue,padj,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
6,NAT1,NAT1,95.87059628671004,-0.4618119842755553,0.1838993165621769,2.5112218626403395,0.0120314054862605,0.1764824098229201,5.033275352515851,-0.5602362002955328,...,0.2499835106626506,1.345021316023153,0.1786183509749246,0.9182518409151246,0.7526302401777346,2.0042049937800543,1.8278146338544723,-1.0965034181576785,0.2728585282994329,
7,AAMP,AAMP,842.3716209229535,-0.1476064480241523,0.1180792991839148,1.2500620264882107,0.2112768900493981,0.621101974228759,14.553492083441196,0.3101726260809158,...,0.1436118349988333,-1.174214545359719,0.2403091013937329,0.9594495599032664,7.433181480720183,0.2842585712308931,0.6006672968377171,-0.4732379683851699,0.6360434123857338,
8,AARS1,AARS1,1374.1764868232967,-0.0252481964182913,0.1093729310308428,0.2308450196984433,0.8174352049833612,0.957161656507632,16.200285486653765,-0.1372280711065793,...,0.1671727054204942,-0.6585513747031996,0.5101838964077127,0.9966744113218344,11.302473594467742,0.0730244615436611,0.4905015559836879,-0.1488771251646949,0.8816505915117299,
9,ABAT,ABAT,55.9224048891824,-0.3585220629160938,0.2117003248291207,1.6935357241680382,0.0903535409020921,0.4386939785389467,1.693745222699288,-0.9499849425240666,...,0.2681252539633148,0.426446279789382,0.6697826846194971,0.9966744113218344,0.6345738591917904,-1.0072747380631193,2.0802920535413554,0.4841987144778059,0.6282448491442183,


In [251]:
# main df
df_data_merged = pd.DataFrame()
df_data_merged = df_data_merged.append(deseq_df_data[0][["project_0", "project_1"]]) # add gene cols to main df

In [254]:
# join df columns on gene & gene_id columns
df_data_merged = reduce(lambda left, right: pd.merge(left, right, on=["project_0", "project_1"], how="outer"), 
                        [df_data_merged] + deseq_df_data)
df_data_merged.shape

  df_data_merged = reduce(lambda left, right: pd.merge(left, right, on=["project_0", "project_1"], how="outer"),


(18323, 170)

In [256]:
df_data_merged.head()

Unnamed: 0,project_0,project_1,IRF2BP2_degraded_v1_2,IRF2BP2_degraded_v1_3,IRF2BP2_degraded_v1_4,IRF2BP2_degraded_v1_5,IRF2BP2_degraded_v1_6,IRF2BP2_degraded_v1_7,IRF2BP2_degraded_v1_8,IRF2BP2_degraded_v1_9,...,4_degrons_v1_16_y,4_degrons_v1_17_y,4_degrons_v1_18_y,4_degrons_v1_19_y,4_degrons_v1_20_y,4_degrons_v1_21_y,4_degrons_v1_22_y,4_degrons_v1_23_y,4_degrons_v1_24_y,4_degrons_v1_25_y
0,genes,gene_id,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,baseMean,log2FoldChange,...,lfcSE,stat,pvalue,padj,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
1,NAT1,NAT1,95.87059628671004,-0.4618119842755553,0.1838993165621769,2.5112218626403395,0.0120314054862605,0.1764824098229201,5.033275352515851,-0.5602362002955328,...,0.1910164963313597,1.195738593227763,0.2317985857833831,0.5815447582945407,6.951305773266107,-0.303572495248922,0.6056459757244428,0.5012375338345212,0.6162039617032911,
2,AAMP,AAMP,842.3716209229535,-0.1476064480241523,0.1180792991839148,1.2500620264882107,0.2112768900493981,0.621101974228759,14.553492083441196,0.3101726260809158,...,0.0936551287545897,-0.7816068093858641,0.4344456838431942,0.7234391226069221,31.25961097658043,0.3155357818127773,0.2868775385203951,-1.0998971318569954,0.2713769445263918,0.5759060637707974
3,AARS1,AARS1,1374.1764868232967,-0.0252481964182913,0.1093729310308428,0.2308450196984433,0.8174352049833612,0.957161656507632,16.200285486653765,-0.1372280711065793,...,0.0853883738066852,-0.0376794020496949,0.9699432991159118,0.9854767365932272,22.7732265306714,0.3114110726026999,0.3184951940658864,-0.9777575247753314,0.3281942632603161,
4,ABAT,ABAT,55.9224048891824,-0.3585220629160938,0.2117003248291207,1.6935357241680382,0.0903535409020921,0.4386939785389467,1.693745222699288,-0.9499849425240666,...,0.1522534347786068,0.3937569945555876,0.6937604579667991,0.8679025465000523,6.911045666718635,-0.1534162228659588,0.5532445763431577,0.2773027146149558,0.7815476753088093,


In [264]:
# total unique genes
unique_gene_list = []
for df_data in deseq_df_data:
    unique_gene_list.extend(df_data.iloc[1:,0].tolist())
print("Total unique genes: {}".format(len(list(set(unique_gene_list)))))

Total unique genes: 18322


In [280]:
df_merged = df_data_merged

In [281]:
df_merged.head()

Unnamed: 0,project_0,project_1,IRF2BP2_degraded_v1_2,IRF2BP2_degraded_v1_3,IRF2BP2_degraded_v1_4,IRF2BP2_degraded_v1_5,IRF2BP2_degraded_v1_6,IRF2BP2_degraded_v1_7,IRF2BP2_degraded_v1_8,IRF2BP2_degraded_v1_9,...,4_degrons_v1_16_y,4_degrons_v1_17_y,4_degrons_v1_18_y,4_degrons_v1_19_y,4_degrons_v1_20_y,4_degrons_v1_21_y,4_degrons_v1_22_y,4_degrons_v1_23_y,4_degrons_v1_24_y,4_degrons_v1_25_y
0,genes,gene_id,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,baseMean,log2FoldChange,...,lfcSE,stat,pvalue,padj,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
1,NAT1,NAT1,95.87059628671004,-0.4618119842755553,0.1838993165621769,2.5112218626403395,0.0120314054862605,0.1764824098229201,5.033275352515851,-0.5602362002955328,...,0.1910164963313597,1.195738593227763,0.2317985857833831,0.5815447582945407,6.951305773266107,-0.303572495248922,0.6056459757244428,0.5012375338345212,0.6162039617032911,
2,AAMP,AAMP,842.3716209229535,-0.1476064480241523,0.1180792991839148,1.2500620264882107,0.2112768900493981,0.621101974228759,14.553492083441196,0.3101726260809158,...,0.0936551287545897,-0.7816068093858641,0.4344456838431942,0.7234391226069221,31.25961097658043,0.3155357818127773,0.2868775385203951,-1.0998971318569954,0.2713769445263918,0.5759060637707974
3,AARS1,AARS1,1374.1764868232967,-0.0252481964182913,0.1093729310308428,0.2308450196984433,0.8174352049833612,0.957161656507632,16.200285486653765,-0.1372280711065793,...,0.0853883738066852,-0.0376794020496949,0.9699432991159118,0.9854767365932272,22.7732265306714,0.3114110726026999,0.3184951940658864,-0.9777575247753314,0.3281942632603161,
4,ABAT,ABAT,55.9224048891824,-0.3585220629160938,0.2117003248291207,1.6935357241680382,0.0903535409020921,0.4386939785389467,1.693745222699288,-0.9499849425240666,...,0.1522534347786068,0.3937569945555876,0.6937604579667991,0.8679025465000523,6.911045666718635,-0.1534162228659588,0.5532445763431577,0.2773027146149558,0.7815476753088093,


In [283]:
df_merged = df_merged.T.reset_index(drop=True).T

In [284]:
df_merged

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,160,161,162,163,164,165,166,167,168,169
0,genes,gene_id,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,baseMean,log2FoldChange,...,lfcSE,stat,pvalue,padj,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
1,NAT1,NAT1,95.87059628671004,-0.4618119842755553,0.1838993165621769,2.5112218626403395,0.0120314054862605,0.1764824098229201,5.033275352515851,-0.5602362002955328,...,0.1910164963313597,1.195738593227763,0.2317985857833831,0.5815447582945407,6.951305773266107,-0.303572495248922,0.6056459757244428,0.5012375338345212,0.6162039617032911,
2,AAMP,AAMP,842.3716209229535,-0.1476064480241523,0.1180792991839148,1.2500620264882107,0.2112768900493981,0.621101974228759,14.553492083441196,0.3101726260809158,...,0.0936551287545897,-0.7816068093858641,0.4344456838431942,0.7234391226069221,31.25961097658043,0.3155357818127773,0.2868775385203951,-1.0998971318569954,0.2713769445263918,0.5759060637707974
3,AARS1,AARS1,1374.1764868232967,-0.0252481964182913,0.1093729310308428,0.2308450196984433,0.8174352049833612,0.957161656507632,16.200285486653765,-0.1372280711065793,...,0.0853883738066852,-0.0376794020496949,0.9699432991159118,0.9854767365932272,22.7732265306714,0.3114110726026999,0.3184951940658864,-0.9777575247753314,0.3281942632603161,
4,ABAT,ABAT,55.9224048891824,-0.3585220629160938,0.2117003248291207,1.6935357241680382,0.0903535409020921,0.4386939785389467,1.693745222699288,-0.9499849425240666,...,0.1522534347786068,0.3937569945555876,0.6937604579667991,0.8679025465000523,6.911045666718635,-0.1534162228659588,0.5532445763431577,0.2773027146149558,0.7815476753088093,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18318,AC113208.3,AC113208.3,,,,,,,,,...,0.8855085832558085,1.1782373841042888,0.2387019826835846,,0.0,-0.0,,,0.0,
18319,105377586,105377586,,,,,,,,,...,0.9973246137985076,0.3124464112381569,0.7547012834221719,,0.5719749369309547,-1.908720323682767,1.9634370487950688,0.9721321724341097,0.3309847985506584,
18320,105378663,105378663,,,,,,,,,...,0.859766767514565,-1.265830008396275,0.2055739490192727,,0.783802604256384,1.878134129592003,1.7378394499003909,-1.0807293675486844,0.2798175156218232,
18321,FRGCA,FRGCA,,,,,,,,,...,,,0.0,,0.0,-0.0,,,0.0,


__add info columns__

In [None]:
# reset column index
df_info_merged = df_info_merged.T.reset_index(drop=True).T

In [285]:
df_info_merged.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,160,161,162,163,164,165,166,167,168,169
0,project,project,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,...,4_degrons_v1,4_degrons_v1,4_degrons_v1,4_degrons_v1,4_degrons_v1,4_degrons_v1,4_degrons_v1,4_degrons_v1,4_degrons_v1,4_degrons_v1
1,file,file,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,...,4_degrons_v1_RUNX2_1h_0_5_deseq_readcounts.csv,4_degrons_v1_RUNX2_1h_0_5_deseq_readcounts.csv,4_degrons_v1_RUNX2_1h_0_5_deseq_readcounts.csv,4_degrons_v1_RUNX2_1h_0_5_deseq_readcounts.csv,4_degrons_v1_RUNX2_1h_0_5_deseq_tccounts.csv,4_degrons_v1_RUNX2_1h_0_5_deseq_tccounts.csv,4_degrons_v1_RUNX2_1h_0_5_deseq_tccounts.csv,4_degrons_v1_RUNX2_1h_0_5_deseq_tccounts.csv,4_degrons_v1_RUNX2_1h_0_5_deseq_tccounts.csv,4_degrons_v1_RUNX2_1h_0_5_deseq_tccounts.csv
2,scale,scale,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,...,,,,,,,,,,
3,condition,condition,3h,3h,3h,3h,3h,3h,3h,3h,...,1h,1h,1h,1h,1h,1h,1h,1h,1h,1h
4,type,type,readcounts,readcounts,readcounts,readcounts,readcounts,readcounts,tccounts,tccounts,...,readcounts,readcounts,readcounts,readcounts,tccounts,tccounts,tccounts,tccounts,tccounts,tccounts


In [286]:
%%time

# combine info and data
df_save = df_info_merged.append(df_merged, ignore_index=True)
df_save.shape

CPU times: user 28 ms, sys: 0 ns, total: 28 ms
Wall time: 28.9 ms


(18328, 170)

In [287]:
df_save

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,160,161,162,163,164,165,166,167,168,169
0,project,project,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,...,4_degrons_v1,4_degrons_v1,4_degrons_v1,4_degrons_v1,4_degrons_v1,4_degrons_v1,4_degrons_v1,4_degrons_v1,4_degrons_v1,4_degrons_v1
1,file,file,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,...,4_degrons_v1_RUNX2_1h_0_5_deseq_readcounts.csv,4_degrons_v1_RUNX2_1h_0_5_deseq_readcounts.csv,4_degrons_v1_RUNX2_1h_0_5_deseq_readcounts.csv,4_degrons_v1_RUNX2_1h_0_5_deseq_readcounts.csv,4_degrons_v1_RUNX2_1h_0_5_deseq_tccounts.csv,4_degrons_v1_RUNX2_1h_0_5_deseq_tccounts.csv,4_degrons_v1_RUNX2_1h_0_5_deseq_tccounts.csv,4_degrons_v1_RUNX2_1h_0_5_deseq_tccounts.csv,4_degrons_v1_RUNX2_1h_0_5_deseq_tccounts.csv,4_degrons_v1_RUNX2_1h_0_5_deseq_tccounts.csv
2,scale,scale,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,...,,,,,,,,,,
3,condition,condition,3h,3h,3h,3h,3h,3h,3h,3h,...,1h,1h,1h,1h,1h,1h,1h,1h,1h,1h
4,type,type,readcounts,readcounts,readcounts,readcounts,readcounts,readcounts,tccounts,tccounts,...,readcounts,readcounts,readcounts,readcounts,tccounts,tccounts,tccounts,tccounts,tccounts,tccounts
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18323,AC113208.3,AC113208.3,,,,,,,,,...,0.8855085832558085,1.1782373841042888,0.2387019826835846,,0.0,-0.0,,,0.0,
18324,105377586,105377586,,,,,,,,,...,0.9973246137985076,0.3124464112381569,0.7547012834221719,,0.5719749369309547,-1.908720323682767,1.9634370487950688,0.9721321724341097,0.3309847985506584,
18325,105378663,105378663,,,,,,,,,...,0.859766767514565,-1.265830008396275,0.2055739490192727,,0.783802604256384,1.878134129592003,1.7378394499003909,-1.0807293675486844,0.2798175156218232,
18326,FRGCA,FRGCA,,,,,,,,,...,,,0.0,,0.0,-0.0,,,0.0,


__save composite merged file__

In [288]:
# save file
res_dir = "../results"
deseq_res_file = "slamseq_merged_deseq_output_noAlt.csv"

df_save.to_csv("{}/{}".format(res_dir, deseq_res_file), header=False, index=False)

print("Saved DESeq results to: {}/{}".format(res_dir, deseq_res_file))

Saved DESeq results to: ../results/slamseq_merged_deseq_output_noAlt.csv


# Looking into data

In [258]:
#df_MYB = 
df_save.loc[:,df_save list(df_save.iloc[0])

True

In [260]:
list(df_save.iloc[0])

['project',
 'project',
 'IRF2BP2_degraded_v1',
 'IRF2BP2_degraded_v1',
 'IRF2BP2_degraded_v1',
 'IRF2BP2_degraded_v1',
 'IRF2BP2_degraded_v1',
 'IRF2BP2_degraded_v1',
 'IRF2BP2_degraded_v1',
 'IRF2BP2_degraded_v1',
 'IRF2BP2_degraded_v1',
 'IRF2BP2_degraded_v1',
 'IRF2BP2_degraded_v1',
 'IRF2BP2_degraded_v1',
 'IRF2BP2_degraded_v1',
 'IRF2BP2_degraded_v1',
 'IRF2BP2_degraded_v1',
 'IRF2BP2_degraded_v1',
 'IRF2BP2_degraded_v1',
 'IRF2BP2_degraded_v1',
 'IRF2BP2_degraded_v1',
 'IRF2BP2_degraded_v1',
 'IRF2BP2_degraded_v1',
 'IRF2BP2_degraded_v1',
 'IRF2BP2_degraded_v1',
 'IRF2BP2_degraded_v1',
 'IRF2BP2_degraded_v1',
 'IRF2BP2_degraded_v1',
 'IRF2BP2_degraded_v1',
 'IRF2BP2_degraded_v1',
 'IRF2BP2_degraded_v1',
 'IRF2BP2_degraded_v1',
 'IRF2BP2_degraded_v1',
 'IRF2BP2_degraded_v1',
 'IRF2BP2_degraded_v1',
 'IRF2BP2_degraded_v1',
 'IRF2BP2_degraded_v1',
 'IRF2BP2_degraded_v1',
 'IRF2BP2_degraded_v1',
 'IRF2BP2_degraded_v1',
 'IRF2BP2_degraded_v1',
 'IRF2BP2_degraded_v1',
 'IRF2BP2_degrad

In [256]:
project2

'MYB_degraded_v1'

## Testing

In [16]:
# read in deseq table
print(files[0])
df1 = pd.read_csv(files[0], header=[0], index_col=None, squeeze=True)
df1.iloc[:2]

../results/slamseq_IRF2BP2_degraded_v1/IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_deseq_readcounts.csv


Unnamed: 0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,genes,gene_id
0,98.73474,-0.35887,0.179587,1.998309,0.045683,0.557313,NAT1,NAT1
1,872.500027,-0.041895,0.116618,0.35925,0.719408,0.968449,AAMP,AAMP


In [17]:
df1.shape

(15112, 8)

In [18]:
print(len(df1.genes.unique()))
print(len(df1.genes.drop_duplicates(keep=False)))
print(df1.shape[0] - len(df1.genes.drop_duplicates(keep=False)))
print(len(df1.genes.unique()) - len(df1.genes.drop_duplicates(keep=False)))

15112
15112
0
0


In [21]:
["{}_{}".format(col_name, os.path.basename(files[0]).split(".")[0]) if "gene" not in col_name \
 else col_name for col_name in df1.columns.tolist()]

['baseMean_IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_deseq_readcounts',
 'log2FoldChange_IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_deseq_readcounts',
 'lfcSE_IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_deseq_readcounts',
 'stat_IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_deseq_readcounts',
 'pvalue_IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_deseq_readcounts',
 'padj_IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_deseq_readcounts',
 'genes',
 'gene_id']

In [22]:
# read in deseq table
print(files[1])
df2 = pd.read_csv(files[1], header=[0], index_col=None, squeeze=True)
df2.iloc[:2]

../results/slamseq_IRF2BP2_degraded_v1/IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_deseq_tccounts.csv


Unnamed: 0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,genes,gene_id
0,5.02174,-0.554785,0.739133,0.750588,0.4529,,NAT1,NAT1
1,14.549761,0.316474,0.385427,-0.821099,0.41159,0.642625,AAMP,AAMP


In [24]:
#deseq_dfs = map(files, get_deseq_data)
deseq_dfs = [get_deseq_data(file) for file in files[:3]]
deseq_dfs[0].iloc[:1]

Unnamed: 0,baseMean_IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_deseq_readcounts,log2FoldChange_IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_deseq_readcounts,lfcSE_IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_deseq_readcounts,stat_IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_deseq_readcounts,pvalue_IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_deseq_readcounts,padj_IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_deseq_readcounts,genes,gene_id
0,98.73474,-0.35887,0.179587,1.998309,0.045683,0.557313,NAT1,NAT1


In [25]:
# main df
df_merged = pd.DataFrame()
df_merged = df_merged.append(deseq_dfs[0][["genes", "gene_id"]]) # add gene cols to main df
df_merged.head()

Unnamed: 0,genes,gene_id
0,NAT1,NAT1
1,AAMP,AAMP
2,AARS1,AARS1
3,ABAT,ABAT
4,ABCA1,ABCA1


In [26]:
# join df columns on gene & gene_id columns
df_merged = reduce(lambda left, right: pd.merge(left, right, on=['genes', "gene_id"], how="outer"), 
                   [df_merged] + deseq_dfs)
df_merged.shape

(15112, 20)

In [27]:
# change column names
df_merged.columns = [col_name.split("_")[0] if "gene" not in col_name else 
                     col_name for col_name in df_merged.columns.tolist()]
df_merged = df_merged.T.reset_index().T.reset_index(drop=True)

In [28]:
df_merged.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,genes,gene_id,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
1,NAT1,NAT1,98.73474,-0.35887,0.179587,1.998309,0.045683,0.557313,5.02174,-0.554785,0.739133,0.750588,0.4529,,105.179117,-0.435531,0.261972,1.662513,0.09641,0.616449
2,AAMP,AAMP,872.500027,-0.041895,0.116618,0.35925,0.719408,0.968449,14.549761,0.316474,0.385427,-0.821099,0.41159,0.642625,971.817384,0.058489,0.136273,-0.429207,0.667772,0.989724
3,AARS1,AARS1,1426.956869,0.083048,0.107111,-0.775339,0.438139,0.906009,16.042542,-0.109203,0.395866,0.275859,0.782656,0.886222,890.019221,-0.00236,0.161838,0.014582,0.988365,0.999179
4,ABAT,ABAT,57.515186,-0.258913,0.20369,1.271113,0.203688,0.808742,1.695255,-0.946442,1.260847,0.75064,0.452869,,42.89278,-0.2227,0.265193,0.839767,0.401039,0.944698


In [19]:
df_merged.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,genes,gene_id,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
1,NAT1,NAT1,95.678631,-0.463807,0.183351,2.529615,0.011419,0.162526,5.023216,-0.554238,0.738925,0.75006,0.453218,,105.239049,-0.433213,0.261849,1.65444,0.098038,0.646734
2,AAMP,AAMP,841.211188,-0.150545,0.117138,1.285188,0.198727,0.58154,14.561153,0.317738,0.385701,-0.823793,0.410057,0.647203,972.517055,0.060776,0.136377,-0.44565,0.65585,0.995046
3,AARS1,AARS1,1373.801586,-0.025304,0.10876,0.232656,0.816029,0.954353,16.055735,-0.107774,0.395963,0.272182,0.785482,0.888308,890.639308,-0.00008,0.161873,0.000495,0.999605,0.99988
4,ABAT,ABAT,55.753788,-0.362282,0.21001,1.725072,0.084514,0.404263,1.69655,-0.944984,1.260997,0.749394,0.45362,,42.920747,-0.220415,0.265135,0.831329,0.405788,0.961633


which genes are duplicated

why are the values so different for some unaffected genes

In [42]:
test_counts = "../results/slamseq_IRF2BP2_degraded_v1_dup/IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_deseq_readcounts.csv"
df_test = pd.read_csv(test_counts, header=[0], index_col=None, squeeze=True)

In [43]:
duplicate_genes = list(set(df_test.genes.unique()) - set(df_test.genes.drop_duplicates(keep=False)))

In [45]:
print(len(duplicate_genes))
df_dup = df_test.loc[df_test['genes'].isin(duplicate_genes)]
print(df_dup.shape)

27
(54, 8)


In [49]:
df_dedup = df1.loc[df1['genes'].isin(duplicate_genes)]

In [53]:
df_dup.loc[df_dup['genes'] == "CHML"]

Unnamed: 0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,genes,gene_id
534,8319.176123,-0.093807,0.119186,0.787069,0.431242,0.77892,CHML,CHML
6018,3965.963276,0.065072,0.13196,-0.493116,0.62193,0.878732,CHML,CHML


In [54]:
df_dedup.loc[df_dedup['genes'] == "CHML"]

Unnamed: 0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,genes,gene_id
534,12746.248337,0.063397,0.114081,-0.555717,0.578405,0.941269,CHML,CHML


In [59]:
df_test.loc[df_test['genes'].isin(["MYC", "MYB", "IRF2BP2", "MEIS1", "MYB"])]

Unnamed: 0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,genes,gene_id
1790,727.339448,-0.114417,0.113961,1.004001,0.3153782,0.6937536,MEIS1,MEIS1
1902,3919.460999,-0.100624,0.127026,0.792155,0.4282701,0.7769807,MYB,MYB
1906,1757.343901,-0.127836,0.121284,1.054029,0.2918698,0.6784911,MYC,MYC
12650,15280.599449,0.90623,0.094478,-9.591931,8.645148e-22,1.317088e-18,IRF2BP2,IRF2BP2


In [60]:
df1.loc[df1['genes'].isin(["MYC", "MYB", "IRF2BP2", "MEIS1", "MYB"])]

Unnamed: 0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,genes,gene_id
1790,753.976244,-0.008948,0.106854,0.083742,0.9332619,0.9950914,MEIS1,MEIS1
1902,4062.431874,0.004507,0.120156,-0.037512,0.9700771,0.9970766,MYB,MYB
1906,1822.255867,-0.02071,0.11789,0.175672,0.8605519,0.9866435,MYC,MYC
12647,16046.862685,1.013079,0.087115,-11.629256,2.926355e-31,5.4530540000000005e-28,IRF2BP2,IRF2BP2


In [20]:
print(df_info.shape)
print(df_merged.shape)

(5, 20)
(15518, 20)


In [21]:
df_info.shape[0] + df_merged.shape[0]

15523

In [22]:
df_info = df_info.T.reset_index(drop=True).T # reset column index

In [23]:
df_info

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,project,project,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1
1,file,file,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_6h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_6h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_6h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_6h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_6h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_6h_dese...
2,scale,scale,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise
3,condition,condition,3h,3h,3h,3h,3h,3h,3h,3h,3h,3h,3h,3h,6h,6h,6h,6h,6h,6h
4,type,type,readcounts,readcounts,readcounts,readcounts,readcounts,readcounts,tccounts,tccounts,tccounts,tccounts,tccounts,tccounts,readcounts,readcounts,readcounts,readcounts,readcounts,readcounts


In [24]:
df_merged.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,genes,gene_id,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
1,NAT1,NAT1,95.678631,-0.463807,0.183351,2.529615,0.011419,0.162526,5.023216,-0.554238,0.738925,0.75006,0.453218,,105.239049,-0.433213,0.261849,1.65444,0.098038,0.646734
2,AAMP,AAMP,841.211188,-0.150545,0.117138,1.285188,0.198727,0.58154,14.561153,0.317738,0.385701,-0.823793,0.410057,0.647203,972.517055,0.060776,0.136377,-0.44565,0.65585,0.995046
3,AARS1,AARS1,1373.801586,-0.025304,0.10876,0.232656,0.816029,0.954353,16.055735,-0.107774,0.395963,0.272182,0.785482,0.888308,890.639308,-0.00008,0.161873,0.000495,0.999605,0.99988
4,ABAT,ABAT,55.753788,-0.362282,0.21001,1.725072,0.084514,0.404263,1.69655,-0.944984,1.260997,0.749394,0.45362,,42.920747,-0.220415,0.265135,0.831329,0.405788,0.961633


In [35]:
%%time

# combine info and data
df_save = df_info.append(df_merged, ignore_index=True)

CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 4.03 ms


In [36]:
df_save.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,project,project,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1
1,file,file,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_6h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_6h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_6h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_6h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_6h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_6h_dese...
2,scale,scale,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise
3,condition,condition,3h,3h,3h,3h,3h,3h,3h,3h,3h,3h,3h,3h,6h,6h,6h,6h,6h,6h
4,type,type,readcounts,readcounts,readcounts,readcounts,readcounts,readcounts,tccounts,tccounts,tccounts,tccounts,tccounts,tccounts,readcounts,readcounts,readcounts,readcounts,readcounts,readcounts


## Save grouped data

In [37]:
# save file
res_dir = "../results"
deseq_res_file = "slamseq_merged_deseq.csv"

df_save.to_csv("{}/{}".format(res_dir, deseq_res_file), header=False, index=False)

print("Saved DESeq results to: {}/{}".format(res_dir, deseq_res_file))

Saved DESeq results to: ../results/slamseq_merged_deseq.csv


## MYB_degraded_v1 (project 2)

In [167]:
# read in merged deseq file
res_dir = "../results"
deseq_res_file = "slamseq_merged_deseq.csv"

df_save = pd.read_csv("{}/{}".format(res_dir, deseq_res_file), index_col=False, header=None, squeeze=True)
print(df_save.shape)
df_save.head()

(15523, 20)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,project,project,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1
1,file,file,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_6h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_6h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_6h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_6h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_6h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_6h_dese...
2,scale,scale,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise
3,condition,condition,3h,3h,3h,3h,3h,3h,3h,3h,3h,3h,3h,3h,6h,6h,6h,6h,6h,6h
4,type,type,readcounts,readcounts,readcounts,readcounts,readcounts,readcounts,tccounts,tccounts,tccounts,tccounts,tccounts,tccounts,readcounts,readcounts,readcounts,readcounts,readcounts,readcounts


In [255]:
project = project2
project_dir = "slamseq_{}".format(project)

In [256]:
files = ! ls $res_dir/$project_dir/*.csv
files = [file for file in files if "deseq" in file if "drop" not in file]
# ERCC files first
files = [file for file in files if "ERCCsamplewise" in file] + \
[file for file in files if "ERCCsamplewise" not in file]

In [257]:
files

['../results/slamseq_MYB_degraded_v1/MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_readcounts.csv',
 '../results/slamseq_MYB_degraded_v1/MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_tccounts.csv',
 '../results/slamseq_MYB_degraded_v1/MYB_degraded_v1_0_5_1h_deseq_readcounts.csv',
 '../results/slamseq_MYB_degraded_v1/MYB_degraded_v1_0_5_1h_deseq_tccounts.csv']

In [258]:
# project 2
conditions = ["1h"]
scaling = ["ERCCsamplewise", None]
count_type = ["readcounts", "tccounts"]

Get row info

In [260]:
# df with file info
df_info = pd.DataFrame(list(zip(["project", "file", "scale", "condition", "type"],
                                ["project", "file", "scale", "condition", "type"])))

In [261]:
df_info = pd.concat([df_info] + [pd.DataFrame([x*6 for x in get_file_info(project, file, conditions)]) \
                                 for file in files], axis=1)
df_info = df_info.T.reset_index(drop=True).T # reset column index
df_info.shape

(5, 26)

In [262]:
df_info

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,project,project,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,...,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1
1,file,file,MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_re...,MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_re...,MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_re...,MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_re...,MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_re...,MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_re...,MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_tc...,MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_tc...,...,MYB_degraded_v1_0_5_1h_deseq_readcounts.csv,MYB_degraded_v1_0_5_1h_deseq_readcounts.csv,MYB_degraded_v1_0_5_1h_deseq_readcounts.csv,MYB_degraded_v1_0_5_1h_deseq_readcounts.csv,MYB_degraded_v1_0_5_1h_deseq_tccounts.csv,MYB_degraded_v1_0_5_1h_deseq_tccounts.csv,MYB_degraded_v1_0_5_1h_deseq_tccounts.csv,MYB_degraded_v1_0_5_1h_deseq_tccounts.csv,MYB_degraded_v1_0_5_1h_deseq_tccounts.csv,MYB_degraded_v1_0_5_1h_deseq_tccounts.csv
2,scale,scale,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,...,,,,,,,,,,
3,condition,condition,1h,1h,1h,1h,1h,1h,1h,1h,...,1h,1h,1h,1h,1h,1h,1h,1h,1h,1h
4,type,type,readcounts,readcounts,readcounts,readcounts,readcounts,readcounts,tccounts,tccounts,...,readcounts,readcounts,readcounts,readcounts,tccounts,tccounts,tccounts,tccounts,tccounts,tccounts


Get deseq data

In [263]:
def get_deseq_data(file):
    ''' input deseq file & returns df '''
    ''' add file name to headers except gene columns '''
    df = pd.read_csv(file, header=[0], index_col=None, squeeze=True)
    df.columns = ["{}_{}".format(col_name, os.path.basename(file).split(".")[0]) if "gene" not in col_name
                  else col_name for col_name in df.columns.tolist()]
    return(df)

In [266]:
deseq_dfs = [get_deseq_data(file) for file in files]
print(deseq_dfs[0].shape)
deseq_dfs[0].iloc[:2]

(16558, 8)


Unnamed: 0,baseMean_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_readcounts,log2FoldChange_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_readcounts,lfcSE_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_readcounts,stat_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_readcounts,pvalue_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_readcounts,padj_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_readcounts,genes,gene_id
0,1.877541,-0.045789,1.010641,0.045307,0.963863,,A1BG,A1BG
1,5.229468,-0.26904,0.500213,0.537851,0.59068,,A2MP1,A2MP1


In [267]:
for df in deseq_dfs:
    print(df.shape)

(16558, 8)
(16558, 8)
(16558, 8)
(16558, 8)


In [268]:
[os.path.basename(x) for x in files]

['MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_readcounts.csv',
 'MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_tccounts.csv',
 'MYB_degraded_v1_0_5_1h_deseq_readcounts.csv',
 'MYB_degraded_v1_0_5_1h_deseq_tccounts.csv']

In [269]:
df1 = deseq_dfs[0]
df2 = deseq_dfs[1]
df3 = deseq_dfs[2]
df4 = deseq_dfs[3]

In [270]:
genes1 = list(df1["genes"])
genes2 = list(df2["genes"])
genes3 = list(df3["genes"])
genes4 = list(df4["genes"])

In [271]:
np.logical_and(np.array_equal(df1["genes"].unique(), df2["genes"].unique()), 
               np.array_equal(df2["genes"].unique(), df3["genes"].unique()))

True

In [272]:
len(list(df1["genes"])) == len(set(df1["genes"].unique()))

True

In [275]:
merge_test = reduce(lambda x, y: pd.merge(x, y, on = ["genes", "gene_id"], how="outer"), [df1, df2])

In [277]:
print(merge_test.shape)
merge_test.head()

(16558, 14)


Unnamed: 0,baseMean_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_readcounts,log2FoldChange_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_readcounts,lfcSE_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_readcounts,stat_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_readcounts,pvalue_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_readcounts,padj_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_readcounts,genes,gene_id,baseMean_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_tccounts,log2FoldChange_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_tccounts,lfcSE_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_tccounts,stat_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_tccounts,pvalue_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_tccounts,padj_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_tccounts
0,1.877541,-0.045789,1.010641,0.045307,0.963863,,A1BG,A1BG,0.05833,-0.614498,3.11654,0.197173,0.843692,
1,5.229468,-0.26904,0.500213,0.537851,0.59068,,A2MP1,A2MP1,0.0,-0.0,,,0.0,
2,58.517384,0.013887,0.186521,-0.074452,0.940651,0.987822,NAT1,NAT1,4.852925,0.551781,0.560071,-0.985199,0.324526,0.837918
3,2.316485,-0.735553,0.776528,0.947233,0.34352,,AADAC,AADAC,0.0,-0.0,,,0.0,
4,340.671808,-0.130751,0.092915,1.407202,0.159367,0.743707,AAMP,AAMP,9.364142,0.216094,0.34192,-0.632001,0.527386,0.915144


In [196]:
reduce(lambda x, y: pd.merge(x, y, on = "genes"), [df1.iloc[:10000], df2.iloc[:10000]])["genes"].tolist()

['A1BG',
 'A2MP1',
 'NAT1',
 'AADAC',
 'AAMP',
 'AARS1',
 'ABAT',
 'ABCA1',
 'ABCA2',
 'ABCB7',
 'ABL1',
 'AOC1',
 'ABL2',
 'ABO',
 'ABR',
 'ACAA1',
 'ACACB',
 'ACADM',
 'ACADS',
 'ACADSB',
 'ACADVL',
 'ACAT1',
 'ACAT2',
 'ASIC2',
 'ASIC1',
 'ACLY',
 'ACO1',
 'ACO2',
 'ACOX1',
 'ACP1',
 'ACP2',
 'ACP3',
 'ACRV1',
 'ACTB',
 'ACTG1',
 'ACTN4',
 'ACTL6A',
 'ACTN1',
 'ACVR1',
 'ACVR1B',
 'ACVR2A',
 'ACVR2B',
 'ACVRL1',
 'ACY1',
 'ACYP1',
 'ACYP2',
 'ADA',
 'ADAM8',
 'ADAM10',
 'ADAR',
 'ADARB1',
 'ADCY1',
 'ADCY2',
 'ADCY3',
 'ADCY5',
 'ADCY6',
 'ADCY7',
 'ADCY9',
 'ADCYAP1R1',
 'ADD1',
 'ADD2',
 'ADD3',
 'PLIN2',
 'ADH5',
 'ADH6',
 'ADK',
 'ADORA1',
 'ADORA2A',
 'ADORA2B',
 'ADORA3',
 'ADPRH',
 'PARP1',
 'PARP4',
 'ADRA1A',
 'ADRA2B',
 'ADRA2C',
 'ADRB1',
 'ADRB2',
 'GRK2',
 'GRK3',
 'ADSL',
 'ADSS2',
 'AP2A1',
 'AP2A2',
 'AP1B1',
 'AP2B1',
 'AP1G1',
 'TLE5',
 'CRISP1',
 'AFG3L1P',
 'AGA',
 'AGL',
 'JAG1',
 'AGT',
 'AHCY',
 'AHR',
 'CRYBG1',
 'AK1',
 'AK2',
 'AK4',
 'AKT1',
 'AKT2',
 'ALA

In [197]:
merge_test = reduce(lambda x, y: pd.merge(x, y, on = ["genes", "gene_id"], how="outer"), [df1, df2])

In [198]:
test_genes = merge_test["genes"].tolist()

In [199]:
len(test_genes)

16648

In [200]:
len(genes1)

16588

In [201]:
print(len(list([x for x in genes1 if genes1.count(x) > 1])))
print(len(set([x for x in genes1 if genes1.count(x) > 1])))

60
30


In [152]:
len(set(test_genes))

16558

In [153]:
set([x for x in test_genes if test_genes.count(x) > 1])

{'ACBD6',
 'ASB3',
 'CCDC26',
 'CHML',
 'FEZ1',
 'FNTB',
 'GNMT',
 'GPRASP2',
 'KLHL23',
 'KLRK1',
 'LINC00511',
 'LINC00598',
 'LINC01725',
 'LIPE-AS1',
 'MCL1',
 'MEF2B',
 'MICAL2',
 'MTPN',
 'NBL1',
 'PAK6',
 'PIGY',
 'S1PR3',
 'SCARNA9',
 'SCNM1',
 'SMIM8',
 'SPSB2',
 'STAG3L4',
 'TBC1D7',
 'TTN-AS1',
 'USP9Y'}

In [154]:
merge_test.loc[merge_test["genes"].isin(set([x for x in test_genes if test_genes.count(x) > 1]))]

Unnamed: 0,baseMean_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_readcounts,log2FoldChange_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_readcounts,lfcSE_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_readcounts,stat_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_readcounts,pvalue_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_readcounts,padj_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_readcounts,genes,gene_id,baseMean_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_tccounts,log2FoldChange_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_tccounts,lfcSE_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_tccounts,stat_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_tccounts,pvalue_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_tccounts,padj_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_tccounts
587,1949.022676,0.177454,0.137146,-1.293907,0.195698,0.756115,CHML,CHML,122.062052,0.472268,0.179008,-2.638255,8.333385e-03,1.432467e-01
588,1949.022676,0.177454,0.137146,-1.293907,0.195698,0.756115,CHML,CHML,57.521996,1.500506,0.181365,-8.273385,1.302081e-16,3.976554e-14
589,1221.280092,0.152483,0.090387,-1.687005,0.091602,0.700413,CHML,CHML,122.062052,0.472268,0.179008,-2.638255,8.333385e-03,1.432467e-01
590,1221.280092,0.152483,0.090387,-1.687005,0.091602,0.700413,CHML,CHML,57.521996,1.500506,0.181365,-8.273385,1.302081e-16,3.976554e-14
959,209.879610,-0.413183,0.124792,3.310983,0.000930,0.058866,S1PR3,S1PR3,35.707272,-0.391967,0.194360,2.016711,4.372564e-02,3.978295e-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15759,35.798328,0.089758,0.216731,-0.414143,0.678769,0.918913,LIPE-AS1,LIPE-AS1,1.000995,-0.538208,1.091397,0.493137,6.219158e-01,
15916,1.609619,-0.520566,1.020085,0.510316,0.609830,,LINC01725,LINC01725,0.000000,-0.000000,,,0.000000e+00,
15917,1.609619,-0.520566,1.020085,0.510316,0.609830,,LINC01725,LINC01725,0.000000,-0.000000,,,0.000000e+00,
15918,8.320226,0.301544,0.436010,-0.691598,0.489190,,LINC01725,LINC01725,0.000000,-0.000000,,,0.000000e+00,


In [114]:
reduce(lambda x, y: pd.merge(x, y, on = ["genes", "gene_id"], how="outer"), deseq_dfs)

Unnamed: 0,baseMean_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_readcounts,log2FoldChange_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_readcounts,lfcSE_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_readcounts,stat_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_readcounts,pvalue_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_readcounts,padj_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_readcounts,genes,gene_id,baseMean_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_tccounts,log2FoldChange_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_tccounts,...,lfcSE_MYB_degraded_v1_0_5_1h_deseq_readcounts,stat_MYB_degraded_v1_0_5_1h_deseq_readcounts,pvalue_MYB_degraded_v1_0_5_1h_deseq_readcounts,padj_MYB_degraded_v1_0_5_1h_deseq_readcounts,baseMean_MYB_degraded_v1_0_5_1h_deseq_tccounts,log2FoldChange_MYB_degraded_v1_0_5_1h_deseq_tccounts,lfcSE_MYB_degraded_v1_0_5_1h_deseq_tccounts,stat_MYB_degraded_v1_0_5_1h_deseq_tccounts,pvalue_MYB_degraded_v1_0_5_1h_deseq_tccounts,padj_MYB_degraded_v1_0_5_1h_deseq_tccounts
0,1.877539,-0.045684,1.010547,0.045207,0.963942,,A1BG,A1BG,0.058385,-0.615170,...,0.980114,-0.116922,0.906922,,0.058385,-0.500626,3.116540,0.160635,0.872381,
1,5.229387,-0.269041,0.500087,0.537989,0.590585,,A2MP1,A2MP1,0.000000,-0.000000,...,0.477972,0.372795,0.709301,,0.000000,-0.000000,,,0.000000,
2,58.516396,0.013917,0.186478,-0.074632,0.940507,0.987830,NAT1,NAT1,4.852990,0.551270,...,0.156367,-0.758833,0.447953,0.999815,5.267543,0.666107,0.567477,-1.173804,0.240473,0.774211
3,2.316444,-0.735572,0.776358,0.947465,0.343402,,AADAC,AADAC,0.000000,-0.000000,...,0.748054,0.906933,0.364442,,0.000000,-0.000000,,,0.000000,
4,340.667352,-0.130732,0.092874,1.407626,0.159242,0.743147,AAMP,AAMP,9.364263,0.215343,...,0.074923,0.335587,0.737182,0.999815,9.852376,0.296100,0.338771,-0.874041,0.382096,0.864729
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17003,82.429753,0.008600,0.168839,-0.050936,0.959376,0.990346,106736475,106736475,6.090411,0.107138,...,0.142142,-0.825469,0.409105,0.999815,6.362049,0.189380,0.438964,-0.431426,0.666159,0.953370
17004,13.209258,-0.102026,0.308889,0.330301,0.741173,0.934870,GET1-SH3BGR,GET1-SH3BGR,0.144147,-1.102433,...,0.294363,-0.007729,0.993833,,0.144147,-0.987976,2.583608,0.382402,0.702163,
17005,3.067087,-0.096211,0.639843,0.150367,0.880475,,BOLA2-SMG1P6,BOLA2-SMG1P6,0.140468,-0.134282,...,0.619041,-0.022964,0.981679,,0.146822,-0.019737,2.568065,0.007686,0.993868,
17006,153.879523,0.023166,0.124540,-0.186016,0.852432,0.968322,107303344,107303344,3.557956,0.031424,...,0.109675,-1.192979,0.232878,0.999815,3.741420,0.118501,0.541308,-0.218916,0.826715,0.974970


In [90]:
# main df
df_merged = pd.DataFrame()
df_merged = df_merged.append(deseq_dfs[0][["genes", "gene_id"]]) # add gene cols to main df
print(df_merged.shape)
df_merged.head()

(16588, 2)


Unnamed: 0,genes,gene_id
0,A1BG,A1BG
1,A2MP1,A2MP1
2,NAT1,NAT1
3,AADAC,AADAC
4,AAMP,AAMP


In [91]:
# join df columns on gene & gene_id columns
df_merged = reduce(lambda left, right: pd.merge(left, right, on=['genes', "gene_id"], how="outer"), 
                   [df_merged] + deseq_dfs)
print(df_merged.shape)

(17488, 26)


In [79]:
# change column names
df_merged.columns = [col_name.split("_")[0] if "gene" not in col_name else 
                     col_name for col_name in df_merged.columns.tolist()]
df_merged = df_merged.T.reset_index().T.reset_index(drop=True)

In [80]:
df_merged.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,genes,gene_id,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,baseMean,log2FoldChange,...,lfcSE,stat,pvalue,padj,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
1,A1BG,A1BG,1.877539,-0.045684,1.010547,0.045207,0.963942,,0.058385,-0.61517,...,0.980114,-0.116922,0.906922,,0.058385,-0.500626,3.11654,0.160635,0.872381,
2,A2MP1,A2MP1,5.229387,-0.269041,0.500087,0.537989,0.590585,,0.0,-0.0,...,0.477972,0.372795,0.709301,,0.0,-0.0,,,0.0,
3,NAT1,NAT1,58.516396,0.013917,0.186478,-0.074632,0.940507,0.98783,4.85299,0.55127,...,0.156367,-0.758833,0.447953,0.999815,5.267543,0.666107,0.567477,-1.173804,0.240473,0.774211
4,AADAC,AADAC,2.316444,-0.735572,0.776358,0.947465,0.343402,,0.0,-0.0,...,0.748054,0.906933,0.364442,,0.0,-0.0,,,0.0,


In [81]:
print(df_save.shape)
print(df_merged.shape)

(15523, 20)
(17489, 26)


# MYB_degraded_v7 (res_v7)

In [40]:
project = project2
res_version = "res_v7"
project_dir = "slamseq_{}_{}".format(project, res_version)

In [51]:
files = ! ls $res_dir/$project_dir/*.csv
files = [file for file in files if "deseq" in file if "drop" not in file]
# ERCC files first
files = [file for file in files if "ERCCsamplewise" in file] + \
[file for file in files if "ERCCsamplewise" not in file]
print(len(files))
files

5


['../results/slamseq_MYB_degraded_v1_res_v7/MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_readcounts.csv',
 '../results/slamseq_MYB_degraded_v1_res_v7/MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_tccounts.csv',
 '../results/slamseq_MYB_degraded_v1_res_v7/MYB_degraded_v1_0_5_1h_deseq_readcounts.csv',
 '../results/slamseq_MYB_degraded_v1_res_v7/MYB_degraded_v1_0_5_1h_deseq_tccounts.csv',
 '../results/slamseq_MYB_degraded_v1_res_v7/slamseq_MYB_degraded_v1_res_v7_0_5_deseq_output.csv']

In [52]:
files = files[:-1]

In [53]:
# project 2
conditions = ["1h"]
scaling = ["ERCCsamplewise", None]
count_type = ["readcounts", "tccounts"]

## make info rows

In [54]:
# df with file info
df_info = pd.DataFrame(list(zip(["project", "file", "scale", "condition", "type"],
                                ["project", "file", "scale", "condition", "type"])))

In [55]:
df_info = pd.concat([df_info] + [pd.DataFrame([x*6 for x in get_file_info(project, file, conditions)]) \
                                 for file in files], axis=1)
df_info.shape

(5, 26)

## merge dfs

In [56]:
# Get list of deseq dfs
deseq_dfs = [get_deseq_data(file) for file in files]
deseq_dfs[0].iloc[:1]

Unnamed: 0,baseMean_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_readcounts,log2FoldChange_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_readcounts,lfcSE_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_readcounts,stat_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_readcounts,pvalue_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_readcounts,padj_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_readcounts,genes,gene_id
0,1.881134,-0.045298,1.016923,0.044544,0.964471,,A1BG,A1BG


In [57]:
# main df
df_merged = pd.DataFrame()
df_merged = df_merged.append(deseq_dfs[0][["genes", "gene_id"]]) # add gene cols to main df

In [58]:
# join df columns on gene & gene_id columns
df_merged = reduce(lambda left, right: pd.merge(left, right, on=['genes', "gene_id"], how="outer"), 
                   [df_merged] + deseq_dfs)
df_merged.shape

(17211, 26)

In [59]:
# change column names
df_merged.columns = [col_name.split("_")[0] if "gene" not in col_name else 
                     col_name for col_name in df_merged.columns.tolist()]
df_merged = df_merged.T.reset_index().T.reset_index(drop=True)

## add info columns

In [60]:
# reset column index
df_info = df_info.T.reset_index(drop=True).T

In [61]:
%%time

# combine info and data
df_save = df_info.append(df_merged, ignore_index=True)
df_save.shape

CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 3.44 ms


(17217, 26)

## save merged file

In [62]:
# save file
res_dir = "../results"
deseq_res_file = "slamseq_merged_deseq_noAlt.csv"

df_save.to_csv("{}/{}".format(res_dir, deseq_res_file), header=False, index=False)

print("Saved DESeq results to: {}/{}".format(res_dir, deseq_res_file))

Saved DESeq results to: ../results/slamseq_merged_deseq_noAlt.csv


# inhibitors (no alt)

## merging DESeq outputs

In [11]:
project = "inhibitors_v1"
res_v = "res"
project_dir = "slamseq_{}".format(project)

In [12]:
files = ! ls $res_dir/$project_dir/*deseq*.csv
# nested list per time point
files = natsorted([files[i:i + 4] for i in range(0, len(files), 4)])
# ERCC (read, tc) | nonERCC (read, tc)
files = [lst[-2:]+lst[:-2] for lst in files]
files = [i for sublist in files for i in sublist]

In [28]:
# project 1
conditions = ["Naphthol", "Plumbagin", "Celastrol", "TM1-001", "Tretinoin"]#[i.split("_")[-6] for i in files[::4]]
print(conditions)
scaling = ["ERCCsamplewise", None]
count_type = ["readcounts", "tccounts"]

['Naphthol', 'Plumbagin', 'Celastrol', 'TM1-001', 'Tretinoin']


In [33]:
# change files order to conditions
files = [x for cond in conditions for x in files if cond in x]

In [34]:
[os.path.basename(file) for file in files]

['inhibitors_v1_DMSO_Naphthol_ERCCsamplewise_0_5_deseq_readcounts.csv',
 'inhibitors_v1_DMSO_Naphthol_ERCCsamplewise_0_5_deseq_tccounts.csv',
 'inhibitors_v1_DMSO_Naphthol_0_5_deseq_readcounts.csv',
 'inhibitors_v1_DMSO_Naphthol_0_5_deseq_tccounts.csv',
 'inhibitors_v1_DMSO_Plumbagin_ERCCsamplewise_0_5_deseq_readcounts.csv',
 'inhibitors_v1_DMSO_Plumbagin_ERCCsamplewise_0_5_deseq_tccounts.csv',
 'inhibitors_v1_DMSO_Plumbagin_0_5_deseq_readcounts.csv',
 'inhibitors_v1_DMSO_Plumbagin_0_5_deseq_tccounts.csv',
 'inhibitors_v1_DMSO_Celastrol_ERCCsamplewise_0_5_deseq_readcounts.csv',
 'inhibitors_v1_DMSO_Celastrol_ERCCsamplewise_0_5_deseq_tccounts.csv',
 'inhibitors_v1_DMSO_Celastrol_0_5_deseq_readcounts.csv',
 'inhibitors_v1_DMSO_Celastrol_0_5_deseq_tccounts.csv',
 'inhibitors_v1_DMSO_TM1-001_ERCCsamplewise_0_5_deseq_readcounts.csv',
 'inhibitors_v1_DMSO_TM1-001_ERCCsamplewise_0_5_deseq_tccounts.csv',
 'inhibitors_v1_DMSO_TM1-001_0_5_deseq_readcounts.csv',
 'inhibitors_v1_DMSO_TM1-001_0_5_d

### make info rows

In [35]:
# df with file info
df_info = pd.DataFrame(list(zip(["project", "file", "scale", "condition", "type"],
                                ["project", "file", "scale", "condition", "type"])))

In [36]:
df_info = pd.concat([df_info] + 
                    [pd.DataFrame([x*6 for x in get_file_info(project, file, conditions)]) for file in files], 
                    axis=1)
df_info.shape

(5, 122)

### get dfs and merge

In [37]:
# Get list of deseq dfs
deseq_dfs = [get_deseq_data(file) for file in files]

In [38]:
# main df
df_merged = pd.DataFrame()
df_merged = df_merged.append(deseq_dfs[0][["genes", "gene_id"]]) # add gene cols to main df

In [39]:
# join df columns on gene & gene_id columns
df_merged = reduce(lambda left, right: pd.merge(left, right, on=['genes', "gene_id"], how="outer"), 
                   [df_merged] + deseq_dfs)
df_merged.shape

(16113, 122)

In [40]:
df_merged.iloc[:5]

Unnamed: 0,genes,gene_id,baseMean_inhibitors_v1_DMSO_Naphthol_ERCCsamplewise_0_5_deseq_readcounts,log2FoldChange_inhibitors_v1_DMSO_Naphthol_ERCCsamplewise_0_5_deseq_readcounts,lfcSE_inhibitors_v1_DMSO_Naphthol_ERCCsamplewise_0_5_deseq_readcounts,stat_inhibitors_v1_DMSO_Naphthol_ERCCsamplewise_0_5_deseq_readcounts,pvalue_inhibitors_v1_DMSO_Naphthol_ERCCsamplewise_0_5_deseq_readcounts,padj_inhibitors_v1_DMSO_Naphthol_ERCCsamplewise_0_5_deseq_readcounts,baseMean_inhibitors_v1_DMSO_Naphthol_ERCCsamplewise_0_5_deseq_tccounts,log2FoldChange_inhibitors_v1_DMSO_Naphthol_ERCCsamplewise_0_5_deseq_tccounts,...,lfcSE_inhibitors_v1_DMSO_Tretinoin_0_5_deseq_readcounts,stat_inhibitors_v1_DMSO_Tretinoin_0_5_deseq_readcounts,pvalue_inhibitors_v1_DMSO_Tretinoin_0_5_deseq_readcounts,padj_inhibitors_v1_DMSO_Tretinoin_0_5_deseq_readcounts,baseMean_inhibitors_v1_DMSO_Tretinoin_0_5_deseq_tccounts,log2FoldChange_inhibitors_v1_DMSO_Tretinoin_0_5_deseq_tccounts,lfcSE_inhibitors_v1_DMSO_Tretinoin_0_5_deseq_tccounts,stat_inhibitors_v1_DMSO_Tretinoin_0_5_deseq_tccounts,pvalue_inhibitors_v1_DMSO_Tretinoin_0_5_deseq_tccounts,padj_inhibitors_v1_DMSO_Tretinoin_0_5_deseq_tccounts
0,A2M,A2M,0.162862,-0.70404,2.811327,0.25043,0.802255,,0.0,-0.0,...,1.63774,-0.303366,0.761611,,0.0,-0.0,,,0.0,
1,NAT1,NAT1,28.62739,-0.246713,0.255792,0.964506,0.334792,,3.47396,-0.02185,...,0.242288,1.859788,0.062915,0.585229,6.524734,-0.295706,0.570508,0.518321,0.604235,0.848824
2,AAMP,AAMP,93.218501,-0.158698,0.151452,1.047845,0.29471,0.874683,2.560923,-0.558682,...,0.146354,0.520657,0.602606,0.972426,6.435022,-0.088202,0.607806,0.145115,0.88462,0.967578
3,AARS1,AARS1,117.885607,-0.036773,0.154016,0.238759,0.811293,0.972527,4.964917,0.525032,...,0.147296,-0.418068,0.675897,0.977016,12.644458,0.905353,0.481334,-1.880924,0.059982,0.286805
4,ABAT,ABAT,15.663349,0.397794,0.356652,-1.115358,0.264697,,0.571845,2.130662,...,0.274097,-2.00804,0.044639,0.505688,1.244083,2.115603,1.286812,-1.644065,0.100163,


In [41]:
# change column names
df_merged.columns = [col_name.split("_")[0] if "gene" not in col_name else 
                     col_name for col_name in df_merged.columns.tolist()]
df_merged = df_merged.T.reset_index().T.reset_index(drop=True)
df_merged.shape

(16114, 122)

### add info columns

In [42]:
print(df_info.shape)
print(df_merged.shape)

(5, 122)
(16114, 122)


In [43]:
# reset column index
df_info = df_info.T.reset_index(drop=True).T

In [44]:
%%time

# combine info and data
df_save = df_info.append(df_merged, ignore_index=True)
df_save.shape

CPU times: user 4 ms, sys: 20 ms, total: 24 ms
Wall time: 23.3 ms


(16119, 122)

### save merged file

In [45]:
# save file
res_dir = "../results"
deseq_res_file = "slamseq_{}_0_5_deseq_{}output.csv".format(project, "" if alt_loci == True else "noAlt_")

df_save.to_csv("{}/{}/{}".format(res_dir, project_dir, deseq_res_file), header=False, index=False)

print("Saved DESeq results to: {}/{}".format(project_dir, deseq_res_file))

Saved DESeq results to: slamseq_inhibitors_v1/slamseq_inhibitors_v1_0_5_deseq_noAlt_output.csv


## merge with MYB outputs

In [74]:
def mergeDESeqOutputs(deseq_files):
    """
    combines merged DESeq outputs from each project
    
    Args:
      deseq_files: list of deseq files
    """
    # list of deseq dfs w/o header
    dfs = [pd.read_csv(file, index_col=False, header=None, squeeze=True, low_memory=False) for file in deseq_files]
    
    # merge info data
    df_info_merged = dfs[0].iloc[:5]
    print("df1 cols: {}".format(df_info_merged.shape[1]))
    # merge info columns
    for i, df in enumerate(dfs[1:]):
        print("df{} cols: {}".format(str(i+2), df.shape[1]))
        df_info_merged = merge_info_cols(df_info_merged, df.iloc[:5])
        print("Merged rows: {}\nMerged columns: {}".format(df_info_merged.shape[0], df_info_merged.shape[1]))
    
    # merge data
    # make first row unique col names and remove info rows
    df_data = [rename_deseq_data(df) for df in dfs]
    df_data = [df.iloc[4:] for df in df_data]
    
    df_merged = pd.DataFrame()
    # add gene cols to main df
    df_merged = df_merged.append(df_data[0][["project_0", "project_1"]])
    # join df columns on gene & gene_id columns
    df_merged = reduce(lambda left, right: pd.merge(left, right, on=["project_0", "project_1"], how="outer"), 
                       [df_merged] + df_data)
    print("Total unique genes: {}".format(str(df_merged.shape[0]-1)))
    
    # reset column index
    df_merged = df_merged.T.reset_index(drop=True).T
    df_info_merged = df_info_merged.T.reset_index(drop=True).T
    
    # combine info and data
    df_merged = df_info_merged.append(df_merged, ignore_index=True)
    
    return(df_merged)

In [54]:
def merge_info_cols(df1, df2):
    ''' merged info columns from 2 info dfs '''
    df_merged = pd.merge(df1, df2, left_on=[0,1], right_on=[0,1])
    df_merged = df_merged.T.reset_index(drop=True).T
    if (df1.shape[1]+df2.shape[1]-2) != df_merged.shape[1]:
        raise ValueError("Incorrect number of columns.\
                          df1: {}\
                          df2: {}".format(df1.shape, df2.shape))
    return(df_merged)

In [55]:
def get_deseq_data(file, low_memory=True):
    ''' input deseq file & returns df '''
    ''' add file name to headers except gene columns '''
    df = pd.read_csv(file, header=[0], index_col=None, squeeze=True, low_memory=low_memory)
    df.columns = ["{}_{}".format(col_name, os.path.basename(file).split(".")[0]) if "gene" not in col_name
                  else col_name for col_name in df.columns.tolist()]
    return(df)

In [56]:
def rename_deseq_data(df):
    ''' renames columns uniquely to deseq output header '''
    df.columns = ["{}_{}".format(col_name, str(idx)) if "gene" not in col_name
                  else col_name for idx, col_name in enumerate(df.iloc[0].tolist())]
    df = df.iloc[1:]
    return(df)

In [46]:
project_dirs = ["slamseq_MYB_degraded_timecourse_v1",
                "slamseq_MYB_degraded_v1_res_v7",
                "slamseq_inhibitors_v1"]

deseq_res_dirs = ["{}/{}".format(res_dir, project_dir) for project_dir in project_dirs]

In [50]:
deseq_files = [glob.glob(deseq_res_dir+"/*output.csv") for deseq_res_dir in deseq_res_dirs]
deseq_files = [item for sublist in deseq_files for item in sublist]
deseq_files

['../results/slamseq_MYB_degraded_timecourse_v1/slamseq_MYB_degraded_timecourse_v1_0_5_deseq_noAlt_output.csv',
 '../results/slamseq_MYB_degraded_v1_res_v7/slamseq_MYB_degraded_v1_res_v7_0_5_deseq_output.csv',
 '../results/slamseq_inhibitors_v1/slamseq_inhibitors_v1_0_5_deseq_noAlt_output.csv']

In [66]:
deseq_files

['../results/slamseq_MYB_degraded_timecourse_v1/slamseq_MYB_degraded_timecourse_v1_0_5_deseq_noAlt_output.csv',
 '../results/slamseq_MYB_degraded_v1_res_v7/slamseq_MYB_degraded_v1_res_v7_0_5_deseq_output.csv',
 '../results/slamseq_inhibitors_v1/slamseq_inhibitors_v1_0_5_deseq_noAlt_output.csv']

In [75]:
df_merged_test = mergeDESeqOutputs(deseq_files)

df1 cols: 170
df2 cols: 26
Merged rows: 5
Merged columns: 194
df3 cols: 122
Merged rows: 5
Merged columns: 314
Total unique genes: 18913


In [76]:
print(df_merged_test.shape)

(18919, 314)


In [99]:
df_save.equals(df_merged_test)

True

__save composite merged file__

In [107]:
"_".join([project.split("_v")[0].split("slamseq_")[1] for project in project_dirs])

'MYB_degraded_timecourse_MYB_degraded_inhibitors'

In [120]:
# save file
res_dir = "../results"
proj_group = "_".join([project.split("_v")[0].split("slamseq_")[1] for project in project_dirs])
deseq_res_file = "slamseq_{}merged_deseq_output{}.csv".format(proj_group+"_", "" if alt_loci == True else "_noAlt")
deseq_res_file

'slamseq_MYB_degraded_timecourse_MYB_degraded_inhibitors_merged_deseq_output_noAlt.csv'

In [121]:
# open output file
f = open("{}/{}".format(res_dir, deseq_res_file), "a")

# add project names as comments
f.write("# Merged DESeq project outputs\n")
for project in project_dirs:
    f.write("# {}\n".format(project))
f.write("\n")
df_save.to_csv(f, header=False, index=False)
f.close()

In [122]:
print("Saved DESeq results to: {}/{}".format(res_dir, deseq_res_file))

Saved DESeq results to: ../results/slamseq_MYB_degraded_timecourse_MYB_degraded_inhibitors_merged_deseq_output_noAlt.csv


In [111]:
# add project names as comments
f.write("# Merged DESeq outputs\n")
for project in project_dirs:
    f.write("# {}\n".format(project))
f.write("\n")

1

In [112]:
df_save.to_csv(f, header=False, index=False)
f.close()

In [109]:
df_save.to_csv("{}/{}".format(res_dir, deseq_res_file), header=False, index=False)

print("Saved DESeq results to: {}/{}".format(res_dir, deseq_res_file))

Saved DESeq results to: ../results/slamseq_MYB_degraded_timecourse_MYB_degraded_inhibitors_merged_deseq_output_noAlt.csv


In [115]:
df_test = pd.DataFrame({'a':[1,2,3], 'b':[1,2,3]})

In [119]:
with open(res_dir+"/merge_output_test.csv", "a") as file:
    file.write("# Merged DESeq project outputs\n")
    df_test.to_csv(file, header=True, index=False)

In [316]:
# save file
res_dir = "../results"
proj_group = "MYB_degraded"
deseq_res_file = "slamseq_{}merged_deseq_output{}.csv".format(proj_group+"_", "" if alt_loci == True else "_noAlt")

df_save.to_csv("{}/{}".format(res_dir, deseq_res_file), header=False, index=False)

print("Saved DESeq results to: {}/{}".format(res_dir, deseq_res_file))

Saved DESeq results to: ../results/slamseq_MYB_degraded_merged_deseq_output_noAlt.csv


In [329]:
deseq_files = [glob.glob(deseq_res_dir+"/*output.csv") for deseq_res_dir in deseq_res_dirs]
deseq_files = [item for sublist in deseq_files for item in sublist]
deseq_files

['../results/slamseq_MYB_degraded_timecourse_v1/slamseq_MYB_degraded_timecourse_v1_0_5_deseq_noAlt_output.csv',
 '../results/slamseq_MYB_degraded_v1_res_v7/slamseq_MYB_degraded_v1_res_v7_0_5_deseq_output.csv']

In [330]:
# list of deseq dfs
#deseq_dfs = [get_deseq_data(deseq_file, low_memory=False) for deseq_file in deseq_files]

# list of deseq dfs (no header)
deseq_dfs = [pd.read_csv(deseq_file, index_col=False, header=None, squeeze=True, low_memory=False)
             for deseq_file in deseq_files]

In [331]:
deseq_dfs[0].head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,160,161,162,163,164,165,166,167,168,169
0,project,project,MYB_degraded_timecourse_v1,MYB_degraded_timecourse_v1,MYB_degraded_timecourse_v1,MYB_degraded_timecourse_v1,MYB_degraded_timecourse_v1,MYB_degraded_timecourse_v1,MYB_degraded_timecourse_v1,MYB_degraded_timecourse_v1,...,MYB_degraded_timecourse_v1,MYB_degraded_timecourse_v1,MYB_degraded_timecourse_v1,MYB_degraded_timecourse_v1,MYB_degraded_timecourse_v1,MYB_degraded_timecourse_v1,MYB_degraded_timecourse_v1,MYB_degraded_timecourse_v1,MYB_degraded_timecourse_v1,MYB_degraded_timecourse_v1
1,file,file,MYB_degraded_timecourse_v1_MYB_1h_ERCCsamplewi...,MYB_degraded_timecourse_v1_MYB_1h_ERCCsamplewi...,MYB_degraded_timecourse_v1_MYB_1h_ERCCsamplewi...,MYB_degraded_timecourse_v1_MYB_1h_ERCCsamplewi...,MYB_degraded_timecourse_v1_MYB_1h_ERCCsamplewi...,MYB_degraded_timecourse_v1_MYB_1h_ERCCsamplewi...,MYB_degraded_timecourse_v1_MYB_1h_ERCCsamplewi...,MYB_degraded_timecourse_v1_MYB_1h_ERCCsamplewi...,...,MYB_degraded_timecourse_v1_MYB_48h_0_5_deseq_r...,MYB_degraded_timecourse_v1_MYB_48h_0_5_deseq_r...,MYB_degraded_timecourse_v1_MYB_48h_0_5_deseq_r...,MYB_degraded_timecourse_v1_MYB_48h_0_5_deseq_r...,MYB_degraded_timecourse_v1_MYB_48h_0_5_deseq_t...,MYB_degraded_timecourse_v1_MYB_48h_0_5_deseq_t...,MYB_degraded_timecourse_v1_MYB_48h_0_5_deseq_t...,MYB_degraded_timecourse_v1_MYB_48h_0_5_deseq_t...,MYB_degraded_timecourse_v1_MYB_48h_0_5_deseq_t...,MYB_degraded_timecourse_v1_MYB_48h_0_5_deseq_t...
2,scale,scale,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,...,,,,,,,,,,
3,condition,condition,1h,1h,1h,1h,1h,1h,1h,1h,...,48h,48h,48h,48h,48h,48h,48h,48h,48h,48h
4,type,type,readcounts,readcounts,readcounts,readcounts,readcounts,readcounts,tccounts,tccounts,...,readcounts,readcounts,readcounts,readcounts,tccounts,tccounts,tccounts,tccounts,tccounts,tccounts


In [332]:
for df in deseq_dfs:
    print(df.shape)

(18311, 170)
(17217, 26)


In [333]:
deseq_dfs[1].head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,project,project,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,...,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1
1,file,file,MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_re...,MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_re...,MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_re...,MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_re...,MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_re...,MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_re...,MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_tc...,MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_tc...,...,MYB_degraded_v1_0_5_1h_deseq_readcounts.csv,MYB_degraded_v1_0_5_1h_deseq_readcounts.csv,MYB_degraded_v1_0_5_1h_deseq_readcounts.csv,MYB_degraded_v1_0_5_1h_deseq_readcounts.csv,MYB_degraded_v1_0_5_1h_deseq_tccounts.csv,MYB_degraded_v1_0_5_1h_deseq_tccounts.csv,MYB_degraded_v1_0_5_1h_deseq_tccounts.csv,MYB_degraded_v1_0_5_1h_deseq_tccounts.csv,MYB_degraded_v1_0_5_1h_deseq_tccounts.csv,MYB_degraded_v1_0_5_1h_deseq_tccounts.csv
2,scale,scale,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,...,,,,,,,,,,
3,condition,condition,1h,1h,1h,1h,1h,1h,1h,1h,...,1h,1h,1h,1h,1h,1h,1h,1h,1h,1h
4,type,type,readcounts,readcounts,readcounts,readcounts,readcounts,readcounts,tccounts,tccounts,...,readcounts,readcounts,readcounts,readcounts,tccounts,tccounts,tccounts,tccounts,tccounts,tccounts


# mebendazole_v1 (no alt)

In [32]:
project = project6
project_dir = "slamseq_{}".format(project)
res_dir = "../results"

In [35]:
files = ! ls $res_dir/$project_dir/*.csv
files = [file for file in files if "deseq" in file if "drop" not in file]
# ERCC files first
files = [file for file in files if "ERCCsamplewise" in file] + \
[file for file in files if "ERCCsamplewise" not in file]
print(len(files))
files

4


['../results/slamseq_mebendazole_v1/mebendazole_v1_DMSO_Mebendazole_ERCCsamplewise_0_5_deseq_readcounts.csv',
 '../results/slamseq_mebendazole_v1/mebendazole_v1_DMSO_Mebendazole_ERCCsamplewise_0_5_deseq_tccounts.csv',
 '../results/slamseq_mebendazole_v1/mebendazole_v1_DMSO_Mebendazole_0_5_deseq_readcounts.csv',
 '../results/slamseq_mebendazole_v1/mebendazole_v1_DMSO_Mebendazole_0_5_deseq_tccounts.csv']

In [37]:
# project 6
conditions = ["1h"]
scaling = ["ERCCsamplewise", None]
count_type = ["readcounts", "tccounts"]
alt_loci = False

__make info rows__

In [68]:
# df with file info
df_info = pd.DataFrame(list(zip(["project", "file", "scale", "condition", "type"],
                                ["project", "file", "scale", "condition", "type"])))

In [69]:
df_info = pd.concat([df_info] + [pd.DataFrame([x*6 for x in get_file_info(project2, file, conditions)]) \
                                 for file in files], axis=1)
df_info.shape

(5, 26)

__get dfs and merge__

In [70]:
# Get list of deseq dfs
deseq_dfs = [get_deseq_data(file) for file in files]

In [71]:
# main df
df_merged = pd.DataFrame()
# add gene cols to main df
df_merged = df_merged.append(deseq_dfs[0][["genes", "gene_id"]])

In [72]:
# join df columns on gene & gene_id columns
df_merged = reduce(lambda left, right: pd.merge(left, right, on=['genes', "gene_id"], how="outer"), 
                   [df_merged] + deseq_dfs)
df_merged.shape

(14292, 26)

In [43]:
df_merged.iloc[:5]

Unnamed: 0,genes,gene_id,baseMean_mebendazole_v1_DMSO_Mebendazole_ERCCsamplewise_0_5_deseq_readcounts,log2FoldChange_mebendazole_v1_DMSO_Mebendazole_ERCCsamplewise_0_5_deseq_readcounts,lfcSE_mebendazole_v1_DMSO_Mebendazole_ERCCsamplewise_0_5_deseq_readcounts,stat_mebendazole_v1_DMSO_Mebendazole_ERCCsamplewise_0_5_deseq_readcounts,pvalue_mebendazole_v1_DMSO_Mebendazole_ERCCsamplewise_0_5_deseq_readcounts,padj_mebendazole_v1_DMSO_Mebendazole_ERCCsamplewise_0_5_deseq_readcounts,baseMean_mebendazole_v1_DMSO_Mebendazole_ERCCsamplewise_0_5_deseq_tccounts,log2FoldChange_mebendazole_v1_DMSO_Mebendazole_ERCCsamplewise_0_5_deseq_tccounts,...,lfcSE_mebendazole_v1_DMSO_Mebendazole_0_5_deseq_readcounts,stat_mebendazole_v1_DMSO_Mebendazole_0_5_deseq_readcounts,pvalue_mebendazole_v1_DMSO_Mebendazole_0_5_deseq_readcounts,padj_mebendazole_v1_DMSO_Mebendazole_0_5_deseq_readcounts,baseMean_mebendazole_v1_DMSO_Mebendazole_0_5_deseq_tccounts,log2FoldChange_mebendazole_v1_DMSO_Mebendazole_0_5_deseq_tccounts,lfcSE_mebendazole_v1_DMSO_Mebendazole_0_5_deseq_tccounts,stat_mebendazole_v1_DMSO_Mebendazole_0_5_deseq_tccounts,pvalue_mebendazole_v1_DMSO_Mebendazole_0_5_deseq_tccounts,padj_mebendazole_v1_DMSO_Mebendazole_0_5_deseq_tccounts
0,A1BG,A1BG,0.980389,-0.444769,1.330763,0.334221,0.738213,,0.0,-0.0,...,1.333388,0.361329,0.717854,,0.0,-0.0,,,0.0,
1,NAT1,NAT1,138.921088,-0.133044,0.171409,0.776179,0.437643,0.798069,12.356874,-0.335894,...,0.168504,0.906322,0.364765,0.705445,12.286265,-0.348474,0.37778,0.922427,0.356306,
2,AAMP,AAMP,352.541875,0.114062,0.10594,-1.076667,0.281629,0.709864,7.98088,0.134371,...,0.087859,-1.036432,0.300001,0.656839,7.883212,0.11409,0.448307,-0.254491,0.799116,
3,AARS1,AARS1,245.079899,-0.252932,0.136319,1.855436,0.063534,0.427695,13.306179,-0.129994,...,0.122509,2.265178,0.023502,0.21196,13.205238,-0.14642,0.376884,0.388502,0.697645,
4,ABAT,ABAT,70.946169,-0.198707,0.17729,1.120802,0.262372,0.698348,2.541981,-0.29173,...,0.16631,1.337286,0.181129,0.539089,2.517329,-0.310585,0.872674,0.355901,0.721915,


In [44]:
[os.path.basename(x) for x in files]

['mebendazole_v1_DMSO_Mebendazole_ERCCsamplewise_0_5_deseq_readcounts.csv',
 'mebendazole_v1_DMSO_Mebendazole_ERCCsamplewise_0_5_deseq_tccounts.csv',
 'mebendazole_v1_DMSO_Mebendazole_0_5_deseq_readcounts.csv',
 'mebendazole_v1_DMSO_Mebendazole_0_5_deseq_tccounts.csv']

In [73]:
# change column names
df_merged.columns = [col_name.split("_")[0] if "gene" not in col_name else 
                     col_name for col_name in df_merged.columns.tolist()]
df_merged = df_merged.T.reset_index().T.reset_index(drop=True)
df_merged.shape

(14293, 26)

__add info cols__

In [74]:
print(df_info.shape)
print(df_merged.shape)

(5, 26)
(14293, 26)


In [75]:
# reset column index
df_info = df_info.T.reset_index(drop=True).T

In [76]:
%%time

# combine info and data
df_save = df_info.append(df_merged, ignore_index=True)
df_save.shape

CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 2.84 ms


(14298, 26)

__save merged file__

In [77]:
# save file
res_dir = "../results"
deseq_res_file = "slamseq_{}_0_5_deseq_{}output.csv".format(project, "" if alt_loci == True else "noAlt_")

df_save.to_csv("{}/{}/{}".format(res_dir, project_dir, deseq_res_file), header=False, index=False)

print("Saved DESeq results to: {}/{}".format(project_dir, deseq_res_file))

Saved DESeq results to: slamseq_mebendazole_v1/slamseq_mebendazole_v1_0_5_deseq_noAlt_output.csv


In [82]:
deseq_dfs[0].shape

(14292, 8)

In [86]:
df_save.columns.tolist()

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25]

In [113]:
print(list(range(0,8)))
print(list(range(8,14)))
print(list(range(14,20)))
print(list(range(20,26)))

[0, 1, 2, 3, 4, 5, 6, 7]
[8, 9, 10, 11, 12, 13]
[14, 15, 16, 17, 18, 19]
[20, 21, 22, 23, 24, 25]


In [118]:
df_save.iloc[:10,[0, 1, 20, 21, 22, 23, 24, 25]]

Unnamed: 0,0,1,20,21,22,23,24,25
0,project,project,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1
1,file,file,mebendazole_v1_DMSO_Mebendazole_0_5_deseq_tcco...,mebendazole_v1_DMSO_Mebendazole_0_5_deseq_tcco...,mebendazole_v1_DMSO_Mebendazole_0_5_deseq_tcco...,mebendazole_v1_DMSO_Mebendazole_0_5_deseq_tcco...,mebendazole_v1_DMSO_Mebendazole_0_5_deseq_tcco...,mebendazole_v1_DMSO_Mebendazole_0_5_deseq_tcco...
2,scale,scale,,,,,,
3,condition,condition,1h,1h,1h,1h,1h,1h
4,type,type,tccounts,tccounts,tccounts,tccounts,tccounts,tccounts
5,genes,gene_id,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
6,A1BG,A1BG,0.0,-0.0,,,0.0,
7,NAT1,NAT1,12.286265,-0.348474,0.37778,0.922427,0.356306,
8,AAMP,AAMP,7.883212,0.11409,0.448307,-0.254491,0.799116,
9,AARS1,AARS1,13.205238,-0.14642,0.376884,0.388502,0.697645,


In [117]:
deseq_dfs[3].iloc[:4,[7,6,0,1,2,3,4,5]]

Unnamed: 0,gene_id,genes,baseMean_mebendazole_v1_DMSO_Mebendazole_0_5_deseq_tccounts,log2FoldChange_mebendazole_v1_DMSO_Mebendazole_0_5_deseq_tccounts,lfcSE_mebendazole_v1_DMSO_Mebendazole_0_5_deseq_tccounts,stat_mebendazole_v1_DMSO_Mebendazole_0_5_deseq_tccounts,pvalue_mebendazole_v1_DMSO_Mebendazole_0_5_deseq_tccounts,padj_mebendazole_v1_DMSO_Mebendazole_0_5_deseq_tccounts
0,A1BG,A1BG,0.0,-0.0,,,0.0,
1,NAT1,NAT1,12.286265,-0.348474,0.37778,0.922427,0.356306,
2,AAMP,AAMP,7.883212,0.11409,0.448307,-0.254491,0.799116,
3,AARS1,AARS1,13.205238,-0.14642,0.376884,0.388502,0.697645,
