# Slamseq

In [2]:
from __future__ import print_function
import os.path
import pandas as pd
import sys
sys.path.insert(0, '../../')
import seaborn as sns
import numpy as np
from functools import reduce
import glob

from genepy.utils import helper as h
from genepy.utils import plot as genepyPlot
from genepy.rna import pyDESeq2
import genepy.rna as rna

from bokeh.plotting import *
from bokeh.models import HoverTool
from bokeh.io import output_notebook

import matplotlib.pyplot as plt

from sklearn.neighbors import KNeighborsClassifier
from sklearn.manifold import MDS, TSNE
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
#from umap import UMAP

output_notebook()
%load_ext autoreload
%matplotlib inline
%autoreload 2
%load_ext rpy2.ipython

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


## Get files

Expermients
- IRF2BP2 degraded v1
    - 3h
    - 6h
- MYB degraded v1
    - 1h
- 4 degrons v1
    - RUNX1
    - RUNX2
    - SPI1
    - GFI1

DESeq output files
- Tccounts ERCC
- Readcounts ERCC
- Tccounts (unscaled)
- Readcounts (unscaled)

In [4]:
res_dir = "../results"

In [7]:
project1 = "IRF2BP2_degraded_v1"
project2 = "MYB_degraded_v1"
project3 = "4_degrons_v1"

In [191]:
def get_file_info(project, file, conditions):
    
    project_row = [project]
    #name = os.path.basename(file).split(".")[0]
    file_row = [os.path.basename(file)]
    scaling_row = ["ERCCsamplewise" if "ERCCsamplewise" in file else None]
    condition_row = [cond for cond in conditions if cond in file]
    count_row = ["readcounts" if "readcounts" in file else "tccounts"]
    
    return [project_row, file_row, scaling_row, condition_row, count_row]

In [192]:
def get_deseq_data(file):
    ''' input deseq file & returns df '''
    ''' add file name to headers except gene columns '''
    df = pd.read_csv(file, header=[0], index_col=None, squeeze=True)
    df.columns = ["{}_{}".format(col_name, os.path.basename(file).split(".")[0]) if "gene" not in col_name
                  else col_name for col_name in df.columns.tolist()]
    return(df)

# IRF2BP2_degraded_v1 (project 1)

__res_v4 no alt__

In [14]:
project = "IRF2BP2_degraded_v1"
res_v = "res_v4"
project_dir = "slamseq_{}_{}".format(project, res_v)
#proj_dir = "slamseq_{}".format(project1)

In [15]:
files = ! ls $res_dir/$project_dir/*.csv
files = [file for file in files if "deseq" in file if "drop" not in file]
# ERCC files first
files = [file for file in files if "ERCCsamplewise" in file] + \
[file for file in files if "ERCCsamplewise" not in file]
print(len(files))

8


In [16]:
# project 1
conditions = ["3h", "6h"]
scaling = ["ERCCsamplewise", None]
count_type = ["readcounts", "tccounts"]

### make info rows

In [17]:
# df with file info
df_info = pd.DataFrame(list(zip(["project", "file", "scale", "condition", "type"],
                                ["project", "file", "scale", "condition", "type"])))

In [19]:
df_info = pd.concat([df_info] + [pd.DataFrame([x*6 for x in get_file_info(project, file, conditions)]) \
                                 for file in files], axis=1)
df_info.shape

(5, 50)

In [20]:
df_info.head()

Unnamed: 0,0,1,0.1,1.1,2,3,4,5,0.2,1.2,...,2.1,3.1,4.1,5.1,0.3,1.3,2.2,3.2,4.2,5.2
0,project,project,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,...,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1
1,file,file,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,...,IRF2BP2_degraded_v1_0_5_6h_deseq_readcounts.csv,IRF2BP2_degraded_v1_0_5_6h_deseq_readcounts.csv,IRF2BP2_degraded_v1_0_5_6h_deseq_readcounts.csv,IRF2BP2_degraded_v1_0_5_6h_deseq_readcounts.csv,IRF2BP2_degraded_v1_0_5_6h_deseq_tccounts.csv,IRF2BP2_degraded_v1_0_5_6h_deseq_tccounts.csv,IRF2BP2_degraded_v1_0_5_6h_deseq_tccounts.csv,IRF2BP2_degraded_v1_0_5_6h_deseq_tccounts.csv,IRF2BP2_degraded_v1_0_5_6h_deseq_tccounts.csv,IRF2BP2_degraded_v1_0_5_6h_deseq_tccounts.csv
2,scale,scale,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,...,,,,,,,,,,
3,condition,condition,3h,3h,3h,3h,3h,3h,3h,3h,...,6h,6h,6h,6h,6h,6h,6h,6h,6h,6h
4,type,type,readcounts,readcounts,readcounts,readcounts,readcounts,readcounts,tccounts,tccounts,...,readcounts,readcounts,readcounts,readcounts,tccounts,tccounts,tccounts,tccounts,tccounts,tccounts


### get dfs and merge

In [21]:
# Get list of deseq dfs
deseq_dfs = [get_deseq_data(file) for file in files]

In [23]:
# main df
df_merged = pd.DataFrame()
df_merged = df_merged.append(deseq_dfs[0][["genes", "gene_id"]]) # add gene cols to main df

In [25]:
# join df columns on gene & gene_id columns
df_merged = reduce(lambda left, right: pd.merge(left, right, on=['genes', "gene_id"], how="outer"), 
                   [df_merged] + deseq_dfs)
df_merged.shape

(15632, 50)

In [26]:
df_merged.iloc[:2]

Unnamed: 0,genes,gene_id,baseMean_IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_deseq_readcounts,log2FoldChange_IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_deseq_readcounts,lfcSE_IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_deseq_readcounts,stat_IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_deseq_readcounts,pvalue_IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_deseq_readcounts,padj_IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_deseq_readcounts,baseMean_IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_deseq_tccounts,log2FoldChange_IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_deseq_tccounts,...,lfcSE_IRF2BP2_degraded_v1_0_5_6h_deseq_readcounts,stat_IRF2BP2_degraded_v1_0_5_6h_deseq_readcounts,pvalue_IRF2BP2_degraded_v1_0_5_6h_deseq_readcounts,padj_IRF2BP2_degraded_v1_0_5_6h_deseq_readcounts,baseMean_IRF2BP2_degraded_v1_0_5_6h_deseq_tccounts,log2FoldChange_IRF2BP2_degraded_v1_0_5_6h_deseq_tccounts,lfcSE_IRF2BP2_degraded_v1_0_5_6h_deseq_tccounts,stat_IRF2BP2_degraded_v1_0_5_6h_deseq_tccounts,pvalue_IRF2BP2_degraded_v1_0_5_6h_deseq_tccounts,padj_IRF2BP2_degraded_v1_0_5_6h_deseq_tccounts
0,NAT1,NAT1,95.870596,-0.461812,0.183899,2.511222,0.012031,0.176482,5.033275,-0.560236,...,0.249984,1.345021,0.178618,0.918252,0.75263,2.004205,1.827815,-1.096503,0.272859,
1,AAMP,AAMP,842.371621,-0.147606,0.118079,1.250062,0.211277,0.621102,14.553492,0.310173,...,0.143612,-1.174215,0.240309,0.95945,7.433181,0.284259,0.600667,-0.473238,0.636043,


In [27]:
# change column names
df_merged.columns = [col_name.split("_")[0] if "gene" not in col_name else 
                     col_name for col_name in df_merged.columns.tolist()]
df_merged = df_merged.T.reset_index().T.reset_index(drop=True)
df_merged.shape

(15633, 50)

In [28]:
df_merged.iloc[:2]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
0,genes,gene_id,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,baseMean,log2FoldChange,...,lfcSE,stat,pvalue,padj,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
1,NAT1,NAT1,95.870596,-0.461812,0.183899,2.511222,0.012031,0.176482,5.033275,-0.560236,...,0.249984,1.345021,0.178618,0.918252,0.75263,2.004205,1.827815,-1.096503,0.272859,


### add info columns

In [29]:
print(df_info.shape)
print(df_merged.shape)

(5, 50)
(15633, 50)


In [30]:
# reset column index
df_info = df_info.T.reset_index(drop=True).T

In [31]:
%%time

# combine info and data
df_save = df_info.append(df_merged, ignore_index=True)
df_save.shape

CPU times: user 12 ms, sys: 0 ns, total: 12 ms
Wall time: 10.4 ms


(15638, 50)

### save merged file

In [33]:
# save file
res_dir = "../results"
deseq_res_file = "slamseq_{}_{}_0_5_deseq_output.csv".format(project, res_v)

df_save.to_csv("{}/{}/{}".format(res_dir, project_dir, deseq_res_file), header=False, index=False)

print("Saved DESeq results to: {}/{}".format(project_dir, deseq_res_file))

Saved DESeq results to: slamseq_IRF2BP2_degraded_v1_res_v4/slamseq_IRF2BP2_degraded_v1_res_v4_0_5_deseq_output.csv


# MYB_degraded_v1 (project 2)

In [34]:
# read in merged deseq file
res_dir = "../results"
deseq_res_file = "slamseq_merged_deseq_output.csv"

df_save = pd.read_csv("{}/{}".format(res_dir, deseq_res_file), index_col=False, header=None, squeeze=True)
print(df_save.shape)

(17275, 74)


  exec(code_obj, self.user_global_ns, self.user_ns)


In [35]:
project = project2
project_dir = "slamseq_{}".format(project)

In [36]:
files = ! ls $res_dir/$project_dir/*.csv
files = [file for file in files if "deseq" in file if "drop" not in file]
# ERCC files first
files = [file for file in files if "ERCCsamplewise" in file] + \
[file for file in files if "ERCCsamplewise" not in file]
print(len(files))

4


In [37]:
# project 2
conditions = ["1h"]
scaling = ["ERCCsamplewise", None]
count_type = ["readcounts", "tccounts"]

### make info rows

In [164]:
# df with file info
df_info = pd.DataFrame(list(zip(["project", "file", "scale", "condition", "type"],
                                ["project", "file", "scale", "condition", "type"])))

In [165]:
df_info = pd.concat([df_info] + [pd.DataFrame([x*6 for x in get_file_info(project2, file, conditions)]) \
                                 for file in files], axis=1)
df_info.shape

(5, 26)

In [168]:
df_info.iloc[:,[0,1,20,21,22,23,24]]

Unnamed: 0,0,1,0.1,1.1,2,3,4
0,project,project,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1
1,file,file,MYB_degraded_v1_0_5_1h_deseq_tccounts.csv,MYB_degraded_v1_0_5_1h_deseq_tccounts.csv,MYB_degraded_v1_0_5_1h_deseq_tccounts.csv,MYB_degraded_v1_0_5_1h_deseq_tccounts.csv,MYB_degraded_v1_0_5_1h_deseq_tccounts.csv
2,scale,scale,,,,,
3,condition,condition,1h,1h,1h,1h,1h
4,type,type,tccounts,tccounts,tccounts,tccounts,tccounts


### merge dfs

In [10]:
# Get list of deseq dfs
deseq_dfs = [get_deseq_data(file) for file in files]
deseq_dfs[0].iloc[:1]

Unnamed: 0,baseMean_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_readcounts,log2FoldChange_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_readcounts,lfcSE_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_readcounts,stat_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_readcounts,pvalue_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_readcounts,padj_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_readcounts,genes,gene_id
0,1.877541,-0.045789,1.010641,0.045307,0.963863,,A1BG,A1BG


In [11]:
# main df
df_merged = pd.DataFrame()
df_merged = df_merged.append(deseq_dfs[0][["genes", "gene_id"]]) # add gene cols to main df

In [12]:
# join df columns on gene & gene_id columns
df_merged = reduce(lambda left, right: pd.merge(left, right, on=['genes', "gene_id"], how="outer"), 
                   [df_merged] + deseq_dfs)
df_merged.shape

(16558, 26)

In [146]:
# change column names
df_merged.columns = [col_name.split("_")[0] if "gene" not in col_name else 
                     col_name for col_name in df_merged.columns.tolist()]
df_merged = df_merged.T.reset_index().T.reset_index(drop=True)

### merge info cols

In [147]:
df_save_info = df_save.iloc[:5,]

In [171]:
print(df_save_info.shape)
print(df_info.shape)

(5, 50)
(5, 26)


In [170]:
df_save_info

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
0,project,project,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,...,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1
1,file,file,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,...,IRF2BP2_degraded_v1_0_5_6h_deseq_readcounts.csv,IRF2BP2_degraded_v1_0_5_6h_deseq_readcounts.csv,IRF2BP2_degraded_v1_0_5_6h_deseq_readcounts.csv,IRF2BP2_degraded_v1_0_5_6h_deseq_readcounts.csv,IRF2BP2_degraded_v1_0_5_6h_deseq_tccounts.csv,IRF2BP2_degraded_v1_0_5_6h_deseq_tccounts.csv,IRF2BP2_degraded_v1_0_5_6h_deseq_tccounts.csv,IRF2BP2_degraded_v1_0_5_6h_deseq_tccounts.csv,IRF2BP2_degraded_v1_0_5_6h_deseq_tccounts.csv,IRF2BP2_degraded_v1_0_5_6h_deseq_tccounts.csv
2,scale,scale,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,...,,,,,,,,,,
3,condition,condition,3h,3h,3h,3h,3h,3h,3h,3h,...,6h,6h,6h,6h,6h,6h,6h,6h,6h,6h
4,type,type,readcounts,readcounts,readcounts,readcounts,readcounts,readcounts,tccounts,tccounts,...,readcounts,readcounts,readcounts,readcounts,tccounts,tccounts,tccounts,tccounts,tccounts,tccounts


In [173]:
# reset column index
df_info = df_info.T.reset_index(drop=True).T

In [177]:
# merge info columns
df_info_merged = pd.merge(df_save_info, df_info, left_on=[0,1], right_on=[0,1])
df_info_merged = df_info_merged.T.reset_index(drop=True).T
df_info_merged.shape

(5, 74)

### merge gene data

In [None]:
# main df
df_merged = pd.DataFrame()
df_merged = df_merged.append(deseq_dfs[0][["genes", "gene_id"]]) # add gene cols to main df

In [179]:
print(df_merged.shape)
df_merged.iloc[:3]

(16559, 26)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,genes,gene_id,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,baseMean,log2FoldChange,...,lfcSE,stat,pvalue,padj,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
1,A1BG,A1BG,1.877541,-0.045789,1.010641,0.045307,0.963863,,0.05833,-0.614498,...,0.980153,-0.116824,0.907,,0.05833,-0.499954,3.11654,0.16042,0.87255,
2,A2MP1,A2MP1,5.229468,-0.26904,0.500213,0.537851,0.59068,,0.0,-0.0,...,0.478116,0.372684,0.709383,,0.0,-0.0,,,0.0,


In [182]:
# merged data w/o info
df_saved_output = df_save.iloc[5:]
df_saved_output.shape

(15113, 50)

In [190]:
saved_genes = df_saved_output.iloc[1:,0].tolist()
print(len(saved_genes))
new_genes = df_merged.iloc[1:,0].tolist()
print(len(new_genes))
print("Total unique genes combined: {}".format(len(set(saved_genes+new_genes))))

15112
16558
Total unique genes combined: 17269


In [194]:
df_saved_output.iloc[:3]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
5,genes,gene_id,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,baseMean,log2FoldChange,...,lfcSE,stat,pvalue,padj,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
6,NAT1,NAT1,98.73474046943294,-0.3588695183526639,0.1795865924500918,1.998309080074538,0.0456831617026497,0.5573132426854484,5.021740199533236,-0.5547845714960977,...,0.2491509542089338,1.3371348848527798,0.181178611933578,0.8915384574240992,0.7530778794721894,2.014292355431807,1.8320814934075496,-1.0994556534083848,0.2715693670830352,
7,AAMP,AAMP,872.5000270269832,-0.0418951057520827,0.1166181865869584,0.3592501905424756,0.7194079326130601,0.9684493014304824,14.549760596196997,0.3164736874507167,...,0.1441953888640895,-1.1898593830650157,0.2341016648908468,0.9320326022466292,7.440114778273713,0.290637206923639,0.6010687521535898,-0.483534048113972,0.6287165903309597,


In [201]:
df_saved_output.columns = df_saved_output.iloc[0]
df_saved_output = df_saved_output[1:]

In [203]:
df_saved_output[:2]

5,genes,gene_id,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,baseMean.1,log2FoldChange.1,...,lfcSE.1,stat.1,pvalue.1,padj.1,baseMean.2,log2FoldChange.2,lfcSE.2,stat.2,pvalue.2,padj.2
6,NAT1,NAT1,98.73474046943294,-0.3588695183526639,0.1795865924500918,1.998309080074538,0.0456831617026497,0.5573132426854484,5.021740199533236,-0.5547845714960977,...,0.2491509542089338,1.3371348848527798,0.181178611933578,0.8915384574240992,0.7530778794721894,2.014292355431807,1.8320814934075496,-1.0994556534083848,0.2715693670830352,
7,AAMP,AAMP,872.5000270269832,-0.0418951057520827,0.1166181865869584,0.3592501905424756,0.7194079326130601,0.9684493014304824,14.549760596196997,0.3164736874507167,...,0.1441953888640895,-1.1898593830650157,0.2341016648908468,0.9320326022466292,7.440114778273713,0.290637206923639,0.6010687521535898,-0.483534048113972,0.6287165903309597,


In [216]:
df_saved_output.shape

(15112, 50)

In [223]:
# change number values to float
num_cols = list(range(2,df_saved_output.shape[1]))
df_saved_output.iloc[:,num_cols] = df_saved_output.iloc[:,num_cols].astype(float)

__Make df with all genes__

In [241]:
print("Saved df genes: {}".format(len(df_saved_output['genes'])))
print("Merged df genes: {}".format(len(df_merged['genes'])))
merged_genes = list(set(df_saved_output['genes'].tolist() + df_merged['genes'].tolist()))
merged_genes.sort()
merged_genes = sorted(merged_genes, key=lambda x: (x.isnumeric(),int(x) if x.isnumeric() else x))
print("Total unique genes: {}".format(len(merged_genes)))

Saved df genes: 15112
Merged df genes: 16558
Total unique genes: 17269


In [243]:
df = pd.DataFrame(list(zip(merged_genes, merged_genes)),
                  columns=["genes", "gene_id"])

In [245]:
df = reduce(lambda left, right: pd.merge(left, right, on=['genes', "gene_id"], how="outer"), 
                   [df, df_saved_output, df_merged])

In [246]:
print(df.shape)
df.head()

(17269, 74)


Unnamed: 0,genes,gene_id,baseMean_x,log2FoldChange_x,lfcSE_x,stat_x,pvalue_x,padj_x,baseMean_x.1,log2FoldChange_x.1,...,lfcSE_y,stat_y,pvalue_y,padj_y,baseMean_y,log2FoldChange_y,lfcSE_y.1,stat_y.1,pvalue_y.1,padj_y.1
0,A1BG,A1BG,,,,,,,,,...,0.980153,-0.116824,0.907,,0.05833,-0.499954,3.11654,0.16042,0.87255,
1,A1BG-AS1,A1BG-AS1,25.689167,0.01394,0.311866,-0.044699,0.964347,0.997077,0.209183,-0.545181,...,0.179837,-1.153464,0.24872,0.999857,0.0,-0.0,,,0.0,
2,A1CF,A1CF,,,,,,,,,...,0.577384,0.735683,0.461924,,0.0,-0.0,,,0.0,
3,A2M-AS1,A2M-AS1,,,,,,,,,...,0.894369,-0.055995,0.955345,,0.0,-0.0,,,0.0,
4,A2ML1,A2ML1,,,,,,,,,...,0.706911,0.342414,0.732039,,0.0,-0.0,,,0.0,


In [247]:
# change column names
df.columns = [col_name.split("_")[0] if "gene" not in col_name else 
              col_name for col_name in df.columns.tolist()]
df = df.T.reset_index().T.reset_index(drop=True)

### add info cols

In [248]:
print(df.shape)
print(df_info_merged.shape)

(17270, 74)
(5, 74)


In [250]:
# reset column index
df_info_merged = df_info_merged.T.reset_index(drop=True).T

In [251]:
%%time

# combine info and data
df_save = df_info_merged.append(df, ignore_index=True)
df_save.shape

CPU times: user 12 ms, sys: 8 ms, total: 20 ms
Wall time: 22.7 ms


(17275, 74)

### save merged df

In [252]:
# save file
res_dir = "../results"
deseq_res_file = "slamseq_merged_deseq_output.csv"

df_save.to_csv("{}/{}".format(res_dir, deseq_res_file), header=False, index=False)

print("Saved DESeq results to: {}/{}".format(res_dir, deseq_res_file))

Saved DESeq results to: ../results/slamseq_merged_deseq_output.csv


# 4_degrons_v1 (RUNX1)

In [16]:
project = "4_degrons_v1"
project_dir = "slamseq_{}".format(project)
gene = "RUNX1"

In [17]:
files = ! ls $res_dir/$project_dir/*.csv
files = [file for file in files if "deseq" in file and gene in file]
# ERCC files first
files = [file for file in files if "ERCCsamplewise" in file] + \
[file for file in files if "ERCCsamplewise" not in file]
print(len(files))

4


In [18]:
files

['../results/slamseq_4_degrons_v1/4_degrons_v1_RUNX1_1h_ERCCsamplewise_0_5_deseq_readcounts.csv',
 '../results/slamseq_4_degrons_v1/4_degrons_v1_RUNX1_1h_ERCCsamplewise_0_5_deseq_tccounts.csv',
 '../results/slamseq_4_degrons_v1/4_degrons_v1_RUNX1_1h_0_5_deseq_readcounts.csv',
 '../results/slamseq_4_degrons_v1/4_degrons_v1_RUNX1_1h_0_5_deseq_tccounts.csv']

In [19]:
# project 3
conditions = ["1h"]
scaling = ["ERCCsamplewise", None]
count_type = ["readcounts", "tccounts"]

### make info rows

In [22]:
# df with file info
df_info = pd.DataFrame(list(zip(["project", "file", "scale", "condition", "type"],
                                ["project", "file", "scale", "condition", "type"])))

In [23]:
df_info = pd.concat([df_info] + [pd.DataFrame([x*6 for x in get_file_info(project, file, conditions)]) \
                                 for file in files], axis=1)
df_info.shape

(5, 26)

In [24]:
df_info.head()

Unnamed: 0,0,1,0.1,1.1,2,3,4,5,0.2,1.2,...,2.1,3.1,4.1,5.1,0.3,1.3,2.2,3.2,4.2,5.2
0,project,project,4_degrons_v1,4_degrons_v1,4_degrons_v1,4_degrons_v1,4_degrons_v1,4_degrons_v1,4_degrons_v1,4_degrons_v1,...,4_degrons_v1,4_degrons_v1,4_degrons_v1,4_degrons_v1,4_degrons_v1,4_degrons_v1,4_degrons_v1,4_degrons_v1,4_degrons_v1,4_degrons_v1
1,file,file,4_degrons_v1_RUNX1_1h_ERCCsamplewise_0_5_deseq...,4_degrons_v1_RUNX1_1h_ERCCsamplewise_0_5_deseq...,4_degrons_v1_RUNX1_1h_ERCCsamplewise_0_5_deseq...,4_degrons_v1_RUNX1_1h_ERCCsamplewise_0_5_deseq...,4_degrons_v1_RUNX1_1h_ERCCsamplewise_0_5_deseq...,4_degrons_v1_RUNX1_1h_ERCCsamplewise_0_5_deseq...,4_degrons_v1_RUNX1_1h_ERCCsamplewise_0_5_deseq...,4_degrons_v1_RUNX1_1h_ERCCsamplewise_0_5_deseq...,...,4_degrons_v1_RUNX1_1h_0_5_deseq_readcounts.csv,4_degrons_v1_RUNX1_1h_0_5_deseq_readcounts.csv,4_degrons_v1_RUNX1_1h_0_5_deseq_readcounts.csv,4_degrons_v1_RUNX1_1h_0_5_deseq_readcounts.csv,4_degrons_v1_RUNX1_1h_0_5_deseq_tccounts.csv,4_degrons_v1_RUNX1_1h_0_5_deseq_tccounts.csv,4_degrons_v1_RUNX1_1h_0_5_deseq_tccounts.csv,4_degrons_v1_RUNX1_1h_0_5_deseq_tccounts.csv,4_degrons_v1_RUNX1_1h_0_5_deseq_tccounts.csv,4_degrons_v1_RUNX1_1h_0_5_deseq_tccounts.csv
2,scale,scale,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,...,,,,,,,,,,
3,condition,condition,1h,1h,1h,1h,1h,1h,1h,1h,...,1h,1h,1h,1h,1h,1h,1h,1h,1h,1h
4,type,type,readcounts,readcounts,readcounts,readcounts,readcounts,readcounts,tccounts,tccounts,...,readcounts,readcounts,readcounts,readcounts,tccounts,tccounts,tccounts,tccounts,tccounts,tccounts


### get dfs and merge

In [25]:
# Get list of deseq dfs
deseq_dfs = [get_deseq_data(file) for file in files]

In [26]:
# main df
df_merged = pd.DataFrame()
df_merged = df_merged.append(deseq_dfs[0][["genes", "gene_id"]]) # add gene cols to main df

In [27]:
# join df columns on gene & gene_id columns
df_merged = reduce(lambda left, right: pd.merge(left, right, on=['genes', "gene_id"], how="outer"), 
                   [df_merged] + deseq_dfs)
df_merged.shape

(16744, 26)

In [28]:
df_merged.iloc[:2]

Unnamed: 0,genes,gene_id,baseMean_4_degrons_v1_RUNX1_1h_ERCCsamplewise_0_5_deseq_readcounts,log2FoldChange_4_degrons_v1_RUNX1_1h_ERCCsamplewise_0_5_deseq_readcounts,lfcSE_4_degrons_v1_RUNX1_1h_ERCCsamplewise_0_5_deseq_readcounts,stat_4_degrons_v1_RUNX1_1h_ERCCsamplewise_0_5_deseq_readcounts,pvalue_4_degrons_v1_RUNX1_1h_ERCCsamplewise_0_5_deseq_readcounts,padj_4_degrons_v1_RUNX1_1h_ERCCsamplewise_0_5_deseq_readcounts,baseMean_4_degrons_v1_RUNX1_1h_ERCCsamplewise_0_5_deseq_tccounts,log2FoldChange_4_degrons_v1_RUNX1_1h_ERCCsamplewise_0_5_deseq_tccounts,...,lfcSE_4_degrons_v1_RUNX1_1h_0_5_deseq_readcounts,stat_4_degrons_v1_RUNX1_1h_0_5_deseq_readcounts,pvalue_4_degrons_v1_RUNX1_1h_0_5_deseq_readcounts,padj_4_degrons_v1_RUNX1_1h_0_5_deseq_readcounts,baseMean_4_degrons_v1_RUNX1_1h_0_5_deseq_tccounts,log2FoldChange_4_degrons_v1_RUNX1_1h_0_5_deseq_tccounts,lfcSE_4_degrons_v1_RUNX1_1h_0_5_deseq_tccounts,stat_4_degrons_v1_RUNX1_1h_0_5_deseq_tccounts,pvalue_4_degrons_v1_RUNX1_1h_0_5_deseq_tccounts,padj_4_degrons_v1_RUNX1_1h_0_5_deseq_tccounts
0,A2M,A2M,2.269069,-0.155731,1.023965,0.152086,0.879119,,0.0,-0.0,...,1.025011,0.157375,0.87495,,0.0,-0.0,,,0.0,
1,A2MP1,A2MP1,2.0503,3.698355,1.401673,-2.638529,0.008327,,0.0,-0.0,...,1.402745,-2.657264,0.007878,,0.0,-0.0,,,0.0,


### add info columns

In [29]:
print(df_info.shape)
print(df_merged.shape)

(5, 26)
(16744, 26)


In [30]:
# reset column index
df_info = df_info.T.reset_index(drop=True).T

In [31]:
%%time

# combine info and data
df_save = df_info.append(df_merged, ignore_index=True)
df_save.shape

CPU times: user 12 ms, sys: 0 ns, total: 12 ms
Wall time: 13.1 ms


(16749, 52)

### save merged file

In [33]:
# save file
res_dir = "../results"
deseq_res_file = "slamseq_{}_{}_0_5_deseq_output.csv".format(project, gene)

df_save.to_csv("{}/{}/{}".format(res_dir, project_dir, deseq_res_file), header=False, index=False)

print("Saved DESeq results to: {}/{}".format(project_dir, deseq_res_file))

Saved DESeq results to: slamseq_4_degrons_v1/slamseq_4_degrons_v1_RUNX1_0_5_deseq_output.csv


# 4_degrons_v1 (RUNX2)

In [34]:
project = "4_degrons_v1"
project_dir = "slamseq_{}".format(project)
gene = "RUNX2"

In [35]:
files = ! ls $res_dir/$project_dir/*.csv
files = [file for file in files if "deseq" in file and gene in file]
# ERCC files first
files = [file for file in files if "ERCCsamplewise" in file] + \
[file for file in files if "ERCCsamplewise" not in file]
print(len(files))

4


In [36]:
files

['../results/slamseq_4_degrons_v1/4_degrons_v1_RUNX2_1h_ERCCsamplewise_0_5_deseq_readcounts.csv',
 '../results/slamseq_4_degrons_v1/4_degrons_v1_RUNX2_1h_ERCCsamplewise_0_5_deseq_tccounts.csv',
 '../results/slamseq_4_degrons_v1/4_degrons_v1_RUNX2_1h_0_5_deseq_readcounts.csv',
 '../results/slamseq_4_degrons_v1/4_degrons_v1_RUNX2_1h_0_5_deseq_tccounts.csv']

In [37]:
# project 3
conditions = ["1h"]
scaling = ["ERCCsamplewise", None]
count_type = ["readcounts", "tccounts"]

### make info rows

In [38]:
# df with file info
df_info = pd.DataFrame(list(zip(["project", "file", "scale", "condition", "type"],
                                ["project", "file", "scale", "condition", "type"])))

In [39]:
df_info = pd.concat([df_info] + [pd.DataFrame([x*6 for x in get_file_info(project, file, conditions)]) \
                                 for file in files], axis=1)
df_info.shape

(5, 26)

In [40]:
df_info.head()

Unnamed: 0,0,1,0.1,1.1,2,3,4,5,0.2,1.2,...,2.1,3.1,4.1,5.1,0.3,1.3,2.2,3.2,4.2,5.2
0,project,project,4_degrons_v1,4_degrons_v1,4_degrons_v1,4_degrons_v1,4_degrons_v1,4_degrons_v1,4_degrons_v1,4_degrons_v1,...,4_degrons_v1,4_degrons_v1,4_degrons_v1,4_degrons_v1,4_degrons_v1,4_degrons_v1,4_degrons_v1,4_degrons_v1,4_degrons_v1,4_degrons_v1
1,file,file,4_degrons_v1_RUNX2_1h_ERCCsamplewise_0_5_deseq...,4_degrons_v1_RUNX2_1h_ERCCsamplewise_0_5_deseq...,4_degrons_v1_RUNX2_1h_ERCCsamplewise_0_5_deseq...,4_degrons_v1_RUNX2_1h_ERCCsamplewise_0_5_deseq...,4_degrons_v1_RUNX2_1h_ERCCsamplewise_0_5_deseq...,4_degrons_v1_RUNX2_1h_ERCCsamplewise_0_5_deseq...,4_degrons_v1_RUNX2_1h_ERCCsamplewise_0_5_deseq...,4_degrons_v1_RUNX2_1h_ERCCsamplewise_0_5_deseq...,...,4_degrons_v1_RUNX2_1h_0_5_deseq_readcounts.csv,4_degrons_v1_RUNX2_1h_0_5_deseq_readcounts.csv,4_degrons_v1_RUNX2_1h_0_5_deseq_readcounts.csv,4_degrons_v1_RUNX2_1h_0_5_deseq_readcounts.csv,4_degrons_v1_RUNX2_1h_0_5_deseq_tccounts.csv,4_degrons_v1_RUNX2_1h_0_5_deseq_tccounts.csv,4_degrons_v1_RUNX2_1h_0_5_deseq_tccounts.csv,4_degrons_v1_RUNX2_1h_0_5_deseq_tccounts.csv,4_degrons_v1_RUNX2_1h_0_5_deseq_tccounts.csv,4_degrons_v1_RUNX2_1h_0_5_deseq_tccounts.csv
2,scale,scale,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,...,,,,,,,,,,
3,condition,condition,1h,1h,1h,1h,1h,1h,1h,1h,...,1h,1h,1h,1h,1h,1h,1h,1h,1h,1h
4,type,type,readcounts,readcounts,readcounts,readcounts,readcounts,readcounts,tccounts,tccounts,...,readcounts,readcounts,readcounts,readcounts,tccounts,tccounts,tccounts,tccounts,tccounts,tccounts


### get dfs and merge

In [41]:
# Get list of deseq dfs
deseq_dfs = [get_deseq_data(file) for file in files]

In [42]:
# main df
df_merged = pd.DataFrame()
df_merged = df_merged.append(deseq_dfs[0][["genes", "gene_id"]]) # add gene cols to main df

In [43]:
# join df columns on gene & gene_id columns
df_merged = reduce(lambda left, right: pd.merge(left, right, on=['genes', "gene_id"], how="outer"), 
                   [df_merged] + deseq_dfs)
df_merged.shape

(16744, 26)

In [44]:
df_merged.iloc[:2]

Unnamed: 0,genes,gene_id,baseMean_4_degrons_v1_RUNX2_1h_ERCCsamplewise_0_5_deseq_readcounts,log2FoldChange_4_degrons_v1_RUNX2_1h_ERCCsamplewise_0_5_deseq_readcounts,lfcSE_4_degrons_v1_RUNX2_1h_ERCCsamplewise_0_5_deseq_readcounts,stat_4_degrons_v1_RUNX2_1h_ERCCsamplewise_0_5_deseq_readcounts,pvalue_4_degrons_v1_RUNX2_1h_ERCCsamplewise_0_5_deseq_readcounts,padj_4_degrons_v1_RUNX2_1h_ERCCsamplewise_0_5_deseq_readcounts,baseMean_4_degrons_v1_RUNX2_1h_ERCCsamplewise_0_5_deseq_tccounts,log2FoldChange_4_degrons_v1_RUNX2_1h_ERCCsamplewise_0_5_deseq_tccounts,...,lfcSE_4_degrons_v1_RUNX2_1h_0_5_deseq_readcounts,stat_4_degrons_v1_RUNX2_1h_0_5_deseq_readcounts,pvalue_4_degrons_v1_RUNX2_1h_0_5_deseq_readcounts,padj_4_degrons_v1_RUNX2_1h_0_5_deseq_readcounts,baseMean_4_degrons_v1_RUNX2_1h_0_5_deseq_tccounts,log2FoldChange_4_degrons_v1_RUNX2_1h_0_5_deseq_tccounts,lfcSE_4_degrons_v1_RUNX2_1h_0_5_deseq_tccounts,stat_4_degrons_v1_RUNX2_1h_0_5_deseq_tccounts,pvalue_4_degrons_v1_RUNX2_1h_0_5_deseq_tccounts,padj_4_degrons_v1_RUNX2_1h_0_5_deseq_tccounts
0,A2M,A2M,2.168326,0.11384,0.946798,-0.120237,0.904295,,0.0,-0.0,...,0.94386,-0.168996,0.8658,,0.0,-0.0,,,0.0,
1,A2MP1,A2MP1,0.986161,2.369922,1.588698,-1.491739,0.135768,,0.0,-0.0,...,1.582256,-1.52934,0.12618,,0.0,-0.0,,,0.0,


### add info columns

In [45]:
print(df_info.shape)
print(df_merged.shape)

(5, 26)
(16744, 26)


In [46]:
# reset column index
df_info = df_info.T.reset_index(drop=True).T

In [47]:
%%time

# combine info and data
df_save = df_info.append(df_merged, ignore_index=True)
df_save.shape

CPU times: user 8 ms, sys: 4 ms, total: 12 ms
Wall time: 13.4 ms


(16749, 52)

### save merged file

In [48]:
# save file
res_dir = "../results"
deseq_res_file = "slamseq_{}_{}_0_5_deseq_output.csv".format(project, gene)

df_save.to_csv("{}/{}/{}".format(res_dir, project_dir, deseq_res_file), header=False, index=False)

print("Saved DESeq results to: {}/{}".format(project_dir, deseq_res_file))

Saved DESeq results to: slamseq_4_degrons_v1/slamseq_4_degrons_v1_RUNX2_0_5_deseq_output.csv


# 4_degrons_v1 (SPI1)

In [65]:
project = "4_degrons_v1"
project_dir = "slamseq_{}".format(project)
gene = "SPI1"

In [66]:
files = ! ls $res_dir/$project_dir/*.csv
files = [file for file in files if "deseq" in file and gene in file]
# ERCC files first
files = [file for file in files if "ERCCsamplewise" in file] + \
[file for file in files if "ERCCsamplewise" not in file]
print(len(files))

4


In [67]:
files

['../results/slamseq_4_degrons_v1/4_degrons_v1_SPI1_1h_ERCCsamplewise_0_5_deseq_readcounts.csv',
 '../results/slamseq_4_degrons_v1/4_degrons_v1_SPI1_1h_ERCCsamplewise_0_5_deseq_tccounts.csv',
 '../results/slamseq_4_degrons_v1/4_degrons_v1_SPI1_1h_0_5_deseq_readcounts.csv',
 '../results/slamseq_4_degrons_v1/4_degrons_v1_SPI1_1h_0_5_deseq_tccounts.csv']

In [68]:
# project 3
conditions = ["1h"]
scaling = ["ERCCsamplewise", None]
count_type = ["readcounts", "tccounts"]

### make info rows

In [69]:
# df with file info
df_info = pd.DataFrame(list(zip(["project", "file", "scale", "condition", "type"],
                                ["project", "file", "scale", "condition", "type"])))

In [70]:
df_info = pd.concat([df_info] + [pd.DataFrame([x*6 for x in get_file_info(project, file, conditions)]) \
                                 for file in files], axis=1)
df_info.shape

(5, 26)

In [71]:
df_info.head()

Unnamed: 0,0,1,0.1,1.1,2,3,4,5,0.2,1.2,...,2.1,3.1,4.1,5.1,0.3,1.3,2.2,3.2,4.2,5.2
0,project,project,4_degrons_v1,4_degrons_v1,4_degrons_v1,4_degrons_v1,4_degrons_v1,4_degrons_v1,4_degrons_v1,4_degrons_v1,...,4_degrons_v1,4_degrons_v1,4_degrons_v1,4_degrons_v1,4_degrons_v1,4_degrons_v1,4_degrons_v1,4_degrons_v1,4_degrons_v1,4_degrons_v1
1,file,file,4_degrons_v1_SPI1_1h_ERCCsamplewise_0_5_deseq_...,4_degrons_v1_SPI1_1h_ERCCsamplewise_0_5_deseq_...,4_degrons_v1_SPI1_1h_ERCCsamplewise_0_5_deseq_...,4_degrons_v1_SPI1_1h_ERCCsamplewise_0_5_deseq_...,4_degrons_v1_SPI1_1h_ERCCsamplewise_0_5_deseq_...,4_degrons_v1_SPI1_1h_ERCCsamplewise_0_5_deseq_...,4_degrons_v1_SPI1_1h_ERCCsamplewise_0_5_deseq_...,4_degrons_v1_SPI1_1h_ERCCsamplewise_0_5_deseq_...,...,4_degrons_v1_SPI1_1h_0_5_deseq_readcounts.csv,4_degrons_v1_SPI1_1h_0_5_deseq_readcounts.csv,4_degrons_v1_SPI1_1h_0_5_deseq_readcounts.csv,4_degrons_v1_SPI1_1h_0_5_deseq_readcounts.csv,4_degrons_v1_SPI1_1h_0_5_deseq_tccounts.csv,4_degrons_v1_SPI1_1h_0_5_deseq_tccounts.csv,4_degrons_v1_SPI1_1h_0_5_deseq_tccounts.csv,4_degrons_v1_SPI1_1h_0_5_deseq_tccounts.csv,4_degrons_v1_SPI1_1h_0_5_deseq_tccounts.csv,4_degrons_v1_SPI1_1h_0_5_deseq_tccounts.csv
2,scale,scale,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,...,,,,,,,,,,
3,condition,condition,1h,1h,1h,1h,1h,1h,1h,1h,...,1h,1h,1h,1h,1h,1h,1h,1h,1h,1h
4,type,type,readcounts,readcounts,readcounts,readcounts,readcounts,readcounts,tccounts,tccounts,...,readcounts,readcounts,readcounts,readcounts,tccounts,tccounts,tccounts,tccounts,tccounts,tccounts


### get dfs and merge

In [72]:
# Get list of deseq dfs
deseq_dfs = [get_deseq_data(file) for file in files]

In [73]:
# main df
df_merged = pd.DataFrame()
df_merged = df_merged.append(deseq_dfs[0][["genes", "gene_id"]]) # add gene cols to main df

In [74]:
# join df columns on gene & gene_id columns
df_merged = reduce(lambda left, right: pd.merge(left, right, on=['genes', "gene_id"], how="outer"), 
                   [df_merged] + deseq_dfs)
df_merged.shape

(16744, 26)

In [75]:
df_merged.iloc[:2]

Unnamed: 0,genes,gene_id,baseMean_4_degrons_v1_SPI1_1h_ERCCsamplewise_0_5_deseq_readcounts,log2FoldChange_4_degrons_v1_SPI1_1h_ERCCsamplewise_0_5_deseq_readcounts,lfcSE_4_degrons_v1_SPI1_1h_ERCCsamplewise_0_5_deseq_readcounts,stat_4_degrons_v1_SPI1_1h_ERCCsamplewise_0_5_deseq_readcounts,pvalue_4_degrons_v1_SPI1_1h_ERCCsamplewise_0_5_deseq_readcounts,padj_4_degrons_v1_SPI1_1h_ERCCsamplewise_0_5_deseq_readcounts,baseMean_4_degrons_v1_SPI1_1h_ERCCsamplewise_0_5_deseq_tccounts,log2FoldChange_4_degrons_v1_SPI1_1h_ERCCsamplewise_0_5_deseq_tccounts,...,lfcSE_4_degrons_v1_SPI1_1h_0_5_deseq_readcounts,stat_4_degrons_v1_SPI1_1h_0_5_deseq_readcounts,pvalue_4_degrons_v1_SPI1_1h_0_5_deseq_readcounts,padj_4_degrons_v1_SPI1_1h_0_5_deseq_readcounts,baseMean_4_degrons_v1_SPI1_1h_0_5_deseq_tccounts,log2FoldChange_4_degrons_v1_SPI1_1h_0_5_deseq_tccounts,lfcSE_4_degrons_v1_SPI1_1h_0_5_deseq_tccounts,stat_4_degrons_v1_SPI1_1h_0_5_deseq_tccounts,pvalue_4_degrons_v1_SPI1_1h_0_5_deseq_tccounts,padj_4_degrons_v1_SPI1_1h_0_5_deseq_tccounts
0,A2M,A2M,0.0,-0.0,,,0.0,,0.0,-0.0,...,,,0.0,,0.0,-0.0,,,0.0,
1,A2MP1,A2MP1,0.203748,-0.450371,2.92638,0.1539,0.877688,,0.0,-0.0,...,2.952092,0.004913,0.99608,,0.0,-0.0,,,0.0,


### add info columns

In [76]:
print(df_info.shape)
print(df_merged.shape)

(5, 26)
(16744, 26)


In [77]:
# reset column index
df_info = df_info.T.reset_index(drop=True).T

In [78]:
%%time

# combine info and data
df_save = df_info.append(df_merged, ignore_index=True)
df_save.shape

CPU times: user 12 ms, sys: 0 ns, total: 12 ms
Wall time: 11.7 ms


(16749, 52)

### save merged file

In [79]:
# save file
res_dir = "../results"
deseq_res_file = "slamseq_{}_{}_0_5_deseq_output.csv".format(project, gene)

df_save.to_csv("{}/{}/{}".format(res_dir, project_dir, deseq_res_file), header=False, index=False)

print("Saved DESeq results to: {}/{}".format(project_dir, deseq_res_file))

Saved DESeq results to: slamseq_4_degrons_v1/slamseq_4_degrons_v1_SPI1_0_5_deseq_output.csv


# 4_degrons_v1 (GFI1)

In [80]:
project = "4_degrons_v1"
project_dir = "slamseq_{}".format(project)
gene = "GFI1"

In [81]:
files = ! ls $res_dir/$project_dir/*.csv
files = [file for file in files if "deseq" in file and gene in file]
# ERCC files first
files = [file for file in files if "ERCCsamplewise" in file] + \
[file for file in files if "ERCCsamplewise" not in file]
print(len(files))

4


In [82]:
files

['../results/slamseq_4_degrons_v1/4_degrons_v1_GFI1_2h_ERCCsamplewise_0_5_deseq_readcounts.csv',
 '../results/slamseq_4_degrons_v1/4_degrons_v1_GFI1_2h_ERCCsamplewise_0_5_deseq_tccounts.csv',
 '../results/slamseq_4_degrons_v1/4_degrons_v1_GFI1_2h_0_5_deseq_readcounts.csv',
 '../results/slamseq_4_degrons_v1/4_degrons_v1_GFI1_2h_0_5_deseq_tccounts.csv']

In [83]:
# project 3
conditions = ["2h"]
scaling = ["ERCCsamplewise", None]
count_type = ["readcounts", "tccounts"]

### make info rows

In [84]:
# df with file info
df_info = pd.DataFrame(list(zip(["project", "file", "scale", "condition", "type"],
                                ["project", "file", "scale", "condition", "type"])))

In [85]:
df_info = pd.concat([df_info] + [pd.DataFrame([x*6 for x in get_file_info(project, file, conditions)]) \
                                 for file in files], axis=1)
df_info.shape

(5, 26)

In [86]:
df_info.head()

Unnamed: 0,0,1,0.1,1.1,2,3,4,5,0.2,1.2,...,2.1,3.1,4.1,5.1,0.3,1.3,2.2,3.2,4.2,5.2
0,project,project,4_degrons_v1,4_degrons_v1,4_degrons_v1,4_degrons_v1,4_degrons_v1,4_degrons_v1,4_degrons_v1,4_degrons_v1,...,4_degrons_v1,4_degrons_v1,4_degrons_v1,4_degrons_v1,4_degrons_v1,4_degrons_v1,4_degrons_v1,4_degrons_v1,4_degrons_v1,4_degrons_v1
1,file,file,4_degrons_v1_GFI1_2h_ERCCsamplewise_0_5_deseq_...,4_degrons_v1_GFI1_2h_ERCCsamplewise_0_5_deseq_...,4_degrons_v1_GFI1_2h_ERCCsamplewise_0_5_deseq_...,4_degrons_v1_GFI1_2h_ERCCsamplewise_0_5_deseq_...,4_degrons_v1_GFI1_2h_ERCCsamplewise_0_5_deseq_...,4_degrons_v1_GFI1_2h_ERCCsamplewise_0_5_deseq_...,4_degrons_v1_GFI1_2h_ERCCsamplewise_0_5_deseq_...,4_degrons_v1_GFI1_2h_ERCCsamplewise_0_5_deseq_...,...,4_degrons_v1_GFI1_2h_0_5_deseq_readcounts.csv,4_degrons_v1_GFI1_2h_0_5_deseq_readcounts.csv,4_degrons_v1_GFI1_2h_0_5_deseq_readcounts.csv,4_degrons_v1_GFI1_2h_0_5_deseq_readcounts.csv,4_degrons_v1_GFI1_2h_0_5_deseq_tccounts.csv,4_degrons_v1_GFI1_2h_0_5_deseq_tccounts.csv,4_degrons_v1_GFI1_2h_0_5_deseq_tccounts.csv,4_degrons_v1_GFI1_2h_0_5_deseq_tccounts.csv,4_degrons_v1_GFI1_2h_0_5_deseq_tccounts.csv,4_degrons_v1_GFI1_2h_0_5_deseq_tccounts.csv
2,scale,scale,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,...,,,,,,,,,,
3,condition,condition,2h,2h,2h,2h,2h,2h,2h,2h,...,2h,2h,2h,2h,2h,2h,2h,2h,2h,2h
4,type,type,readcounts,readcounts,readcounts,readcounts,readcounts,readcounts,tccounts,tccounts,...,readcounts,readcounts,readcounts,readcounts,tccounts,tccounts,tccounts,tccounts,tccounts,tccounts


### get dfs and merge

In [87]:
# Get list of deseq dfs
deseq_dfs = [get_deseq_data(file) for file in files]

In [88]:
# main df
df_merged = pd.DataFrame()
df_merged = df_merged.append(deseq_dfs[0][["genes", "gene_id"]]) # add gene cols to main df

In [89]:
# join df columns on gene & gene_id columns
df_merged = reduce(lambda left, right: pd.merge(left, right, on=['genes', "gene_id"], how="outer"), 
                   [df_merged] + deseq_dfs)
df_merged.shape

(16744, 26)

In [90]:
df_merged.iloc[:2]

Unnamed: 0,genes,gene_id,baseMean_4_degrons_v1_GFI1_2h_ERCCsamplewise_0_5_deseq_readcounts,log2FoldChange_4_degrons_v1_GFI1_2h_ERCCsamplewise_0_5_deseq_readcounts,lfcSE_4_degrons_v1_GFI1_2h_ERCCsamplewise_0_5_deseq_readcounts,stat_4_degrons_v1_GFI1_2h_ERCCsamplewise_0_5_deseq_readcounts,pvalue_4_degrons_v1_GFI1_2h_ERCCsamplewise_0_5_deseq_readcounts,padj_4_degrons_v1_GFI1_2h_ERCCsamplewise_0_5_deseq_readcounts,baseMean_4_degrons_v1_GFI1_2h_ERCCsamplewise_0_5_deseq_tccounts,log2FoldChange_4_degrons_v1_GFI1_2h_ERCCsamplewise_0_5_deseq_tccounts,...,lfcSE_4_degrons_v1_GFI1_2h_0_5_deseq_readcounts,stat_4_degrons_v1_GFI1_2h_0_5_deseq_readcounts,pvalue_4_degrons_v1_GFI1_2h_0_5_deseq_readcounts,padj_4_degrons_v1_GFI1_2h_0_5_deseq_readcounts,baseMean_4_degrons_v1_GFI1_2h_0_5_deseq_tccounts,log2FoldChange_4_degrons_v1_GFI1_2h_0_5_deseq_tccounts,lfcSE_4_degrons_v1_GFI1_2h_0_5_deseq_tccounts,stat_4_degrons_v1_GFI1_2h_0_5_deseq_tccounts,pvalue_4_degrons_v1_GFI1_2h_0_5_deseq_tccounts,padj_4_degrons_v1_GFI1_2h_0_5_deseq_tccounts
0,A2M,A2M,1.139918,-0.772628,1.415723,0.545748,0.585239,,0.090347,0.743037,...,1.396028,0.540517,0.588841,,0.091919,0.690072,3.352471,-0.20584,0.836916,
1,A2MP1,A2MP1,0.891307,0.244643,1.717638,-0.14243,0.88674,,0.0,-0.0,...,1.77853,-0.131773,0.895163,,0.0,-0.0,,,0.0,


### add info columns

In [91]:
print(df_info.shape)
print(df_merged.shape)

(5, 26)
(16744, 26)


In [92]:
# reset column index
df_info = df_info.T.reset_index(drop=True).T

In [93]:
%%time

# combine info and data
df_save = df_info.append(df_merged, ignore_index=True)
df_save.shape

CPU times: user 8 ms, sys: 4 ms, total: 12 ms
Wall time: 13.1 ms


(16749, 52)

### save merged file

In [94]:
# save file
res_dir = "../results"
deseq_res_file = "slamseq_{}_{}_0_5_deseq_output.csv".format(project, gene)

df_save.to_csv("{}/{}/{}".format(res_dir, project_dir, deseq_res_file), header=False, index=False)

print("Saved DESeq results to: {}/{}".format(project_dir, deseq_res_file))

Saved DESeq results to: slamseq_4_degrons_v1/slamseq_4_degrons_v1_GFI1_0_5_deseq_output.csv


# Merge all no alt deseq outputs

In [119]:
project_dirs = ["slamseq_IRF2BP2_degraded_v1_res_v4",
                "slamseq_MYB_degraded_v1_res_v7",
                "slamseq_4_degrons_v1"]

deseq_res_dirs = ["{}/{}".format(res_dir, project_dir) for project_dir in project_dirs]

In [163]:
deseq_files = [glob.glob(deseq_res_dir+"/*output.csv") for deseq_res_dir in deseq_res_dirs]
deseq_files = [item for sublist in deseq_files for item in sublist]
deseq_files

['../results/slamseq_IRF2BP2_degraded_v1_res_v4/slamseq_IRF2BP2_degraded_v1_res_v4_0_5_deseq_output.csv',
 '../results/slamseq_MYB_degraded_v1_res_v7/slamseq_MYB_degraded_v1_res_v7_0_5_deseq_output.csv',
 '../results/slamseq_4_degrons_v1/slamseq_4_degrons_v1_SPI1_0_5_deseq_output.csv',
 '../results/slamseq_4_degrons_v1/slamseq_4_degrons_v1_GFI1_0_5_deseq_output.csv',
 '../results/slamseq_4_degrons_v1/slamseq_4_degrons_v1_RUNX1_0_5_deseq_output.csv',
 '../results/slamseq_4_degrons_v1/slamseq_4_degrons_v1_RUNX2_0_5_deseq_output.csv']

In [164]:
deseq_dfs = [pd.read_csv(deseq_file, index_col=False, header=None, squeeze=True) for deseq_file in deseq_files]

  deseq_dfs = [pd.read_csv(deseq_file, index_col=False, header=None, squeeze=True) for deseq_file in deseq_files]


In [168]:
deseq_dfs[0].iloc[5:10]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
5,genes,gene_id,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,baseMean,log2FoldChange,...,lfcSE,stat,pvalue,padj,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
6,NAT1,NAT1,95.87059628671004,-0.4618119842755553,0.1838993165621769,2.5112218626403395,0.0120314054862605,0.1764824098229201,5.033275352515851,-0.5602362002955328,...,0.2499835106626506,1.345021316023153,0.1786183509749246,0.9182518409151246,0.7526302401777346,2.0042049937800543,1.8278146338544723,-1.0965034181576785,0.2728585282994329,
7,AAMP,AAMP,842.3716209229535,-0.1476064480241523,0.1180792991839148,1.2500620264882107,0.2112768900493981,0.621101974228759,14.553492083441196,0.3101726260809158,...,0.1436118349988333,-1.174214545359719,0.2403091013937329,0.9594495599032664,7.433181480720183,0.2842585712308931,0.6006672968377171,-0.4732379683851699,0.6360434123857338,
8,AARS1,AARS1,1374.1764868232967,-0.0252481964182913,0.1093729310308428,0.2308450196984433,0.8174352049833612,0.957161656507632,16.200285486653765,-0.1372280711065793,...,0.1671727054204942,-0.6585513747031996,0.5101838964077127,0.9966744113218344,11.302473594467742,0.0730244615436611,0.4905015559836879,-0.1488771251646949,0.8816505915117299,
9,ABAT,ABAT,55.9224048891824,-0.3585220629160938,0.2117003248291207,1.6935357241680382,0.0903535409020921,0.4386939785389467,1.693745222699288,-0.9499849425240666,...,0.2681252539633148,0.426446279789382,0.6697826846194971,0.9966744113218344,0.6345738591917904,-1.0072747380631193,2.0802920535413554,0.4841987144778059,0.6282448491442183,


In [125]:
# make empty df
df_save_data = pd.DataFrame()
df_save_info = pd.DataFrame()

In [132]:
# seperate info and data
df_res_info = df_res.iloc[:5]
df_res_data = df_res.iloc[5:]

## Merge info data

In [181]:
def merge_info_cols(df1, df2):
    ''' merged info columns from 2 info dfs '''
    df_merged = pd.merge(df1, df2, left_on=[0,1], right_on=[0,1])
    df_merged = df_merged.T.reset_index(drop=True).T
    if (df1.shape[1]+df2.shape[1]-2) != df_merged.shape[1]:
        raise ValueError("Incorrect number of columns.\
                          df1: {}\
                          df2: {}".format(df1.shape, df2.shape))
    return(df_merged)

In [184]:
# IRF2BP2
df_info_merged = deseq_dfs[0].iloc[:5]

# merge info columns
for df_res in deseq_dfs[1:]:
    print("df1 cols: {} | df2 cols: {}".format(df_info_merged.shape[1], df_res.shape[1]))
    df_info = df_res.iloc[:5]
    df_info_merged = merge_info_cols(df_info_merged, df_info)
    print(df_info_merged.shape[1])

df1 cols: 50 | df2 cols: 26
74
df1 cols: 74 | df2 cols: 52
124
df1 cols: 124 | df2 cols: 52
174
df1 cols: 174 | df2 cols: 52
224
df1 cols: 224 | df2 cols: 52
274


## Merge gene data

In [193]:
# Get list of deseq dfs
deseq_dfs = [get_deseq_data(file) for file in deseq_files]
print(len(deseq_dfs))
deseq_dfs[0].iloc[:1]

  deseq_dfs = [get_deseq_data(file) for file in deseq_files]


6


Unnamed: 0,project_slamseq_IRF2BP2_degraded_v1_res_v4_0_5_deseq_output,project.1_slamseq_IRF2BP2_degraded_v1_res_v4_0_5_deseq_output,IRF2BP2_degraded_v1_slamseq_IRF2BP2_degraded_v1_res_v4_0_5_deseq_output,IRF2BP2_degraded_v1.1_slamseq_IRF2BP2_degraded_v1_res_v4_0_5_deseq_output,IRF2BP2_degraded_v1.2_slamseq_IRF2BP2_degraded_v1_res_v4_0_5_deseq_output,IRF2BP2_degraded_v1.3_slamseq_IRF2BP2_degraded_v1_res_v4_0_5_deseq_output,IRF2BP2_degraded_v1.4_slamseq_IRF2BP2_degraded_v1_res_v4_0_5_deseq_output,IRF2BP2_degraded_v1.5_slamseq_IRF2BP2_degraded_v1_res_v4_0_5_deseq_output,IRF2BP2_degraded_v1.6_slamseq_IRF2BP2_degraded_v1_res_v4_0_5_deseq_output,IRF2BP2_degraded_v1.7_slamseq_IRF2BP2_degraded_v1_res_v4_0_5_deseq_output,...,IRF2BP2_degraded_v1.38_slamseq_IRF2BP2_degraded_v1_res_v4_0_5_deseq_output,IRF2BP2_degraded_v1.39_slamseq_IRF2BP2_degraded_v1_res_v4_0_5_deseq_output,IRF2BP2_degraded_v1.40_slamseq_IRF2BP2_degraded_v1_res_v4_0_5_deseq_output,IRF2BP2_degraded_v1.41_slamseq_IRF2BP2_degraded_v1_res_v4_0_5_deseq_output,IRF2BP2_degraded_v1.42_slamseq_IRF2BP2_degraded_v1_res_v4_0_5_deseq_output,IRF2BP2_degraded_v1.43_slamseq_IRF2BP2_degraded_v1_res_v4_0_5_deseq_output,IRF2BP2_degraded_v1.44_slamseq_IRF2BP2_degraded_v1_res_v4_0_5_deseq_output,IRF2BP2_degraded_v1.45_slamseq_IRF2BP2_degraded_v1_res_v4_0_5_deseq_output,IRF2BP2_degraded_v1.46_slamseq_IRF2BP2_degraded_v1_res_v4_0_5_deseq_output,IRF2BP2_degraded_v1.47_slamseq_IRF2BP2_degraded_v1_res_v4_0_5_deseq_output
0,file,file,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,...,IRF2BP2_degraded_v1_0_5_6h_deseq_readcounts.csv,IRF2BP2_degraded_v1_0_5_6h_deseq_readcounts.csv,IRF2BP2_degraded_v1_0_5_6h_deseq_readcounts.csv,IRF2BP2_degraded_v1_0_5_6h_deseq_readcounts.csv,IRF2BP2_degraded_v1_0_5_6h_deseq_tccounts.csv,IRF2BP2_degraded_v1_0_5_6h_deseq_tccounts.csv,IRF2BP2_degraded_v1_0_5_6h_deseq_tccounts.csv,IRF2BP2_degraded_v1_0_5_6h_deseq_tccounts.csv,IRF2BP2_degraded_v1_0_5_6h_deseq_tccounts.csv,IRF2BP2_degraded_v1_0_5_6h_deseq_tccounts.csv


In [190]:
deseq_dfs[0].columns.tolist()

['project_slamseq_IRF2BP2_degraded_v1_res_v4_0_5_deseq_output',
 'project.1_slamseq_IRF2BP2_degraded_v1_res_v4_0_5_deseq_output',
 'IRF2BP2_degraded_v1_slamseq_IRF2BP2_degraded_v1_res_v4_0_5_deseq_output',
 'IRF2BP2_degraded_v1.1_slamseq_IRF2BP2_degraded_v1_res_v4_0_5_deseq_output',
 'IRF2BP2_degraded_v1.2_slamseq_IRF2BP2_degraded_v1_res_v4_0_5_deseq_output',
 'IRF2BP2_degraded_v1.3_slamseq_IRF2BP2_degraded_v1_res_v4_0_5_deseq_output',
 'IRF2BP2_degraded_v1.4_slamseq_IRF2BP2_degraded_v1_res_v4_0_5_deseq_output',
 'IRF2BP2_degraded_v1.5_slamseq_IRF2BP2_degraded_v1_res_v4_0_5_deseq_output',
 'IRF2BP2_degraded_v1.6_slamseq_IRF2BP2_degraded_v1_res_v4_0_5_deseq_output',
 'IRF2BP2_degraded_v1.7_slamseq_IRF2BP2_degraded_v1_res_v4_0_5_deseq_output',
 'IRF2BP2_degraded_v1.8_slamseq_IRF2BP2_degraded_v1_res_v4_0_5_deseq_output',
 'IRF2BP2_degraded_v1.9_slamseq_IRF2BP2_degraded_v1_res_v4_0_5_deseq_output',
 'IRF2BP2_degraded_v1.10_slamseq_IRF2BP2_degraded_v1_res_v4_0_5_deseq_output',
 'IRF2BP2_deg

In [188]:
# main df
df_data_merged = pd.DataFrame()
df_data_merged = df_data_merged.append(deseq_dfs[0][["genes", "gene_id"]]) # add gene cols to main df

KeyError: "None of [Index(['genes', 'gene_id'], dtype='object')] are in the [columns]"

In [112]:
deseq_res_file = ! ls $deseq_res_dir/*output.csv
deseq_res_file = deseq_res_file[0]
deseq_res_file

'../results/slamseq_IRF2BP2_degraded_v1_res_v4/slamseq_IRF2BP2_degraded_v1_res_v4_0_5_deseq_output.csv'

In [114]:
df_save.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
0,project,project,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,...,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1
1,file,file,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,...,IRF2BP2_degraded_v1_0_5_6h_deseq_readcounts.csv,IRF2BP2_degraded_v1_0_5_6h_deseq_readcounts.csv,IRF2BP2_degraded_v1_0_5_6h_deseq_readcounts.csv,IRF2BP2_degraded_v1_0_5_6h_deseq_readcounts.csv,IRF2BP2_degraded_v1_0_5_6h_deseq_tccounts.csv,IRF2BP2_degraded_v1_0_5_6h_deseq_tccounts.csv,IRF2BP2_degraded_v1_0_5_6h_deseq_tccounts.csv,IRF2BP2_degraded_v1_0_5_6h_deseq_tccounts.csv,IRF2BP2_degraded_v1_0_5_6h_deseq_tccounts.csv,IRF2BP2_degraded_v1_0_5_6h_deseq_tccounts.csv
2,scale,scale,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,...,,,,,,,,,,
3,condition,condition,3h,3h,3h,3h,3h,3h,3h,3h,...,6h,6h,6h,6h,6h,6h,6h,6h,6h,6h
4,type,type,readcounts,readcounts,readcounts,readcounts,readcounts,readcounts,tccounts,tccounts,...,readcounts,readcounts,readcounts,readcounts,tccounts,tccounts,tccounts,tccounts,tccounts,tccounts


In [115]:
df_save_info = df_save.iloc[:5,]
df_save_info

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
0,project,project,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,...,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1
1,file,file,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,...,IRF2BP2_degraded_v1_0_5_6h_deseq_readcounts.csv,IRF2BP2_degraded_v1_0_5_6h_deseq_readcounts.csv,IRF2BP2_degraded_v1_0_5_6h_deseq_readcounts.csv,IRF2BP2_degraded_v1_0_5_6h_deseq_readcounts.csv,IRF2BP2_degraded_v1_0_5_6h_deseq_tccounts.csv,IRF2BP2_degraded_v1_0_5_6h_deseq_tccounts.csv,IRF2BP2_degraded_v1_0_5_6h_deseq_tccounts.csv,IRF2BP2_degraded_v1_0_5_6h_deseq_tccounts.csv,IRF2BP2_degraded_v1_0_5_6h_deseq_tccounts.csv,IRF2BP2_degraded_v1_0_5_6h_deseq_tccounts.csv
2,scale,scale,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,...,,,,,,,,,,
3,condition,condition,3h,3h,3h,3h,3h,3h,3h,3h,...,6h,6h,6h,6h,6h,6h,6h,6h,6h,6h
4,type,type,readcounts,readcounts,readcounts,readcounts,readcounts,readcounts,tccounts,tccounts,...,readcounts,readcounts,readcounts,readcounts,tccounts,tccounts,tccounts,tccounts,tccounts,tccounts


In [135]:
df_res_data = df_res.iloc[5:]

# Looking into data

In [258]:
#df_MYB = 
df_save.loc[:,df_save list(df_save.iloc[0])

True

In [260]:
list(df_save.iloc[0])

['project',
 'project',
 'IRF2BP2_degraded_v1',
 'IRF2BP2_degraded_v1',
 'IRF2BP2_degraded_v1',
 'IRF2BP2_degraded_v1',
 'IRF2BP2_degraded_v1',
 'IRF2BP2_degraded_v1',
 'IRF2BP2_degraded_v1',
 'IRF2BP2_degraded_v1',
 'IRF2BP2_degraded_v1',
 'IRF2BP2_degraded_v1',
 'IRF2BP2_degraded_v1',
 'IRF2BP2_degraded_v1',
 'IRF2BP2_degraded_v1',
 'IRF2BP2_degraded_v1',
 'IRF2BP2_degraded_v1',
 'IRF2BP2_degraded_v1',
 'IRF2BP2_degraded_v1',
 'IRF2BP2_degraded_v1',
 'IRF2BP2_degraded_v1',
 'IRF2BP2_degraded_v1',
 'IRF2BP2_degraded_v1',
 'IRF2BP2_degraded_v1',
 'IRF2BP2_degraded_v1',
 'IRF2BP2_degraded_v1',
 'IRF2BP2_degraded_v1',
 'IRF2BP2_degraded_v1',
 'IRF2BP2_degraded_v1',
 'IRF2BP2_degraded_v1',
 'IRF2BP2_degraded_v1',
 'IRF2BP2_degraded_v1',
 'IRF2BP2_degraded_v1',
 'IRF2BP2_degraded_v1',
 'IRF2BP2_degraded_v1',
 'IRF2BP2_degraded_v1',
 'IRF2BP2_degraded_v1',
 'IRF2BP2_degraded_v1',
 'IRF2BP2_degraded_v1',
 'IRF2BP2_degraded_v1',
 'IRF2BP2_degraded_v1',
 'IRF2BP2_degraded_v1',
 'IRF2BP2_degrad

In [256]:
project2

'MYB_degraded_v1'

## Testing

In [16]:
# read in deseq table
print(files[0])
df1 = pd.read_csv(files[0], header=[0], index_col=None, squeeze=True)
df1.iloc[:2]

../results/slamseq_IRF2BP2_degraded_v1/IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_deseq_readcounts.csv


Unnamed: 0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,genes,gene_id
0,98.73474,-0.35887,0.179587,1.998309,0.045683,0.557313,NAT1,NAT1
1,872.500027,-0.041895,0.116618,0.35925,0.719408,0.968449,AAMP,AAMP


In [17]:
df1.shape

(15112, 8)

In [18]:
print(len(df1.genes.unique()))
print(len(df1.genes.drop_duplicates(keep=False)))
print(df1.shape[0] - len(df1.genes.drop_duplicates(keep=False)))
print(len(df1.genes.unique()) - len(df1.genes.drop_duplicates(keep=False)))

15112
15112
0
0


In [21]:
["{}_{}".format(col_name, os.path.basename(files[0]).split(".")[0]) if "gene" not in col_name \
 else col_name for col_name in df1.columns.tolist()]

['baseMean_IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_deseq_readcounts',
 'log2FoldChange_IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_deseq_readcounts',
 'lfcSE_IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_deseq_readcounts',
 'stat_IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_deseq_readcounts',
 'pvalue_IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_deseq_readcounts',
 'padj_IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_deseq_readcounts',
 'genes',
 'gene_id']

In [22]:
# read in deseq table
print(files[1])
df2 = pd.read_csv(files[1], header=[0], index_col=None, squeeze=True)
df2.iloc[:2]

../results/slamseq_IRF2BP2_degraded_v1/IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_deseq_tccounts.csv


Unnamed: 0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,genes,gene_id
0,5.02174,-0.554785,0.739133,0.750588,0.4529,,NAT1,NAT1
1,14.549761,0.316474,0.385427,-0.821099,0.41159,0.642625,AAMP,AAMP


In [24]:
#deseq_dfs = map(files, get_deseq_data)
deseq_dfs = [get_deseq_data(file) for file in files[:3]]
deseq_dfs[0].iloc[:1]

Unnamed: 0,baseMean_IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_deseq_readcounts,log2FoldChange_IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_deseq_readcounts,lfcSE_IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_deseq_readcounts,stat_IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_deseq_readcounts,pvalue_IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_deseq_readcounts,padj_IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_deseq_readcounts,genes,gene_id
0,98.73474,-0.35887,0.179587,1.998309,0.045683,0.557313,NAT1,NAT1


In [25]:
# main df
df_merged = pd.DataFrame()
df_merged = df_merged.append(deseq_dfs[0][["genes", "gene_id"]]) # add gene cols to main df
df_merged.head()

Unnamed: 0,genes,gene_id
0,NAT1,NAT1
1,AAMP,AAMP
2,AARS1,AARS1
3,ABAT,ABAT
4,ABCA1,ABCA1


In [26]:
# join df columns on gene & gene_id columns
df_merged = reduce(lambda left, right: pd.merge(left, right, on=['genes', "gene_id"], how="outer"), 
                   [df_merged] + deseq_dfs)
df_merged.shape

(15112, 20)

In [27]:
# change column names
df_merged.columns = [col_name.split("_")[0] if "gene" not in col_name else 
                     col_name for col_name in df_merged.columns.tolist()]
df_merged = df_merged.T.reset_index().T.reset_index(drop=True)

In [28]:
df_merged.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,genes,gene_id,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
1,NAT1,NAT1,98.73474,-0.35887,0.179587,1.998309,0.045683,0.557313,5.02174,-0.554785,0.739133,0.750588,0.4529,,105.179117,-0.435531,0.261972,1.662513,0.09641,0.616449
2,AAMP,AAMP,872.500027,-0.041895,0.116618,0.35925,0.719408,0.968449,14.549761,0.316474,0.385427,-0.821099,0.41159,0.642625,971.817384,0.058489,0.136273,-0.429207,0.667772,0.989724
3,AARS1,AARS1,1426.956869,0.083048,0.107111,-0.775339,0.438139,0.906009,16.042542,-0.109203,0.395866,0.275859,0.782656,0.886222,890.019221,-0.00236,0.161838,0.014582,0.988365,0.999179
4,ABAT,ABAT,57.515186,-0.258913,0.20369,1.271113,0.203688,0.808742,1.695255,-0.946442,1.260847,0.75064,0.452869,,42.89278,-0.2227,0.265193,0.839767,0.401039,0.944698


In [19]:
df_merged.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,genes,gene_id,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
1,NAT1,NAT1,95.678631,-0.463807,0.183351,2.529615,0.011419,0.162526,5.023216,-0.554238,0.738925,0.75006,0.453218,,105.239049,-0.433213,0.261849,1.65444,0.098038,0.646734
2,AAMP,AAMP,841.211188,-0.150545,0.117138,1.285188,0.198727,0.58154,14.561153,0.317738,0.385701,-0.823793,0.410057,0.647203,972.517055,0.060776,0.136377,-0.44565,0.65585,0.995046
3,AARS1,AARS1,1373.801586,-0.025304,0.10876,0.232656,0.816029,0.954353,16.055735,-0.107774,0.395963,0.272182,0.785482,0.888308,890.639308,-0.00008,0.161873,0.000495,0.999605,0.99988
4,ABAT,ABAT,55.753788,-0.362282,0.21001,1.725072,0.084514,0.404263,1.69655,-0.944984,1.260997,0.749394,0.45362,,42.920747,-0.220415,0.265135,0.831329,0.405788,0.961633


which genes are duplicated

why are the values so different for some unaffected genes

In [42]:
test_counts = "../results/slamseq_IRF2BP2_degraded_v1_dup/IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_deseq_readcounts.csv"
df_test = pd.read_csv(test_counts, header=[0], index_col=None, squeeze=True)

In [43]:
duplicate_genes = list(set(df_test.genes.unique()) - set(df_test.genes.drop_duplicates(keep=False)))

In [45]:
print(len(duplicate_genes))
df_dup = df_test.loc[df_test['genes'].isin(duplicate_genes)]
print(df_dup.shape)

27
(54, 8)


In [49]:
df_dedup = df1.loc[df1['genes'].isin(duplicate_genes)]

In [53]:
df_dup.loc[df_dup['genes'] == "CHML"]

Unnamed: 0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,genes,gene_id
534,8319.176123,-0.093807,0.119186,0.787069,0.431242,0.77892,CHML,CHML
6018,3965.963276,0.065072,0.13196,-0.493116,0.62193,0.878732,CHML,CHML


In [54]:
df_dedup.loc[df_dedup['genes'] == "CHML"]

Unnamed: 0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,genes,gene_id
534,12746.248337,0.063397,0.114081,-0.555717,0.578405,0.941269,CHML,CHML


In [59]:
df_test.loc[df_test['genes'].isin(["MYC", "MYB", "IRF2BP2", "MEIS1", "MYB"])]

Unnamed: 0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,genes,gene_id
1790,727.339448,-0.114417,0.113961,1.004001,0.3153782,0.6937536,MEIS1,MEIS1
1902,3919.460999,-0.100624,0.127026,0.792155,0.4282701,0.7769807,MYB,MYB
1906,1757.343901,-0.127836,0.121284,1.054029,0.2918698,0.6784911,MYC,MYC
12650,15280.599449,0.90623,0.094478,-9.591931,8.645148e-22,1.317088e-18,IRF2BP2,IRF2BP2


In [60]:
df1.loc[df1['genes'].isin(["MYC", "MYB", "IRF2BP2", "MEIS1", "MYB"])]

Unnamed: 0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,genes,gene_id
1790,753.976244,-0.008948,0.106854,0.083742,0.9332619,0.9950914,MEIS1,MEIS1
1902,4062.431874,0.004507,0.120156,-0.037512,0.9700771,0.9970766,MYB,MYB
1906,1822.255867,-0.02071,0.11789,0.175672,0.8605519,0.9866435,MYC,MYC
12647,16046.862685,1.013079,0.087115,-11.629256,2.926355e-31,5.4530540000000005e-28,IRF2BP2,IRF2BP2


In [20]:
print(df_info.shape)
print(df_merged.shape)

(5, 20)
(15518, 20)


In [21]:
df_info.shape[0] + df_merged.shape[0]

15523

In [22]:
df_info = df_info.T.reset_index(drop=True).T # reset column index

In [23]:
df_info

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,project,project,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1
1,file,file,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_6h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_6h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_6h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_6h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_6h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_6h_dese...
2,scale,scale,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise
3,condition,condition,3h,3h,3h,3h,3h,3h,3h,3h,3h,3h,3h,3h,6h,6h,6h,6h,6h,6h
4,type,type,readcounts,readcounts,readcounts,readcounts,readcounts,readcounts,tccounts,tccounts,tccounts,tccounts,tccounts,tccounts,readcounts,readcounts,readcounts,readcounts,readcounts,readcounts


In [24]:
df_merged.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,genes,gene_id,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
1,NAT1,NAT1,95.678631,-0.463807,0.183351,2.529615,0.011419,0.162526,5.023216,-0.554238,0.738925,0.75006,0.453218,,105.239049,-0.433213,0.261849,1.65444,0.098038,0.646734
2,AAMP,AAMP,841.211188,-0.150545,0.117138,1.285188,0.198727,0.58154,14.561153,0.317738,0.385701,-0.823793,0.410057,0.647203,972.517055,0.060776,0.136377,-0.44565,0.65585,0.995046
3,AARS1,AARS1,1373.801586,-0.025304,0.10876,0.232656,0.816029,0.954353,16.055735,-0.107774,0.395963,0.272182,0.785482,0.888308,890.639308,-0.00008,0.161873,0.000495,0.999605,0.99988
4,ABAT,ABAT,55.753788,-0.362282,0.21001,1.725072,0.084514,0.404263,1.69655,-0.944984,1.260997,0.749394,0.45362,,42.920747,-0.220415,0.265135,0.831329,0.405788,0.961633


In [35]:
%%time

# combine info and data
df_save = df_info.append(df_merged, ignore_index=True)

CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 4.03 ms


In [36]:
df_save.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,project,project,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1
1,file,file,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_6h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_6h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_6h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_6h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_6h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_6h_dese...
2,scale,scale,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise
3,condition,condition,3h,3h,3h,3h,3h,3h,3h,3h,3h,3h,3h,3h,6h,6h,6h,6h,6h,6h
4,type,type,readcounts,readcounts,readcounts,readcounts,readcounts,readcounts,tccounts,tccounts,tccounts,tccounts,tccounts,tccounts,readcounts,readcounts,readcounts,readcounts,readcounts,readcounts


## Save grouped data

In [37]:
# save file
res_dir = "../results"
deseq_res_file = "slamseq_merged_deseq.csv"

df_save.to_csv("{}/{}".format(res_dir, deseq_res_file), header=False, index=False)

print("Saved DESeq results to: {}/{}".format(res_dir, deseq_res_file))

Saved DESeq results to: ../results/slamseq_merged_deseq.csv


## MYB_degraded_v1 (project 2)

In [167]:
# read in merged deseq file
res_dir = "../results"
deseq_res_file = "slamseq_merged_deseq.csv"

df_save = pd.read_csv("{}/{}".format(res_dir, deseq_res_file), index_col=False, header=None, squeeze=True)
print(df_save.shape)
df_save.head()

(15523, 20)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,project,project,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1,IRF2BP2_degraded_v1
1,file,file,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_3h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_6h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_6h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_6h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_6h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_6h_dese...,IRF2BP2_degraded_v1_ERCCsamplewise_0_5_6h_dese...
2,scale,scale,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise
3,condition,condition,3h,3h,3h,3h,3h,3h,3h,3h,3h,3h,3h,3h,6h,6h,6h,6h,6h,6h
4,type,type,readcounts,readcounts,readcounts,readcounts,readcounts,readcounts,tccounts,tccounts,tccounts,tccounts,tccounts,tccounts,readcounts,readcounts,readcounts,readcounts,readcounts,readcounts


In [255]:
project = project2
project_dir = "slamseq_{}".format(project)

In [256]:
files = ! ls $res_dir/$project_dir/*.csv
files = [file for file in files if "deseq" in file if "drop" not in file]
# ERCC files first
files = [file for file in files if "ERCCsamplewise" in file] + \
[file for file in files if "ERCCsamplewise" not in file]

In [257]:
files

['../results/slamseq_MYB_degraded_v1/MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_readcounts.csv',
 '../results/slamseq_MYB_degraded_v1/MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_tccounts.csv',
 '../results/slamseq_MYB_degraded_v1/MYB_degraded_v1_0_5_1h_deseq_readcounts.csv',
 '../results/slamseq_MYB_degraded_v1/MYB_degraded_v1_0_5_1h_deseq_tccounts.csv']

In [258]:
# project 2
conditions = ["1h"]
scaling = ["ERCCsamplewise", None]
count_type = ["readcounts", "tccounts"]

Get row info

In [260]:
# df with file info
df_info = pd.DataFrame(list(zip(["project", "file", "scale", "condition", "type"],
                                ["project", "file", "scale", "condition", "type"])))

In [261]:
df_info = pd.concat([df_info] + [pd.DataFrame([x*6 for x in get_file_info(project, file, conditions)]) \
                                 for file in files], axis=1)
df_info = df_info.T.reset_index(drop=True).T # reset column index
df_info.shape

(5, 26)

In [262]:
df_info

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,project,project,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,...,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1
1,file,file,MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_re...,MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_re...,MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_re...,MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_re...,MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_re...,MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_re...,MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_tc...,MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_tc...,...,MYB_degraded_v1_0_5_1h_deseq_readcounts.csv,MYB_degraded_v1_0_5_1h_deseq_readcounts.csv,MYB_degraded_v1_0_5_1h_deseq_readcounts.csv,MYB_degraded_v1_0_5_1h_deseq_readcounts.csv,MYB_degraded_v1_0_5_1h_deseq_tccounts.csv,MYB_degraded_v1_0_5_1h_deseq_tccounts.csv,MYB_degraded_v1_0_5_1h_deseq_tccounts.csv,MYB_degraded_v1_0_5_1h_deseq_tccounts.csv,MYB_degraded_v1_0_5_1h_deseq_tccounts.csv,MYB_degraded_v1_0_5_1h_deseq_tccounts.csv
2,scale,scale,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,ERCCsamplewise,...,,,,,,,,,,
3,condition,condition,1h,1h,1h,1h,1h,1h,1h,1h,...,1h,1h,1h,1h,1h,1h,1h,1h,1h,1h
4,type,type,readcounts,readcounts,readcounts,readcounts,readcounts,readcounts,tccounts,tccounts,...,readcounts,readcounts,readcounts,readcounts,tccounts,tccounts,tccounts,tccounts,tccounts,tccounts


Get deseq data

In [263]:
def get_deseq_data(file):
    ''' input deseq file & returns df '''
    ''' add file name to headers except gene columns '''
    df = pd.read_csv(file, header=[0], index_col=None, squeeze=True)
    df.columns = ["{}_{}".format(col_name, os.path.basename(file).split(".")[0]) if "gene" not in col_name
                  else col_name for col_name in df.columns.tolist()]
    return(df)

In [266]:
deseq_dfs = [get_deseq_data(file) for file in files]
print(deseq_dfs[0].shape)
deseq_dfs[0].iloc[:2]

(16558, 8)


Unnamed: 0,baseMean_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_readcounts,log2FoldChange_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_readcounts,lfcSE_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_readcounts,stat_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_readcounts,pvalue_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_readcounts,padj_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_readcounts,genes,gene_id
0,1.877541,-0.045789,1.010641,0.045307,0.963863,,A1BG,A1BG
1,5.229468,-0.26904,0.500213,0.537851,0.59068,,A2MP1,A2MP1


In [267]:
for df in deseq_dfs:
    print(df.shape)

(16558, 8)
(16558, 8)
(16558, 8)
(16558, 8)


In [268]:
[os.path.basename(x) for x in files]

['MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_readcounts.csv',
 'MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_tccounts.csv',
 'MYB_degraded_v1_0_5_1h_deseq_readcounts.csv',
 'MYB_degraded_v1_0_5_1h_deseq_tccounts.csv']

In [269]:
df1 = deseq_dfs[0]
df2 = deseq_dfs[1]
df3 = deseq_dfs[2]
df4 = deseq_dfs[3]

In [270]:
genes1 = list(df1["genes"])
genes2 = list(df2["genes"])
genes3 = list(df3["genes"])
genes4 = list(df4["genes"])

In [271]:
np.logical_and(np.array_equal(df1["genes"].unique(), df2["genes"].unique()), 
               np.array_equal(df2["genes"].unique(), df3["genes"].unique()))

True

In [272]:
len(list(df1["genes"])) == len(set(df1["genes"].unique()))

True

In [275]:
merge_test = reduce(lambda x, y: pd.merge(x, y, on = ["genes", "gene_id"], how="outer"), [df1, df2])

In [277]:
print(merge_test.shape)
merge_test.head()

(16558, 14)


Unnamed: 0,baseMean_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_readcounts,log2FoldChange_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_readcounts,lfcSE_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_readcounts,stat_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_readcounts,pvalue_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_readcounts,padj_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_readcounts,genes,gene_id,baseMean_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_tccounts,log2FoldChange_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_tccounts,lfcSE_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_tccounts,stat_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_tccounts,pvalue_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_tccounts,padj_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_tccounts
0,1.877541,-0.045789,1.010641,0.045307,0.963863,,A1BG,A1BG,0.05833,-0.614498,3.11654,0.197173,0.843692,
1,5.229468,-0.26904,0.500213,0.537851,0.59068,,A2MP1,A2MP1,0.0,-0.0,,,0.0,
2,58.517384,0.013887,0.186521,-0.074452,0.940651,0.987822,NAT1,NAT1,4.852925,0.551781,0.560071,-0.985199,0.324526,0.837918
3,2.316485,-0.735553,0.776528,0.947233,0.34352,,AADAC,AADAC,0.0,-0.0,,,0.0,
4,340.671808,-0.130751,0.092915,1.407202,0.159367,0.743707,AAMP,AAMP,9.364142,0.216094,0.34192,-0.632001,0.527386,0.915144


In [196]:
reduce(lambda x, y: pd.merge(x, y, on = "genes"), [df1.iloc[:10000], df2.iloc[:10000]])["genes"].tolist()

['A1BG',
 'A2MP1',
 'NAT1',
 'AADAC',
 'AAMP',
 'AARS1',
 'ABAT',
 'ABCA1',
 'ABCA2',
 'ABCB7',
 'ABL1',
 'AOC1',
 'ABL2',
 'ABO',
 'ABR',
 'ACAA1',
 'ACACB',
 'ACADM',
 'ACADS',
 'ACADSB',
 'ACADVL',
 'ACAT1',
 'ACAT2',
 'ASIC2',
 'ASIC1',
 'ACLY',
 'ACO1',
 'ACO2',
 'ACOX1',
 'ACP1',
 'ACP2',
 'ACP3',
 'ACRV1',
 'ACTB',
 'ACTG1',
 'ACTN4',
 'ACTL6A',
 'ACTN1',
 'ACVR1',
 'ACVR1B',
 'ACVR2A',
 'ACVR2B',
 'ACVRL1',
 'ACY1',
 'ACYP1',
 'ACYP2',
 'ADA',
 'ADAM8',
 'ADAM10',
 'ADAR',
 'ADARB1',
 'ADCY1',
 'ADCY2',
 'ADCY3',
 'ADCY5',
 'ADCY6',
 'ADCY7',
 'ADCY9',
 'ADCYAP1R1',
 'ADD1',
 'ADD2',
 'ADD3',
 'PLIN2',
 'ADH5',
 'ADH6',
 'ADK',
 'ADORA1',
 'ADORA2A',
 'ADORA2B',
 'ADORA3',
 'ADPRH',
 'PARP1',
 'PARP4',
 'ADRA1A',
 'ADRA2B',
 'ADRA2C',
 'ADRB1',
 'ADRB2',
 'GRK2',
 'GRK3',
 'ADSL',
 'ADSS2',
 'AP2A1',
 'AP2A2',
 'AP1B1',
 'AP2B1',
 'AP1G1',
 'TLE5',
 'CRISP1',
 'AFG3L1P',
 'AGA',
 'AGL',
 'JAG1',
 'AGT',
 'AHCY',
 'AHR',
 'CRYBG1',
 'AK1',
 'AK2',
 'AK4',
 'AKT1',
 'AKT2',
 'ALA

In [197]:
merge_test = reduce(lambda x, y: pd.merge(x, y, on = ["genes", "gene_id"], how="outer"), [df1, df2])

In [198]:
test_genes = merge_test["genes"].tolist()

In [199]:
len(test_genes)

16648

In [200]:
len(genes1)

16588

In [201]:
print(len(list([x for x in genes1 if genes1.count(x) > 1])))
print(len(set([x for x in genes1 if genes1.count(x) > 1])))

60
30


In [152]:
len(set(test_genes))

16558

In [153]:
set([x for x in test_genes if test_genes.count(x) > 1])

{'ACBD6',
 'ASB3',
 'CCDC26',
 'CHML',
 'FEZ1',
 'FNTB',
 'GNMT',
 'GPRASP2',
 'KLHL23',
 'KLRK1',
 'LINC00511',
 'LINC00598',
 'LINC01725',
 'LIPE-AS1',
 'MCL1',
 'MEF2B',
 'MICAL2',
 'MTPN',
 'NBL1',
 'PAK6',
 'PIGY',
 'S1PR3',
 'SCARNA9',
 'SCNM1',
 'SMIM8',
 'SPSB2',
 'STAG3L4',
 'TBC1D7',
 'TTN-AS1',
 'USP9Y'}

In [154]:
merge_test.loc[merge_test["genes"].isin(set([x for x in test_genes if test_genes.count(x) > 1]))]

Unnamed: 0,baseMean_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_readcounts,log2FoldChange_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_readcounts,lfcSE_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_readcounts,stat_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_readcounts,pvalue_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_readcounts,padj_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_readcounts,genes,gene_id,baseMean_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_tccounts,log2FoldChange_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_tccounts,lfcSE_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_tccounts,stat_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_tccounts,pvalue_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_tccounts,padj_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_tccounts
587,1949.022676,0.177454,0.137146,-1.293907,0.195698,0.756115,CHML,CHML,122.062052,0.472268,0.179008,-2.638255,8.333385e-03,1.432467e-01
588,1949.022676,0.177454,0.137146,-1.293907,0.195698,0.756115,CHML,CHML,57.521996,1.500506,0.181365,-8.273385,1.302081e-16,3.976554e-14
589,1221.280092,0.152483,0.090387,-1.687005,0.091602,0.700413,CHML,CHML,122.062052,0.472268,0.179008,-2.638255,8.333385e-03,1.432467e-01
590,1221.280092,0.152483,0.090387,-1.687005,0.091602,0.700413,CHML,CHML,57.521996,1.500506,0.181365,-8.273385,1.302081e-16,3.976554e-14
959,209.879610,-0.413183,0.124792,3.310983,0.000930,0.058866,S1PR3,S1PR3,35.707272,-0.391967,0.194360,2.016711,4.372564e-02,3.978295e-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15759,35.798328,0.089758,0.216731,-0.414143,0.678769,0.918913,LIPE-AS1,LIPE-AS1,1.000995,-0.538208,1.091397,0.493137,6.219158e-01,
15916,1.609619,-0.520566,1.020085,0.510316,0.609830,,LINC01725,LINC01725,0.000000,-0.000000,,,0.000000e+00,
15917,1.609619,-0.520566,1.020085,0.510316,0.609830,,LINC01725,LINC01725,0.000000,-0.000000,,,0.000000e+00,
15918,8.320226,0.301544,0.436010,-0.691598,0.489190,,LINC01725,LINC01725,0.000000,-0.000000,,,0.000000e+00,


In [114]:
reduce(lambda x, y: pd.merge(x, y, on = ["genes", "gene_id"], how="outer"), deseq_dfs)

Unnamed: 0,baseMean_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_readcounts,log2FoldChange_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_readcounts,lfcSE_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_readcounts,stat_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_readcounts,pvalue_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_readcounts,padj_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_readcounts,genes,gene_id,baseMean_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_tccounts,log2FoldChange_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_tccounts,...,lfcSE_MYB_degraded_v1_0_5_1h_deseq_readcounts,stat_MYB_degraded_v1_0_5_1h_deseq_readcounts,pvalue_MYB_degraded_v1_0_5_1h_deseq_readcounts,padj_MYB_degraded_v1_0_5_1h_deseq_readcounts,baseMean_MYB_degraded_v1_0_5_1h_deseq_tccounts,log2FoldChange_MYB_degraded_v1_0_5_1h_deseq_tccounts,lfcSE_MYB_degraded_v1_0_5_1h_deseq_tccounts,stat_MYB_degraded_v1_0_5_1h_deseq_tccounts,pvalue_MYB_degraded_v1_0_5_1h_deseq_tccounts,padj_MYB_degraded_v1_0_5_1h_deseq_tccounts
0,1.877539,-0.045684,1.010547,0.045207,0.963942,,A1BG,A1BG,0.058385,-0.615170,...,0.980114,-0.116922,0.906922,,0.058385,-0.500626,3.116540,0.160635,0.872381,
1,5.229387,-0.269041,0.500087,0.537989,0.590585,,A2MP1,A2MP1,0.000000,-0.000000,...,0.477972,0.372795,0.709301,,0.000000,-0.000000,,,0.000000,
2,58.516396,0.013917,0.186478,-0.074632,0.940507,0.987830,NAT1,NAT1,4.852990,0.551270,...,0.156367,-0.758833,0.447953,0.999815,5.267543,0.666107,0.567477,-1.173804,0.240473,0.774211
3,2.316444,-0.735572,0.776358,0.947465,0.343402,,AADAC,AADAC,0.000000,-0.000000,...,0.748054,0.906933,0.364442,,0.000000,-0.000000,,,0.000000,
4,340.667352,-0.130732,0.092874,1.407626,0.159242,0.743147,AAMP,AAMP,9.364263,0.215343,...,0.074923,0.335587,0.737182,0.999815,9.852376,0.296100,0.338771,-0.874041,0.382096,0.864729
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17003,82.429753,0.008600,0.168839,-0.050936,0.959376,0.990346,106736475,106736475,6.090411,0.107138,...,0.142142,-0.825469,0.409105,0.999815,6.362049,0.189380,0.438964,-0.431426,0.666159,0.953370
17004,13.209258,-0.102026,0.308889,0.330301,0.741173,0.934870,GET1-SH3BGR,GET1-SH3BGR,0.144147,-1.102433,...,0.294363,-0.007729,0.993833,,0.144147,-0.987976,2.583608,0.382402,0.702163,
17005,3.067087,-0.096211,0.639843,0.150367,0.880475,,BOLA2-SMG1P6,BOLA2-SMG1P6,0.140468,-0.134282,...,0.619041,-0.022964,0.981679,,0.146822,-0.019737,2.568065,0.007686,0.993868,
17006,153.879523,0.023166,0.124540,-0.186016,0.852432,0.968322,107303344,107303344,3.557956,0.031424,...,0.109675,-1.192979,0.232878,0.999815,3.741420,0.118501,0.541308,-0.218916,0.826715,0.974970


In [90]:
# main df
df_merged = pd.DataFrame()
df_merged = df_merged.append(deseq_dfs[0][["genes", "gene_id"]]) # add gene cols to main df
print(df_merged.shape)
df_merged.head()

(16588, 2)


Unnamed: 0,genes,gene_id
0,A1BG,A1BG
1,A2MP1,A2MP1
2,NAT1,NAT1
3,AADAC,AADAC
4,AAMP,AAMP


In [91]:
# join df columns on gene & gene_id columns
df_merged = reduce(lambda left, right: pd.merge(left, right, on=['genes', "gene_id"], how="outer"), 
                   [df_merged] + deseq_dfs)
print(df_merged.shape)

(17488, 26)


In [79]:
# change column names
df_merged.columns = [col_name.split("_")[0] if "gene" not in col_name else 
                     col_name for col_name in df_merged.columns.tolist()]
df_merged = df_merged.T.reset_index().T.reset_index(drop=True)

In [80]:
df_merged.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,genes,gene_id,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,baseMean,log2FoldChange,...,lfcSE,stat,pvalue,padj,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
1,A1BG,A1BG,1.877539,-0.045684,1.010547,0.045207,0.963942,,0.058385,-0.61517,...,0.980114,-0.116922,0.906922,,0.058385,-0.500626,3.11654,0.160635,0.872381,
2,A2MP1,A2MP1,5.229387,-0.269041,0.500087,0.537989,0.590585,,0.0,-0.0,...,0.477972,0.372795,0.709301,,0.0,-0.0,,,0.0,
3,NAT1,NAT1,58.516396,0.013917,0.186478,-0.074632,0.940507,0.98783,4.85299,0.55127,...,0.156367,-0.758833,0.447953,0.999815,5.267543,0.666107,0.567477,-1.173804,0.240473,0.774211
4,AADAC,AADAC,2.316444,-0.735572,0.776358,0.947465,0.343402,,0.0,-0.0,...,0.748054,0.906933,0.364442,,0.0,-0.0,,,0.0,


In [81]:
print(df_save.shape)
print(df_merged.shape)

(15523, 20)
(17489, 26)


# MYB_degraded_v7 (res_v7)

In [38]:
project = project2
res_version = "res_v7"
project_dir = "slamseq_{}_{}".format(project, res_version)

In [39]:
files = ! ls $res_dir/$project_dir/*.csv
files = [file for file in files if "deseq" in file if "drop" not in file]
# ERCC files first
files = [file for file in files if "ERCCsamplewise" in file] + \
[file for file in files if "ERCCsamplewise" not in file]
print(len(files))

4


In [40]:
files

['../results/slamseq_MYB_degraded_v1_res_v7/MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_readcounts.csv',
 '../results/slamseq_MYB_degraded_v1_res_v7/MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_tccounts.csv',
 '../results/slamseq_MYB_degraded_v1_res_v7/MYB_degraded_v1_0_5_1h_deseq_readcounts.csv',
 '../results/slamseq_MYB_degraded_v1_res_v7/MYB_degraded_v1_0_5_1h_deseq_tccounts.csv']

In [41]:
# project 2
conditions = ["1h"]
scaling = ["ERCCsamplewise", None]
count_type = ["readcounts", "tccounts"]

## make info rows

In [42]:
# df with file info
df_info = pd.DataFrame(list(zip(["project", "file", "scale", "condition", "type"],
                                ["project", "file", "scale", "condition", "type"])))

In [43]:
df_info = pd.concat([df_info] + [pd.DataFrame([x*6 for x in get_file_info(project2, file, conditions)]) \
                                 for file in files], axis=1)
df_info.shape

(5, 26)

In [44]:
df_info.iloc[:,[0,1,20,21,22,23,24]]

Unnamed: 0,0,1,0.1,1.1,2,3,4
0,project,project,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1,MYB_degraded_v1
1,file,file,MYB_degraded_v1_0_5_1h_deseq_tccounts.csv,MYB_degraded_v1_0_5_1h_deseq_tccounts.csv,MYB_degraded_v1_0_5_1h_deseq_tccounts.csv,MYB_degraded_v1_0_5_1h_deseq_tccounts.csv,MYB_degraded_v1_0_5_1h_deseq_tccounts.csv
2,scale,scale,,,,,
3,condition,condition,1h,1h,1h,1h,1h
4,type,type,tccounts,tccounts,tccounts,tccounts,tccounts


## merge dfs

In [45]:
# Get list of deseq dfs
deseq_dfs = [get_deseq_data(file) for file in files]
deseq_dfs[0].iloc[:1]

Unnamed: 0,baseMean_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_readcounts,log2FoldChange_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_readcounts,lfcSE_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_readcounts,stat_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_readcounts,pvalue_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_readcounts,padj_MYB_degraded_v1_ERCCsamplewise_0_5_1h_deseq_readcounts,genes,gene_id
0,1.881134,-0.045298,1.016923,0.044544,0.964471,,A1BG,A1BG


In [46]:
# main df
df_merged = pd.DataFrame()
df_merged = df_merged.append(deseq_dfs[0][["genes", "gene_id"]]) # add gene cols to main df

In [47]:
# join df columns on gene & gene_id columns
df_merged = reduce(lambda left, right: pd.merge(left, right, on=['genes', "gene_id"], how="outer"), 
                   [df_merged] + deseq_dfs)
df_merged.shape

(17211, 26)

In [48]:
# change column names
df_merged.columns = [col_name.split("_")[0] if "gene" not in col_name else 
                     col_name for col_name in df_merged.columns.tolist()]
df_merged = df_merged.T.reset_index().T.reset_index(drop=True)

## add info columns

In [49]:
# reset column index
df_info = df_info.T.reset_index(drop=True).T

In [50]:
%%time

# combine info and data
df_save = df_info.append(df_merged, ignore_index=True)
df_save.shape

CPU times: user 20 ms, sys: 8 ms, total: 28 ms
Wall time: 27.1 ms


(17217, 26)

## save merged file

In [51]:
# save file
res_dir = "../results"
deseq_res_file = "slamseq_merged_deseq_noAlt.csv"

df_save.to_csv("{}/{}".format(res_dir, deseq_res_file), header=False, index=False)

print("Saved DESeq results to: {}/{}".format(res_dir, deseq_res_file))

Saved DESeq results to: ../results/slamseq_merged_deseq_noAlt.csv
