# 4sU-seq data analysis using DESeq2 results output (Dex vs no Dex)
This Jupyter notebook contains scripts used to analyze 4sU-seq differential expression from DESeq2 results. Specifically, results from comparing dexamethasone time points to zero dexamethasone, as in Lammer et al., 2023.

# Table of contents
1. [Load packages and files](#load-packages-and-files)
2. [Pairwise scatter plots](#pairwise-scatter-plots)
3. [Outliers](#outliers)
4. [Save gene lists](#save-gene-lists)
5. [UpSet plots for DE genes in each set](#upset-plots-for-de-genes-in-each-set-optional)

## Load packages and files
Load required packages and results files. Additionally, format results into one dataframe with zero baseMean genes removed (dfNoZero)

In [None]:
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np
import scipy as sp
from scipy import stats
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
#input p-value cutoff and data paths here
pCutoff=0.05
resDir = "..\\data"
outDir = "..\\analyses"
times = ["1","2","3"]
cellnames = ["wt","sof","ctrl"]
samples=[]
samplefiles=[]
for cell in cellnames:
	for time in times:
		samples.append(cell+time+"h")
		samplefiles.append("res_"+cell+"_"+time+"h")
resFiles=[resDir+"\\"+file+".csv" for file in samplefiles]

In [None]:
#format data into one dataframe, remove rows with zero baseMean
def formatResFiles(resFiles):
	tables=[]
	allCols=["ens","name","baseMean","log2FoldChange","lfcSE","padj"]
	lessCols=["baseMean","log2FoldChange","lfcSE","padj"]
	for i,file in enumerate(resFiles):
		if i==0:
			tables.append(pd.read_csv(file, usecols=allCols))
		else:
			tables.append(pd.read_csv(file, usecols=lessCols))
	ensIDs=tables[0]["ens"]
	geneNames=tables[0]["name"]
	df=pd.DataFrame(ensIDs)
	df["name"]=geneNames
	for i,x in enumerate(tables):
		colNames=[samples[i]+"_"+y for y in lessCols]
		for n,col in enumerate(colNames):
			df[col]=x[lessCols[n]]
	#remove genes with NA gene names
	df = df[df["name"].isna()==False]
	#remove rows with zero base mean to remove nan fold change rows but maintain nan padjs
	dfNoZero=df.copy()
	columns=[x+"_baseMean" for x in samples]
	dfNoZero = dfNoZero[dfNoZero[columns].min(axis=1) > 0]

	return dfNoZero
dfNoZero=formatResFiles(resFiles)

## Pairwise scatter plots
Plot scatter plots of gene fold changes between two samples at the same time point. "scatterLFCall" plots a 3x3 grid of all times and sample pairs.

In [None]:
#scatter plot of fold changes (subplots)
#time variable will correspond to column
#sample pairs will correspond to rows
def scatterLFCall(df,times,pCutoff):
	fontsize=8
	mm=0.0393701
	figwidth=180
	figheight=180
	plt.rcParams["font.family"]="Arial"
	plt.rcParams["font.size"]=fontsize
	fig,ax=plt.subplots(3,3,dpi=300,figsize=(figwidth*mm,figheight*mm),layout="constrained")
	ax=ax.flatten()
	cellPairs=[["sof","ctrl"],["sof","wt"],["wt","ctrl"]]
	plotNum=len(cellnames)*len(times)
	timeCount=0
	cellCount=0
	for i in range(plotNum):
		pairCurrent=cellPairs[cellCount]
		timeCurrent=times[timeCount]
		sampleslfc=[cell+timeCurrent+"h_log2FoldChange" for cell in pairCurrent]
		samplespadj=[cell+timeCurrent+"h_padj" for cell in pairCurrent]
		dfMin=df[sampleslfc+samplespadj]
		dfMinSig=dfMin[dfMin[samplespadj].min(axis=1)<pCutoff]
		conditions=[#categorize genes based on p-adjusted 
			(dfMinSig[samplespadj[0]]<pCutoff) & (dfMinSig[samplespadj[1]]<pCutoff),
			(dfMinSig[samplespadj[0]]<pCutoff) & (dfMinSig[samplespadj[1]]>=pCutoff),
			(dfMinSig[samplespadj[0]]>=pCutoff) & (dfMinSig[samplespadj[1]]<pCutoff),
			(dfMinSig[samplespadj[0]]<pCutoff) & (np.isnan(dfMinSig[samplespadj[1]])),
			(np.isnan(dfMinSig[samplespadj[0]])) & (dfMinSig[samplespadj[1]]<pCutoff)
		]
		values=["Both",pairCurrent[0],pairCurrent[1],pairCurrent[0],pairCurrent[1]]
		hue_order=["Both",pairCurrent[1],pairCurrent[0]]
		dfMinSig["Sample"]=np.select(conditions,values)
		rho,pval=stats.spearmanr(dfMinSig[sampleslfc[0]], dfMinSig[sampleslfc[1]])
		nGenes=len(dfMinSig.index)
		sns.scatterplot(data=dfMinSig,x=sampleslfc[0],y=sampleslfc[1],
			ax=ax[i],hue="Sample",hue_order=hue_order,alpha=0.8,s=figheight/3/6.67)
		sns.regplot(data=dfMinSig,x=sampleslfc[0],y=sampleslfc[1],
			ax=ax[i],scatter=False,color="gray", line_kws={'linewidth':figheight/3/40})
		ax[i].set_aspect('equal')
		ax[i].set_xlim(-10,10)
		ax[i].set_ylim(-10,10)
		ax[i].axline((0, 0), slope=1, color="black",linestyle=":")
		ax[i].set_xlabel(pairCurrent[0]+" log2FC")
		ax[i].set_ylabel(pairCurrent[1]+" log2FC")
		ax[i].text(0.97, 0.13, "p-adj < "+str(pCutoff), horizontalalignment='right',
			size='medium', color='black', weight='semibold', transform=ax[i].transAxes)
		ax[i].text(0.97, 0.08, "n = "+str(nGenes), horizontalalignment='right',
			size='medium', color='black', weight='semibold', transform=ax[i].transAxes)
		ax[i].text(0.97, 0.03, "Spearman = "+str(round(rho,2)), horizontalalignment='right',
			size='medium', color='black', weight='semibold', transform=ax[i].transAxes)
		ax[i].legend(loc='best', fontsize=fontsize-1)
		timeCount += 1
		if timeCount%len(times) == 0:#reset time counter when you get to the last time (start next row)
			timeCount=0
			cellCount+=1#when start new row, also move onto next pair
def scatterLFCOneTime(df,time,pCutoff):
	fontsize=8
	mm=0.0393701
	figwidth=180
	figheight=60
	plt.rcParams["font.family"]="Arial"
	plt.rcParams["font.size"]=fontsize
	fig,ax=plt.subplots(1,3,dpi=300,figsize=(figwidth*mm,figheight*mm),layout="constrained")
	ax=ax.flatten()
	cellPairs=[["sof","ctrl"],["sof","wt"],["wt","ctrl"]]
	for i in range(len(cellPairs)):
		sampleslfc=[cell+str(time)+"h_log2FoldChange" for cell in cellPairs[i]]
		samplespadj=[cell+str(time)+"h_padj" for cell in cellPairs[i]]
		dfMin=df[sampleslfc+samplespadj]
		dfMinSig=dfMin[dfMin[samplespadj].min(axis=1)<pCutoff]
		conditions=[#categorize genes based on p-adjusted 
			(dfMinSig[samplespadj[0]]<pCutoff) & (dfMinSig[samplespadj[1]]<pCutoff),
			(dfMinSig[samplespadj[0]]<pCutoff) & (dfMinSig[samplespadj[1]]>=pCutoff),
			(dfMinSig[samplespadj[0]]>=pCutoff) & (dfMinSig[samplespadj[1]]<pCutoff),
			(dfMinSig[samplespadj[0]]<pCutoff) & (np.isnan(dfMinSig[samplespadj[1]])),
			(np.isnan(dfMinSig[samplespadj[0]])) & (dfMinSig[samplespadj[1]]<pCutoff)
		]
		values=["Both",cellPairs[i][0],cellPairs[i][1],cellPairs[i][0],cellPairs[i][1]]
		hue_order=["Both",cellPairs[i][1],cellPairs[i][0]]
		dfMinSig["Sample"]=np.select(conditions,values)
		rho,pval=stats.spearmanr(dfMinSig[sampleslfc[0]], dfMinSig[sampleslfc[1]])
		nGenes=len(dfMinSig.index)
		sns.scatterplot(data=dfMinSig,x=sampleslfc[0],y=sampleslfc[1],
			ax=ax[i],hue="Sample",hue_order=hue_order,alpha=0.8,s=figheight/6.67)
		sns.regplot(data=dfMinSig,x=sampleslfc[0],y=sampleslfc[1],
			ax=ax[i],scatter=False,color="gray", line_kws={'linewidth':figheight/40})
		ax[i].set_aspect('equal')
		ax[i].set_xlim(-10,10)
		ax[i].set_ylim(-10,10)
		ax[i].axline((0, 0), slope=1, color="black",linestyle=":")
		ax[i].set_xlabel(cellPairs[i][0]+" log2 fold change (Dex vs. No Dex)")
		ax[i].set_ylabel(cellPairs[i][1]+" log2 fold change (Dex vs. No Dex)")
		ax[i].text(0.97, 0.13, "p-adj < "+str(pCutoff), horizontalalignment='right',
			size='medium', color='black', weight='semibold', transform=ax[i].transAxes)
		ax[i].text(0.97, 0.08, "n = "+str(nGenes), horizontalalignment='right',
			size='medium', color='black', weight='semibold', transform=ax[i].transAxes)
		ax[i].text(0.97, 0.03, "Spearman = "+str(round(rho,2)), horizontalalignment='right',
			size='medium', color='black', weight='semibold', transform=ax[i].transAxes)

scatterLFCall(dfNoZero,times,pCutoff)
scatterLFCOneTime(dfNoZero,3,pCutoff)

## Outliers
Use the minimum covariance determinant to calculate bivariate outliers between a pair of samples. Then, visualize outliers on a fold change scatter plot with dots marked as statistically significant in both samples, one samples, or not an outlier. After, use linear regression values to select outliers below the line to get a reproducible set of genes with maximum discrepancy between samples.

In [None]:
#outlier statistics (code from https://towardsdatascience.com/detecting-and-treating-outliers-in-python-part-2-3a3319ec2c33)
from __future__ import with_statement
from scipy.stats import chi2
from sklearn.covariance import MinCovDet
import copy

def robust_mahalanobis_method(df, pCutoff):
	#Minimum covariance determinant
	rng = np.random.RandomState(0)
	real_cov = np.cov(df.values.T)
	X = rng.multivariate_normal(mean=np.mean(df, axis=0), cov=real_cov, size=506)
	cov = MinCovDet(random_state=0).fit(X)
	mcd = cov.covariance_ #robust covariance metric
	robust_mean = cov.location_  #robust mean
	inv_covmat = sp.linalg.inv(mcd) #inverse covariance metric
	#Robust M-Distance
	x_minus_mu = df - robust_mean
	left_term = np.dot(x_minus_mu, inv_covmat)
	mahal = np.dot(left_term, x_minus_mu.T)
	md = np.sqrt(mahal.diagonal())
	#Flag as outlier
	outlier = []
	C = np.sqrt(chi2.ppf((1-pCutoff), df=df.shape[1]))#degrees of freedom = number of variables
	for index, value in enumerate(md):
		if value > C:
			outlier.append(index)
		else:
			continue
	return outlier, md
def outliersBi(df,biSamples,pCutoff):
	lfcCols=[x+"_log2FoldChange" for x in biSamples]
	padjCols=[x+"_padj" for x in biSamples]
	dfSamples=df[["ens","name",biSamples[0]+"_baseMean"]+lfcCols+padjCols]
	dfSamples=dfSamples[dfSamples[padjCols].min(axis=1)<0.01]
	df_bivariate=dfSamples[lfcCols]
	outliers_mahal_rob_bi, md_rb_bi = robust_mahalanobis_method(df_bivariate,pCutoff)
	dfOutliers=dfSamples.iloc[outliers_mahal_rob_bi]
	return dfOutliers
dfSoFCtrl=outliersBi(dfNoZero,['sof3h','ctrl3h'],pCutoff)
dfSoFWt=outliersBi(dfNoZero,['sof3h','wt3h'],pCutoff)
dfCtrlWt=outliersBi(dfNoZero,['ctrl3h','wt3h'],pCutoff)

In [None]:
#scatter plot of fold changes and outlier plot (single pair)
def scatterOutliers(df,dfOutliers,samples,time,pCutoff):
	fontsize=8
	mm=0.0393701
	figwidth=120
	figheight=60
	plt.rcParams["font.family"]="Arial"
	plt.rcParams["font.size"]=fontsize
	fig,ax=plt.subplots(1,2,dpi=300,figsize=(figwidth*mm,figheight*mm),layout="constrained")
	ax=ax.flatten()
	sampleTimes=[sample+str(time)+"h" for sample in samples]
	sampleslfc=[sample+"_log2FoldChange" for sample in sampleTimes]
	samplespadj=[sample+"_padj" for sample in sampleTimes]
	dfPlot=df[["ens","name"]+sampleslfc+samplespadj]
	dfPlotSig=dfPlot[dfPlot[samplespadj].min(axis=1)<pCutoff]
	conditions=[#categorize genes based on p-adjusted 
		(dfPlotSig[samplespadj[0]]<pCutoff) & (dfPlotSig[samplespadj[1]]<pCutoff),
		(dfPlotSig[samplespadj[0]]<pCutoff) & (dfPlotSig[samplespadj[1]]>=pCutoff),
		(dfPlotSig[samplespadj[0]]>=pCutoff) & (dfPlotSig[samplespadj[1]]<pCutoff),
		(dfPlotSig[samplespadj[0]]<pCutoff) & (np.isnan(dfPlotSig[samplespadj[1]])),
		(np.isnan(dfPlotSig[samplespadj[0]])) & (dfPlotSig[samplespadj[1]]<pCutoff)
	]
	values=["Both",samples[0],samples[1],samples[0],samples[1]]
	dfPlotSig["Sample"]=np.select(conditions,values)
	outlierCond=[#categorize genes by outlier status
		(~dfPlotSig["ens"].isin(dfOutliers["ens"])),
		(dfPlotSig["Sample"]=="Both") & (dfPlotSig["ens"].isin(dfOutliers["ens"])),
		(dfPlotSig["Sample"]==samples[0]) & (dfPlotSig["ens"].isin(dfOutliers["ens"])),
		(dfPlotSig["Sample"]==samples[1]) & (dfPlotSig["ens"].isin(dfOutliers["ens"]))
	]
	outlierValues=["Neither","Both",samples[0],samples[1]]
	dfPlotSig["Outlier"]=np.select(outlierCond,outlierValues)
	hue_order=["Both",samples[1],samples[0]]
	outlier_order=["Neither",samples[1],samples[0],"Both"]
	rho,pval=stats.spearmanr(dfPlotSig[sampleslfc[0]], dfPlotSig[sampleslfc[1]])
	nGenes=len(dfPlotSig.index)
	sns.scatterplot(data=dfPlotSig,x=sampleslfc[0],y=sampleslfc[1],
		ax=ax[0],hue="Sample",hue_order=hue_order,alpha=0.8,s=figheight/6.67)
	sns.regplot(data=dfPlotSig,x=sampleslfc[0],y=sampleslfc[1],
		ax=ax[0],scatter=False,color="gray",line_kws={'linewidth':figheight/40})
	sns.scatterplot(data=dfPlotSig,x=sampleslfc[0],y=sampleslfc[1],
		ax=ax[1],hue="Outlier",hue_order=outlier_order,alpha=0.8,s=figheight/6.67)
	reg=sns.regplot(data=dfPlotSig,x=sampleslfc[0],y=sampleslfc[1],
		ax=ax[1],scatter=False,color="gray",line_kws={'linewidth':figheight/40})
	for i in range(2):
		ax[i].set_aspect('equal')
		ax[i].set_xlim(-10,10)
		ax[i].set_ylim(-10,10)
		ax[i].axline((0, 0), slope=1, color="black", linestyle=":")
		ax[i].set_xlabel(samples[0]+" log2 fold change (Dex vs. No Dex)")
		ax[i].set_ylabel(samples[1]+" log2 fold change (Dex vs. No Dex)")
		ax[i].text(0.97, 0.13, "p-adj < "+str(pCutoff), horizontalalignment='right',
			size='medium', color='black', weight='semibold', transform=ax[i].transAxes)
		ax[i].text(0.97, 0.08, "n = "+str(nGenes), horizontalalignment='right',
			size='medium', color='black', weight='semibold', transform=ax[i].transAxes)
		ax[i].text(0.97, 0.03, "Spearman = "+str(round(rho,2)), horizontalalignment='right',
			size='medium', color='black', weight='semibold', transform=ax[i].transAxes)
		ax[i].legend(loc='best', fontsize=fontsize-1)
	#get regression line equation for selecting outliers
	slope, intercept, r_value, p_value, std_err = stats.linregress(x=reg.get_lines()[0].get_xdata(),y=reg.get_lines()[0].get_ydata())
	return(dfPlotSig,slope,intercept)
def pickOutliers(df,sign,sampleX,sampleY,time,slope,intercept):
	#this uses the linear regression line values to pick outliers below it
	dfOutliers=df[(df["Outlier"]==sampleX) | (df["Outlier"]=="Both")]
	xCol=sampleX+str(time)+"h_log2FoldChange"
	yCol=sampleY+str(time)+"h_log2FoldChange"
	if sign=="+":
		dfOutliers=dfOutliers[dfOutliers[xCol]>0]
		dfPicked=dfOutliers[((dfOutliers[xCol]*slope)+intercept)>dfOutliers[yCol]]
	if sign=="-":
		dfOutliers=dfOutliers[dfOutliers[xCol]<0]
		dfPicked=dfOutliers[((dfOutliers[xCol]*slope)+intercept)<dfOutliers[yCol]]
	return(dfPicked)
timeScatter=3
dfSoFoverCtrl,slope,intercept=scatterOutliers(dfNoZero,dfSoFCtrl,["sof","ctrl"],timeScatter,pCutoff)
_,_,_=scatterOutliers(dfNoZero,dfSoFWt,["sof","wt"],timeScatter,pCutoff)
_,_,_=scatterOutliers(dfNoZero,dfCtrlWt,["ctrl","wt"],timeScatter,pCutoff)
#select SoF Dex-dep. outliers
dfSoFoverCtrlPos=pickOutliers(dfSoFoverCtrl,"+","sof","ctrl",3,slope,intercept)
#subset results dataframe with selected outliers only
dfSoF3h=dfNoZero[dfNoZero["ens"].isin(dfSoFoverCtrlPos["ens"])]

In [None]:
#outlier and chosen outlier plots (single pair)
def scatterChosenOutliers(df,dfOutliers,dfChosen,samples,time,pCutoff):
	fontsize=8
	mm=0.0393701
	figwidth=120
	figheight=60
	plt.rcParams["font.family"]="Arial"
	plt.rcParams["font.size"]=fontsize
	fig,ax=plt.subplots(1,2,dpi=300,figsize=(figwidth*mm,figheight*mm),layout="constrained")
	ax=ax.flatten()
	sampleTimes=[sample+str(time)+"h" for sample in samples]
	sampleslfc=[sample+"_log2FoldChange" for sample in sampleTimes]
	samplespadj=[sample+"_padj" for sample in sampleTimes]
	dfPlot=df[["ens","name"]+sampleslfc+samplespadj]
	dfPlotSig=dfPlot[dfPlot[samplespadj].min(axis=1)<pCutoff]
	conditions=[#categorize genes based on p-adjusted 
		(dfPlotSig[samplespadj[0]]<pCutoff) & (dfPlotSig[samplespadj[1]]<pCutoff),
		(dfPlotSig[samplespadj[0]]<pCutoff) & (dfPlotSig[samplespadj[1]]>=pCutoff),
		(dfPlotSig[samplespadj[0]]>=pCutoff) & (dfPlotSig[samplespadj[1]]<pCutoff),
		(dfPlotSig[samplespadj[0]]<pCutoff) & (np.isnan(dfPlotSig[samplespadj[1]])),
		(np.isnan(dfPlotSig[samplespadj[0]])) & (dfPlotSig[samplespadj[1]]<pCutoff)
	]
	values=["Both",samples[0],samples[1],samples[0],samples[1]]
	dfPlotSig["Sample"]=np.select(conditions,values)
	outlierCond=[#categorize by outlier status
		(~dfPlotSig["ens"].isin(dfOutliers["ens"])),
		(dfPlotSig["Sample"]=="Both") & (dfPlotSig["ens"].isin(dfOutliers["ens"])),
		(dfPlotSig["Sample"]==samples[0]) & (dfPlotSig["ens"].isin(dfOutliers["ens"])),
		(dfPlotSig["Sample"]==samples[1]) & (dfPlotSig["ens"].isin(dfOutliers["ens"]))
	]
	outlierValues=["Neither","Both",samples[0],samples[1]]
	dfPlotSig["Outlier"]=np.select(outlierCond,outlierValues)
	dfPlotSig["Gene Set"]=np.where(dfPlotSig["ens"].isin(dfChosen["ens"]), "SoF 3h", "Other")
	outlier_order=["Neither",samples[1],samples[0],"Both"]
	rho,pval=stats.spearmanr(dfPlotSig[sampleslfc[0]], dfPlotSig[sampleslfc[1]])
	nGenes=len(dfPlotSig.index)
	sns.scatterplot(data=dfPlotSig,x=sampleslfc[0],y=sampleslfc[1],
		ax=ax[0],hue="Outlier",hue_order=outlier_order,alpha=0.8,s=figheight/6.67)
	sns.regplot(data=dfPlotSig,x=sampleslfc[0],y=sampleslfc[1],
		ax=ax[0],scatter=False,color="gray",line_kws={'linewidth':figheight/40})
	sns.scatterplot(data=dfPlotSig,x=sampleslfc[0],y=sampleslfc[1],
		ax=ax[1],hue="Gene Set",alpha=0.8,s=figheight/6.67)
	reg=sns.regplot(data=dfPlotSig,x=sampleslfc[0],y=sampleslfc[1],
		ax=ax[1],scatter=False,color="gray",line_kws={'linewidth':figheight/40})
	for i in range(2):
		ax[i].set_aspect('equal')
		ax[i].set_xlim(-10,10)
		ax[i].set_ylim(-10,10)
		ax[i].axline((0, 0), slope=1, color="black", linestyle=":")
		ax[i].set_xlabel(samples[0]+" log2 fold change (Dex vs. No Dex)")
		ax[i].set_ylabel(samples[1]+" log2 fold change (Dex vs. No Dex)")
		ax[i].text(0.97, 0.13, "p-adj < "+str(pCutoff), horizontalalignment='right',
			size='medium', color='black', weight='semibold', transform=ax[i].transAxes)
		ax[i].text(0.97, 0.08, "n = "+str(nGenes), horizontalalignment='right',
			size='medium', color='black', weight='semibold', transform=ax[i].transAxes)
		ax[i].text(0.97, 0.03, "Spearman = "+str(round(rho,2)), horizontalalignment='right',
			size='medium', color='black', weight='semibold', transform=ax[i].transAxes)
		ax[i].legend(loc='best', fontsize=fontsize-1)
timeScatter=3
scatterChosenOutliers(dfNoZero,dfSoFCtrl,dfSoFoverCtrlPos,["sof","ctrl"],timeScatter,pCutoff)
#SoF 3h Rep. genes are activated outliers and have higher fold change in ctrl vs sof
dfSoF3hRep=dfSoFoverCtrl[(dfSoFoverCtrl["Outlier"] == "Both") | (dfSoFoverCtrl["Outlier"] == "ctrl")]
dfSoF3hRep=dfSoF3hRep[(dfSoF3hRep["ctrl3h_log2FoldChange"]>0) & (dfSoF3hRep["ctrl3h_log2FoldChange"]>dfSoF3hRep["sof3h_log2FoldChange"])]
scatterChosenOutliers(dfNoZero,dfSoFCtrl,dfSoF3hRep,["sof","ctrl"],3,pCutoff)


## Save gene lists
Save lists of genes from the selected outliers and genes activated at each time point in the wild-type sample.

In [None]:
#save SoF Dex-dep. gene set
filename="\\SoF3h_0.05"
#save ensembl ids without version numbers
dfSoF3h["nover"]=[x.split(".")[0] for x in dfSoF3h["ens"]]
dfSoF3h.to_csv(outDir+filename+"_sym.txt",columns=["name"],header=False,index=False)
dfSoF3h.to_csv(outDir+filename+"_ens.txt",columns=["ens"],header=False,index=False)
dfSoF3h.to_csv(outDir+filename+"_ens_nover.txt",columns=["nover"],header=False,index=False)
#save SoF 3h Rep. gene set
filename="\\SoF3hRep_0.05"
dfSoF3hRep["nover"]=[x.split(".")[0] for x in dfSoF3hRep["ens"]]
dfSoF3hRep.to_csv(outDir+filename+"_sym.txt",columns=["name"],header=False,index=False)
dfSoF3hRep.to_csv(outDir+filename+"_ens.txt",columns=["ens"],header=False,index=False)
dfSoF3hRep.to_csv(outDir+filename+"_ens_nover.txt",columns=["nover"],header=False,index=False)

In [None]:
#save lists of uniquely activated genes at each time point (in wt)
def saveActList(df,sample,time,pCutoff,name):
	dfAct=df[(df[sample+str(time)+"h_log2FoldChange"]>0) & (df[sample+str(time)+"h_padj"]<pCutoff)]
	if time==2:
		dfAct=dfAct[(dfAct[sample+str(time-1)+"h_padj"]>pCutoff)]
	elif time==3:
		dfAct=dfAct[(dfAct[sample+str(time-2)+"h_padj"]>pCutoff) & (dfAct[sample+str(time-1)+"h_padj"]>pCutoff)]
	dfAct["nover"]=[x.split(".")[0] for x in dfAct["ens"]]#ensembl ids without version number
	dfAct.to_csv(outDir+"\\"+name+"_p"+str(pCutoff)+"_sym.txt",columns=["name"],header=False,index=False)
	dfAct.to_csv(outDir+"\\"+name+"_p"+str(pCutoff)+"_ens.txt",columns=["ens"],header=False,index=False)
	dfAct.to_csv(outDir+"\\"+name+"_p"+str(pCutoff)+"_ens_nover.txt",columns=["nover"],header=False,index=False)
saveActList(dfNoZero,"wt",1,pCutoff,"act1")
saveActList(dfNoZero,"wt",2,pCutoff,"act2")
saveActList(dfNoZero,"wt",3,pCutoff,"act3")

In [None]:
#save list of top 100 DE genes in wt at 3h
def top100(df,sample,topnum):
	colPadj=sample+"_padj"
	dfSort=df.sort_values(by=colPadj, ascending=True)
	dfTop=dfSort.head(topnum)
	return dfTop
dfTop100=top100(dfNoZero,"wt3h",100)
dfTop100.to_csv(outDir+"\\top100_wt3h_sym.txt",columns=["name"],header=False,index=False)
dfTop100.to_csv(outDir+"\\top100_wt3h_ens.txt",columns=["ens"],header=False,index=False)

## UpSet plots for DE genes in each set
Generate UpSet plots showing the overlap of differentially expressed genes between samples at a dexamethasone time point.

In [None]:
#upset plot to compare DE genes
from upsetplot import plot
from upsetplot import UpSet

def upsetDE(df,time,pCutoff,split):#split should activated, repressed, or all
	fontsize=10
	mm=0.0393701
	figwidth=80
	figheight=60
	plt.rcParams["font.family"]="Arial"
	plt.rcParams["font.size"]=fontsize
	fig=plt.figure(dpi=300,figsize=(figwidth*mm,figheight*mm))
	sample_cols=[cell+str(time)+"h_padj" for cell in cellnames]
	lfc_cols=[cell+str(time)+"h_log2FoldChange" for cell in cellnames]
	bool_cols=[cell+str(time)+"h" for cell in cellnames]
	dfPadj=df[["ens","name"]+lfc_cols+sample_cols]
	if split=="activated":
		dfPadj=dfPadj[dfPadj[lfc_cols].min(axis=1)>0]
	elif split=="repressed":
		dfPadj=dfPadj[dfPadj[lfc_cols].min(axis=1)<0]
	elif split=="all":
		pass
	for i,col in enumerate(bool_cols):
		dfPadj[col]=np.where(dfPadj[sample_cols[i]] < pCutoff, True, False)
	dfBool=dfPadj[bool_cols]
	dfCount=dfBool.groupby(bool_cols).size()
	if split=="activated":
		title="Activated DE genes at "+str(time)+"h Dex"
	elif split=="repressed":
		title="Repressed DE genes at "+str(time)+"h Dex"
	elif split=="all":
		title="DE genes at "+str(time)+"h Dex"
	p=plot(dfCount,fig=fig,element_size=20,min_degree=1,show_counts=True)
	plt.title(title)
	plt.show()
	return dfBool
timeUpset=3
_=upsetDE(dfNoZero,timeUpset,pCutoff,"all")
_=upsetDE(dfNoZero,timeUpset,pCutoff,"activated")
_=upsetDE(dfNoZero,timeUpset,pCutoff,"repressed")