# Plot Enrichr results as bar graphs for SoF Rep. gene sets
This Jupyter notebook contains scripts used to plot top results from Enrichr as bar graphs for specifically the SoF GR-repressed gene sets.

# Table of contents
1. [Load packages and files](#load-packages-and-files)
2. [Top 3 bar graph](#top-3-bar-graph)

## Load packages and files <a name="initialize"></a>
Load required packages and results files. Additionally, format results into one dataframe per Enrichr category.

In [None]:
#plot enrichr table data by adjusted p-value and size of the set
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
#file paths here
goDir = "..\\data\\enrichr"
outDir = "..\\analyses"
cheaFiles = [
	"chea_sof3hdown.txt",
	"chea_sofdown.txt"
	]
pathwayFiles = [
	"pathway_sof3hdown.txt",
	"pathway_sofdown.txt"
	]
chrlocFiles = [
	"chrloc_sof3hdown.txt",
	"chrloc_sofdown.txt"
	]
cheaPaths = [goDir+"\\"+x for x in cheaFiles]
pathwayPaths = [goDir+"\\"+x for x in pathwayFiles]
chrlocPaths = [goDir+"\\"+x for x in chrlocFiles]
names=["SoF 3h Rep.", "SoF Const. Rep."]

#assemble data into dataframes
def dfize(paths):
	tables = []
	for path in paths:
		tables.append(pd.read_csv(path,sep='\t'))
	return tables

cheaTable=dfize(cheaPaths)
pathwayTable=dfize(pathwayPaths)
chrlocTable=dfize(chrlocPaths)

## Top 3 bar graph <a name="top3"></a>
Assemble the top 3 results by adjusted p-value from each gene set into one dataframe per category. Then either plot all categories into one figure (bar_all) or as pairs of categories (bar_two).

In [None]:
#get top 3 terms from each set by adjusted p-value
def top3(dftable,names):
	cols=["Term","Adjusted P-value","-log10(padj)","Gene Set","SampleID"]
	top3df=pd.DataFrame(columns=cols)
	for i,df in enumerate(dftable):
		df=df.sort_values(by="Adjusted P-value", ascending=True)
		df=df[["Term","Adjusted P-value"]].head(3)
		df["-log10(padj)"]=-(np.log10(df["Adjusted P-value"]))
		df["Rank"] = [1,2,3]
		df["Gene Set"]=names[i]
		df["SampleID"]=i
		top3df=pd.concat([top3df,df])
	return(top3df)
cheaDf=top3(cheaTable,names)
pathwayDf=top3(pathwayTable,names)
chrlocDf=top3(chrlocTable,names)

In [None]:
#bar plots, top 3 terms from each metric, plot -log10(padj) with name of enrichment set
def bar_all(dfs,titles):
	fontsize=10
	mm=0.0393701
	figwidth=180
	figheight=60
	plt.rcParams["font.family"]="Arial"
	plt.rcParams["font.size"]=fontsize
	fig,ax=plt.subplots(1,3,dpi=300,figsize=(figwidth*mm,figheight*mm),layout="constrained",sharey=True)
	ax=ax.flatten()
	hue_order=[1,2,3]
	def autolabel(df,ax,rects):
		#bars plotted by rank so need to reorder the terms
		dfranksort=df.sort_values(by=["SampleID","Rank"],ascending=True)
		terms=list(dfranksort["Term"])
		terms_short=[]
		maxletters=30
		for x in terms:
			if len(x) <= maxletters:
				terms_short.append(x)
			elif len(x) > maxletters:
				terms_short.append(x[0:maxletters]+"...")
		rectsX=[rect.get_x() for rect in rects]
		rectsY=[rect.get_y() for rect in rects]
		rectsHeight=[rect.get_height() for rect in rects]
		rectDf=pd.DataFrame({"x":rectsX, "y":rectsY, "height":rectsHeight})
		rectDf=rectDf.sort_values(by="y", ascending=True)
		for i in range(len(rectDf.index)):
			ax.text(list(rectDf["x"])[i]+0.1, list(rectDf["y"])[i] + list(rectDf["height"])[i] / 2.,terms_short[i], ha='left', va='center', color='black',fontsize=fontsize-3)
	for i,df in enumerate(dfs):
		sns.barplot(data=df,x="-log10(padj)",y="Gene Set",hue="Rank",
			ax=ax[i],palette="muted",order=names,hue_order=hue_order)
		#p-value 0.05 line
		ax[i].axvline(-(np.log10(0.05)),0,1, color="gray", alpha=0.6, linestyle="dashed")
		ax[i].get_legend().remove()
		ax[i].set_title(titles[i])
		autolabel(df,ax[i],ax[i].patches)
		ax[i].set_xlabel('-log10(Adjusted P-value)')
		if i > 0:
			ax[i].set_ylabel('')
bar_all([cheaDf,pathwayDf,chrlocDf],["ChEA TFs","Pathways","Chromosome"])