# RIP-seq data analysis using DESeq2 results output (3 hour dex treatment)
This Jupyter notebook contains scripts used to analyze RIP-seq enrichemnt from DESeq2 results.

# Table of contents
1. [Load packages and files](#load-packages-and-files)
2. [Plot volcano plot for GR RIP enrichment](#plot-volcano-plot-for-gr-rip-enrichment)

## Load packages and files
Load required packages and results files. Additionally, format results into one dataframe with zero baseMean genes removed (dfNoZero)

In [None]:
#notebook for processing rip-seq
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from adjustText import adjust_text
#p-value cutoff and file paths here
pCutoff=0.05
resDir = "..\\data"
outDir = "..\\analyses"
times = ["0","1","2","3"]
samples=[]
samplefiles=[]
for time in times:
	samples.append(time+"h")
	samplefiles.append("rip_res_"+time)
resFiles=[resDir+"\\"+file+".csv" for file in samplefiles]

In [None]:
#format data into one dataframe, remove rows with zero baseMean
def formatResFiles(resFiles):
	tables=[]
	allCols=["ens","name","baseMean","log2FoldChange","lfcSE","padj"]
	lessCols=["baseMean","log2FoldChange","lfcSE","padj"]
	for i,file in enumerate(resFiles):
		if i==0:
			tables.append(pd.read_csv(file, usecols=allCols))
		else:
			tables.append(pd.read_csv(file, usecols=lessCols))
	ensIDs=tables[0]["ens"]
	geneNames=tables[0]["name"]
	df=pd.DataFrame(ensIDs)
	df["name"]=geneNames
	for i,x in enumerate(tables):
		colNames=[samples[i]+"_"+y for y in lessCols]
		for n,col in enumerate(colNames):
			df[col]=x[lessCols[n]]
	#remove genes with NA gene names
	df = df[df["name"].isna()==False]
	#remove rows with zero base mean to remove nan fold change rows but maintain nan padjs
	dfNoZero=df.copy()
	columns=[x+"_baseMean" for x in samples]
	dfNoZero = dfNoZero[dfNoZero[columns].min(axis=1) > 0]

	return dfNoZero
dfNoZero=formatResFiles(resFiles)

## Plot volcano plot for GR RIP enrichment

In [None]:
#volcano plot
def volcano(df,time,pCutoff):
	plt.rcParams["font.family"]="Arial"
	fig,ax=plt.subplots(dpi=400, figsize=(3.5,4))
	samplecols=["name",time+"h_log2FoldChange",time+"h_padj"]
	dfPlot=df[samplecols]
	conditions=[#categorize genes based on p-adjusted 
			(dfPlot[samplecols[2]]<pCutoff) & (dfPlot[samplecols[1]]>0),
			(dfPlot[samplecols[2]]<pCutoff) & (dfPlot[samplecols[1]]<0),
			(dfPlot[samplecols[2]]>=pCutoff)
		]
	values=["Enriched","Depleted","Neither"]
	hue_order=["Neither","Depleted","Enriched"]
	dfPlot["p < "+str(pCutoff)]=np.select(conditions,values)
	dfPlot["-log10(p-adjusted)"]=-(np.log10(dfPlot[samplecols[2]]))
	palette=["#858585","#6389FF","#FF5C5C"]
	sns.scatterplot(data=dfPlot,x=samplecols[1],y="-log10(p-adjusted)",
			ax=ax,hue="p < "+str(pCutoff),hue_order=hue_order,alpha=1,palette=palette,s=15)
	#label top 10 genes on the plot (by adjusted p-value)
	dfLabel=dfPlot[dfPlot[samplecols[1]]>0]
	dfLabelSort=dfLabel.sort_values(by=["-log10(p-adjusted)"],ascending=False)
	dfLabelSort=dfLabelSort.head(10)
	top10=[list(dfLabelSort["name"]),list(dfLabelSort[samplecols[1]]),list(dfLabelSort["-log10(p-adjusted)"])]
	texts=[]
	for i in range(len(dfLabelSort.index)):
		texts.append(plt.text(top10[1][i],top10[2][i],top10[0][i]))
	adjust_text(texts, force_points=0.1, force_text=0.1,
			expand_points=(1, 1), expand_text=(1, 1),
			arrowprops=dict(arrowstyle="-", color='black', lw=0.7))
	#annotate with number of genes and enriched genes
	ngenes=len(dfPlot.index)
	nEnr=len(dfPlot[dfPlot["p < "+str(pCutoff)]=="Enriched"].index)
	ax.text(0.99, 0.005, "total = "+str(ngenes)+", enriched = "+str(nEnr), horizontalalignment='right',
			size='medium', color='black', weight='normal', transform=ax.transAxes)
	ax.set_xlabel("log2 Fold Change (IP vs. input)")
	ax.set_ylabel("-log10(adjusted p-value)")
	plt.legend(loc="best")
volcano(dfNoZero,"3",pCutoff)