# **KEGG PATHWAY ENRICHMENT VISUALIZATION** (METABOLOME)

### Metabolome: CNSA, Inspiration4, MHU-3(L+18), MHU-3(R+2)
#### For these plots, the following rules were applied:
#### 1. All pathways with a p-value < 0.05 that were observed in at least two missions are presented.
#### 2. The more missions the pathway is involved in, the higher the position in the plot.
#### 3. The pathway with a more significant maximum p-value is higher in the plot.

In [None]:
## load packages
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# read in data
df_CNSA = pd.read_csv("CNSA.csv")
df_I4 = pd.read_csv("Inspiration4.csv")
df_MHU3_L18 = pd.read_csv("MHU3_L18.csv")
df_MHU3_R2 = pd.read_csv("MHU3_R2.csv")

In [None]:
# make a list of the dataframes
df_list = [df_CNSA, df_I4, df_MHU3_L18, df_MHU3_R2]

In [None]:
# for loop to add a column for the sample name
for df in df_list:
    if df is df_CNSA:
        df['Sample'] = 'CNSA'
    elif df is df_I4:
        df['Sample'] = 'Inspiration4'
    elif df is df_MHU3_L18:
        df['Sample'] = 'MHU3(L+18)'
    elif df is df_MHU3_R2:
        df['Sample'] = 'MHU3(R+2)'

In [None]:
# combine all dataframes into one
df_all = pd.concat(df_list)

In [None]:
# keep only the columns we need
kegg = df_all[['Sample', 'Description', '#NAME?', 'Hits', 'Background']]

In [None]:
# rename the columns
kegg.columns = ['Sample','Description', '-LogP', 'Hits', 'Background']

In [None]:
# make a new column for the gene ratio
kegg['Gene Ratio'] = kegg['Hits'] / kegg['Background']

In [None]:
# reorder the columns
new_kegg = kegg[['Sample', 'Description', '-LogP', 'Gene Ratio']]

In [None]:
#keep only -LogP values greater than 1.3
new_kegg = new_kegg[new_kegg['-LogP'] > 1.3]

In [None]:
# make a new column containing the maximum -LogP value for each description across all samples
new_kegg['Max -LogP'] = new_kegg.groupby('Description')["-LogP"].transform('max')

In [None]:
# count the number of times each description appears across all samples
new_kegg['Count'] = new_kegg.groupby('Description')['Description'].transform('count')

In [None]:
# filter for kegg pathways detected at list twice
new_kegg = new_kegg[new_kegg['Count'] > 1]

In [None]:
# store the final dataframe as a new variable
df = new_kegg

In [None]:
# sort by sample, max -LogP, and enrichment
df = df.sort_values(by=['Count', 'Max -LogP', 'Gene Ratio'], ascending=[False, False, False])

In [None]:
# set the sample order in the dataframe
sample_order = ['CNSA', 'MHU3(L+18)', 'MHU3(R+2)', 'Inspiration4']
df['Sample'] = pd.Categorical(df['Sample'], categories=sample_order, ordered=True)

## Metabolome KEGG Pathway Enrichment: -Log10 p-value > 1.3 Pathways Detected at Least Twice

In [None]:
#set style
sns.set_context("paper")

In [None]:
# Create figure and axes
fig, ax = plt.subplots(figsize=(6,5))
sns.scatterplot(data=df, x='Sample', y='Description', size='-LogP', color="black", edgecolor="w", 
                hue='Gene Ratio', palette='flare', sizes=(100, 300), legend="brief", ax=ax)
# Create custom legend with -LogP
h, l = ax.get_legend_handles_labels()
plt.legend(h[7:15], l[7:15], title='-LogP', bbox_to_anchor=(1.2, 0.8), loc=2, fontsize=14, 
           borderaxespad=0., title_fontsize=14)

# Create custom olorbar legend with Gene count
handles, labels = ax.get_legend_handles_labels()
sm = plt.cm.ScalarMappable(cmap='flare', norm=plt.Normalize(vmin=df['Gene Ratio'].min(), vmax=df['Gene Ratio'].max()))
sm.set_array([])
cbar = fig.colorbar(sm, ax=ax, shrink=0.25, pad=0.35)
cbar.ax.tick_params(labelsize=13)
cbar.set_label(label='Enrichment', size=14, labelpad=10)

# Set x and y labels, limits, ticks, and title
ax.set_ylabel(None)
ax.set_xlabel(None)
ax.set_xmargin(0.06)
ax.set_ylim(bottom=11.5, top=-0.5)
plt.yticks(fontsize=13)
ax.xaxis.tick_bottom()
ax.xaxis.set_label_position('top')
plt.xticks(fontsize=13, rotation=90)

plt.show()