# **Gene Ontology/Pathway Enrichment Analysis**

In [33]:
pip install gseapy

Collecting gseapy
  Downloading gseapy-1.1.8-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading gseapy-1.1.8-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (590 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m590.8/590.8 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gseapy
Successfully installed gseapy-1.1.8


In [1]:
# Connect to google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd

# Load only the columns you need
file_path = '/content/drive/My Drive/Colab Notebooks/Computational Stem Cells/genes.csv'

# Read CSV and extract 'region' and 'genes' columns
df = pd.read_csv(file_path, usecols=['region', 'genes'])

# Convert the comma-separated genes into lists
df['genes'] = df['genes'].str.split(',')

# Show the first few rows
df.head()


Unnamed: 0,region,genes
0,chr1:356138-7827342 (CN 4),"[ENSG00000236601, ENSG00000237094, ENSG0000023..."
1,chr1:1659027-13516270 (CN 0),"[ENSG00000272004, ENSG00000269737, ENSG0000023..."


In [5]:
d = df["genes"].apply(lambda x: [gene.strip() for gene in x] if isinstance(x, list) else x)

In [47]:
cnv1 = d[0]
cnv2 = d[1]

print(cnv2)
print(cnv1)

['ENSG00000272004', 'ENSG00000269737', 'ENSG00000233542', 'ENSG00000271806', 'ENSG00000234396', 'ENSG00000224387', 'ENSG00000229393', 'ENSG00000272449', 'ENSG00000228037', 'ENSG00000233234', 'ENSG00000231630', 'ENSG00000226286', 'ENSG00000272235', 'ENSG00000272088', 'ENSG00000238260', 'ENSG00000272153', 'ENSG00000227169', 'ENSG00000260972', 'ENSG00000271746', 'ENSG00000231868...']
['ENSG00000236601', 'ENSG00000237094', 'ENSG00000230021', 'ENSG00000235146', 'ENSG00000229905', 'ENSG00000272438', 'ENSG00000272512', 'ENSG00000224969', 'ENSG00000273443', 'ENSG00000272141', 'ENSG00000260179', 'ENSG00000272106', 'ENSG00000272004', 'ENSG00000269737', 'ENSG00000233542', 'ENSG00000271806', 'ENSG00000234396', 'ENSG00000224387', 'ENSG00000229393', 'ENSG00000272449...']


Looking at these Ensembl gene IDs we found that they are variation of psudogene RNU6, and variations of non-coding RNA genes RP11 and RP3

In [45]:
cnv_dict={"cnv1":["RNU6"],
           "cnv2": ["RP11", "RP3"]}

In [46]:
import gseapy as gp
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np


# Perform pathway enrichment analysis for CNVs
pathway_results = []

for cell_type, genes in cnv_dict.items():
    print(f"Running pathway enrichment analysis for {cell_type}...")

    try:
        # Perform pathway enrichment analysis (KEGG, Reactome)
        pathway_enrichment = gp.enrichr(
            gene_list=genes,
            gene_sets=['KEGG_2021_Human', 'Reactome_2022'],
            organism='Human',  # Replace with 'Mouse' if working with mouse data
            outdir=None,
        )

        # Add cell type information to the results
        pathway_df = pathway_enrichment.results
        pathway_df['Cell Type'] = cell_type
        pathway_results.append(pathway_df)

    except Exception as e:
        print(f"Error processing {cell_type}: {e}")

# Combine results for all cell types
combined_results = pd.concat(pathway_results)

# Filter for significant pathways (e.g., Adjusted P-value < 0.05)
significant_results = combined_results[combined_results['Adjusted P-value'] < 0.05]

# Pivot the data for visualization
pivot_table = significant_results.pivot_table(
    index='Term',  # Pathway terms
    columns='Cell Type',  # Cell types
    values='Adjusted P-value',  # Use Adjusted P-value for significance
    aggfunc='min'  # Use the minimum Adjusted P-value for each pathway
)

# Replace NaN values with 1 (no significance)
pivot_table = pivot_table.fillna(1)

# Create a heatmap
if not pivot_table.empty:
    plt.figure(figsize=(10, 8))
    sns.heatmap(
        -np.log10(pivot_table),
        cmap='viridis',
        linewidths=0.5,
        annot=True,
        fmt=".2f",
        cbar_kws={'label': '-log10(Adjusted P-value)'}
    )
    plt.title("Pathway Enrichment Analysis for CNVs")
    plt.xlabel("Cell Type")
    plt.ylabel("Pathway")
    plt.tight_layout()
    plt.show()
else:
    print("No significant enrichment results to display.")


Running pathway enrichment analysis for cnv1...
Running pathway enrichment analysis for cnv2...
Error processing cnv2: Error sending gene list, try again later
No significant enrichment results to display.
