# Import and functions

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
df = pd.read_csv('/projects/synsight/repos/phenospace/normalisation/publication/pathways/data/results_summary.csv')

FileNotFoundError: [Errno 2] No such file or directory: '/projects/synsight/repos/phenospace/normalisation/publication/pathways/data/results_summary.csv'

In [None]:
results_df = df[df["Pathway"]!='NOTCH_SIGNALING']


In [None]:
results_df.head(2)

In [None]:
def plot_results(results_df):
    # Sort pathways by name for better visualization
    results_df = results_df.sort_values("Pathway")

    # Plot 1: Random mAP vs mAP for each pathway
    x = np.arange(len(results_df["Pathway"]))  # Bar positions
    width = 0.35  # Bar width

    fig, ax = plt.subplots(figsize=(12, 6))
    ax.bar(x - width / 2, results_df["Random mAP"], width, label="Random mAP", alpha=0.7)
    ax.bar(x + width / 2, results_df["mAP"], width, label="mAP", alpha=0.7)

    # Customizing the plot
    ax.set_title("Comparison of Random mAP vs mAP by Pathway", fontsize=14)
    ax.set_xlabel("Pathways", fontsize=12)
    ax.set_ylabel("mAP", fontsize=12)
    ax.set_xticks(x)
    ax.set_xticklabels(results_df["Pathway"], rotation=45, ha="right")
    ax.legend()
    ax.grid(axis="y", linestyle="--", alpha=0.7)
    plt.tight_layout()
    plt.show()

    # Plot 2: mEF vs Max EF for each pathway
    fig, ax = plt.subplots(figsize=(12, 6))
    ax.bar(x - width / 2, results_df["mEF"], width, label="mEF", alpha=0.7, color='orange')
    ax.bar(x + width / 2, results_df["Max EF"], width, label="Max EF", alpha=0.7, color='green')

    # Customizing the plot
    ax.set_title("Comparison of EF vs Max EF by Pathway", fontsize=14)
    ax.set_xlabel("Pathways", fontsize=12)
    ax.set_ylabel("EF", fontsize=12)
    ax.set_xticks(x)
    ax.set_xticklabels(results_df["Pathway"], rotation=45, ha="right")
    ax.legend()
    ax.grid(axis="y", linestyle="--", alpha=0.7)
    plt.tight_layout()
    plt.show()




def analyze_results(results_df):
    # 2. Distribution of Number of Genes in Pathways
    plt.figure(figsize=(10, 6))
    sns.histplot(results_df['Number of Genes'], bins=20, kde=True, color='green')
    plt.title("Distribution of Number of Genes in Pathways")
    plt.xlabel("Number of Genes")
    plt.ylabel("Frequency")
    plt.show()

    # 3. Scatter plot of mAP vs Number of Genes
    plt.figure(figsize=(10, 6))
    sns.scatterplot(x='Number of Genes', y='mAP', data=results_df)
    sns.scatterplot(x='Number of Genes', y='Random mAP', data=results_df)
    plt.title("mAP vs Number of Genes")
    plt.xlabel("Number of Genes")
    plt.ylabel("mAP")
    plt.show()

    # 4. Scatter plot of mAP vs Number of Molecules
    plt.figure(figsize=(10, 6))
    sns.scatterplot(x='Number of Molecules', y='mAP', data=results_df)
    sns.scatterplot(x='Number of Molecules', y='Random mAP', data=results_df)
    plt.title("mAP vs Number of Molecules")
    plt.xlabel("Number of Molecules")
    plt.ylabel("mAP")
    plt.show()

    # 7. Scatter plot of Number of Impacted Genes vs mAP
    plt.figure(figsize=(10, 6))
    sns.scatterplot(x='Number of Impacted Genes', y='mAP', data=results_df)
    sns.scatterplot(x='Number of Impacted Genes', y='Random mAP', data=results_df)
    plt.title("Number of Impacted Genes vs mAP")
    plt.xlabel("Number of Impacted Genes")
    plt.ylabel("mAP")
    plt.show()


    # 9. Boxplot of mAP grouped by Number of Impacted Genes (binned)
    results_df['Impacted Genes Bin'] = pd.qcut(results_df['Number of Impacted Genes'], q=4)
    plt.figure(figsize=(10, 6))
    sns.boxplot(x='Impacted Genes Bin', y='Random mAP', data=results_df)

    sns.boxplot(x='Impacted Genes Bin', y='mAP', data=results_df)
    plt.title("mAP by Impacted Genes (Binned)")
    plt.xlabel("Impacted Genes Bin")
    plt.ylabel("mAP")
    plt.show()

    # 10. Scatter plot of Number of Molecules vs Impacted Genes
    plt.figure(figsize=(10, 6))
    sns.scatterplot(x='Number of Molecules', y='Number of Impacted Genes', data=results_df)
    plt.title("Number of Molecules vs Impacted Genes")
    plt.xlabel("Number of Molecules")
    plt.ylabel("Number of Impacted Genes")
    plt.show()


# Results

## Raw 

In [None]:

# Call the function to analyze results
analyze_results(results_df)


In [None]:
plot_results(results_df)

## Filtered

In [None]:
df_filtered = results_df[results_df['Number of Impacted Genes']>=10]

In [None]:
len(df_filtered)

In [None]:
analyze_results(df_filtered)

In [None]:
plot_results(df_filtered)