In [None]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import json

from copy import deepcopy

from tqdm.notebook import tqdm
import os

In [None]:
data = pd.read_excel("../supplement/SupplementalTable7_top1000_pSNVs.xlsx", skiprows=1)
data.shape

In [None]:
# Check the number of TERT, RALY and CDC20 mutations.
data[data["GENE"] == "TERT"].value_counts(["POS"])

In [None]:
data[data["GENE"] == "RALY"].value_counts(["POS"])

In [None]:
data[data["GENE"] == "CDC20"].value_counts(["POS"])

In [None]:
# Original function to add REMIND-Cancer scores based on the scoring weights.
def _add_remind_cancer_score(
    dataframe: pd.DataFrame,
    remind_cancer_scoring_weights: dict,
    ge_column: str
):
    """Add the REMIND-Cancer score to the DataFrame.

    Args:
        dataframe (pd.DataFrame): Pandas DataFrame containing the data to score.
        remind_cancer_scoring_weights (dict): Dictionary containing the REMIND-Cancer scoring weights.
            Example can be found in REMIND-Cancer/examples/results/configuration_file.json, particularly the last key `REMIND-Cancer_scoring_weights`.
        ge_column (str): Name of the column containing gene expression data.

    Returns:
        pd.DataFrame: DataFrame with the REMIND-Cancer score added.
    """
    # Iterate through each row of the DataFrame and calculate the REMIND-Cancer score.
    for idx, row in dataframe.iterrows():
        # (1) Genomic.
        # Transcription Factors
        created_tf_weight = np.min([
            float(row["num_created_tfs_passing_tf_expression_threshold"]) *
            remind_cancer_scoring_weights["genomic"]["tfbs"]["creation_weight_per_tfbs"],
            remind_cancer_scoring_weights["genomic"]["tfbs"]["creation_weight_maximum"]
        ])

        destroyed_tf_weight = np.min([
            float(row["num_destroyed_tfs_passing_tf_expression_threshold"]) *
            remind_cancer_scoring_weights["genomic"]["tfbs"]["destruction_weight_per_tfbs"],
            remind_cancer_scoring_weights["genomic"]["tfbs"]["destruction_weight_maximum"]
        ])

        # Recurrence
        recurrence_weight = np.min([
            float(row["num_recurrent_mutations"]) *
            remind_cancer_scoring_weights["genomic"]["recurrence"]["weight_per_recurrent_mutation"],
            remind_cancer_scoring_weights["genomic"]["recurrence"]["weight_maximum"]
        ])

        # Purity
        try:
            purity_weight = remind_cancer_scoring_weights["genomic"]["purity"]["purity_weight"] if float(
                row["ICGC_Estimated_Purity"]) >= remind_cancer_scoring_weights["genomic"]["purity"]["purity_threshold_for_weight"] else 0
        except:
            # print(
            #     f"Purity of {row['ICGC_Estimated_Purity']} does not work. Setting score to 0.")
            purity_weight = 0

        # Allele Frequency
        try:
            af_weight = remind_cancer_scoring_weights["genomic"]["allele_frequency"]["af_weight"] if float(
                row["allele_frequency"]) >= remind_cancer_scoring_weights["genomic"]["allele_frequency"]["af_threshold_for_weight"] else 0
        except:
            # print(
            #     f"Allele frequency of {row['allele_frequency']} does not work. Setting score to 0.")
            af_weight = 0

        # (2) Transcriptomic
        # Gene Expression
        gene_expression_weight = np.min([
            float(row[ge_column]) *
            remind_cancer_scoring_weights["transcriptomic"]["gene_expression"]["weight_per_unit_of_expression"],
            remind_cancer_scoring_weights["transcriptomic"]["gene_expression"]["weight_maximum"]
        ])

        # (3) Annotations
        # CGC
        cgc_weight = bool(row["within_cgc_list"]) * \
            remind_cancer_scoring_weights["annotations"]["cgc"]["weight"]

        # Open Chromatin
        open_chromatin_weight = bool(
            row["open_chromatin"]) * remind_cancer_scoring_weights["annotations"]["open_chromatin"]["weight"]

        # Create the final score by summing all the weights.
        # Round the score to 2 decimal places.
        # Assign the score to the "score_v2" column in the DataFrame.
        dataframe.loc[idx, "score_v2"] = np.round(
            np.sum([
                created_tf_weight,
                destroyed_tf_weight,
                recurrence_weight,
                purity_weight,
                af_weight,
                gene_expression_weight,
                cgc_weight,
                open_chromatin_weight
            ]), 2)

    # Sort the DataFrame by the "score_v2" column in descending order.
    dataframe = dataframe.sort_values("score_v2", ascending=False)

    return dataframe

with open("../../examples/results/configuration_file.json", "r") as f:
    scoring_weights = json.load(f)
    scoring_weights = scoring_weights["REMIND-Cancer_scoring_weights"]

# Get the median, mean, min, and max ranks for each gene and position



In [None]:
# Define the weight mapping for each scoring attribute.
# The main key (e.g. Recurrence Weight) is the name of the weight
# The `attribute` is a list of the keys/subkeys to access the weight
#      Ex: scoring_weights["genomic"]["recurrence"]["weight_per_recurrent_mutation"] = ["genomic", "recurrence", "weight_per_recurrent_mutation"]
# The `range` is the range of values to iterate through for the weight
#      Ex: np.linspace(1, 100, 100) will create a range from 1 to 100 with 100 points
# The `original_weight` is the original weight value for the attribute

weight_mapping = {
    "Recurrence Weight": {
        "attribute": ["genomic", "recurrence", "weight_per_recurrent_mutation"],
        "range": np.linspace(1, 100, 100),
        "original_weight": scoring_weights["genomic"]["recurrence"]["weight_per_recurrent_mutation"],
    },
    "Gene Expression Weight": {
        "attribute": ["transcriptomic", "gene_expression", "weight_per_unit_of_expression"],
        "range": np.linspace(1, 100, 100),
        "original_weight": scoring_weights["transcriptomic"]["gene_expression"]["weight_per_unit_of_expression"],
    },
    "Purity Weight": {
        "attribute": ["genomic", "purity", "purity_weight"],
        "range": np.linspace(1, 100, 100),
        "original_weight": scoring_weights["genomic"]["purity"]["purity_weight"],
    },
    "Transcription Factor Creation Weight": {
        "attribute": ["genomic", "tfbs", "creation_weight_per_tfbs"],
        "range": np.linspace(1, 20, 100),
        "original_weight": scoring_weights["genomic"]["tfbs"]["creation_weight_per_tfbs"],
    },
    "Transcription Factor Destruction Weight": {
        "attribute": ["genomic", "tfbs", "destruction_weight_per_tfbs"],
        "range": np.linspace(1, 20, 100),
        "original_weight": scoring_weights["genomic"]["tfbs"]["destruction_weight_per_tfbs"],
    },
    "Allele Frequency Weight": {
        "attribute": ["genomic", "allele_frequency", "af_weight"],
        "range": np.linspace(1, 50, 100),
        "original_weight": scoring_weights["genomic"]["allele_frequency"]["af_weight"],
    },
    "CGC Weight": {
        "attribute": ["annotations", "cgc", "weight"],
        "range": np.linspace(1, 50, 100),
        "original_weight": scoring_weights["annotations"]["cgc"]["weight"],
    },
    "Open Chromatin Weight": {
        "attribute": ["annotations", "open_chromatin", "weight"],
        "range": np.linspace(1, 50, 100),
        "original_weight": scoring_weights["annotations"]["open_chromatin"]["weight"],
    }
 }


In [None]:
# For each gene and position, plot the four graphs in a plotly subplot.
import plotly.subplots as sp

def create_subplots(
    medians: dict, 
    means: dict, 
    mins: dict, 
    maxes: dict, 
    name: str, 
    gene: str, 
    position: int, 
    output_dir="./plots"
):
    """Create subplots for the given means, medians, mins, and maxes.

    Args:
        medians (dict): A dictionary containing the median ranks for each gene and position.
        means (dict): A dictionary containing the mean ranks for each gene and position.
        mins (dict): A dictionary containing the minimum ranks for each gene and position.
        maxes (dict): A dictionary containing the maximum ranks for each gene and position.
        name (str): The name of the weight being varied.
        gene (str): The gene being analyzed.
        position (int): The position of the gene being analyzed.
        output_dir (str, optional): The directory to save the plots. Defaults to "./plots".
    """
    # Ensure the output directory exists.
    if not os.path.exists(output_dir):
        os.makedirs(output_dir, exist_ok=True)
        print(f"Created output directory: {os.path.abspath(output_dir)}")

    # Define the 2x2 grid for the subplots.
    fig = sp.make_subplots(
        rows=2, cols=2,
        subplot_titles=[
            "Mean Rank", "Median Rank",
            "Min Rank", "Max Rank"
        ],
    )

    # Plot the mean rank at position (1,1)
    fig.add_trace(
        go.Scatter(
            x=weight_mapping[name]["range"],
            y=means[gene][position],
            mode="lines",
            name=f"{gene} {position} Mean Rank",
            marker=dict(size=10, color="blue")
        ),
        row=1, col=1
    )
    # Also, add a vertical line for the original weight.
    fig.add_vline(
        x=weight_mapping[name]["original_weight"],
        line=dict(color="black", width=2, dash="dash"),
    )

    # Plot the median rank at position (1,2)
    fig.add_trace(
        go.Scatter(
            x=weight_mapping[name]["range"],
            y=medians[gene][position],
            mode="lines",
            name=f"{gene} {position} Median Rank",
            marker=dict(size=10, color="orange")
        ),
        row=1, col=2
    )
    # Also, add a vertical line for the original weight.
    fig.add_vline(
        x=weight_mapping[name]["original_weight"],
        line=dict(color="black", width=2, dash="dash"),
    )

    # Plot the min rank at position (2,1).
    fig.add_trace(
        go.Scatter(
            x=weight_mapping[name]["range"],
            y=mins[gene][position],
            mode="lines",
            name=f"{gene} {position} Min Rank",
            marker=dict(size=10, color="green")
        ),
        row=2, col=1
    )
    # Also, add a vertical line for the original weight.
    fig.add_vline(
        x=weight_mapping[name]["original_weight"],
        line=dict(color="black", width=2, dash="dash"),
    )

    # Plot the max rank at position (2,2).
    fig.add_trace(
        go.Scatter(
            x=weight_mapping[name]["range"],
            y=maxes[gene][position],
            mode="lines",
            name=f"{gene} {position} Max Rank",
            marker=dict(size=10, color="red")
        ),
        row=2, col=2
    )
    # Also, add a vertical line for the original weight.
    fig.add_vline(
        x=weight_mapping[name]["original_weight"],
        line=dict(color="black", width=2, dash="dash"),
    )
    fig.update_layout(
        title=f"Iterrating through <b>{name}</b> Scoring for {gene} at position {position} (n={data[(data['GENE'] == gene) & (data['POS'] == position)].shape[0]})",
        showlegend=True
    )
    # Add a common x-axis label.
    fig.update_xaxes(title_text=name, row=2, col=1)
    fig.update_xaxes(title_text=name, row=2, col=2)
    # Add a common y-axis label.
    fig.update_yaxes(title_text="Rank", row=1, col=1)
    fig.update_yaxes(title_text="Rank", row=2, col=1)


    fig.write_html(os.path.join(output_dir, f"{name.replace(' ', '_')}_{gene}_{position}_score_iteration.html"))


In [None]:
# Iterate through the recurrence scoring.

# Load in the original scoring weights. These will be changed for each iteration.
with open("../../examples/results/configuration_file.json", "r") as f:
    scoring_weights = json.load(f)
    scoring_weights = scoring_weights["REMIND-Cancer_scoring_weights"]

# Iterate through the weight mapping and create subplots for each weight.
for name in weight_mapping:
    # Initialize the figure for the subplots.
    fig = go.Figure()

    # Initialize dictionaries to store means, medians, mins, and maxes for each gene and position.
    means = {}
    medians = {}
    mins = {}
    maxes = {}

    # Iterate through the genes and positions of interest.
    for gene, position, color in [
        ("TERT", 1295228, "red"), 
        ("TERT", 1295250, "blue"), 
        ("RALY", 32580927, "green"), 
        ("CDC20", 43824529, "yellow")
    ]:
        # Initialize the dictionaries for each gene and position.
        if gene not in means:
            means[gene] = {}
            medians[gene] = {}
            mins[gene] = {}
            maxes[gene] = {}
        
        if position not in means[gene]:
            means[gene][position] = []
            medians[gene][position] = []
            mins[gene][position] = []
            maxes[gene][position] = []

        # Initialize the x-axis range, which is just the range of the weight.
        x_axis_scoring_range = weight_mapping[name]["range"]

        # Initialize the y-axis score list.
        y_axis_score = []

        # Iterate through the x-axis scoring range and calculate the scores.
        for x in tqdm(x_axis_scoring_range, desc=f"{name}: {gene} {position}"):
            # Create a temporary copy of the scoring weights to modify.
            scoring_weights_temp = deepcopy(scoring_weights)

            # Update the scoring weights with the current x value.
            # Ex: scoring_weights_temp["genomic"]["recurrence"]["weight_per_recurrent_mutation"] = 1
            scoring_weights_temp[
                weight_mapping[name]["attribute"][0]
            ][
                weight_mapping[name]["attribute"][1]
            ][
                weight_mapping[name]["attribute"][2]
            ] = x

            # Calculate the scores using the modified scoring weights.
            data_temp = _add_remind_cancer_score(
                dataframe=data.copy(),
                remind_cancer_scoring_weights=scoring_weights_temp,
                ge_column="FPKM_Z_score"
            )

            # Since data_temp now has the new scores as `score_v2`, sort it.
            data_temp = data_temp.sort_values("score_v2", ascending=False, inplace=False)

            # Reset the index of the data_temp DataFrame to have the ranks.
            data_temp = data_temp.reset_index(drop=True)

            # Get the median rank by filtering the data_temp DataFrame for the current gene and position and 
            # then getting the index, which represents the rank.
            median = np.median(
                list(
                    data_temp[(data_temp["GENE"] == gene) & (data_temp["POS"] == position)].index
                )
            )

            # Get the mean, min, and max ranks for the current gene and position with the same idea.
            mean = np.mean(list(data_temp[(data_temp["GENE"] == gene) & (data_temp["POS"] == position)].index))
            min = np.min(list(data_temp[(data_temp["GENE"] == gene) & (data_temp["POS"] == position)].index))
            max = np.max(list(data_temp[(data_temp["GENE"] == gene) & (data_temp["POS"] == position)].index))

            # Append the mean, median, min, and max ranks to the respective lists.
            means[gene][position].append(mean)
            medians[gene][position].append(median)
            mins[gene][position].append(min)
            maxes[gene][position].append(max)

        # Call the create_subplots function to create the subplots for the current gene and position.
        # These should be saved within the `plots` directory.
        create_subplots(medians, means, mins, maxes, name, gene, position, output_dir="./plots")
