In [1]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import json

from copy import deepcopy

from tqdm.notebook import tqdm
import os

In [2]:
!uv add 

[1m[31merror:[0m the following required arguments were not provided:
  [32m<PACKAGES|--requirements <REQUIREMENTS>>[0m

[1m[32mUsage:[0m [1m[36muv add[0m [36m<PACKAGES|--requirements <REQUIREMENTS>>[0m

For more information, try '[1m[36m--help[0m'.


In [3]:
data = pd.read_excel("../supplement/SupplementalTable7_top1000_pSNVs.xlsx", skiprows=1)
data.shape

(1000, 29)

In [4]:
# Check the number of TERT, RALY and CDC20 mutations.
data[data["GENE"] == "TERT"].value_counts(["POS"])

POS    
1295228    28
1295250     7
1294883     1
1295149     1
Name: count, dtype: int64

In [5]:
data[data["GENE"] == "RALY"].value_counts(["POS"])

POS     
32580927    4
32580916    1
Name: count, dtype: int64

In [6]:
data[data["GENE"] == "CDC20"].value_counts(["POS"])

POS     
43824528    1
43824529    1
Name: count, dtype: int64

In [7]:
# Original function to add REMIND-Cancer scores based on the scoring weights.
def _add_remind_cancer_score(
    dataframe: pd.DataFrame,
    remind_cancer_scoring_weights: dict,
    ge_column: str
):
    """Add the REMIND-Cancer score to the DataFrame.

    Args:
        dataframe (pd.DataFrame): Pandas DataFrame containing the data to score.
        remind_cancer_scoring_weights (dict): Dictionary containing the REMIND-Cancer scoring weights.
            Example can be found in REMIND-Cancer/examples/results/configuration_file.json, particularly the last key `REMIND-Cancer_scoring_weights`.
        ge_column (str): Name of the column containing gene expression data.

    Returns:
        pd.DataFrame: DataFrame with the REMIND-Cancer score added.
    """
    # Iterate through each row of the DataFrame and calculate the REMIND-Cancer score.
    for idx, row in dataframe.iterrows():
        # (1) Genomic.
        # Transcription Factors
        created_tf_weight = np.min([
            float(row["num_created_tfs_passing_tf_expression_threshold"]) *
            remind_cancer_scoring_weights["genomic"]["tfbs"]["creation_weight_per_tfbs"],
            remind_cancer_scoring_weights["genomic"]["tfbs"]["creation_weight_maximum"]
        ])

        destroyed_tf_weight = np.min([
            float(row["num_destroyed_tfs_passing_tf_expression_threshold"]) *
            remind_cancer_scoring_weights["genomic"]["tfbs"]["destruction_weight_per_tfbs"],
            remind_cancer_scoring_weights["genomic"]["tfbs"]["destruction_weight_maximum"]
        ])

        # Recurrence
        recurrence_weight = np.min([
            float(row["num_recurrent_mutations"]) *
            remind_cancer_scoring_weights["genomic"]["recurrence"]["weight_per_recurrent_mutation"],
            remind_cancer_scoring_weights["genomic"]["recurrence"]["weight_maximum"]
        ])

        # Purity
        try:
            purity_weight = remind_cancer_scoring_weights["genomic"]["purity"]["purity_weight"] if float(
                row["ICGC_Estimated_Purity"]) >= remind_cancer_scoring_weights["genomic"]["purity"]["purity_threshold_for_weight"] else 0
        except:
            # print(
            #     f"Purity of {row['ICGC_Estimated_Purity']} does not work. Setting score to 0.")
            purity_weight = 0

        # Allele Frequency
        try:
            af_weight = remind_cancer_scoring_weights["genomic"]["allele_frequency"]["af_weight"] if float(
                row["allele_frequency"]) >= remind_cancer_scoring_weights["genomic"]["allele_frequency"]["af_threshold_for_weight"] else 0
        except:
            # print(
            #     f"Allele frequency of {row['allele_frequency']} does not work. Setting score to 0.")
            af_weight = 0

        # (2) Transcriptomic
        # Gene Expression
        gene_expression_weight = np.min([
            float(row[ge_column]) *
            remind_cancer_scoring_weights["transcriptomic"]["gene_expression"]["weight_per_unit_of_expression"],
            remind_cancer_scoring_weights["transcriptomic"]["gene_expression"]["weight_maximum"]
        ])

        # (3) Annotations
        # CGC
        cgc_weight = bool(row["within_cgc_list"]) * \
            remind_cancer_scoring_weights["annotations"]["cgc"]["weight"]

        # Open Chromatin
        open_chromatin_weight = bool(
            row["open_chromatin"]) * remind_cancer_scoring_weights["annotations"]["open_chromatin"]["weight"]

        # Create the final score by summing all the weights.
        # Round the score to 2 decimal places.
        # Assign the score to the "score_v2" column in the DataFrame.
        dataframe.loc[idx, "score_v2"] = np.round(
            np.sum([
                created_tf_weight,
                destroyed_tf_weight,
                recurrence_weight,
                purity_weight,
                af_weight,
                gene_expression_weight,
                cgc_weight,
                open_chromatin_weight
            ]), 2)

    # Sort the DataFrame by the "score_v2" column in descending order.
    dataframe = dataframe.sort_values("score_v2", ascending=False)

    return dataframe

with open("../../examples/results/configuration_file.json", "r") as f:
    scoring_weights = json.load(f)
    scoring_weights = scoring_weights["REMIND-Cancer_scoring_weights"]

# Get the median, mean, min, and max ranks for each gene and position



In [8]:
# Define the weight mapping for each scoring attribute.
# The main key (e.g. Recurrence Weight) is the name of the weight
# The `attribute` is a list of the keys/subkeys to access the weight
#      Ex: scoring_weights["genomic"]["recurrence"]["weight_per_recurrent_mutation"] = ["genomic", "recurrence", "weight_per_recurrent_mutation"]
# The `range` is the range of values to iterate through for the weight
#      Ex: np.linspace(1, 100, 100) will create a range from 1 to 100 with 100 points
# The `original_weight` is the original weight value for the attribute

weight_mapping = {
    "Recurrence Weight": {
        "attribute": ["genomic", "recurrence", "weight_per_recurrent_mutation"],
        "range": np.linspace(1, 100, 100),
        "original_weight": scoring_weights["genomic"]["recurrence"]["weight_per_recurrent_mutation"],
    },
    "Gene Expression Weight": {
        "attribute": ["transcriptomic", "gene_expression", "weight_per_unit_of_expression"],
        "range": np.linspace(1, 100, 100),
        "original_weight": scoring_weights["transcriptomic"]["gene_expression"]["weight_per_unit_of_expression"],
    },
    "Purity Weight": {
        "attribute": ["genomic", "purity", "purity_weight"],
        "range": np.linspace(1, 100, 100),
        "original_weight": scoring_weights["genomic"]["purity"]["purity_weight"],
    },
    "Transcription Factor Creation Weight": {
        "attribute": ["genomic", "tfbs", "creation_weight_per_tfbs"],
        "range": np.linspace(1, 20, 100),
        "original_weight": scoring_weights["genomic"]["tfbs"]["creation_weight_per_tfbs"],
    },
    "Transcription Factor Destruction Weight": {
        "attribute": ["genomic", "tfbs", "destruction_weight_per_tfbs"],
        "range": np.linspace(1, 20, 100),
        "original_weight": scoring_weights["genomic"]["tfbs"]["destruction_weight_per_tfbs"],
    },
    "Allele Frequency Weight": {
        "attribute": ["genomic", "allele_frequency", "af_weight"],
        "range": np.linspace(1, 50, 100),
        "original_weight": scoring_weights["genomic"]["allele_frequency"]["af_weight"],
    },
    "CGC Weight": {
        "attribute": ["annotations", "cgc", "weight"],
        "range": np.linspace(1, 50, 100),
        "original_weight": scoring_weights["annotations"]["cgc"]["weight"],
    },
    "Open Chromatin Weight": {
        "attribute": ["annotations", "open_chromatin", "weight"],
        "range": np.linspace(1, 50, 100),
        "original_weight": scoring_weights["annotations"]["open_chromatin"]["weight"],
    }
 }


In [9]:
from plotly.subplots import make_subplots

In [10]:
import math
def round_up_nice(x, base=50):
    return math.ceil(x / base) * base

def _create_min_and_median_plots(
    medians,
    mins,
    name,
    colors,
    weight_mapping,
):
    fig = make_subplots(
        rows=1, cols=2,
        subplot_titles=[
            "Min Rank", "Median Rank",
        ],
    )

    for gene_name in medians:
        for position in medians[gene_name]:
            fig.add_trace(
                go.Scatter(
                    x=weight_mapping[name]["range"],
                    y=medians[gene_name][position],
                    mode="lines",
                    name=f"{gene_name} {position}",
                    marker=dict(size=10, color=colors[gene_name][position])
                ),
                row=1, col=1
            )
            fig.add_trace(
                go.Scatter(
                    x=weight_mapping[name]["range"],
                    y=mins[gene_name][position],
                    mode="lines",
                    name=f"{gene_name} {position}",
                    marker=dict(size=10, color=colors[gene_name][position]),
                    showlegend=False
                ),
                row=1, col=2
            )

    # Also, add a vertical line for the original weight.
    fig.add_vline(
        x=weight_mapping[name]["original_weight"],
        line=dict(color="black", width=2, dash="dash"),
        row=1, col=1
    )

    fig.add_vline(
        x=weight_mapping[name]["original_weight"],
        line=dict(color="black", width=2, dash="dash"),
        row=1, col=2
    )

    # Get the max
    max_medians = []
    for gene_name in medians:
        for position in medians[gene_name]:
            max_medians.append(np.max(medians[gene_name][position]))

    # Round up to the nearest 50.
    max_yaxis = round_up_nice(
        np.max(max_medians),
        50
    )

    fig.update_yaxes(
        range=[0, max_yaxis], row=1, col=1
    )
    fig.update_yaxes(
        range=[0, max_yaxis], row=1, col=2
    )

    fig.update_layout(
        title = f"{name}: Gene Rank Distributions"
    )

    return fig

In [None]:
from plotly.subplots import make_subplots

In [11]:
# Iterate through the recurrence scoring.

# Define the colors
colors = {
    "TERT": {
        1295228: "red",
        1295250: "blue"
    },
    "RALY": {
        32580927: "green"
    },
    "CDC20": {
        43824529: "yellow"
    }
}

# Load in the original scoring weights. These will be changed for each iteration.
with open("../../examples/results/configuration_file.json", "r") as f:
    scoring_weights = json.load(f)
    scoring_weights = scoring_weights["REMIND-Cancer_scoring_weights"]

# Iterate through the weight mapping and create subplots for each weight.
figs = []
for name in weight_mapping:
    # Initialize the figure for the subplots.
    fig = go.Figure()

    # Initialize dictionaries to store means, medians, mins, and maxes for each gene and position.
    means = {}
    medians = {}
    mins = {}
    maxes = {}

    # Iterate through the genes and positions of interest.
    for gene, position, color in [
        ("TERT", 1295228, "red"), 
        ("TERT", 1295250, "blue"), 
        ("RALY", 32580927, "green"), 
        ("CDC20", 43824529, "yellow")
    ]:
        # Initialize the dictionaries for each gene and position.
        if gene not in means:
            means[gene] = {}
            medians[gene] = {}
            mins[gene] = {}
            maxes[gene] = {}
        
        if position not in means[gene]:
            means[gene][position] = []
            medians[gene][position] = []
            mins[gene][position] = []
            maxes[gene][position] = []

        # Initialize the x-axis range, which is just the range of the weight.
        x_axis_scoring_range = weight_mapping[name]["range"]

        # Initialize the y-axis score list.
        y_axis_score = []

        # Iterate through the x-axis scoring range and calculate the scores.
        for x in tqdm(x_axis_scoring_range, desc=f"{name}: {gene} {position}"):
            # Create a temporary copy of the scoring weights to modify.
            scoring_weights_temp = deepcopy(scoring_weights)

            # Update the scoring weights with the current x value.
            # Ex: scoring_weights_temp["genomic"]["recurrence"]["weight_per_recurrent_mutation"] = 1
            scoring_weights_temp[
                weight_mapping[name]["attribute"][0]
            ][
                weight_mapping[name]["attribute"][1]
            ][
                weight_mapping[name]["attribute"][2]
            ] = x

            # Calculate the scores using the modified scoring weights.
            data_temp = _add_remind_cancer_score(
                dataframe=data.copy(),
                remind_cancer_scoring_weights=scoring_weights_temp,
                ge_column="FPKM_Z_score"
            )

            # Since data_temp now has the new scores as `score_v2`, sort it.
            data_temp = data_temp.sort_values("score_v2", ascending=False, inplace=False)

            # Reset the index of the data_temp DataFrame to have the ranks.
            data_temp = data_temp.reset_index(drop=True)

            # Get the median rank by filtering the data_temp DataFrame for the current gene and position and 
            # then getting the index, which represents the rank.
            median = np.median(
                list(
                    data_temp[(data_temp["GENE"] == gene) & (data_temp["POS"] == position)].index
                )
            )

            # Get the mean, min, and max ranks for the current gene and position with the same idea.
            mean = np.mean(list(data_temp[(data_temp["GENE"] == gene) & (data_temp["POS"] == position)].index))
            min = np.min(list(data_temp[(data_temp["GENE"] == gene) & (data_temp["POS"] == position)].index))
            max = np.max(list(data_temp[(data_temp["GENE"] == gene) & (data_temp["POS"] == position)].index))

            # Append the mean, median, min, and max ranks to the respective lists.
            means[gene][position].append(mean)
            medians[gene][position].append(median)
            mins[gene][position].append(min)
            maxes[gene][position].append(max)

    ##### PLOTTING #######
    # Define the 1x2 grid for the subplots.
    fig = _create_min_and_median_plots(
        medians=medians,
        mins=mins,
        name=name,
        colors=colors,
        weight_mapping=weight_mapping
    )
    fig.show()
    figs.append(fig)

Recurrence Weight: TERT 1295228:   0%|          | 0/100 [00:00<?, ?it/s]

Recurrence Weight: TERT 1295250:   0%|          | 0/100 [00:00<?, ?it/s]

Recurrence Weight: RALY 32580927:   0%|          | 0/100 [00:00<?, ?it/s]

Recurrence Weight: CDC20 43824529:   0%|          | 0/100 [00:00<?, ?it/s]

Gene Expression Weight: TERT 1295228:   0%|          | 0/100 [00:00<?, ?it/s]

Gene Expression Weight: TERT 1295250:   0%|          | 0/100 [00:00<?, ?it/s]

Gene Expression Weight: RALY 32580927:   0%|          | 0/100 [00:00<?, ?it/s]

Gene Expression Weight: CDC20 43824529:   0%|          | 0/100 [00:00<?, ?it/s]

Purity Weight: TERT 1295228:   0%|          | 0/100 [00:00<?, ?it/s]

Purity Weight: TERT 1295250:   0%|          | 0/100 [00:00<?, ?it/s]

Purity Weight: RALY 32580927:   0%|          | 0/100 [00:00<?, ?it/s]

Purity Weight: CDC20 43824529:   0%|          | 0/100 [00:00<?, ?it/s]

Transcription Factor Creation Weight: TERT 1295228:   0%|          | 0/100 [00:00<?, ?it/s]

Transcription Factor Creation Weight: TERT 1295250:   0%|          | 0/100 [00:00<?, ?it/s]

Transcription Factor Creation Weight: RALY 32580927:   0%|          | 0/100 [00:00<?, ?it/s]

Transcription Factor Creation Weight: CDC20 43824529:   0%|          | 0/100 [00:00<?, ?it/s]

Transcription Factor Destruction Weight: TERT 1295228:   0%|          | 0/100 [00:00<?, ?it/s]

Transcription Factor Destruction Weight: TERT 1295250:   0%|          | 0/100 [00:00<?, ?it/s]

Transcription Factor Destruction Weight: RALY 32580927:   0%|          | 0/100 [00:00<?, ?it/s]

Transcription Factor Destruction Weight: CDC20 43824529:   0%|          | 0/100 [00:00<?, ?it/s]

Allele Frequency Weight: TERT 1295228:   0%|          | 0/100 [00:00<?, ?it/s]

Allele Frequency Weight: TERT 1295250:   0%|          | 0/100 [00:00<?, ?it/s]

Allele Frequency Weight: RALY 32580927:   0%|          | 0/100 [00:00<?, ?it/s]

Allele Frequency Weight: CDC20 43824529:   0%|          | 0/100 [00:00<?, ?it/s]

CGC Weight: TERT 1295228:   0%|          | 0/100 [00:00<?, ?it/s]

CGC Weight: TERT 1295250:   0%|          | 0/100 [00:00<?, ?it/s]

CGC Weight: RALY 32580927:   0%|          | 0/100 [00:00<?, ?it/s]

CGC Weight: CDC20 43824529:   0%|          | 0/100 [00:00<?, ?it/s]

Open Chromatin Weight: TERT 1295228:   0%|          | 0/100 [00:00<?, ?it/s]

Open Chromatin Weight: TERT 1295250:   0%|          | 0/100 [00:00<?, ?it/s]

Open Chromatin Weight: RALY 32580927:   0%|          | 0/100 [00:00<?, ?it/s]

Open Chromatin Weight: CDC20 43824529:   0%|          | 0/100 [00:00<?, ?it/s]

In [13]:
from plotly.subplots import make_subplots    
fig = _create_min_and_median_plots(
        medians=medians,
        mins=mins,
        name=name,
        colors=colors,
        weight_mapping=weight_mapping
    )
fig.show()