In [2]:
import pandas as pd
import plotly.graph_objects as go
from plotly.offline import init_notebook_mode

import numpy as np
import ast

init_notebook_mode(connected=True)

In [3]:
data = pd.read_csv("/Users/nicholasabad/Desktop/workspace/REMIND-Cancer-visualization/my_data/top200.vcf", delimiter="\t")
data.head()

Unnamed: 0,score,pid,cohort,#CHROM,POS,GENE,REF,ALT,strand,FPKM,...,num_promoter_mutations,num_recurrent_mutations,expression_traces,tf_summary,ncbi_gene_summary,ncbi_url,in_old_results,cn_score.1,cn_file_of_score,ICGC_Estimated_Purity.1
0,93.45,45a7949d-e63f-4956-866c-df51257032de,BLCA-US,2,71204529,ANKRD53,G,A,+,1.428501,...,284,5.0,{'ANKRD53': {'BLCA-US': {'raw': [0.13674067793...,"{'REL': {'summary': ""This gene encodes a prote...",Involved in mitotic metaphase plate congressio...,https://www.ncbi.nlm.nih.gov/gene/79998,True,3,not_available,0.938
1,93.41,6c884037-49f9-41c3-b0e5-9cbcd545aeb7,SKCM-US,9,91933357,SECISBP2,G,A,+,15.07919,...,879,6.0,{'SECISBP2': {'SKCM-US': {'raw': [5.0687620882...,{'ELK4': {'summary': 'This gene is a member of...,The protein encoded by this gene is one of the...,https://www.ncbi.nlm.nih.gov/gene/79048,True,6,not_available,0.78
2,88.1,49dec0c2-8e75-4f44-a253-82b2ea605890,LUAD-US,2,71204529,ANKRD53,G,A,+,1.764278,...,436,5.0,{'ANKRD53': {'LUAD-US': {'raw': [0.73980555761...,"{'REL': {'summary': ""This gene encodes a prote...",Involved in mitotic metaphase plate congressio...,https://www.ncbi.nlm.nih.gov/gene/79998,True,4,not_available,0.5
3,86.45,d692ecd0-2433-426b-9af2-a30c44a80f7c,THCA-US,5,1295228,TERT,G,A,-,0.380236,...,20,95.0,"{'TERT': {'THCA-US': {'raw': [0.0, 0.006731104...",{'ELK4': {'summary': 'This gene is a member of...,Telomerase is a ribonucleoprotein polymerase t...,https://www.ncbi.nlm.nih.gov/gene/7015,True,3,not_available,0.589
4,86.3,9988eb07-01f6-4f83-8699-bb63e0525f08,HNSC-US,7,145813786,CNTNAP2,C,A,+,21.329213,...,459,0.0,{'CNTNAP2': {'HNSC-US': {'raw': [0.38939713831...,{'ZEB1': {'summary': 'This gene encodes a zinc...,This gene encodes a member of the neurexin fam...,https://www.ncbi.nlm.nih.gov/gene/26047,False,9,not_available,0.489


In [3]:
%run /Users/nicholasabad/Desktop/workspace/phd-thesis/utils.ipynb



    _write_figure_to_pdf(
        fig: object,
        name_of_plot: str,
        notes: str="",
        output_location: str="/Users/nicholasabad/Desktop/workspace/phd-thesis/data/images",
        path_to_image_metadata_file: str="/Users/nicholasabad/Desktop/workspace/phd-thesis/data/images/metadata.json",
        overwrite: bool=False,
    )
    


# b. Transcription factor

In [4]:
def tf_expression_violin_plot(
    row: pd.Series,
    chosen_tf_name: str,
    name_of_expression_traces_column: str="expression_traces",
    name_of_cohort_column: str="cohort",
    name_of_tfbs_column: str="JASPAR2020_CORE_vertebrates_non_redundant(tf_name,binding_affinity,seq1,seq2,raw,zscore,log,tf_sequence_logo)",
    name_of_pid_column: str="pid",
):
    expression_trace = ast.literal_eval(row[name_of_expression_traces_column])

    fig = go.Figure()

    for selection in ["zscore", "raw", "log"]:
        if selection == "zscore":
            legend_title = "Z-Score"
        elif selection == "raw":
            legend_title = "Raw"
        else:
            legend_title = "Log"

        # Add background traces.
        cohort = row[name_of_cohort_column]
        n = len(expression_trace[chosen_tf_name][cohort][selection])

        fig.add_trace(
            go.Violin(
                y=expression_trace[chosen_tf_name][cohort][selection],
                name=f"<b>{cohort}</b><br>(n={n})",
                box_visible=True,
                meanline_visible=True,
                marker_color="lightgrey",
                legendgroup=selection,
                legendgrouptitle_text=legend_title,
                visible=True if selection == "raw" else "legendonly",
                opacity=0.6,
                line_color="black",
                points="all"
            )
        )
        # Add scatter plot.
        if row[name_of_tfbs_column] == ".":
            continue

        for entry in row[name_of_tfbs_column].split(";"):
            tf_name = entry.split(",")[0]
            if tf_name == chosen_tf_name:
                binding_affinity = entry.split(",")[1]
                raw, zscore, log = entry.split(",")[4:7]
        if selection == "zscore":
            score = zscore
        elif selection == "raw":
            score = raw
        else:
            score = log

        fig.add_trace(
            go.Scatter(
                mode="markers",
                x=[f"<b>{cohort}</b><br>(n={n})"],
                y=[score],
                marker_size=13,
                marker_line_width=1,
                marker_color="red",
                text=["Current Gene"],
                legendgroup=selection,
                visible=True if selection == "raw" else "legendonly",
                name=f'{row[name_of_pid_column][:8]}'
            )
        )

    fig.update_layout(
        title={
            "text": f"<b>Gene Expression for {chosen_tf_name}</b> <br>(Binding Affinity {binding_affinity})",
            "y": 0.9,
            "x": 0.5,
            "xanchor": "center",
            "yanchor": "top",
        },
        yaxis_title = "Expression"
    )
    
    fig.update_layout(
        plot_bgcolor='white',  # Set plotting area background to white
        paper_bgcolor='white',  # Set overall figure background to white
        xaxis=dict(
            showline=True,  # Show axis lines
            linecolor='black',  # Color of axis lines
            linewidth=1  # Width of axis lines
        ),
        yaxis=dict(
            showline=True,  # Show axis lines
            linecolor='black',  # Color of axis lines
            linewidth=1,  # Width of axis lines
            ticks='outside',  # Display ticks outside the axis
            tickcolor='black',  # Color of ticks
            tickwidth=2  # Width of ticks
        )
    )
    
    return fig


### Gold-standard: TERT and ELK4

In [None]:
tert_row = data.iloc[3]

In [6]:
tf_tert = tf_expression_violin_plot(
    row=tert_row,
    chosen_tf_name="ELK4"
)

_write_figure_to_pdf(
    fig = tf_tert,
    name_of_plot = "tf_gold_standard_tert_elk4.pdf",
    output_location = "/Users/nicholasabad/Desktop/workspace/REMIND-Cancer/paper_figures/grid",
)

Saving image to: /Users/nicholasabad/Desktop/workspace/REMIND-Cancer/paper_figures/grid/tf_gold_standard_tert_elk4.pdf.pdf


### Good: ANKRD53

In [7]:
tf_plot_ankrd = tf_expression_violin_plot(
    row=data.iloc[0],
    chosen_tf_name="RELA"
)

_write_figure_to_pdf(
    fig = tf_plot_ankrd,
    name_of_plot = "tf_good_ankrd53_rela.pdf",
    output_location = "/Users/nicholasabad/Desktop/workspace/REMIND-Cancer/paper_figures/grid",
)

Saving image to: /Users/nicholasabad/Desktop/workspace/REMIND-Cancer/paper_figures/grid/tf_good_ankrd53_rela.pdf.pdf


### Bad: NF2

In [15]:
data.iloc[54]["created_tfs_passing_tf_expression_threshold"]

'IKZF1'

In [5]:
data.iloc[54][['REF', 'ALT']]

REF    G
ALT    A
Name: 54, dtype: object

In [16]:
tf_plot_rpl36al = tf_expression_violin_plot(
    row=data.iloc[54],
    chosen_tf_name="IKZF1"
)

_write_figure_to_pdf(
    fig = tf_plot_rpl36al,
    name_of_plot = "tf_good_rpl36al_IKZF1.pdf",
    output_location = "/Users/nicholasabad/Desktop/workspace/REMIND-Cancer/paper_figures/grid",
)

Saving image to: /Users/nicholasabad/Desktop/workspace/REMIND-Cancer/paper_figures/grid/tf_good_rpl36al_IKZF1.pdf.pdf


# c. TFBS Logo Plot

# d. DeepPileup

In [18]:
def af_greater_than_25_scatterplot(
    path_to_overview_file: str, 
    only_relevant: bool = True
):
    data = pd.read_csv(path_to_overview_file, delimiter="\t")

    gene_name = path_to_overview_file.split("/")[-5]
    position = path_to_overview_file.split("/")[-4]

    cohorts = {}
    for idx, row in data.iterrows():
        cohort_file = row["Cohort_File"]
        af = float(row["SNPs_AF>25_%"])

        cohort = cohort_file.split("_")[2]
        if cohort not in cohorts:
            cohorts[cohort] = {"tumor": -1, "control": -1}
        if "control" in cohort_file:
            cohorts[cohort]["control"] = float(af)
        else:
            cohorts[cohort]["tumor"] = float(af)

    num_original_cohorts = len(cohorts.keys())

    if only_relevant:
        cohorts_to_remove = []
        for cohort in cohorts:
            control_value = cohorts[cohort]["control"]
            tumor_value = cohorts[cohort]["tumor"]
            if control_value == 0 and tumor_value == 0:
                cohorts_to_remove.append(cohort)
        for cohort in cohorts_to_remove:
            cohorts.pop(cohort, None)

    x_axis = list(cohorts.keys())

    num_current_cohorts = len(x_axis)

    if len(x_axis) == 0:
        return af_greater_than_25_scatterplot(path_to_overview_file, False)

    else:
        fig = go.Figure()

        fig.add_trace(
            go.Scatter(
                x=x_axis,
                y=[cohorts[cohort]["tumor"] for cohort in x_axis],
                mode="markers",
                marker_color="red",
                marker={"size": 10},
                name="Tumor",
            )
        )
        fig.add_trace(
            go.Scatter(
                x=x_axis,
                y=[cohorts[cohort]["control"] for cohort in x_axis],
                mode="markers",
                marker_color="green",
                marker={"symbol": "circle-x-open", "size": 10},
                name="Control",
            )
        )

        fig.update_layout(
            title=f"<b>Patients with a minor allele frequency > 25%</b><br><sup>Gene Name: {gene_name} / Position: {position} / Displaying {num_current_cohorts} of {num_original_cohorts} cohorts",
            xaxis_title="Cohorts",
            yaxis_title="Percent of Patients",
            # yaxis_range=[-2, 100],
        )
        
        fig.update_layout(
            plot_bgcolor='white',  # Set plotting area background to white
            paper_bgcolor='white',  # Set overall figure background to white
            xaxis=dict(
                showline=True,  # Show axis lines
                linecolor='black',  # Color of axis lines
                linewidth=1  # Width of axis lines
            ),
            yaxis=dict(
                showline=True,  # Show axis lines
                linecolor='black',  # Color of axis lines
                linewidth=1,  # Width of axis lines
                ticks='outside',  # Display ticks outside the axis
                tickcolor='black',  # Color of ticks
                tickwidth=2  # Width of ticks
            )
        )

        fig.update_xaxes(tickangle=45)

        return fig

### Gold standard: TERT

In [23]:
dp_tert = af_greater_than_25_scatterplot(
    path_to_overview_file="/Users/nicholasabad/Desktop/workspace/data/deep_pileup/pcawg/TERT/chr5:1295228/Overview.tsv"
)

_write_figure_to_pdf(
    fig = dp_tert,
    name_of_plot = "dp_tert.pdf",
    output_location = "/Users/nicholasabad/Desktop/workspace/REMIND-Cancer/paper_figures/grid",
)

Saving image to: /Users/nicholasabad/Desktop/workspace/REMIND-Cancer/paper_figures/grid/dp_tert.pdf.pdf


### Good: ANKRD53

In [24]:
dp_ankrd53 = af_greater_than_25_scatterplot(
    path_to_overview_file="/Users/nicholasabad/Desktop/workspace/data/deep_pileup/pcawg/ANKRD53/chr2:71204529/Overview.tsv"
)

_write_figure_to_pdf(
    fig = dp_ankrd53,
    name_of_plot = "dp_ankrd53.pdf",
    output_location = "/Users/nicholasabad/Desktop/workspace/REMIND-Cancer/paper_figures/grid",
)

Saving image to: /Users/nicholasabad/Desktop/workspace/REMIND-Cancer/paper_figures/grid/dp_ankrd53.pdf.pdf


### Bad: NF2

In [25]:
dp_nf2 = af_greater_than_25_scatterplot(
    path_to_overview_file="/Users/nicholasabad/Desktop/workspace/data/deep_pileup/pcawg/NF2/chr22:29999735/Overview.tsv"
)

_write_figure_to_pdf(
    fig = dp_nf2,
    name_of_plot = "dp_nf2.pdf",
    output_location = "/Users/nicholasabad/Desktop/workspace/REMIND-Cancer/paper_figures/grid",
)

Saving image to: /Users/nicholasabad/Desktop/workspace/REMIND-Cancer/paper_figures/grid/dp_nf2.pdf.pdf


# e. Genome Tornado Plot

In [26]:
import shutil

### Gold standard: TERT

In [27]:
shutil.copy(
    "/Users/nicholasabad/Desktop/workspace/REMIND-Cancer-visualization/example_data/tornado_plots/chr5/chr5_TERT_zoomed.png",
    "/Users/nicholasabad/Desktop/workspace/REMIND-Cancer/paper_figures/grid/gtp_TERT.png"
)

'/Users/nicholasabad/Desktop/workspace/REMIND-Cancer/paper_figures/grid/gtp_TERT.png'

### Good: Ankrd53

In [28]:
shutil.copy(
    "/Users/nicholasabad/Desktop/workspace/REMIND-Cancer-visualization/example_data/tornado_plots/chr2/chr2_ANKRD53_zoomed.png",
    "/Users/nicholasabad/Desktop/workspace/REMIND-Cancer/paper_figures/grid/gtp_ANKRD53.png"
)

'/Users/nicholasabad/Desktop/workspace/REMIND-Cancer/paper_figures/grid/gtp_ANKRD53.png'

### Bad: PRDM2

In [30]:
data.iloc[24]

score                                                       76.92
pid                          66d312fc-809d-428b-98e3-a29d211db35c
cohort                                                    SKCM-US
#CHROM                                                          1
POS                                                      14026387
                                            ...                  
ncbi_url                   https://www.ncbi.nlm.nih.gov/gene/7799
in_old_results                                              False
cn_score.1                                          not_available
cn_file_of_score                                    not_available
ICGC_Estimated_Purity.1                                      0.82
Name: 24, Length: 166, dtype: object

In [31]:
shutil.copy(
    "/Users/nicholasabad/Desktop/workspace/REMIND-Cancer-visualization/example_data/tornado_plots/chr1/chr1_PRDM2_zoomed.png",
    "/Users/nicholasabad/Desktop/workspace/REMIND-Cancer/paper_figures/grid/gtp_PRDM2.png"
)

'/Users/nicholasabad/Desktop/workspace/REMIND-Cancer/paper_figures/grid/gtp_PRDM2.png'