# Instructions

1. **Set file paths and options** in the **Setup** cell:
   - `iptm_file_path`: Path to the IPTM vs. PEAK file (required).
   - `spoc_file_path`: Path to the SPOC score file (optional).
   - `SPOC_analysis`: Set to `True` if you want to do SPOC-based analysis (requires a valid SPOC file), otherwise `False`.
   - `output_dir`: Where to save charts and selected data (defaults to creating an "analysis" folder next to your IPTM file).

2. **Run the notebook cells in order**:
   - The second cell loads the IPTM data and checks whether to proceed with SPOC or basic analysis.
   - If SPOC analysis is enabled and the file is provided, the subsequent cells will merge data and show the SPOC-based chart.
   - Otherwise, you'll see the basic IPTM vs. PEAK chart.

3. **Interact with the charts**:
   - Use **Lasso/Box select** to label points persistently.
   - Use the **Search** widget to highlight points by partial name.
   - **Clear** labels or search highlights as needed.
   - **Save** the plot as HTML/PDF or **export** selected data as a CSV.

4. **Check the output directory** for your saved files.

In [1]:
# === STEP 1: BASIC SETUP ===

import os
import pandas as pd
import plotly.express as px
import ipywidgets as widgets
from IPython.display import display, Markdown
from plotly.graph_objs import FigureWidget

# ---------------- USER INPUTS ----------------
# Required: path to the IPTM vs. PEAK file
iptm_file_path = "/Volumes/plaschka/shared/alphafold/matthias.vorlaender/screens/transcription_complexes/2025-03-05_PolII_subunits_uniprot_vs_RPB3_FLAG_DNase_vs_wt_DNaseI_without_PDB_IDs/IPTM_vs_PTM.txt"


# Optional: path to the SPOC file
#Set None if not available
spoc_file_path = "/Volumes/plaschka/shared/alphafold/matthias.vorlaender/screens/transcription_complexes/2025-03-05_PolII_subunits_uniprot_vs_RPB3_FLAG_DNase_vs_wt_DNaseI_without_PDB_IDs/spoc_dir_SPOC_analysis.csv"

# Boolean flag indicating whether you want to do SPOC analysis
SPOC_analysis = True  # or False

# Output directory (default is a subfolder 'analysis' next to the IPTM file)
# If you want to override, set output_dir = "/your/desired/output"
default_base = os.path.dirname(iptm_file_path)  # Folder of the IPTM file
default_out = os.path.join(default_base, "analysis")
output_dir = default_out


In [2]:


# Make sure the output directory exists
os.makedirs(output_dir, exist_ok=True)

print("IPTM file path   :", iptm_file_path)
print("SPOC file path   :", spoc_file_path)
print("SPOC_analysis    :", SPOC_analysis)
print("Output directory :", output_dir)

# === STEP 2: LOAD IPTM & BRANCH ===

# Load the IPTM vs. PEAK data
df_iptm = pd.read_csv(iptm_file_path, sep="\t")
print("Loaded IPTM DataFrame with shape:", df_iptm.shape)

def extract_max_from_iptm(value):
    try:
        if pd.isna(value):
            return np.nan
        # Convert the value to string and split on colon
        parts = str(value).split(":")
        # Convert each part to a float, ignoring parts that cannot be converted
        nums = []
        for part in parts:
            try:
                nums.append(float(part))
            except:
                pass
        if nums:
            return max(nums)
        else:
            return np.nan
    except Exception as e:
        return np.nan

# Apply the function row-wise to create a new column "IPTM_max"
df_iptm["IPTM_max"] = df_iptm["IPTM"].apply(extract_max_from_iptm)
print("Created 'IPTM_max' column with the maximum IPTM score for each row.")

if SPOC_analysis and spoc_file_path is not None:
    print("SPOC analysis is True, and a SPOC file is provided. We will proceed with SPOC-based code.")
else:
    print("Either SPOC_analysis is False or no SPOC file is provided.")
    print("Proceed with Basic Bubble Chart (equivalent to old cell #2).")

# === STEP 3: SPOC MERGE & HOVER SETUP ===
if SPOC_analysis and spoc_file_path is not None:
    print("Loading SPOC file and merging with IPTM data...")
    df_spoc = pd.read_csv(spoc_file_path)
    print("SPOC DataFrame shape:", df_spoc.shape)
    
    # Merge the two DataFrames
    merged_df = pd.merge(
        df_iptm,
        df_spoc,
        left_on="NAME",
        right_on="complex_name",
        how="left"
    )
    print("Merged DataFrame shape:", merged_df.shape)
    
    # === 1) Create an "opacity" column based on spoc_score ===
    if "spoc_score" in merged_df.columns and merged_df["spoc_score"].notnull().any():
        min_score = merged_df["spoc_score"].min()
        max_score = 1.0  # forcing maximum to 1.0
        def compute_opacity(score):
            if pd.isnull(score):
                return 0.1
            if max_score == min_score:
                return 1.0
            return 0.1 + (score - min_score) / (max_score - min_score) * (1.0 - 0.1)
        merged_df["opacity"] = merged_df["spoc_score"].apply(compute_opacity)
    else:
        merged_df["opacity"] = 1.0

    # === 2) Parse short name from "NAME" and store in new column ===
    def parse_shortname(full_name):
        """
        Given something like:
          "76_sp-Q92610-ZN592_HUMAN_vs_sp-Q13889-TF2H3_HUMAN"
        Extract the short name from the target portion
          -> "TF2H3"
        """
        if pd.isnull(full_name):
            return None
        try:
            left_vs_right = full_name.split("_vs_")
            target_part = left_vs_right[1]  # e.g. "sp-Q13889-TF2H3_HUMAN"
            chunks = target_part.split("-")
            if len(chunks) < 3:
                return target_part
            # e.g. chunks[2] = "TF2H3_HUMAN"
            return chunks[2].split("_")[0]  # "TF2H3"
        except:
            return None

    merged_df["protein_name_hit"] = merged_df["NAME"].apply(parse_shortname)

    # === 3) Build default hover text ===
    default_hover_columns = ["NAME", "IPTM", "PEAK", "spoc_score"]
    # We can also add "protein_name_hit" to the default hover if you want
    # default_hover_columns.append("protein_name_hit")

    for col in default_hover_columns:
        if col not in merged_df.columns:
            default_hover_columns.remove(col)

    merged_df["hover_text"] = merged_df.apply(
        lambda row: "<br>".join([f"{col}: {row[col]}" for col in default_hover_columns]),
        axis=1
    )
    
    # === 4) Build the hover selection widget ===
    # This widget uses 'available_hover_columns', which now includes 'protein_name_hit'
    available_hover_columns = list(merged_df.columns)
    # Pre-select defaults
    if default_hover_columns:
        preselected = tuple(default_hover_columns)
    else:
        preselected = (available_hover_columns[0],)  # fallback

    hover_columns_selector = widgets.SelectMultiple(
        options=available_hover_columns,
        value=preselected,
        description="Hover Columns:",
        disabled=False,
        layout={'width': '400px'}
    )

    update_hover_button = widgets.Button(
        description="Update Hover Info",
        button_style="primary"
    )
    
    def update_hover_info(b):
        selected_columns = list(hover_columns_selector.value)
        if not selected_columns:
            print("Please select at least one column for hover info.")
            return
        merged_df["hover_text"] = merged_df.apply(
            lambda row: "<br>".join([f"{col}: {row[col]}" for col in default_hover_columns]),
            axis=1
        )
        print("Hover info updated using columns:", selected_columns)
    
    update_hover_button.on_click(update_hover_info)

    display(Markdown("### SPOC Hover-Column Selection"))
    display(widgets.HBox([hover_columns_selector, update_hover_button]))

else:
    print("Skipping SPOC merge and hover setup because SPOC_analysis=False or no SPOC file provided.")

IPTM file path   : /Volumes/plaschka/shared/alphafold/matthias.vorlaender/screens/transcription_complexes/2025-03-05_PolII_subunits_uniprot_vs_RPB3_FLAG_DNase_vs_wt_DNaseI_without_PDB_IDs/IPTM_vs_PTM.txt
SPOC file path   : /Volumes/plaschka/shared/alphafold/matthias.vorlaender/screens/transcription_complexes/2025-03-05_PolII_subunits_uniprot_vs_RPB3_FLAG_DNase_vs_wt_DNaseI_without_PDB_IDs/spoc_dir_SPOC_analysis.csv
SPOC_analysis    : True
Output directory : /Volumes/plaschka/shared/alphafold/matthias.vorlaender/screens/transcription_complexes/2025-03-05_PolII_subunits_uniprot_vs_RPB3_FLAG_DNase_vs_wt_DNaseI_without_PDB_IDs/analysis
Loaded IPTM DataFrame with shape: (794, 10)
Created 'IPTM_max' column with the maximum IPTM score for each row.
SPOC analysis is True, and a SPOC file is provided. We will proceed with SPOC-based code.
Loading SPOC file and merging with IPTM data...
SPOC DataFrame shape: (233, 30)
Merged DataFrame shape: (794, 41)


### SPOC Hover-Column Selection

HBox(children=(SelectMultiple(description='Hover Columns:', index=(0, 1, 6, 12), layout=Layout(width='400px'),…

In [None]:
# === STEP 4a: SPOC-BASED BUBBLE CHART ===
import re
from plotly.colors import sample_colorscale

# Global variable to store the current hover column selection.
# Initialize with the default hover columns.
current_hover_columns = default_hover_columns

def update_hover_info(b):
    global current_hover_columns
    selected_columns = list(hover_columns_selector.value)
    if not selected_columns:
        print("Please select at least one column for hover info.")
        return
    current_hover_columns = selected_columns

    merged_df["hover_text"] = merged_df.apply(
        lambda row: "<br>".join([f"{col}: {row[col]}" for col in  selected_columns]),
        axis=1
    )
    print("Hover info updated using columns:", selected_columns)

update_hover_button.on_click(update_hover_info)

def parse_name_field(name_str):
    """
    Given a string of form:
       "76_sp-Q92610-ZN592_HUMAN_vs_sp-Q13889-TF2H3_HUMAN"
    return a dict with:
       {
         'index': '76',
         'protein1': 'sp-Q92610-ZN592_HUMAN',
         'protein2': 'sp-Q13889-TF2H3_HUMAN'
       }
    If parsing fails, returns something fallback with empty strings.
    """
    try:
        # Split around '_vs_'
        parts = name_str.split("_vs_")
        left_part = parts[0]  # e.g. "76_sp-Q92610-ZN592_HUMAN"
        right_part = parts[1] # e.g. "sp-Q13889-TF2H3_HUMAN"

        # Now split the left_part on the first underscore, to separate index from protein1
        left_sub = left_part.split("_", 1)
        idx = left_sub[0]  # "76"
        prot1 = left_sub[1]  # "sp-Q92610-ZN592_HUMAN"

        return {
            "index": idx,
            "protein1": prot1,
            "protein2": right_part
        }
    except Exception:
        # If something goes wrong, return placeholders
        return {
            "index": "",
            "protein1": "",
            "protein2": ""
        }

def parse_color(color_str):
    """Converts a hex or rgb(a) color string to (r, g, b)."""
    # This helper is used if you need numeric r,g,b from a string.
    # If you only need to pass e.g. "red" or "#ff0000" to Plotly,
    # you can skip converting to (r,g,b). Plotly can handle them directly.
    if color_str.startswith("#"):
        hex_color = color_str.lstrip("#")
        r, g, b = tuple(int(hex_color[i:i+2], 16) for i in (0, 2, 4))
        return r, g, b
    elif color_str.startswith("rgb"):
        nums = re.findall(r'\d+', color_str)
        r, g, b = tuple(int(n) for n in nums[:3])
        return r, g, b
    else:
        # Try named CSS color (e.g. "red", "blue") - Plotly accepts those directly
        return color_str

# Before building the figure, let's ensure 'index' is in merged_df
if SPOC_analysis and spoc_file_path is not None:
    if "index" not in merged_df.columns:
        # Parse once for all rows
        parsed_info = merged_df["NAME"].apply(parse_name_field).apply(pd.Series)
        merged_df = pd.concat([merged_df, parsed_info], axis=1)
        # Ensure 'index' column is treated as a string
        merged_df["index"] = merged_df["index"].astype(str)

    # Build the scatter figure
    fig = px.scatter(
        merged_df,
        x="scaled_PEAKavg",
        y="IPTMavg",
        size="IPTM_max",
        color="scaled_PEAKavg",
        color_continuous_scale="viridis_r",
        title="(SPOC) IPTM vs. Scaled PEAKavg",
        labels={"IPTMavg": "IPTMavg", "scaled_PEAKavg": "Scaled PEAKavg"}
    )

    # Build color array with custom opacity
    min_val = merged_df["scaled_PEAKavg"].min()
    max_val = merged_df["scaled_PEAKavg"].max()
    if max_val != min_val:
        norm = (merged_df["scaled_PEAKavg"] - min_val) / (max_val - min_val)
    else:
        norm = merged_df["scaled_PEAKavg"] * 0  # or just 0

    base_colors = px.colors.sequential.Viridis_r
    rgba_colors = []
    for val, opa in zip(norm, merged_df["opacity"]):
        color_str = sample_colorscale(base_colors, val)[0]  # returns a color string
        try:
            r, g, b = parse_color(color_str)
            if isinstance(r, str):
                # color was a named CSS color
                rgba_colors.append(r) 
            else:
                rgba_colors.append(f"rgba({r},{g},{b},{opa})")
        except:
            rgba_colors.append(f"rgba(0,0,0,{opa})")

    fig.update_traces(
        marker=dict(color=rgba_colors),
        customdata=merged_df[["hover_text"]].values,
        hovertemplate="%{customdata[0]}<extra></extra>"
    )
        
    fig.update_layout(
        # Update x and y axes: no grid, with black axis lines.
        xaxis=dict(
            showgrid=False,
            showline=True,
            linewidth=2,
            linecolor='black'
        ),
        yaxis=dict(
            showgrid=False,
            showline=True,
            linewidth=2,
            linecolor='black'
        ),
        # Add a rectangle shape as an outer border.
        shapes=[
            dict(
                type="rect",
                xref="paper", yref="paper",
                x0=0, y0=0, x1=1, y1=1,
                line=dict(color="black", width=2)
            )
        ],
        # Optionally, set the template and margins.
        template="plotly_white",
        margin=dict(l=50, r=50, t=50, b=50)
    )
        
    figw = FigureWidget(fig)

    # --- GLOBAL STORAGE FOR SELECTIONS ---
    global_persisted_indices_spoc = set()

    def handle_selection(trace, points, selector):
        global global_persisted_indices_spoc
        global_persisted_indices_spoc.update(points.point_inds)
        if not global_persisted_indices_spoc:
            print("No points selected.")
            return
        print("Accumulated selected indices:", global_persisted_indices_spoc)
        selected_df = merged_df.iloc[list(global_persisted_indices_spoc)]
        
        # Example label: extract a short uniprot name from 'NAME' or just show the "index"
        def process_name(name_str):
            try:
                parts = name_str.split("_vs_")
                if len(parts) < 2:
                    return name_str
                # E.g. "sp-Q13889-TF2H3_HUMAN"
                hit = parts[1]
                hit_parts = hit.split("-")
                if len(hit_parts) < 3:
                    return hit
                return hit_parts[2].split("_")[0]
            except:
                return name_str
        
        labels = selected_df["NAME"].apply(process_name)
        
        # Check if we already have a "Persistent Labels" trace
        persistent_trace = None
        for t in figw.data:
            if t.name == "Persistent Labels":
                persistent_trace = t
                break
        
        if persistent_trace is None:
            figw.add_scatter(
                x=selected_df["scaled_PEAKavg"],
                y=selected_df["IPTMavg"],
                mode="text",
                text=labels,
                textposition="top center",
                name="Persistent Labels",
                hoverinfo="skip",
                textfont=dict(color="black", size=12)
            )
        else:
            persistent_trace.x = selected_df["scaled_PEAKavg"]
            persistent_trace.y = selected_df["IPTMavg"]
            persistent_trace.text = labels

    # Attach selection callback
    for trace in figw.data:
        trace.on_selection(handle_selection)

    # --- STANDARD SEARCH (by substring) WIDGETS ---
    search_input_spoc = widgets.Text(
        value="",
        placeholder="Enter partial name to search",
        description="Search NAME:",
        style={'description_width': '120px'},
        layout={'width': '400px'}
    )
    search_button_spoc = widgets.Button(
        description="Search",
        tooltip="Search partial matches",
        button_style="primary"
    )
    clear_search_button_spoc = widgets.Button(
        description="Clear Search",
        tooltip="Remove search highlights",
        button_style="warning"
    )

    def on_search_button_click_spoc(b):
        query = search_input_spoc.value.strip()
        if not query:
            print("Please enter a search query.")
            return
        mask = merged_df["NAME"].str.contains(query, case=False, na=False)
        matched = merged_df[mask]
        if matched.empty:
            print("No matches found.")
            return
        
        # Add highlight scatter
        figw.add_scatter(
            x=matched["scaled_PEAKavg"],
            y=matched["IPTMavg"],
            mode="markers+text",
            marker=dict(symbol="circle-open", size=12, line=dict(width=2, color="red")),
            text=[query]*len(matched),
            textposition="top center",
            name="Search Highlight",
            hoverinfo="skip"
        )
        print(f"Found {len(matched)} match(es). Highlights added.")

    def on_clear_search_button_click_spoc(b):
        indices_to_remove = [i for i, t in enumerate(figw.data) if t.name == "Search Highlight"]
        if not indices_to_remove:
            print("No search highlights to clear.")
            return
        for idx in sorted(indices_to_remove, reverse=True):
            figw.data = figw.data[:idx] + figw.data[idx+1:]
        print("Search highlights cleared.")
    
    search_button_spoc.on_click(on_search_button_click_spoc)
    clear_search_button_spoc.on_click(on_clear_search_button_click_spoc)

    # --- NEW MULTI-GROUP INDEX + COLOR HIGHLIGHT ---
    # Example input: (1,5,12,19=green) (2,9,200=red)
    group_highlight_input_spoc = widgets.Text(
        ##TRICK THE COPY PASTE BIUG HERE!!###
        value="(322,245,22,250,743,690,261,233,229,479,464,107,1,203,660,659,648,363,462,474,475,492,271,192,97=green) (33=grey) (591,425,761,771,286,385,233,479,464,203,660,659,648,462,474,492,192,508,181,190,64,579,40,708,364,416,35,151=red) (436,117,630,573,3,60,687=black)(400,411,423=blue)",
        placeholder="(192,97=green) (35,151=red)",
        description="Multi-Groups:",
        style={'description_width': '100px'},
        layout={'width': '600px'}
    )
    group_highlight_button_spoc = widgets.Button(
        description="Highlight Groups",
        tooltip="Highlight multiple index groups, each with a color",
        button_style="info"
    )

    def on_group_highlight_button_click_spoc(b):
        """
        Example input: (743=green) (385,23,151=red) (20,21,423=blue)
        Each group is parsed, and for each group we use the protein_name_hit value for labeling.
        """
        input_str = group_highlight_input_spoc.value.strip()
        if not input_str:
            print("No group spec given. Format: (1,5,12=red) (2,9=green)")
            return

        # Split by closing parenthesis, filtering out empties.
        group_specs = [chunk.strip() for chunk in input_str.split(")") if chunk.strip()]

        for spec in group_specs:
            # Remove any leading "(" if present.
            if spec.startswith("("):
                spec = spec[1:].strip()

            # Split on "=" to separate indices from the color.
            if "=" in spec:
                left_part, color_part = spec.split("=", 1)
                indices_str = left_part.strip()
                color_str = color_part.strip()
            else:
                indices_str = spec
                color_str = "red"  # default

            # Split indices (comma-separated).
            idx_list = [x.strip() for x in indices_str.split(",") if x.strip()]
            if not idx_list:
                print(f"No valid indices found in '{spec}'")
                continue

            # Find all matching rows in merged_df.
            matched = merged_df[merged_df["index"].isin(idx_list)]
            if matched.empty:
                print(f"No match for indices {idx_list}")
                continue

            # Use the protein_name_hit column for labeling.
            group_label = matched["protein_name_hit"]

            # Add one scatter trace for the group.
            figw.add_scatter(
                x=matched["scaled_PEAKavg"],
                y=matched["IPTMavg"],
                mode="markers+text",
                marker=dict(symbol="circle", color=color_str, size=12),
                text=group_label,
                textposition="top center",
                name=f"Highlight",
                hoverinfo="skip"
            )
            print(f"Highlighted indices {idx_list} in color '{color_str}'")

        print("Group highlight done.")

    group_highlight_button_spoc.on_click(on_group_highlight_button_click_spoc)

    # --- CLEAR LABELS & SAVE PLOT ---
    clear_labels_button_spoc = widgets.Button(
        description="Clear Labels",
        button_style="warning"
    )
    def on_clear_labels_click_spoc(b):
        global_persisted_indices_spoc
        global_persisted_indices_spoc.clear()

        # Remove the "Persistent Labels" or highlight traces if needed
        names_to_remove = ["Persistent Labels"]
        # Also remove any highlight traces we might want to clear
        # If you only want to remove the "Persistent Labels", 
        # leave out highlight traces from the list above.
        indices_to_remove = [
            i for i, t in enumerate(figw.data) 
            if t.name in names_to_remove or t.name.startswith("Highlight ")
        ]
        for idx in sorted(indices_to_remove, reverse=True):
            figw.data = figw.data[:idx] + figw.data[idx+1:]
        print("Persistent labels and highlight traces cleared.")

    clear_labels_button_spoc.on_click(on_clear_labels_click_spoc)

    save_plot_button_spoc = widgets.Button(
        description="Save Plot (HTML & PDF)",
        tooltip="Save the current plot",
        button_style="info"
    )


    file_name_widget_spoc = widgets.Text(
        value="selected_data_spoc.csv",
        placeholder="Enter file name",
        description="Save CSV as:",
        disabled=False
    )
    save_data_button_spoc = widgets.Button(
        description="Save Data",
        button_style="success"
    )
    save_data_output_spoc = widgets.Output()
    
    
    import datetime

    custom_suffix_save = widgets.Text(
        value="",
        placeholder="Add file name suffix",
        description="Filename Suffix:",
        layout={'width': '400px'}
    )
    
    def on_save_plot_click_spoc(b):
        try:
            # Retrieve custom suffix from widget and current timestamp.
            suffix = custom_suffix_save.value.strip()
            timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
            if suffix:
                suffix_str = f"_{suffix}_{timestamp}"
            else:
                suffix_str = f"_{timestamp}"
            
            # Build file paths with the custom suffix and timestamp.
            html_filename = f"spoc_bubble_chart{suffix_str}.html"
            pdf_filename = f"spoc_bubble_chart{suffix_str}.pdf"
            html_path = os.path.join(output_dir, html_filename)
            pdf_path = os.path.join(output_dir, pdf_filename)
            
            # Save the figure.
            figw.write_html(html_path)
            figw.write_image(pdf_path, format="pdf")
            print(f"Plot saved:\n  HTML: {html_path}\n  PDF: {pdf_path}")
        except Exception as e:
            print("Error saving plot:", e)

    save_plot_button_spoc.on_click(on_save_plot_click_spoc)

    # --- DISPLAY ---
    instructions_text_spoc = """
    **SPOC-Based Plot Instructions:**
    1. **Use Lasso/Box select to pick points and persist labels**.
    2. (Optional) Search by partial `NAME` using the first box, then clear highlights if needed.
    3. **Highlight by index** (the digits before the underscore) using the second box, 
       e.g. (1,5,12,19=green) (2,9,200=red) (20=blue)
    4. Clear persistent labels and/or highlight traces if needed.
    5. Save the plot (HTML & PDF) or selected data (CSV).
    """
    display(Markdown(instructions_text_spoc))
    display(widgets.HBox([search_input_spoc, search_button_spoc, clear_search_button_spoc]))
    
    # The new multi-index highlight input
    # Now just display the new widgets:
    display(widgets.HBox([group_highlight_input_spoc, group_highlight_button_spoc]))
    display(widgets.HBox([clear_labels_button_spoc]))

    display(widgets.HBox([custom_suffix_save, save_plot_button_spoc]))


    display(figw)
    display(widgets.HBox([file_name_widget_spoc, save_data_button_spoc]))
    display(save_data_output_spoc)

else:
    print("Skipping SPOC-based bubble chart...")

## Note
TO tricky the copy apste button, search the code for "##TRICK THE COPY PASTE BIUG HERE!!###" and enter your hiohglits there!
 


In [3]:
# === STEP 1: MERGE MS DATA ===
ms_file_path = "/Volumes/plaschka/shared/data/mass-spec/MS_analysis/analysis/MV_RPB3_FLAG_pretty/exports/merged_data_20250319_182512_with_nuc_vs_chrom.tsv"
df_ms = pd.read_csv(ms_file_path, sep="\t")
print("Loaded MS data with shape:", df_ms.shape)

def extract_target_uniprot(name_str):
    try:
        parts = name_str.split("_vs_")
        if len(parts) < 2:
            return None
        target = parts[1]  # e.g., "sp-Q9Y3X0-CCDC9_HUMAN"
        target_parts = target.split("-")
        if len(target_parts) < 2:
            return None
        return target_parts[1]  # e.g. "Q9Y3X0"
    except Exception:
        return None

# Create a new column in merged_df with the target uniprot IDs.
merged_df["target_uniprot"] = merged_df["NAME"].apply(extract_target_uniprot)
print("Extracted target_uniprot in merged_df.")

# Merge the MS data with merged_df on "Accession" from MS data and "target_uniprot" in merged_df.
merged_df = pd.merge(merged_df, df_ms, left_on="target_uniprot", right_on="Accession", how="left")
print("Merged DataFrame shape after merging MS data:", merged_df.shape)

Loaded MS data with shape: (1681, 4)
Extracted target_uniprot in merged_df.
Merged DataFrame shape after merging MS data: (794, 49)


In [None]:
# === CELL 1: Column & Transform Selection, Save to JSON Config ===
import os
import json
import ipywidgets as widgets
from IPython.display import display, Markdown

config_file = "config.json"

# We assume you already have a DataFrame called merged_df with numeric columns
numeric_cols = merged_df.select_dtypes(include=[float, int]).columns.tolist()
transform_options = ["None", "log2", "log10"]

# If a config already exists, load it. Otherwise define some defaults.
if os.path.exists(config_file):
    with open(config_file, "r") as f:
        prev_cfg = json.load(f)
    print(f"Loaded existing config from {config_file}: {prev_cfg}")
else:
    prev_cfg = {}
    print("No config file found; using empty defaults.")

# Helper function to get from dict with fallback
def dict_get(d, key, fallback):
    return d[key] if key in d else fallback

# Column selection
color_selector = widgets.Dropdown(
    options=numeric_cols,
    value=dict_get(prev_cfg, "color_column", "scaled_PEAKavg"),
    description="Color Col:",
    layout={'width': '220px'}
)
size_selector = widgets.Dropdown(
    options=numeric_cols,
    value=dict_get(prev_cfg, "size_column", "spoc_score"),
    description="Size Col:",
    layout={'width': '220px'}
)
opacity_selector = widgets.Dropdown(
    options=numeric_cols,
    value=dict_get(prev_cfg, "opacity_column", "spoc_score"),
    description="Opacity Col:",
    layout={'width': '220px'}
)

# Transform selection
color_transform_selector = widgets.Dropdown(
    options=transform_options,
    value=dict_get(prev_cfg, "color_transform", "None"),
    description="Color Xform:",
    layout={'width': '220px'}
)
size_transform_selector = widgets.Dropdown(
    options=transform_options,
    value=dict_get(prev_cfg, "size_transform", "None"),
    description="Size Xform:",
    layout={'width': '220px'}
)
opacity_transform_selector = widgets.Dropdown(
    options=transform_options,
    value=dict_get(prev_cfg, "opacity_transform", "None"),
    description="Opac Xform:",
    layout={'width': '220px'}
)

display(Markdown("### Column & Transform Selection"))
display(widgets.HBox([color_selector, size_selector, opacity_selector]))
display(widgets.HBox([color_transform_selector, size_transform_selector, opacity_transform_selector]))

def on_save_config(b):
    cfg = {
        "color_column": color_selector.value,
        "size_column": size_selector.value,
        "opacity_column": opacity_selector.value,
        "color_transform": color_transform_selector.value,
        "size_transform": size_transform_selector.value,
        "opacity_transform": opacity_transform_selector.value
    }
    with open(config_file, "w") as f:
        json.dump(cfg, f)
    print("[Cell1] Configuration saved to", config_file, ":", cfg)

save_button = widgets.Button(description="Save Config", button_style="success")
save_button.on_click(on_save_config)
display(save_button)

print("Adjust columns/transforms as desired, then click 'Save Config'. Next, run Cell 2.")

Loaded existing config from config.json: {'color_column': 'DNaseI digest chromatin FLAG-mCh_RPB3_normalized_P19387 vs Nucleoplasm FLAG-mCh_RPB3_normalized_P19387', 'size_column': 'spoc_score', 'opacity_column': 'DNaseI digest chromatin FLAG-mCh_RPB3', 'color_transform': 'None', 'size_transform': 'None', 'opacity_transform': 'log2'}


### Column & Transform Selection

HBox(children=(Dropdown(description='Color Col:', index=35, layout=Layout(width='220px'), options=('IPTMavg', …

HBox(children=(Dropdown(description='Color Xform:', layout=Layout(width='220px'), options=('None', 'log2', 'lo…

Button(button_style='success', description='Save Config', style=ButtonStyle())

Adjust columns/transforms as desired, then click 'Save Config'. Next, run Cell 2.


In [None]:
# === CELL 2: Load & Apply Config, then show min/max range ===

import json
import numpy as np
import ipywidgets as widgets
from IPython.display import display, Markdown

load_output = widgets.Output()

import numpy as np

def apply_transform_with_5pct(series, transform_kind="None"):
    """
    1) Fill NaN with 0.
    2) Replace zeros with the 5% quantile of positive values.
    3) If transform='None', keep negatives as is.
       If transform='log2' or 'log10', raise ValueError if still any negative values remain.
    """
    s = series.fillna(0).copy()
    
    # Identify positive (non-zero) values to compute the 5% quantile
    pos_mask = (s > 0)
    if pos_mask.any():
        # 5% quantile among the positive subset
        q5 = np.quantile(s[pos_mask], 0.05)
    else:
        # If we have no positive data at all, let's fallback to something small
        q5 = 1e-6  # or raise an error
    if q5 <= 0:
        # If the 5% quantile is not strictly positive, fallback to small epsilon
        q5 = 1e-6

    # Replace zeros with q5
    zero_mask = (s == 0)
    if zero_mask.any():
        s[zero_mask] = q5

    # If transform=None, we keep negative values as is
    if transform_kind == "None":
        return s

    # If log2 or log10, raise error if data still has negative values
    if (s < 0).any():
        raise ValueError(
            f"Non-positive data found for {transform_kind} transform (some negative). "
            "Check your data or use None transform."
        )
    
    if transform_kind == "log2":
        return np.log2(s)
    elif transform_kind == "log10":
        return np.log10(s)
    else:
        raise ValueError(f"Invalid transform option: {transform_kind}")
    
    
# We'll define float text boxes for the final min/max
color_min_box = widgets.FloatText(description="Color Min:", layout={'width': '200px'})
color_max_box = widgets.FloatText(description="Color Max:", layout={'width': '200px'})
size_min_box  = widgets.FloatText(description="Size Min:",  layout={'width': '200px'})
size_max_box  = widgets.FloatText(description="Size Max:",  layout={'width': '200px'})
opac_min_box  = widgets.FloatText(description="Opac Min:", layout={'width': '200px'})
opac_max_box  = widgets.FloatText(description="Opac Max:", layout={'width': '200px'})

def on_load_apply_config(b):
    with load_output:
        load_output.clear_output()
        try:
            if not os.path.exists("config.json"):
                print("No config.json found. Please run Cell 1 and save config.")
                return

            with open("config.json","r") as f:
                final_cfg = json.load(f)

            # Extract columns & transforms
            ccol = final_cfg.get("color_column","scaled_PEAKavg")
            scol = final_cfg.get("size_column","spoc_score")
            ocol = final_cfg.get("opacity_column","spoc_score")

            cx = final_cfg.get("color_transform","None")
            sx = final_cfg.get("size_transform","None")
            ox = final_cfg.get("opacity_transform","None")

            # Apply transforms
            merged_df["color_processed"] = apply_transform_with_5pct(merged_df[ccol], cx)
            merged_df["size_processed"]  = apply_transform_with_5pct(merged_df[scol], sx)
            merged_df["opacity_processed"] = apply_transform_with_5pct(merged_df[ocol], ox)

            # Show new min/max
            cmin, cmax = merged_df["color_processed"].min(), merged_df["color_processed"].max()
            smin, smax = merged_df["size_processed"].min(), merged_df["size_processed"].max()
            omin, omax = merged_df["opacity_processed"].min(), merged_df["opacity_processed"].max()

            color_min_box.value, color_max_box.value = cmin, cmax
            size_min_box.value, size_max_box.value   = smin, smax
            opac_min_box.value, opac_max_box.value   = omin, omax

            print("[Cell2] Loaded config:", final_cfg)
            print(f" color_processed => [{cmin:.3f}..{cmax:.3f}]")
            print(f" size_processed  => [{smin:.3f}..{smax:.3f}]")
            print(f" opacity_processed => [{omin:.3f}..{omax:.3f}]")

        except ValueError as e:
            print("Error in transform:", e)

load_button = widgets.Button(description="Load & Apply Config", button_style="primary")
load_button.on_click(on_load_apply_config)

display(Markdown("### Load & Apply Config"))
display(load_button)
display(load_output)

display(Markdown("### Min/Max Ranges after transform"))
display(widgets.HBox([color_min_box, color_max_box]))
display(widgets.HBox([size_min_box, size_max_box]))
display(widgets.HBox([opac_min_box, opac_max_box]))

print("Press 'Load & Apply Config' to re-apply transforms, then see updated min/max here.")

### Load & Apply Config

Button(button_style='primary', description='Load & Apply Config', style=ButtonStyle())

Output()

### Min/Max Ranges after transform

HBox(children=(FloatText(value=0.0, description='Color Min:', layout=Layout(width='200px')), FloatText(value=0…

HBox(children=(FloatText(value=0.0, description='Size Min:', layout=Layout(width='200px')), FloatText(value=0.…

HBox(children=(FloatText(value=0.0, description='Opac Min:', layout=Layout(width='200px')), FloatText(value=0.…

Press 'Load & Apply Config' to re-apply transforms, then see updated min/max here.


In [18]:
import numpy as np
import pandas as pd
import plotly.express as px
from plotly.graph_objs import FigureWidget
import ipywidgets as widgets
from IPython.display import display, Markdown
import os, datetime

# Optionally clamp or remap. For demonstration, let's just proceed as is.

###############################################################################
# 4) Build the final scatter. We'll show original columns in hover & legend
###############################################################################
chart_title = "Bubble Chart"

fig = px.scatter(
    merged_df,
    x="scaled_PEAKavg",   # or whichever x-axis
    y="IPTMavg",          # y-axis
    color="color_processed",
    size="size_processed",
    # Show the original columns in hover
    hover_data=[color_col, size_col, opacity_col, "hover_text"], 
    title=chart_title,
    # We can pick a color scale & range if desired
    color_continuous_scale="viridis",
    range_color=[cmin, cmax],  # or override if you like
    labels={
        # Rename color axis to original column name
        "color_processed": color_col,
        "size_processed": size_col
    }
)
fig.update_layout(template="plotly_white")

# We'll create a FigureWidget to attach advanced interactions
figw_dynamic = FigureWidget(fig)

# We want per-point marker opacity from "opacity_processed"
# but must clamp or map them to [0..1] for valid Plotly marker opacity
def remap_to_01(array, old_min, old_max):
    if np.isclose(old_max, old_min):
        return np.full_like(array, 0.5)
    return (array - old_min)/(old_max - old_min)

op_min, op_max = merged_df["opacity_processed"].min(), merged_df["opacity_processed"].max()
op_vals = merged_df["opacity_processed"].values
mapped_opacity = remap_to_01(op_vals, op_min, op_max)
mapped_opacity = np.clip(mapped_opacity, 0, 1)  # final clamp

for trace in figw_dynamic.data:
    trace.marker.opacity = mapped_opacity

###############################################################################
# 5) ADVANCED FEATURES: search, highlight, persistent labels, saving
###############################################################################

# 5a) Global set for persistent selection
global_persisted_indices_dynamic = set()

def handle_selection_dynamic(trace, points, selector):
    global global_persisted_indices_dynamic
    global_persisted_indices_dynamic.update(points.point_inds)
    if not global_persisted_indices_dynamic:
        print("[Dynamic] No points selected.")
        return
    print("[Dynamic] Accumulated selected indices:", global_persisted_indices_dynamic)
    selected_df = merged_df.iloc[list(global_persisted_indices_dynamic)]
    
    # Example label: short uniprot name from 'NAME' or just name
    def short_label(name_str):
        try:
            # Something from SPOC code: parse after _vs_
            parts = name_str.split("_vs_")
            if len(parts) < 2:
                return name_str
            hit = parts[1]
            # parse short
            hit_parts = hit.split("-")
            if len(hit_parts) < 3:
                return hit
            return hit_parts[2].split("_")[0]
        except:
            return name_str
    
    labels = selected_df["NAME"].apply(short_label)
    
    persistent_trace = None
    for t in figw_dynamic.data:
        if t.name == "Persistent Labels (Dynamic)":
            persistent_trace = t
            break
    
    if persistent_trace is None:
        figw_dynamic.add_scatter(
            x=selected_df["scaled_PEAKavg"],
            y=selected_df["IPTMavg"],
            mode="text",
            text=labels,
            textposition="top center",
            name="Persistent Labels (Dynamic)",
            hoverinfo="skip",
            textfont=dict(color="black", size=12)
        )
    else:
        persistent_trace.x = selected_df["scaled_PEAKavg"]
        persistent_trace.y = selected_df["IPTMavg"]
        persistent_trace.text = labels

# Attach selection callback
for trace in figw_dynamic.data:
    trace.on_selection(handle_selection_dynamic)

# 5b) SEARCH: partial name => highlight
search_input_dynamic = widgets.Text(
    value="",
    placeholder="Enter partial NAME to search",
    description="Search NAME:",
    layout={'width': '300px'}
)
search_button_dynamic = widgets.Button(description="Search", button_style="primary")
clear_search_button_dynamic = widgets.Button(description="Clear Search", button_style="warning")

def on_search_button_click_dynamic(b):
    query = search_input_dynamic.value.strip()
    if not query:
        print("[Dynamic] Please enter a search query.")
        return
    mask = merged_df["NAME"].str.contains(query, case=False, na=False)
    matched = merged_df[mask]
    if matched.empty:
        print("[Dynamic] No matches found.")
        return
    figw_dynamic.add_scatter(
        x=matched["scaled_PEAKavg"],
        y=matched["IPTMavg"],
        mode="markers+text",
        marker=dict(symbol="circle-open", size=12, line=dict(width=2, color="red")),
        text=[query]*len(matched),
        textposition="top center",
        name="Search Highlight (Dynamic)",
        hoverinfo="skip"
    )
    print(f"[Dynamic] Found {len(matched)} matches. Highlights added.")

def on_clear_search_button_click_dynamic(b):
    to_remove = [i for i, t in enumerate(figw_dynamic.data) if t.name == "Search Highlight (Dynamic)"]
    if not to_remove:
        print("[Dynamic] No highlights to clear.")
        return
    for idx in sorted(to_remove, reverse=True):
        figw_dynamic.data = figw_dynamic.data[:idx] + figw_dynamic.data[idx+1:]
    print("[Dynamic] Search highlights cleared.")

search_button_dynamic.on_click(on_search_button_click_dynamic)
clear_search_button_dynamic.on_click(on_clear_search_button_click_dynamic)

# 5c) MULTI-GROUP HIGHLIGHT
group_highlight_input_dynamic = widgets.Text(
    value="(1,5=green) (2,9=red)",
    placeholder="(1,5=green) (2,9=red)",
    description="Multi-Groups:",
    layout={'width': '600px'}
)
group_highlight_button_dynamic = widgets.Button(
    description="Highlight Groups",
    tooltip="Highlight multiple index groups, each with a color",
    button_style="info"
)

def on_group_highlight_button_click_dynamic(b):
    user_str = group_highlight_input_dynamic.value.strip()
    if not user_str:
        print("[Dynamic] No group spec. Format: (1,5=red) (2,9=blue)")
        return
    group_specs = [chunk.strip() for chunk in user_str.split(")") if chunk.strip()]
    for spec in group_specs:
        if spec.startswith("("):
            spec = spec[1:].strip()
        if "=" in spec:
            left, c = spec.split("=", 1)
            idx_str = left.strip()
            color_str = c.strip()
        else:
            idx_str = spec
            color_str = "red"
        idx_list = [x.strip() for x in idx_str.split(",") if x.strip()]
        matched = merged_df[merged_df["index"].isin(idx_list)]
        if matched.empty:
            print(f"[Dynamic] No match for indices {idx_list}.")
            continue
        
        # Example label from 'protein_name_hit' or short uniprot name. We'll just use 'NAME' here
        group_label = matched["NAME"] 
        figw_dynamic.add_scatter(
            x=matched["scaled_PEAKavg"],
            y=matched["IPTMavg"],
            mode="markers+text",
            marker=dict(symbol="circle", color=color_str, size=12),
            text=group_label,
            textposition="top center",
            name="Highlight (Dynamic)",
            hoverinfo="skip"
        )
        print(f"[Dynamic] Highlighted indices {idx_list} in '{color_str}'.")
    print("[Dynamic] Group highlight done.")

group_highlight_button_dynamic.on_click(on_group_highlight_button_click_dynamic)

# 5d) CLEAR LABELS & HIGHLIGHTS
clear_labels_button_dynamic = widgets.Button(description="Clear Labels", button_style="warning")
def on_clear_labels_click_dynamic(b):
    global_persisted_indices_dynamic.clear()
    remove_names = ["Persistent Labels (Dynamic)", "Highlight (Dynamic)", "Search Highlight (Dynamic)"]
    idx_remove = [i for i,t in enumerate(figw_dynamic.data) 
                  if t.name in remove_names or t.name.startswith("Highlight ")]
    for idx in sorted(idx_remove, reverse=True):
        figw_dynamic.data = figw_dynamic.data[:idx] + figw_dynamic.data[idx+1:]
    print("[Dynamic] Cleared labels/highlights.")

clear_labels_button_dynamic.on_click(on_clear_labels_click_dynamic)

# 5e) SAVE PLOT
custom_suffix_dynamic = widgets.Text(
    value="",
    placeholder="File name suffix",
    description="File Suffix:",
    layout={'width': '300px'}
)
save_plot_button_dynamic = widgets.Button(description="Save Plot (HTML & PDF)", button_style="info")

def on_save_plot_click_dynamic(b):
    try:
        suffix = custom_suffix_dynamic.value.strip()
        timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
        if suffix:
            suffix_str = f"_{suffix}_{timestamp}"
        else:
            suffix_str = f"_{timestamp}"

        html_name = f"spoc_bubble_chart{suffix_str}.html"
        pdf_name  = f"spoc_bubble_chart{suffix_str}.pdf"
        html_path = os.path.join(output_dir, html_name)
        pdf_path  = os.path.join(output_dir, pdf_name)

        figw_dynamic.write_html(html_path)
        figw_dynamic.write_image(pdf_path, format="pdf")
        print(f"[Dynamic] Plot saved:\n  HTML: {html_path}\n  PDF: {pdf_path}")
    except Exception as e:
        print("[Dynamic] Error saving plot:", e)

save_plot_button_dynamic.on_click(on_save_plot_click_dynamic)

# 5f) SAVE SELECTED DATA
file_name_widget_dynamic = widgets.Text(
    value="selected_data_dynamic.csv",
    placeholder="Enter file name",
    description="Save CSV as:",
    layout={'width': '300px'}
)
save_data_button_dynamic = widgets.Button(description="Save Data", button_style="success")
save_data_output_dynamic = widgets.Output()

def on_save_data_click_dynamic(b):
    with save_data_output_dynamic:
        save_data_output_dynamic.clear_output()
        if not global_persisted_indices_dynamic:
            print("[Dynamic] No points selected!")
            return
        selected_df = merged_df.iloc[list(global_persisted_indices_dynamic)]
        out_file = file_name_widget_dynamic.value
        full_path = os.path.join(output_dir, out_file)
        selected_df.to_csv(full_path, index=False)
        print(f"[Dynamic] Selected data saved to: {full_path}")

save_data_button_dynamic.on_click(on_save_data_click_dynamic)

###############################################################################
# 6) Display final UI
###############################################################################
instructions_text = """
**Instructions**:
1. Lasso/Box select points => persistent labels.
2. Search by partial `NAME` => highlight.
3. Group highlight => e.g. `(1,5=green) (2,9=red)`.
4. Clear labels/highlights as needed.
5. Save the plot or the selected data.
"""
display(Markdown(instructions_text))
# Search & highlight
display(widgets.HBox([search_input_dynamic, search_button_dynamic, clear_search_button_dynamic]))
display(widgets.HBox([group_highlight_input_dynamic, group_highlight_button_dynamic, clear_labels_button_dynamic]))
# Suffix & save
display(widgets.HBox([custom_suffix_dynamic, save_plot_button_dynamic]))
# Show figure
display(figw_dynamic)
# CSV saving
display(Markdown("#### Save Selected Data"))
display(widgets.HBox([file_name_widget_dynamic, save_data_button_dynamic]))
display(save_data_output_dynamic)

print("Done. Negative or zero data are handled by replacing zeros with 5% quantile, transform if valid, etc.")

NameError: name 'color_col' is not defined