In [1]:
## Install/Import packages & define key varribles and functions
# Run install script
# %chmod +x setup_jupyterlab.sh
# %./setup_jupyterlab.sh

# Import necessary libraries for the script to function.
import pandas as pd
import tempfile, requests,time, csv, json, re, os, shutil, io, base64, time, subprocess, sqlite3, zipfile, base64
from io import StringIO, BytesIO
import numpy as np

# Uniprot cliebnt   
from xml.etree import ElementTree
from utils.uniprot_client import fetch_uniprot_info_batch, fetch_uniprot_info, UniProtClient
#from django.conf import settings
from collections import defaultdict
from datetime import datetime

#import statsmodels.api as sm
#from statsmodels.formula.api import ols
#from statsmodels.stats.multicomp import pairwise_tukeyhsd
import warnings

from scipy.stats import pearsonr, spearmanr

from itertools import combinations
from ipydatagrid import DataGrid

from IPython.display import display, HTML, clear_output
import ipywidgets as widgets
from ipywidgets import interact, interactive, fixed, interact_manual

import traitlets
from traitlets import HasTraits, Instance, observe

# Global variable declaration

import _settings as settings
global spec_translate_list
spec_translate_list = settings.SPEC_TRANSLATE_LIST
# Set the default font to Calibri
#matplotlib.rcParams['font.family'] = 'Calibri'


def find_species(header, spec_translate_list):
    """Search for a species in the header and return the first element (species name) from the list."""
    header_lower = header.lower()
    for spec_group in spec_translate_list:
        for term in spec_group[1:]:  # Iterate over possible species names/terms except the first element
            if term.lower() in header_lower:
                return spec_group[0]  # Return the first element of the list (main species name)
    return "unknown"  # Return unknown if no species match is found

def parse_headers():
    fasta_dict = {}
    with open("protein_headers.txt", 'r') as file:
        protein_id = ""
        protein_name = ""
        species = ""
        for line in file:
            line = line.strip()
            if line.startswith('>'):
                if protein_id:
                    # Save the previous protein entry in the dictionary
                    fasta_dict[protein_id] = {
                        "name": protein_name,
                        "species": species
                    }
                sequence = ""
                header_parts = line[1:].split('|')
                if len(header_parts) > 2:
                    protein_id = header_parts[1]
                    protein_name_full = re.split(r' OS=', header_parts[2])[0]
                    if ' ' in protein_name_full:

                        protein_name = protein_name_full#.split()[1]
                    else:
                        protein_name = protein_name_full
                    # Find species in the header
                    species = find_species(line, spec_translate_list)

        if protein_id:
            # Save the last protein entry in the dictionary
            fasta_dict[protein_id] = {
                "name": protein_name,
                "species": species
            }
    return fasta_dict


In [2]:
class DataTransformation(HasTraits):
    pd_results = Instance(pd.DataFrame, allow_none=True)
    mbpdb_results = Instance(pd.DataFrame, allow_none=True)
    # pd_results_cleaned = Instance(pd.DataFrame, allow_none=True)
    search_results = Instance(pd.DataFrame, allow_none=True)
    protein_dict = {}  # Add explicit trait for protein_dict

    def __init__(self):
        super().__init__()
        self.pd_results = pd.DataFrame()
        # self.pd_results_cleaned = pd.DataFrame()
        self.mbpdb_results = pd.DataFrame()
        self.search_results = pd.DataFrame()
        self.protein_dict = parse_headers()
        self.mbpdb_results_from_search_placeholder = widgets.Checkbox(value=False)
        self.fasta_uploader_placeholder = widgets.Checkbox(value=False)
        self.missing_proteins = set()
        self.setup_data_loading_ui()

    def create_download_link(self, file_path, label):
        """Create a download link for a file."""
        if os.path.exists(file_path):
            # Read file content and encode it as base64
            with open(file_path, 'rb') as f:
                content = f.read()
            b64_content = base64.b64encode(content).decode('utf-8')

            # Generate the download link HTML
            return widgets.HTML(f"""
                <a download="{os.path.basename(file_path)}" 
                   href="data:application/octet-stream;base64,{b64_content}" 
                   style="color: #0366d6; text-decoration: none; margin-left: 20px; font-size: 14px;">
                    {label}
                </a>
            """)
        else:
            # Show an error message if the file does not exist
            return widgets.HTML(f"""
                <span style="color: red; margin-left: 20px; font-size: 14px;">
                    File "{file_path}" not found!
                </span>
            """)

    """
    def setup_search_ui(self, peptides):
        # Create dropdown for similarity threshold
        self.threshold_dropdown = widgets.Dropdown(
            options=list(range(0, 101, 10)),
            value=80,
            description='Similarity Threshold:',
            style={'description_width': 'initial'},
            layout=widgets.Layout(width='300px')
        )

        # Create search button
        self.search_button = widgets.Button(
            description='Search Peptides',
            button_style='primary',
            layout=widgets.Layout(width='200px')
        )

        # Progress indicator
        self.search_progress = widgets.HTML(
            value="",
            layout=widgets.Layout(margin='10px 0px')
        )

        # Connect button click to handler
        self.search_button.on_click(lambda b: self._on_search_click(b, ))

        # Create layout
        self.search_widget = widgets.VBox([
            widgets.HBox([
                self.threshold_dropdown,
                self.search_button
            ], layout=widgets.Layout(align_items='center')),
            self.search_progress
        ])

        display(self.search_widget)
    """
    
    def _on_search_click(self, b):
        """Handle search button click"""
        with self.output_area:
            clear_output()

            if self.pd_results is None or self.pd_results.empty:
                display(HTML("<b style='color:red'>Please upload peptidomic data first.</b>"))
                return

            try:
                # Extract sequences from peptidomic data
                self.peptides = self._extract_sequences(self.pd_results)

                if not self.peptides:
                    display(HTML("<b style='color:red'>No valid sequences found in peptidomic data.</b>"))
                    return

                display(HTML(f"<b style='color:blue'>Found {len(self.peptides)} unique peptide sequences. Searching database...</b>"))
                
                # Perform search
                results = self._search_peptides_comprehensive(
                    self.peptides,
                    similarity_threshold=self.threshold_dropdown.value
                )
                # Format results if we have any matches
                if not results.empty:
                    self.mbpdb_results = self._format_search_results_with_matches(results)
                    clear_output()
                    display(HTML(f"<b style='color:green'>MBPDB Search complete! Found {len(self.mbpdb_results)} matches</b>"))
                    self.mbpdb_results_from_search_placeholder.value=True

                else:
                    self.mbpdb_results = results
                    display(HTML("<b style='color:orange'>No matches found in the database.</b>"))

            except Exception as e:
                display(HTML(f"<b style='color:red'>Error: {str(e)}</b>"))
                self.mbpdb_results = pd.DataFrame()

    def _search_peptides_comprehensive(self, peptides, similarity_threshold=100):
        """Search for peptides with BLAST-based similarity matching"""

        # WORK_DIRECTORY = '/home/kuhfeldrf/mbpdb/include/peptide/uploads/temp'
        # conn = sqlite3.connect('/home/kuhfeldrf/mbpdb/include/peptide/db.sqlite3')

        WORK_DIRECTORY = '../../uploads/temp'
        conn = sqlite3.connect('../../db.sqlite3')
        work_path = self._create_work_directory(WORK_DIRECTORY)

        fasta_db_path = os.path.join(work_path, "db.fasta")
        results = []
        extra_info = defaultdict(list)

        # Create database with all peptides for BLAST
        query = "SELECT p.id, p.peptide FROM peptide_peptideinfo p"
        db_peptides = pd.read_sql_query(query, conn)

        # Create BLAST database
        with open(fasta_db_path, 'w') as f:
            for _, row in db_peptides.iterrows():
                f.write(f">{row['id']}\n{row['peptide']}\n")

        self._make_blast_db(fasta_db_path)

        for peptide in self.peptides:
            if similarity_threshold == 100 or len(peptide) <4:
                query = """
                SELECT DISTINCT
                    ? as search_peptide,
                    pi.pid as protein_id,
                    p.id as peptide_id,
                    p.peptide,
                    pi.desc as protein_description,
                    pi.species,
                    p.intervals,
                    f.function,
                    r.additional_details,
                    r.ic50,
                    r.inhibition_type,
                    r.inhibited_microorganisms,
                    r.ptm,
                    r.title,
                    r.authors,
                    r.abstract,
                    r.doi,
                    'sequence' as search_type,
                    'IDENTITY' as scoring_matrix
                FROM peptide_peptideinfo p
                JOIN peptide_proteininfo pi ON p.protein_id = pi.id
                LEFT JOIN peptide_function f ON f.pep_id = p.id
                LEFT JOIN peptide_reference r ON r.func_id = f.id
                WHERE p.peptide = ?
                """
                df = pd.read_sql_query(query, conn, params=[peptide, peptide])
                results.append(df)
            else:
                # Run BLASTP search for similarity matching
                query_path = os.path.join(work_path, "query.fasta")
                with open(query_path, "w") as query_file:
                    query_file.write(f">pep_query\n{peptide}\n")

                output_path = os.path.join(work_path, "blastp_short.out")
                blast_args = [
                    "blastp",
                    "-query", query_path,
                    "-db", fasta_db_path,
                    "-outfmt", "6 std ppos qcovs qlen slen positive",
                    "-evalue", "1000",
                    "-word_size", "2",
                    "-matrix", "IDENTITY",
                    "-threshold", "1",
                    "-task", "blastp-short",
                    "-out", output_path
                ]

                subprocess.check_output(blast_args, stderr=subprocess.STDOUT)

                # Process BLAST results
                search_ids = self._process_blast_results(output_path, similarity_threshold, extra_info)

                if search_ids:
                    df = self._fetch_peptide_data(conn, peptide, search_ids)
                    self._add_blast_details(df, extra_info)
                    results.append(df)

        conn.close()
        self._cleanup_work_directory(WORK_DIRECTORY)

        return self._combine_results(results)

    def _create_work_directory(self, base_dir):
        """Create a working directory for BLAST operations"""
        path = os.path.join(base_dir, f'work_{int(round(time.time() * 1000))}')
        os.makedirs(path)
        return path

    def _make_blast_db(self, library_fasta_path):
        """Create BLAST database from FASTA file"""
        subprocess.check_output(
            ['makeblastdb', '-in', library_fasta_path, '-dbtype', 'prot'],
            stderr=subprocess.STDOUT
        )

    def _process_blast_results(self, output_path, similarity_threshold, extra_info):
        """Process BLAST results and collect search IDs"""
        search_ids = []
        csv.register_dialect('blast_dialect', delimiter='\t')

        with open(output_path, "r") as output_file:
            blast_data = csv.DictReader(
                output_file,
                fieldnames=['query', 'subject', 'percid', 'align_len', 'mismatches',
                            'gaps', 'qstart', 'qend', 'sstart', 'send', 'evalue',
                            'bitscore', 'ppos', 'qcov', 'qlen', 'slen', 'numpos'],
                dialect='blast_dialect'
            )

            for row in blast_data:
                tlen = float(row['slen']) if float(row['slen']) > float(row['qlen']) else float(row['qlen'])
                simcalc = 100 * ((float(row['numpos']) - float(row['gaps'])) / tlen)

                if simcalc >= similarity_threshold:
                    search_ids.append(row['subject'])
                    extra_info[row['subject']] = [
                        f"{simcalc:.2f}", row['qstart'], row['qend'], row['sstart'],
                        row['send'], row['evalue'], row['align_len'], row['mismatches'],
                        row['gaps']
                    ]

        return search_ids

    def _fetch_peptide_data(self, conn, peptide, search_ids):
        """Fetch peptide data from database"""
        placeholders = ','.join(['?' for _ in search_ids])
        query = f"""
        SELECT DISTINCT
            ? as search_peptide,
            pi.pid as protein_id,
            p.id as peptide_id,
            p.peptide,
            pi.desc as protein_description,
            pi.species,
            p.intervals,
            f.function,
            r.additional_details,
            r.ic50,
            r.inhibition_type,
            r.inhibited_microorganisms,
            r.ptm,
            r.title,
            r.authors,
            r.abstract,
            r.doi,
            'sequence' as search_type,
            'IDENTITY' as scoring_matrix
        FROM peptide_peptideinfo p
        JOIN peptide_proteininfo pi ON p.protein_id = pi.id
        LEFT JOIN peptide_function f ON f.pep_id = p.id
        LEFT JOIN peptide_reference r ON r.func_id = f.id
        WHERE p.id IN ({placeholders})
        """

        return pd.read_sql_query(query, conn, params=[peptide] + search_ids)

    def _add_blast_details(self, df, extra_info):
        """Add BLAST details to DataFrame"""
        for idx, row in df.iterrows():
            if str(row['peptide_id']) in extra_info:
                blast_details = extra_info[str(row['peptide_id'])]
                df.at[idx, '% Alignment'] = blast_details[0]
                df.at[idx, 'Query start'] = blast_details[1]
                df.at[idx, 'Query end'] = blast_details[2]
                df.at[idx, 'Subject start'] = blast_details[3]
                df.at[idx, 'Subject end'] = blast_details[4]
                df.at[idx, 'e-value'] = blast_details[5]
                df.at[idx, 'Alignment length'] = blast_details[6]
                df.at[idx, 'Mismatches'] = blast_details[7]
                df.at[idx, 'Gap opens'] = blast_details[8]

    def _cleanup_work_directory(self, work_directory):
        """Clean up old work directories"""
        try:
            dirs = [f for f in os.scandir(work_directory) if f.is_dir()]
            dirs.sort(key=lambda x: os.path.getmtime(x.path), reverse=True)

            for dir_entry in dirs[25:]:
                try:
                    shutil.rmtree(dir_entry.path)
                except Exception:
                    pass
        except Exception:
            pass

    def _combine_results(self, results):
        """Combine and format final results"""
        if not results:
            mbpdb_columns = [
                'search_peptide', 'protein_id', 'peptide', 'protein_description',
                'species', 'intervals', 'function', 'additional_details', 'ic50',
                'inhibition_type', 'inhibited_microorganisms', 'ptm', 'title',
                'authors', 'abstract', 'doi', 'search_type', 'scoring_matrix'
            ]
            return pd.DataFrame(columns=mbpdb_columns)

        final_results = pd.concat(results, ignore_index=True)

        if 'peptide_id' in final_results.columns:
            final_results = final_results.drop('peptide_id', axis=1)

        sort_columns = ['search_peptide']
        if '% Alignment' in final_results.columns:
            sort_columns.append('% Alignment')

        return final_results.sort_values(
            sort_columns,
            ascending=[True] + [False] * (len(sort_columns) - 1)
        )

    def _format_search_results_with_matches(self, final_results):
        """Format search results with matches"""
        if '% Alignment' in final_results.columns:
            final_results['% Alignment'] = pd.to_numeric(
                final_results['% Alignment'],
                errors='coerce'
            )

        grouped = final_results.groupby(["search_peptide", "function"], as_index=False)
        aggregated_results = []
        processed_indices = set()

        for _, group in grouped:
            if len(group) > 1:
                aggregated_row = self._aggregate_group_data(group)
                aggregated_results.append(aggregated_row)
                processed_indices.update(group.index)

        remaining_rows = final_results.loc[~final_results.index.isin(processed_indices)]
        aggregated_df = pd.DataFrame(aggregated_results)

        return pd.concat([aggregated_df, remaining_rows], ignore_index=True)

    def _aggregate_group_data(self, group):
        """Aggregate data for a group of results"""

        def enumerate_field(field):
            if field in group.columns and not group[field].dropna().empty:
                valid_values = set(group[field].dropna().astype(str).str.strip())
                valid_values = {val for val in valid_values if val != ''}
                if len(valid_values) > 1:
                    return "; ".join([f"{i + 1}) {val}" for i, val in enumerate(valid_values)])
                elif len(valid_values) == 1:
                    return next(iter(valid_values))
                return ''
            return ''

        return {col: enumerate_field(col) for col in group.columns}

    def setup_data_loading_ui(self):
        """Initialize and display the data loading UI with integrated search and help tooltips"""

        def create_help_icon(tooltip_text):
            """Create a help icon widget with tooltip"""
            help_icon = widgets.HTML(
                value='<i class="fa fa-question-circle" style="color: #007bff;"></i>',
                layout=widgets.Layout(width='25px', margin='2px 5px')
            )
            help_icon.add_class('jupyter-widgets')
            help_icon.add_class('widget-html')
            return widgets.HTML(
                f'<div title="{tooltip_text}" style="display: inline-block;">{help_icon.value}</div>'
            )

        def create_labeled_uploader(widget, label, tooltip):
            """Create an uploader with label and help icon"""
            return widgets.HBox([
                widget,
                create_help_icon(tooltip)
            ], layout=widgets.Layout(align_items='center'))

        # Create file upload widgets with the same configurations
        self.mbpdb_uploader = widgets.FileUpload(
            accept='.csv,.txt,.tsv,.xlsx',
            multiple=False,
            description='Upload MBPDB File',
            layout=widgets.Layout(width='300px'),
            style={'description_width': 'initial'}
        )

        self.pd_uploader = widgets.FileUpload(
            accept='.csv,.txt,.tsv,.xlsx',
            multiple=False,
            description='Upload Peptidomic File',
            layout=widgets.Layout(width='300px'),
            style={'description_width': 'initial'}
        )

        self.fasta_uploader = widgets.FileUpload(
            accept='.fasta',
            multiple=True,
            description='Upload FASTA Files',
            layout=widgets.Layout(width='300px'),
            style={'description_width': 'initial'}
        )

        # Create search interface
        self.threshold_dropdown = widgets.Dropdown(
            options=list(range(0, 101, 10)),
            value=80,
            description='Similarity Threshold (%):',
            style={'description_width': 'initial'},
            layout=widgets.Layout(width='225px')
        )

        self.search_button = widgets.Button(
            description='Search Database',
            button_style='primary',
            layout=widgets.Layout(width='150px')
        )


        # Create output areas
        self.output_area = widgets.Output()

        mbpdb_box = widgets.HBox([
            widgets.HTML("""
                    <div margin-bottom: 5px;'>
                        <b>Option 1: Upload File</b>
                    </div>
                """),
            self.create_download_link(
                "examples/example_MBPDB_search.tsv",
                "Example"
            )
        ])
        # Create MBPDB options section
        mbpdb_options = widgets.HBox([widgets.VBox([
            mbpdb_box,
            create_labeled_uploader(
                self.mbpdb_uploader,
                "MBPDB File",
                "Upload your own MBPDB file (optional)"
            )
        ]),
            widgets.HTML("<div style='margin: 0 20px; line-height: 100px;'><b>OR</b></div>"),
            widgets.VBox([
                widgets.HTML("<div style='font-weight: bold; margin-bottom: 5px;'>Option 2: Search Database</div>"),
                widgets.HBox([
                    self.threshold_dropdown,
                    self.search_button,
                    create_help_icon("Search peptides against the MBPDB (optional)")
                ], layout=widgets.Layout(align_items='center'))
            ])
        ], layout=widgets.Layout(align_items='center', margin='0'))

        # Create peptide file uploader box with example link
        peptide_box = widgets.HBox([
            create_labeled_uploader(
                self.pd_uploader,
                "Peptidomic File",
                "Upload peptide groups data from Proteome Discover export file (required)"
            ),
            self.create_download_link(
                "examples/example_peptide_data.csv",
                "Example"
            )
        ], layout=widgets.Layout(align_items='center'))

        # Create FASTA uploader box with example link
        fasta_box = widgets.HBox([
            create_labeled_uploader(
                self.fasta_uploader,
                "FASTA Files",
                "Upload Protein FASTA file used in Proteome Discoverer Search (optional)"
            ),
            self.create_download_link(
                "examples/example_fasta.fasta",
                "Example"
            )
        ], layout=widgets.Layout(align_items='center'))

        # Create main container
        self.first_main_container = widgets.VBox([
            widgets.HTML("<h3><u>Upload Peptidomic Data Files:</u></h3>"),
            peptide_box,
            widgets.HTML("<h3 style='margin-bottom: 0;'><u>MBPDB Data (Optional):</u></h3>"),
            mbpdb_options,
            widgets.HTML("<h3><u>Upload Protein FASTA Files (Optional):</u></h3>"),
            fasta_box,
            widgets.HTML("<br>"),
            widgets.HTML("<div style='margin-top: 10px;'></div>"),
            self.output_area,
        ])

        # Register observers
        self.pd_uploader.observe(self._on_pd_upload_change, names='value')
        self.mbpdb_uploader.observe(self._on_mbpdb_upload_change, names='value')
        self.fasta_uploader.observe(self._on_fasta_upload_change, names='value')
        self.search_button.on_click(self._on_search_click)

    def _extract_sequences(self, df):
        """Extract sequences from peptidomic data"""
        if 'Sequence' not in df.columns:
            # First create Sequence column with NaN values
            df['Sequence'] = pd.NA
            
            def extract_sequence(annotated_seq):
                if pd.isna(annotated_seq):
                    return pd.NA
                
                # Split by comma if present to handle multiple sequences
                if ',' in annotated_seq:
                    sequences = []
                    for seq in annotated_seq.split(','):
                        seq = seq.strip()
                        # Handle [X].SEQUENCE.[X] format
                        if '.' in seq:
                            parts = seq.split('.')
                            if len(parts) > 1:
                                sequences.append(parts[1])
                        # Handle plain sequence
                        else:
                            sequences.append(seq)
                    return sequences
                
                # Single sequence case
                # Handle [X].SEQUENCE.[X] format
                if '.' in annotated_seq:
                    parts = annotated_seq.split('.')
                    if len(parts) > 1:
                        return parts[1]
                
                # Handle plain sequence
                return annotated_seq
            
            # Apply the extraction function and explode the results
            df['Sequence'] = df['Annotated Sequence'].apply(extract_sequence)
            # Explode sequences if they're in a list (from comma separation)
            df = df.explode('Sequence')
            
        return df['Sequence'].dropna().unique().tolist()

    def _on_pd_upload_change(self, change):
        if change['type'] == 'change' and change['name'] == 'value':
            with self.output_area:
                self.output_area.clear_output()
                if change['new'] and len(change['new']) > 0:
                    file_data = change['new'][0]
                    self.pd_results, pd_status = self._load_data(
                        file_data,
                        required_columns=['Positions in Proteins'],
                        file_type='Peptidomic'
                    )
                    if pd_status == 'yes' and self.pd_results is not None:
                        self._find_missing_proteins()
                        pass
                        #display(HTML(f'<b style="color:green;">Peptidomic data imported with {self.pd_results.shape[0]} rows and {self.pd_results.shape[1]} columns.</b>'))

    def _on_mbpdb_upload_change(self, change):
        if change['type'] == 'change' and change['name'] == 'value':
            with self.output_area:
                self.output_area.clear_output()
                if change['new'] and len(change['new']) > 0:
                    file_data = change['new'][0]
                    self.mbpdb_results, mbpdb_status = self._load_data(
                        file_data,
                        required_columns=['Search peptide', 'Protein ID', 'Peptide'],
                        file_type='MBPDB'
                    )
                    if mbpdb_status == 'yes' and self.mbpdb_results is not None:
                        self.mbpdb_results.rename(columns={
                            'Search peptide': 'search_peptide',
                            'Protein ID': 'protein_id',
                            'Peptide': 'peptide',
                            'Protein description': 'protein_description',
                            'Species': 'species',
                            'Intervals': 'intervals',
                            'Function': 'function',
                            'Additional details': 'additional_details',
                            'IC50 (μM)': 'ic50',
                            'Inhibition type': 'inhibition_type',
                            'Inhibited microorganisms': 'inhibited_microorganisms',
                            'PTM': 'ptm',
                            'Title': 'title',
                            'Authors': 'authors',
                            'Abstract': 'abstract',
                            'DOI': 'doi',
                            'Search type': 'search_type',
                            'Scoring matrix': 'scoring_matrix',
                        }, inplace=True)
                        # display(HTML(f'<b style="color:green;">MBPDB file imported with {self.mbpdb_results.shape[0]} rows and {self.mbpdb_results.shape[1]} columns</b>'))

    def _on_fasta_upload_change(self, change):
        if change['type'] == 'change' and change['name'] == 'value':
            new_proteins = {}
            with self.output_area:
                self.output_area.clear_output()
                if change['new'] and len(change['new']) > 0:
                    for file_data in change['new']:
                        try:
                            if file_data.name.endswith('.fasta'):
                                self.fasta_filename = file_data.name
                                parsed = self._parse_uploaded_fasta(file_data)
                                new_proteins.update(parsed)
                                #print(f" DebugParsed {len(parsed)} proteins from {file_data.name}")
                                #display(HTML(f'<b style="color:green;">Successfully imported {file_data.name}</b>'))
                        except Exception as e:
                            print(f"Error: {str(e)}")

                    # Update protein_dict with new data
                    self.protein_dict = new_proteins
                    self.fasta_uploader_placeholder.value=True
                    self._find_missing_proteins()
                    #print(f"Debug: Updated protein_dict with {len(new_proteins)} entries")

    def _load_data(self, file_obj, required_columns, file_type):
        """
        Load and validate uploaded data files, cleaning empty rows and validating data.
        
        Args:
            file_obj: Uploaded file object
            required_columns (list): List of required column names (either single names or pairs)
            file_type (str): Type of file being loaded ('MBPDB' or 'Peptidomic')
            
        Returns:
            tuple: (DataFrame or None, status string 'yes'/'no')
        """
        try:
            content = file_obj.content
            filename = file_obj.name
            extension = filename.split('.')[-1].lower()
            
            file_stream = io.BytesIO(content)
            
            # Load data based on file extension with multiple delimiter attempts
            if extension == 'csv':
                # Try different delimiters in order of common usage
                delimiters = [',', ';', '|', '\t']
                df = None
                successful_delimiter = None
                
                for delimiter in delimiters:
                    try:
                        # Reset file stream position
                        file_stream.seek(0)
                        temp_df = pd.read_csv(file_stream, sep=delimiter)
                        
                        # Check if we got more than one column
                        if len(temp_df.columns) > 1:
                            df = temp_df
                            successful_delimiter = delimiter
                            break
                    except:
                        continue
                        
                if df is None:
                    raise ValueError("Could not parse CSV file with any common delimiter (tried: comma, semicolon, pipe, tab)")
                
                # Show which delimiter was used
                #display(HTML(f'<b style="color:blue;">File parsed using delimiter: {successful_delimiter}</b>'))
                
            elif extension in ['txt', 'tsv']:
                # For txt/tsv files, try tab first, then other delimiters
                delimiters = ['\t', ',', ';', '|']
                df = None
                successful_delimiter = None
                
                for delimiter in delimiters:
                    try:
                        file_stream.seek(0)
                        temp_df = pd.read_csv(file_stream, sep=delimiter)
                        if len(temp_df.columns) > 1:
                            df = temp_df
                            successful_delimiter = delimiter
                            break
                    except:
                        continue
                        
                if df is None:
                    raise ValueError("Could not parse TXT/TSV file with any common delimiter")
                    
                #display(HTML(f'<b style="color:blue;">File parsed using delimiter: {successful_delimiter}</b>'))
                
            elif extension == 'xlsx':
                df = pd.read_excel(file_stream)
            else:
                raise ValueError("Unsupported file format. Please upload .csv, .txt, .tsv, or .xlsx files.")
            
            # Clean column names
            df.columns = df.columns.str.strip()
            
            # Drop empty rows
            df = df.dropna(how='all')
            df = df[~(df.astype(str).apply(lambda x: x.str.strip().eq('')).all(axis=1))]
            
            # Handle validation differently based on file type
            if file_type == 'MBPDB':
                self.mbpdb_filename = filename

                # Use column pairs for MBPDB validation
                column_pairs = {
                    'Search peptide': 'search_peptide',
                    'Protein ID': 'protein_id',
                    'Peptide': 'peptide'
                }
                
                # Check for required columns in either format
                missing_pairs = []
                for orig_col, std_col in column_pairs.items():
                    if not (orig_col in df.columns or std_col in df.columns):
                        missing_pairs.append(f"'{orig_col}' or '{std_col}'")
                
                if missing_pairs:
                    display(HTML(f'<b style="color:red;">{file_type} File Error: Missing required columns: {", ".join(missing_pairs)}</b>'))
                    return None, 'no'
                
                # Validate non-empty required columns
                empty_pairs = []
                for orig_col, std_col in column_pairs.items():
                    col_to_check = orig_col if orig_col in df.columns else std_col
                    if df[col_to_check].isna().all() or (df[col_to_check].astype(str).str.strip() == '').all():
                        empty_pairs.append(f"'{orig_col}' or '{std_col}'")
                
                if empty_pairs:
                    display(HTML(f'<b style="color:red;">{file_type} File Error: Required columns are empty: {", ".join(empty_pairs)}</b>'))
                    return None, 'no'
                    
            else:
                self.pd_filename = filename

                # Additional column mapping specifically for peptidomic data
                if file_type == 'Peptidomic':
                    peptidomic_column_mapping = {
                        'Position.in.Proteins': 'Positions in Proteins',
                        'Positions.in.Proteins': 'Positions in Proteins',
                        'Master.Protein.Accessions': 'Master Protein Accessions',
                        'Master.Protein.Accession': 'Master Protein Accessions',
                        'Protein.Accessions': 'Protein Accessions',
                        'Protein.Accession': 'Protein Accessions',
                    }
                    # Apply peptidomic-specific column mapping
                    df.columns = [peptidomic_column_mapping.get(col, col) for col in df.columns]

                # Standard validation for other file types
                if not set(required_columns).issubset(df.columns):
                    missing = set(required_columns) - set(df.columns)
                    display(HTML(f'<b style="color:red;">{file_type} File Error: Missing required columns: {", ".join(missing)}</b>'))
                    return None, 'no'
                
                # Validate non-empty required columns
                empty_required = []
                for col in required_columns:
                    if df[col].isna().all() or (df[col].astype(str).str.strip() == '').all():
                        empty_required.append(col)
                
                if empty_required:
                    display(HTML(f'<b style="color:red;">{file_type} File Error: Required columns are empty: {", ".join(empty_required)}</b>'))
                    return None, 'no'
            
            # Show success message
            #display(HTML(f'<b style="color:green;">{file_type} file loaded successfully with {len(df)} rows after cleaning.</b>'))
            
            return df, 'yes'
            
        except Exception as e:
            display(HTML(f'<b style="color:red;">{file_type} File Error: {str(e)}</b>'))
            return None, 'no'
    
    def _parse_uploaded_fasta(self, file_data):
        """Parse uploaded FASTA file content"""
        fasta_dict = {}
        fasta_text = bytes(file_data.content).decode('utf-8')
        lines = fasta_text.split('\n')

        protein_id = ""
        protein_name = ""
        sequence = ""
        species = ""

        for line in lines:
            line = line.strip()
            if line.startswith('>'):
                if protein_id:
                    fasta_dict[protein_id] = {
                        "name": protein_name,
                        "sequence": sequence,
                        "species": species
                    }
                sequence = ""
                header_parts = line[1:].split('|')
                if len(header_parts) > 2:
                    protein_id = header_parts[1]
                    protein_name_full = re.split(r' OS=', header_parts[2])[0]
                    if ' ' in protein_name_full:
                        protein_name = protein_name_full
                    else:
                        protein_name = protein_name_full
                    species = self._find_species(line)
            else:
                sequence += line

        if protein_id:
            fasta_dict[protein_id] = {
                "name": protein_name,
                "sequence": sequence,
                "species": species
            }

        return fasta_dict

    def _find_species(self, header):
        """Find species in FASTA header"""
        header_lower = header.lower()
        for spec_group in spec_translate_list:
            for term in spec_group[1:]:
                if term.lower() in header_lower:
                    return spec_group[0]
        return "unknown"
    
    def _find_missing_proteins(self):
        """
        Find proteins in the data that are not in the protein dictionary.
        
        Returns:
            set: Set of protein accessions not in the dictionary
        """
        # Get the protein dictionary
        #protein_dict = self.protein_dict.copy()
        
        # Get all protein accessions from pd_results
        if self.pd_results is None or self.pd_results.empty:
            return set()
            
        if 'Master Protein Accessions' not in self.pd_results.columns:
            return set()
        self.missing_proteins = set()
        # Extract all protein accessions and split any that contain semicolons
        protein_accessions = self.pd_results['Master Protein Accessions'].dropna().unique()
        unique_protein_list = set()
        
        for protein in protein_accessions:
            if isinstance(protein, str) and ';' in protein:
                # Split the string by semicolon and strip whitespace
                split_proteins = [p.strip() for p in protein.split(';')]
                unique_protein_list.update(split_proteins)
            elif isinstance(protein, str):
                unique_protein_list.add(protein.strip())
            elif protein is not None:
                unique_protein_list.add(str(protein))
        
        # Also check Positions in Proteins column if available
        if 'Positions in Proteins' in self.pd_results.columns:
            pos_proteins = self.pd_results['Positions in Proteins'].dropna().unique()
            
            for position in pos_proteins:
                if isinstance(position, str) and ';' in position:
                    # Split the string by semicolon
                    for pos in position.split(';'):
                        parts = pos.strip().split()
                        if parts and not parts[0].startswith('['):  # Skip bracketed parts
                            unique_protein_list.add(parts[0])
                elif isinstance(position, str):
                    parts = position.strip().split()
                    if parts and not parts[0].startswith('['):
                        unique_protein_list.add(parts[0])
        
        # Remove None, nan, and 'Unknown' values
        clean_protein_list = set()
        for protein in unique_protein_list:
            if protein and protein != 'Unknown' and protein != 'nan':
                clean_protein_list.add(protein)
        
        # Find proteins not in the dictionary
        self.missing_proteins = {p for p in clean_protein_list if p not in self.protein_dict}
        
    def display_widgets(self):
        """Return the data loading UI with integrated search and help tooltips"""
        # Create and return a VBox containing both the CSS and main container
        css = widgets.HTML("""
            <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.15.4/css/all.min.css">
        """)
        
        return widgets.VBox([css, self.first_main_container])

In [3]:
class GroupProcessing:
    def __init__(self):
        self.group_data = {}
        self.jsonfilename = None
        self.group_number = 1
        self.filtered_columns = []
        self.notification_widget = widgets.IntText(
            value=0,
            description='',
            layout=widgets.Layout(display='none')  # Hidden from view
        )
        
        self.group_uploader = widgets.FileUpload(
        accept='.json',
        multiple=False,
        description='Upload Groups File',
        layout=widgets.Layout(width='200px'),
        style={'description_width': 'initial'},
        disabled = True
        )

        # Initialize output areas
        self.output = widgets.Output(overflow='hidden')
        # Initialize widgets for group selection
        self.column_dropdown = widgets.SelectMultiple(
            #description='Absorbance ',
            style={'description_width': 'initial'},
            disabled=True,
            layout=widgets.Layout(width='225px', height='300px')
        )

        self.column_dropdown_box = widgets.HBox([
            widgets.HTML("Absorbance Columns:&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;"),
            self.column_dropdown
            ],layout=widgets.Layout(width='400px', height='310px')
            )

        self.grouping_variable_text = widgets.Text(
            #description='Assign New Group Name',
            layout=widgets.Layout(width='230px'),
            style={'description_width': 'initial'},
            disabled=True,
            placeholder='Search columns or enter new group name'
        )
        self.no_group_checkbox = widgets.Checkbox(
            value=False,
        )
        self.text_box = widgets.HBox([
            widgets.HTML("Search or Assigned Name:"),
            self.grouping_variable_text
        ], layout=widgets.Layout(margin='5px', height = 'auto'))       

        # Initialize buttons
        self.no_group_button = widgets.Button(
            description='No Groups',
            button_style='success',
            layout=widgets.Layout(margin='5px', height = 'auto'),
            disabled=True  # Start disabled
        )

        self.search_button = widgets.Button(
            description='Search',
            button_style='info',
            layout=widgets.Layout(margin='5px', height = 'auto'),
            disabled=True  # Start disabled
        )
        
        self.add_group_button = widgets.Button(
            description='Add Group',
            button_style='success',
            layout=widgets.Layout(margin='5px', height = 'auto'),
            disabled=True  # Start disabled
        )
        
        self.reset_file_button = widgets.Button(
            description='Reset Selection',
            button_style='warning',
            layout=widgets.Layout(margin='10px 0', width = '90%'),
            disabled=True  # Start disabled
        )
            
        # Set up button callbacks
        self.search_button.on_click(self._search_columns)
        self.add_group_button.on_click(self._add_group)
        self.no_group_button.on_click(self._no_group)
        self.reset_file_button.on_click(self._reset_selection)

        # set up observer for group uploader and no group checkbox
        self.group_uploader.observe(self._on_group_upload_change, names='value')
    
    def enable_widgets(self, enable=True):
        """Enable or disable group processing widgets based on protein mapping completion"""
        # Enable/disable the main input widgets
        self.column_dropdown.disabled = not enable
        self.grouping_variable_text.disabled = not enable
        self.no_group_button.disabled = not enable
        self.search_button.disabled = not enable
        self.add_group_button.disabled = not enable
        self.reset_file_button.disabled = not enable
        self.group_uploader.disabled = not enable
        
        # If enabling, also update the column options from the current data
        if enable and hasattr(self, 'pd_results_cleaned') and self.pd_results_cleaned is not None:
            self.update_data(self.pd_results_cleaned)    

    def setup_data(self):
        """Initialize data and filters for the analysis"""
        # Define columns to exclude with more flexible matching
        columns_to_exclude = [
            # Space-separated and dot-separated variants included
            'Marked as', 'Marked.as',
            'Number of Missed Cleavages', 'Number.of.Missed.Cleavages',
            'Missed Cleavages', 'Missed.Cleavages',
            'Checked', 'Confidence', 'Annotated Sequence', 'Annotated.Sequence',
            'Unnamed: 3', 'Unnamed:.3',
            'Modifications', 'Modifications.in.Proteins', 'Modifications.in.Master.Proteins',
            'Protein Groups', 'Protein.Groups',
            'Proteins', 'PSMs',
            'Master Protein Accessions', 'Master.Protein.Accessions',
            'Master Protein Descriptions', 'Master.Protein.Descriptions',
            'Description',
            'Positions in Master Proteins', 'Positions.in.Master.Proteins',
            'Positions in Proteins', 'Positions.in.Proteins',
            'Modifications in Master Proteins', 'Modifications.in.Master.Proteins',
            'Modifications in Master Proteins all Sites', 'Modifications.in.Master.Proteins.all.Sites',
            'Theo MHplus in Da', 'Theo.MHplus.in.Da',
            'Quan Info', 'Quan.Info',
            "Theo. MH+ [Da]", "Theo.MH+.[Da]",
            'Confidence by Search Engine', 'Confidence.by.Search.Engine',
            'q-Value by Search Engine', 'q-Value.by.Search.Engine',
            'XCorr by Search Engine', 'XCorr.by.Search.Engine',
            'Percolator PEP by Search Engine', 'Percolator.PEP.by.Search.Engine',
            'Percolator q-Value by Search Engine', 'Percolator.q-Value.by.Search.Engine',
            'Percolator SVMScore by Search Engine', 'Percolator.SVMScore.by.Search.Engine',
            'PEP', 'q-Value', 'RT in min', 'RT.in.min',
            'RT in min by Search Engine', 'RT.in.min.by.Search.Engine',
            'Sequence', 'Sequence Length', 'Sequence.Length',
            'search_peptide', 'Peptide', 'protein_id', 'protein_description',
            'Alignment', 'Species',
            'Intervals', 'function', 'unique ID', 'unique.ID',
            'PEP (by Search Engine): Sequest HT', 'PEP.(by.Search.Engine):.Sequest.HT',
            'SVM Score (by Search Engine): Sequest HT', 'SVM.Score.(by.Search.Engine):.Sequest.HT',
            'SVM_Score',
            'XCorr (by Search Engine): Sequest HT', 'XCorr.(by.Search.Engine):.Sequest.HT',
            'Qvality PEP', 'Qvality.PEP',
            'Qvality q-value', 'Qvality.q-value',
            'Top Apex RT [min]', 'Top.Apex.RT.[min]',
            'Top Apex RT in min', 'Top.Apex.RT.in.min',
            'start', 'stop',
            'Abundance Ratio', 'Abundance.Ratio',
            'Abundance Ratio Adj P-Value', 'Abundance.Ratio.Adj.P-Value',
            'Abundance Ratio log2', 'Abundance.Ratio.log2',
            'Abundance Ratio P-Value', 'Abundance.Ratio.P-Value',
            'Abundances', 'Abundances.Counts', 'Abundances.Grouped', 'Abundances.Grouped.Count',
            'Abundances.Grouped.CV', 'Abundances.Normalized', 'Abundances.Scaled',
            'Charge by Search Engine', 'Charge.by.Search.Engine',
            'Concatenated Rank by Search Engine', 'Concatenated.Rank.by.Search.Engine',
            'Delta Cn by Search Engine', 'Delta.Cn.by.Search.Engine',
            'Delta M in ppm by Search Engine', 'Delta.M.in.ppm.by.Search.Engine',
            'Delta mz in Da by Search Engine', 'Delta.mz.in.Da.by.Search.Engine',
            'Delta Score by Search Engine', 'Delta.Score.by.Search.Engine',
            'Found in Sample Groups', 'Found.in.Sample.Groups',
            'Found in Samples', 'Found.in.Samples',
            'Modifications all possible sites', 'Modifications.all.possible.sites',
            'mz in Da by Search Engine', 'mz.in.Da.by.Search.Engine',
            'Number of Isoforms', 'Number.of.Isoforms',
            'Number of Protein Groups', 'Number.of.Protein.Groups',
            'Number of Proteins', 'Number.of.Proteins',
            'Protein Accessions', 'Protein.Accessions',
            'PSM Ambiguity', 'PSM.Ambiguity',
            'Rank by Search Engine', 'Rank.by.Search.Engine',
            'Search Engine Rank by Search Engine', 'Search.Engine.Rank.by.Search.Engine',
            'Score CHIMERYS Identification (by search engine)', 'Score.CHIMERYS.Identification.(by.search.engine)'
        ]

        exclude_substrings = [
            'Abundances by Bio Rep', 'Abundances.by.Bio.Rep',
            'Count', 'count',
            'Origin', 'origin',
            'Average_Abundance', 'Average.Abundance',
            'Avg_', 'Avg.',
            'PEP by Search Engine', 'PEP.by.Search.Engine',
            'SVM Score by Search Engine', 'SVM.Score.by.Search.Engine',
            'XCorr by Search Engine', 'XCorr.by.Search.Engine',
            'Top Apex RT', 'Top.Apex.RT'
        ]
    
        # Use cleaned data if available, otherwise use original
        df = self.pd_results_cleaned if (hasattr(self, 'pd_results_cleaned') and 
                                       not self.pd_results_cleaned.empty) else self.pd_results
        
        if df is not None and not df.empty:
            # More flexible column filtering
            self.filtered_columns = []
            for col in df.columns:
                # Check if any exclusion pattern matches the column name
                should_exclude = any(excl.lower() in col.lower() for excl in columns_to_exclude)
                # Check if any substring pattern matches
                has_excluded_substring = any(sub.lower() in col.lower() for sub in exclude_substrings)
                
                if not should_exclude and not has_excluded_substring:
                    self.filtered_columns.append(col)
              
            # Update dropdown options
            self.column_dropdown.options = self.filtered_columns
            self._reset_inputs()
        else:
            self.filtered_columns = []
            self.column_dropdown.options = []
            with self.output:
                self.output.clear_output()
                display(widgets.HTML('<b style="color:red;">No valid data available for processing.</b>'))
   
    def update_data(self, pd_results):
        """Update data and refresh filtered columns"""
        self.pd_results = pd_results
        
        # Only update if we have valid data
        if pd_results is not None:
            self.setup_data()
            
            # Update the dropdown with new filtered columns
            with self.output:
                self.output.clear_output()
                #display(widgets.HTML('<b style="color:green;">Data updated successfully. Column selection refreshed.</b>'))
    
    def create_download_link(self, file_path, label):
        """Create a download link for a file."""
        if os.path.exists(file_path):
            # Read file content and encode it as base64
            with open(file_path, 'rb') as f:
                content = f.read()
            b64_content = base64.b64encode(content).decode('utf-8')
    
            # Generate the download link HTML
            return widgets.HTML(f"""
                <a download="{os.path.basename(file_path)}" 
                   href="data:application/octet-stream;base64,{b64_content}" 
                   style="color: #0366d6; text-decoration: none; margin-left: 20px; font-size: 14px;">
                    {label}
                </a>
            """)
        else:
            # Show an error message if the file does not exist
            return widgets.HTML(f"""
                <span style="color: red; margin-left: 20px; font-size: 14px;">
                    File "{file_path}" not found!
                </span>
            """)

    def display_widgets(self):
        """Display the main UI for group selection"""
        group_box_uploader = widgets.HBox([
            self.group_uploader,
            self.create_download_link("examples/example_group_definition.json", "Example")
        ], layout=widgets.Layout(align_items='center', overflow='hidden'))

        self.group_box_uploader = widgets.VBox([
            widgets.HTML("<u><b>Option 1:</b> Upload Existing Group Dictionary:</u>"),
            group_box_uploader,
        ])
        
        self.group_box = widgets.VBox([
            self.column_dropdown_box,
            #widgets.HBox([widgets.HTML("Then Choose Option 2 <b>OR</b> Option 3")], layout=widgets.Layout(width='100%', height='50px', overflow='hidden', justify_content='center')),
            widgets.HTML("<u><b>Option 2:</b> Assigning Selected Columns to a Group:</u>"),
            self.text_box,
            widgets.HBox([self.search_button, self.add_group_button], layout=widgets.Layout(width='100%', height='auto', overflow='hidden', justify_content='center')),
            widgets.HBox([widgets.HTML("<h4><b>-OR-</b></h4>")], layout=widgets.Layout(width='100%', height='50px', overflow='hidden', justify_content='center')),
            widgets.HTML("<u><b>Option 3:</b> Use Selected Columns without Assigning to a Group:</u>"),
            widgets.HBox([self.no_group_button], layout=widgets.Layout(width='100%', height='auto', overflow='hidden', justify_content='center')),
            ], layout=widgets.Layout(width='100%', overflow='hidden', height='auto', maxheigh='550px'))
        
        # Create main grid container
        grid = widgets.GridspecLayout(1, 2,  # Number of rows and columns
                                     width='850px', 
                                     heigth='700px',
                                     grid_gap='5px',  # Adjust spacing between grid elements
                                     )
        
        # Create input container with vertical scroll
        input_container = widgets.VBox([
            self.group_box_uploader,
            widgets.HBox([widgets.HTML("<h4><b>-OR-</b></h4>")], layout=widgets.Layout(width='100%', height='auto', overflow='hidden', justify_content='center')),
            widgets.HTML('Select the <u>absorbance columns</u>'),
            self.group_box,
            widgets.HBox([self.reset_file_button], layout=widgets.Layout(width='100%', height='auto', overflow='hidden', justify_content='center'))
        ], layout=widgets.Layout(
            width='400px',
            height='100%',
            overflow='hidden'  # Add vertical scroll
        ))
        
        # Create output container with vertical scroll
        output_container = widgets.VBox([
            widgets.HTML("<h3><u>Group Selection Results:</u></h3>"),
            self.output
        ], layout=widgets.Layout(
            width='400px',
            height='680px',
            max_height='680px',
            overflow='auto',  # Add vertical scroll
            padding='10px'
        ))
        
        # Add to grid
        grid[0, 0] = input_container  # Left column
        grid[0, 1] = output_container  # Right column
        
        return grid
    
    def _on_gd_submit(self, b, dropdown):
        """Handle JSON file submission"""
        selected_file = dropdown.value
        with self.output:
            clear_output()
            
            if selected_file == 'Select an existing grouping dictionary file':
                print("Please select a valid file.")
                return
                
            try:
                # Load and process JSON file
                with open(selected_file, 'r') as file:
                    data = json.load(file)
                self.group_data = {}
                

                for group_number, group_info in data.items():
                    group_name = group_info.get('grouping_variable')
                    selected_columns = group_info.get('abundance_columns')
                    
                    self.group_data[group_number] = {
                        'grouping_variable': group_name,
                        'abundance_columns': selected_columns
                    }
                    
                    display(widgets.HTML(
                        f"<b>Group {group_number}</b> created with <b>{len(selected_columns)} columns assigned</b>."
                    ))
                    display(widgets.HTML(f"<b>Grouping Variable:</b> {group_name}"))
                    display(widgets.HTML(f"<b>Selected Columns:</b> {', '.join(selected_columns)}"))
                    display(widgets.HTML("<hr style='border: 1px solid black;'>"))
                    
            #display(widgets.HTML(f'<b style="color:green;">Successfully uploaded: {selected_file}</b>'))
                
            except Exception as e:
                display(widgets.HTML(f"<b style='color:red;'>An error occurred while processing the file: {str(e)}</b>"))
    
    def _search_columns(self, b):
        """Search for columns based on group name"""
        group_name = self.grouping_variable_text.value
        if group_name:
            matching_columns = [col for col in self.filtered_columns if group_name in col]
            self.column_dropdown.value = matching_columns
        else:
            with self.output:
                clear_output()
                display(widgets.HTML('<b style="color:red;">Please enter a group name to search.</b>'))
    
    def _reset_selection(self, b):
        """Reset all selections and data"""
        self.group_data = {}
        self.group_number = 1
        # Add this line to reset notification widget
        self.notification_widget.value = 0
        with self.output:
            clear_output()
        self._reset_inputs()
        self.no_group_checkbox.value = False
    
    def _reset_inputs(self):
        """Reset input fields"""
        self.grouping_variable_text.value = ''
        self.column_dropdown.value = ()
    
    def _add_group(self, b):
        """Add a new group to the data"""
        group_name = self.grouping_variable_text.value
        selected_columns = list(self.column_dropdown.value)
        
        if not (group_name and selected_columns):
            with self.output:
                display(widgets.HTML('<b style="color:red;">Please enter a group name and select at least one column.</b>'))
            return
        
        # If group_data exists, use next number, otherwise start at 1
        if self.group_data:
            # Convert existing keys to integers and find max
            existing_numbers = [int(k) for k in self.group_data.keys()]
            next_number = max(existing_numbers) + 1
            self.group_number = str(next_number)
        else:
            self.group_data = {}
            self.group_number = "1"
        
        # Add new group data to the dictionary
        self.group_data[self.group_number] = {
            'grouping_variable': group_name,
            'abundance_columns': selected_columns
        }
        
        # Display output
        with self.output:
            display(widgets.HTML(f"<b>Group {self.group_number}</b> created with <b>{len(selected_columns)} columns assigned</b>."))
            display(widgets.HTML(f"<b>Grouping Variable:</b> {group_name}"))
            display(widgets.HTML(f"<b>Selected Columns:</b> {', '.join(selected_columns)}"))
            display(widgets.HTML("<hr style='border: 1px solid black;'>"))

        # Update the Step 3 message to show option 2 as completed
        self.notification_widget.value = 2

        self._reset_inputs()
            
    def _on_group_upload_change(self, change):
        """Handle JSON file upload"""
        if change['type'] == 'change' and change['name'] == 'value':
            with self.output:
                clear_output(wait=True)
                
                if change['new'] and len(change['new']) > 0:
                    file_data = change['new'][0]
                    try:
                        content = bytes(file_data.content).decode('utf-8')
                        simplified_data = json.loads(content)
                        self.jsonfilename = file_data.name
                        
                        # Check for available columns in the dataset
                        available_columns = set(self.filtered_columns)
                        missing_columns = []
                        
                        # Check each column in each group for availability
                        for group_name, columns in simplified_data.items():
                            for column in columns:
                                if column not in available_columns:
                                    missing_columns.append(column)
                            
                        # Handle missing columns
                        if missing_columns:
                            missing_unique = list(set(missing_columns))  # Remove duplicates
                            missing_unique.sort()  # Sort for consistent display
                            
                            # Create HTML list for missing columns
                            missing_list_html = "<ul style='color:red; margin-top: 5px; margin-bottom: 5px;'>"
                            for col in missing_unique:
                                missing_list_html += f"<li>{col}</li>"
                            missing_list_html += "</ul>"
                            
                            # Display error message with HTML list
                            display(HTML(f"""
                                <div style='color:red; background-color: #fff3e0; padding: 10px; border-left: 5px solid #ff9800; margin: 10px 0;'>
                                    <p><b>Error:</b> The following columns from your <b>{file_data.name}</b> are not present in the current dataset:</p>
                                    {missing_list_html}
                                    <p>Please check your file or adjust your column selection or uploaded peptidomic data.</p>
                                </div>
                            """))
                            return
                        
                        # If no missing columns, proceed with conversion to enumerated format
                        self.group_data = {}
                        for i, (group_name, abundance_cols) in enumerate(simplified_data.items(), 1):
                            group_number = str(i)
                            self.group_data[group_number] = {
                                'grouping_variable': group_name,
                                'abundance_columns': abundance_cols
                            }
                            
                            # Display information about the group
                            display(HTML(
                                f"<b>Group {group_number}</b> created with <b>{len(abundance_cols)} columns assigned</b>."
                            ))
                            display(HTML(f"<b>Grouping Variable:</b> {group_name}"))
                            display(HTML(f"<b>Selected Columns:</b> {', '.join(abundance_cols)}"))
                            display(HTML("<hr style='border: 1px solid black;'>"))
                        
                        # Update the Step 3 message to show option 1 as completed
                        self.notification_widget.value = 1
                        
                    except Exception as e:
                        display(HTML(f"<b style='color:red;'>An error occurred while processing the file: {str(e)}</b>"))

    def _no_group(self, b):
        """Handle no-group button click to create individual groups for selected columns"""
        selected_columns = list(self.column_dropdown.value)
        
        if not selected_columns:
            with self.output:
                display(HTML('<b style="color:red;">Please select at least one absorbance column before clicking the button.</b>'))
                self.no_group_checkbox.value = False  # Reset checkbox
            return
        
        else:    
            # Check if the checkbox is checked (using the button's parent widget)
            self.no_group_checkbox.value = True  # Set checkbox to checked
            
            # Clear existing groups
            self.group_data = {}
            
            # Create a new group for each selected column
            for i, column in enumerate(selected_columns, 1):
                group_number = str(i)
                self.group_data[group_number] = {
                    'grouping_variable': column,  # Use column name as grouping variable
                    'abundance_columns': [column]  # Single column in abundance columns
                }
            
            # Display the results with a simpler message
            with self.output:
                clear_output()
                display(HTML(f"<b>Individual groups created for {len(selected_columns)} columns:</b>"))
                display(HTML("<ul>"))
                for column in selected_columns:
                    display(HTML(f"<li>{column}</li>"))
                display(HTML("</ul>"))
                display(HTML("<hr style='border: 1px solid black;'>"))
            # Update the Step 3 message to show option 3 as completed
            self.notification_widget.value = 3

In [4]:
class ProteinCombinationHandler(HasTraits):
    def __init__(self, data_transformer, workflow=None):
        super().__init__()
        self.data_transformer = data_transformer  # Store reference to data_transformer
        self.workflow = workflow  # Store reference to workflow
        self.pd_results = data_transformer.pd_results
        self.pd_results_cleaned = None
        self.protein_output_area = widgets.Output(layout=widgets.Layout(width='100%', margin='5px 0'))
        self.user_decisions = {}
        self.decision_inputs = []
        self.multi_position_combinations = []
        self.submit_button = None
        self.reset_button = None
        self.progress = None
        self.protein_mapping_output_area = widgets.Output()
        self.uniprot_output_area = widgets.Output()
        self.uniprot_client = UniProtClient()
        self.bad_protein = set()
        self.dead_proteins = set()
        self.buttonuniprot_output_area_output = widgets.Output()
        self.protein_mapping_widget = widgets.RadioButtons(
            options=[('Yes', True), ('No (skip)', False)],
            description='Process peptides mapped to multiple proteins?',
            disabled=True, 
            style={'description_width': 'initial'},
            value=None
        )
                # Create a single status message area that will be reused

        
        self.protein_mapping_widget.observe(self.process_protein_mapping, names='value')
        
        # Progress indicator
        self.progress_uniprot = widgets.FloatProgress(
            value=0,
            min=0,
            max=100,
            description='Progress:',
            bar_style='info',
            style={'description_width': 'initial', 'bar_color': '#0080ff'},
            orientation='horizontal',
            layout=widgets.Layout(width='300px')
        )
        
        # Found proteins counter
        self.counter_text = widgets.HTML(
            value="0 / 0 proteins processed",
            layout=widgets.Layout(width='200px'),
        )

    @property  # Make protein_dict a property that always reads from data_transformer
    def protein_dict(self):
        return self.data_transformer.protein_dict
            
    def _fetch_missing_proteins(self, callback=None):
        """
        Fetch missing proteins from UniProt and add them to the protein dictionary.
        Only fetch protein name and species (no sequence).
        """
        if not self.data_transformer.missing_proteins:
            if callback:
                callback()
            return
            
        with self.buttonuniprot_output_area_output:
            clear_output(wait=True)
            display(HTML("<u>Protein Information Retrieval from UniProt</u>"))
            display(widgets.VBox([self.progress_uniprot, self.counter_text]))

        # Initialize tracking variables
        protein_ids = [pid for pid in list(self.data_transformer.missing_proteins) 
                      if pid not in self.protein_dict and pid not in self.dead_proteins]
        
        self.progress_uniprot.value = 0
        self.counter_text.value = f"0 / {len(protein_ids)} proteins processed"
        
        batch_size = 10
        success_count = 0
        processed_count = 0
        recent_messages = []
        max_recent_messages = 3
        all_messages = []
        try:
            # Process in batches
            batches = [protein_ids[i:i+batch_size] for i in range(0, len(protein_ids), batch_size)]
            
            for batch_idx, batch in enumerate(batches):
                self.progress_uniprot.value = (batch_idx / len(batches)) * 100
                self.counter_text.value = f"{processed_count} / {len(protein_ids)} proteins processed"
                
                try:
                    # Batch fetch
                    results = self.uniprot_client.fetch_proteins_batch(batch)
                    
                    # Process batch results
                    for protein_id in batch:
                        if protein_id in results:
                            name, uniprot_species = results[protein_id]
                            self.protein_dict[protein_id] = {
                                "name": name if name else protein_id,
                                "species": uniprot_species
                            }
                            success_count += 1
                            message = f'<span style="color:green;">✓ Added {protein_id}: {name} ({uniprot_species})</span>'
                        else:
                            self.bad_protein.add(protein_id)
                            message = f'<span style="color:orange;">× No data found for {protein_id} in batch fetch</span>'
                        
                        recent_messages.append(message)
                        if len(recent_messages) > max_recent_messages:
                            recent_messages.pop(0)
                        
                        with self.uniprot_output_area:
                            clear_output(wait=True)
                            for msg in recent_messages:
                                display(HTML(msg))
                        
                        processed_count += 1
                        self.counter_text.value = f"{processed_count} / {len(protein_ids)} proteins processed"
                
                except Exception as batch_error:
                    with self.uniprot_output_area:
                        display(HTML(f'<span style="color:red;">Batch processing failed: {str(batch_error)}</span>'))
                    # Add failed batch proteins to bad_protein set
                    self.bad_protein.update(batch)
                
                time.sleep(0.5)  # Rate limiting
            
            # After all batches, try individual fetches for bad proteins once
            bad_proteins_to_process = self.bad_protein.copy()  # Create a copy to iterate over
            for protein_id in bad_proteins_to_process:
                try:
                    name, species, _ = fetch_uniprot_info(protein_id)
                    if name is not None and species is not None:
                        self.protein_dict[protein_id] = {
                            "name": name,
                            "species": species
                        }
                        self.bad_protein.remove(protein_id)
                        success_count += 1
                        message = f'<span style="color:green;">✓ Added {protein_id}: {name} ({species}) [Individual fetch]</span>'
                    else:
                        self.dead_proteins.add(protein_id)
                        self.bad_protein.remove(protein_id)
                        self.protein_dict[protein_id] = {
                            "name": protein_id,
                            "species": "Unknown"
                        }
                        message = f'<span style="color:red;">× No data found for {protein_id} [Individual fetch]</span>'
                
                except Exception as individual_error:
                    self.dead_proteins.add(protein_id)
                    self.bad_protein.remove(protein_id)
                    self.protein_dict[protein_id] = {
                        "name": protein_id,
                        "species": "Unknown"
                    }
                    message = f'<span style="color:red;">× Failed individual fetch for {protein_id}: {str(individual_error)}</span>'
                
                recent_messages.append(message)
                all_messages.append(message)
                if len(recent_messages) > max_recent_messages:
                    recent_messages.pop(0)
                
                with self.uniprot_output_area:
                    clear_output(wait=True)
                    for msg in recent_messages:

                        display(HTML(msg))

                
                time.sleep(0.5)  # Rate limiting
            
            # Final status display
            self.progress_uniprot.value = 100
            with self.uniprot_output_area:
                clear_output(wait=True)
                display(HTML(f'<b style="color:green;">Successfully processed {success_count} out of {len(protein_ids)} proteins.</b>'))
                if self.dead_proteins:
                    display(HTML(f'<b style="color:orange;">Unable to fetch data for {len(self.dead_proteins)} proteins: {", ".join(self.dead_proteins)}</b>'))
                display(HTML(f'<b style="color:green;">Protein fetch complete! Moving to protein combination processing...</b>'))
            
        except Exception as e:
            with self.uniprot_output_area:
                display(HTML(f'<b style="color:red;">Error in fetch process: {str(e)}</b>'))
        
        finally:
            with self.protein_mapping_output_area:
                clear_output(wait=True)
                #display(HTML('<b style="color:green; margin-top: 10px;">Proceeding to protein combination processing...</b>'))
                display(HTML("<br><u>Peptide-to-Protein Mapping</u>"))
                self.pd_results_cleaned, main_container = self.process_protein_combinations()
                display(main_container)
                if hasattr(self.data_transformer, 'workflow'):
                    self.workflow = self.data_transformer.workflow
    
    def process_protein_mapping(self, change):
        """Process protein mapping based on user selection"""
        #with self.protein_mapping_output_area:
        #    self.protein_mapping_output_area.clear_output(wait=True)
            
        if self.protein_mapping_widget.value == True:
            # First display the widgets for UniProt search
            #display(HTML("<h3>Step 1: Fetching Missing Protein Information</h3>"))
            # Display the UniProt search components
            #uniprot_widgets = self.display_widgets()
            #display(uniprot_widgets)
            
            # Create a callback that will execute after UniProt search completes
            def on_uniprot_complete():
                # Now we need to directly execute the protein combination processing
                with self.protein_mapping_output_area:
                    clear_output(wait=True) 

            
            # When UniProt search completes, the callback will be executed
            self._on_update_from_uniprot(callback=on_uniprot_complete)
        else:
            # No (skip) option selected - just process without changes
            self.pd_results_cleaned = self.pd_results.copy()
            
            # For immediate completion in the "No (skip)" case
            if hasattr(self.data_transformer, 'workflow') and hasattr(self.data_transformer.workflow, '_on_submit_complete'):
                self.data_transformer.workflow._on_submit_complete(True)

        return self.pd_results_cleaned
    
    def _on_update_from_uniprot(self, callback=None):
        """Handle update button click with improved callback handling"""
    
        with self.uniprot_output_area:
            # Initialize UniProt client if needed
            if self.uniprot_client is None:
                try:
                    self.uniprot_client = UniProtClient()
                    display(HTML('<b style="color:green;">UniProt client initialized successfully.</b>'))
                except Exception as e:
                    display(HTML(f'<b style="color:red;">Error initializing UniProt client: {str(e)}</b>'))
                    if callback:
                        callback()
                    return
            
            # Find missing proteins
            if self.data_transformer.missing_proteins:
                # Show only the count of missing proteins
                display(HTML(
                    f'<b style="color:orange;">Found {len(self.data_transformer.missing_proteins)} proteins missing information.</b><br>' +
                    f'<b>Starting UniProt fetch process...</b>'
                ))            
            else:
                with self.protein_mapping_output_area:
                    clear_output(wait=True)
                    #display(HTML("<b style='color:green;'>All proteins in the data have information in the protein dictionary!</b>"))
                    # Directly call process_protein_combinations to create the UI
                    # Directly call process_protein_combinations to create the U
                    display(HTML("<u>Peptide-to-Protein Mapping</u>"))
                    self.pd_results_cleaned, main_container = self.process_protein_combinations()
                    display(main_container)
                    # Connect the workflow _on_submit_complete method if needed
                    if hasattr(self.data_transformer, 'workflow'): 
                        self.workflow = self.data_transformer.workflow
                    if callback:
                        callback()
                return
            

           
           
            # Start the fetch process
            #def extended_callback():
            #    if callback:
            #        # Add a small delay to ensure UI updates
            #        import time
            #        time.sleep(0.5)
            #        callback()

        # Fetch missing proteins with our extended callback
        self._fetch_missing_proteins()

    def process_protein_combinations(self):
        """Process protein combinations in pd_results with Unknown handling"""
        if not self.pd_results.empty:
            df = self.pd_results.copy()

            # Fill NaN values with "Unknown"
            df['Positions in Proteins'] = df['Positions in Proteins'].fillna('Unknown')
            df['Master Protein Accessions'] = df['Master Protein Accessions'].fillna('Unknown')

            # Create warning display area
            warning_area = widgets.HTML(
                layout=widgets.Layout(
                    margin='10px 0',
                    padding='10px',
                    border='1px solid #ffeeba',
                    background_color='#fff3cd',
                    border_radius='4px'
                )
            )

            # Get combinations and track Unknown statistics
            combinations = self.get_protein_combinations()

            # Update warning area with statistics
            unknown_positions = (df['Positions in Proteins'] == 'Unknown').sum()
            unknown_master_acc = (df['Master Protein Accessions'] == 'Unknown').sum()

            if unknown_positions > 0 or unknown_master_acc > 0:
                warning_html = "<div><b>ℹ️ Notice:</b><ul style='margin: 5px 0'>"
                if unknown_positions > 0:
                    warning_html += f"<li>{unknown_positions} rows with missing 'Positions in Proteins' are marked as Unknown</li>"
                if unknown_master_acc > 0:
                    warning_html += f"<li>{unknown_master_acc} rows with missing 'Master Protein Accessions' are marked as Unknown</li>"
                warning_html += "</ul>These peptides will be preserved in the output.</div>"
                warning_area.value = warning_html
            else:
                warning_area = widgets.HTML(f"<br>")

            # Main container with warning area
            main_container = widgets.VBox([
                warning_area], layout=widgets.Layout(width='100%', padding='5px'))


            #def create_help_icon(self, tooltip_text):
            #    """Create a help icon widget with tooltip"""
            #    return f'<div title="{tooltip_text}" style="display: inline-block; margin-left: 4px;">' \
            #           '<i class="fa fa-question-circle" style="color: #007bff; font-size: 14px;"></i>' \
            #           '</div>'

            table_header = widgets.HTML("""
                            <div style="display: grid; grid-template-columns: 100px 100px 225px 150px 200px; gap: 2px; margin-bottom: 10px; font-weight: bold; align-items: center;">
                                <div>
                                    Protein ID
                                    <span title="Unique identifier for the protein" style="display: inline-block; margin-left: 4px;">
                                        <i class="fa fa-question-circle" style="color: #007bff; font-size: 14px;"></i>
                                    </span>
                                </div>
                                <div>
                                    Species
                                    <span title="Source organism of the protein" style="display: inline-block; margin-left: 4px;">
                                        <i class="fa fa-question-circle" style="color: #007bff; font-size: 14px;"></i>
                                    </span>
                                </div>
                                <div>
                                    Description
                                    <span title="Full protein name or description" style="display: inline-block; margin-left: 4px;">
                                        <i class="fa fa-question-circle" style="color: #007bff; font-size: 14px;"></i>
                                    </span>
                                </div>
                                <div>
                                    Decision
                                    <span title="Available options:\n
            - 'new' - Create a separate row for this protein\n
            - 'remove' - Remove this protein from combination\n
            - 'asis' - Keep as part of current combination\n
            - 'Custom: (protein ID)': ie. Custom: P02666A1"
            style="display: inline-block; margin-left: 4px;">
                                        <i class="fa fa-question-circle" style="color: #007bff; font-size: 14px;"></i>
                                    </span>
                                </div>
                                <div>
                                    Status
                                    <span title="Color indicators:\n
            - Grey - Default option (not yet submitted)\n
            - Green - Option has been submitted" style="display: inline-block; margin-left: 4px;">
                                        <i class="fa fa-question-circle" style="color: #007bff; font-size: 14px;"></i>
                                    </span>
                                </div>
                            </div>
                            <hr style="margin: 0 0 10px 0;">
                        """)

            # Create input area
            input_area = widgets.VBox(
                #[table_header],
                layout=widgets.Layout(
                width='100%',
                min_height='50px',
                margin='10px 0',
                max_height='500px',  # Set a maximum height
                overflow='auto'      # Enable scrolling
            ))

            # Add rows for each combination
            self.decision_inputs = []
            self.status_displays = {}

            for combo_idx, combo in enumerate(combinations, 1):
                proteins = combo.split('; ')

                # Find rows with this combination
                combo_rows = []
                for _, row in df.iterrows():
                    if pd.isna(row['Positions in Proteins']):
                        continue
                    row_proteins = set(p.split()[0] for p in row['Positions in Proteins'].split('; '))
                    if row_proteins == set(proteins):
                        combo_rows.append(row)

                occurrences = len(combo_rows)

                # Add combination header
                row_header = widgets.HTML(f"""
                    <div style="background-color: #f8f9fa; padding: 2px; margin: 5px 0; border-radius: 5px;">
                        <b>Combination {combo_idx}</b> ({occurrences} occurrences)
                    </div>
                """)
                combo_box = widgets.VBox(
                
                layout=widgets.Layout(
                width='100%',
                min_height='50px',
                margin='10px 0',
                max_height='500px',  # Set a maximum height
                    overflow='auto'      # Enable scrolling
                ))
                # Process each protein in the combination
                for protein in proteins:
                    species = "Unknown" if protein == 'Unknown' else self.protein_dict.get(protein, {}).get('species',
                                                                                                            "Unknown")
                    name = "Unknown Protein" if protein == 'Unknown' else self.protein_dict.get(protein, {}).get('name',
                                                                                                                 "Unknown")

                    # Set default decision based on Master Protein Accessions
                    default_decision = 'asis'  # Always keep Unknown proteins as-is
                    if protein != 'Unknown' and combo_rows:
                        first_row = combo_rows[0]
                        if not pd.isna(first_row['Master Protein Accessions']):
                            master_proteins = first_row['Master Protein Accessions'].split(';')
                            master_proteins = [p.strip() for p in master_proteins]
                            default_decision = 'new' if protein in master_proteins else 'remove'

                    # Create decision input
                    decision_input = widgets.Text(
                        layout=widgets.Layout(width='125px', max_width='125px', overflow='hidden'),
                        value=default_decision
                    )
                    self.decision_inputs.append((combo, protein, decision_input))

                    # Create status display with initial status
                    status_text = {
                        'new': "Will be created as new row",
                        'remove': "Will be removed",
                        'asis': "Will keep as is",
                        'Custom: (protein ID)': "ie. Custom: P02666A1"
                    }
                    initial_status = status_text.get(default_decision, '')
                    status_display = widgets.HTML(f'<span style="color: gray">{initial_status}</span>')
                    self.status_displays[(combo, protein)] = status_display

                    # Create the row content
                    row_content = widgets.HTML(f"""
                    <div style="display: grid; grid-template-columns: 100px 100px 225px; gap: 2px; align-items: center;">
                            <div style="white-space: nowrap; overflow: hidden; text-overflow: ellipsis;">{protein}</div>
                            <div style="white-space: nowrap; overflow: hidden; text-overflow: ellipsis;">{species}</div>
                            <div style="white-space: nowrap; overflow: hidden; text-overflow: ellipsis;">{name}</div>
                        </div>
                    """)

                    
                    # Create container with all elements
                    container = widgets.HBox([
                        row_content,
                        widgets.HBox([decision_input], layout=widgets.Layout(width='150px', padding='0')),
                        widgets.HBox([status_display], layout=widgets.Layout(width='200px', padding='0'))
                    ], layout=widgets.Layout(
                        margin='2px 0',
                        display='flex',
                        height='30px',
                        min_height='50px',
                        align_items='center',
                        overflow='hidden',
                        width='100%'
                    ))
                    combo_box.children += (container,)
                row_box = widgets.VBox(
                    [row_header, combo_box],
                    layout=widgets.Layout(
                        width='100%',
                        padding='0',
                        min_height='250px',   # or whatever you want
                        max_height='750px',
                        overflow='auto'
                    )
                )
                input_area.children += (row_box,)

            # Create buttons
            button_box = self._create_buttons()

            # Add output area
            self.protein_output_area = widgets.Output(
                layout=widgets.Layout(width='100%', margin='0', max_height='50px', max_width='auto', overflow='hidden')
            )
            # Add all components

            main_container.children += (table_header,input_area, button_box, self.protein_output_area)
            #main_container.layout.max_height = '700px'
            main_container.layout.width = '825px'
            #main_container.layout.overflow = 'auto'
            self.pd_results_cleaned = df
            #display(main_container)
            return df, main_container

    def get_protein_combinations(self):
        """Extract unique protein combinations from the dataset with NaN handling"""
        if self.pd_results is None or self.pd_results.empty:
            return []

        protein_combinations = set()
        nan_warnings = {
            'positions': 0,
            'master_acc': 0,
            'unknown_added': 0
        }

        # Create a working copy of the dataframe
        working_df = self.pd_results.copy()

        # Track NaN counts before modification
        nan_warnings['positions'] = working_df['Positions in Proteins'].isna().sum()
        nan_warnings['master_acc'] = working_df['Master Protein Accessions'].isna().sum()

        # Replace NaN values with "Unknown" instead of dropping
        working_df['Positions in Proteins'] = working_df['Positions in Proteins'].fillna('Unknown')
        working_df['Master Protein Accessions'] = working_df['Master Protein Accessions'].fillna('Unknown')

        for _, row in working_df.iterrows():
            try:
                # Handle "Unknown" case specially
                if row['Positions in Proteins'] == 'Unknown':
                    position_proteins = ['Unknown']
                else:
                    position_proteins = [p.split()[0] for p in row['Positions in Proteins'].split('; ')]

                master_acc = row['Master Protein Accessions']

                # Check species of proteins in Positions in Proteins
                species_set = set()
                for protein in position_proteins:
                    if protein in self.protein_dict:
                        species_set.add(self.protein_dict[protein]['species'])
                    elif protein == 'Unknown':
                        species_set.add('Unknown')

                if (';' in master_acc or
                        ';' in row['Positions in Proteins'] or
                        len(species_set) > 1 or
                        'Unknown' in species_set):  # Include Unknown combinations
                    protein_combinations.add('; '.join(sorted(position_proteins)))
                    if 'Unknown' in position_proteins:
                        nan_warnings['unknown_added'] += 1

            except Exception as e:
                print(f"Warning: Error processing row {_}: {str(e)}")
                continue

        # Store warning message for display
        warning_message = []

        if nan_warnings['positions'] > 0:
            warning_message.append(
                f"{nan_warnings['positions']} rows with missing 'Positions in Proteins' were marked as Unknown")
        if nan_warnings['master_acc'] > 0:
            warning_message.append(
                f"{nan_warnings['master_acc']} rows with missing 'Master Protein Accessions' were marked as Unknown")
        if nan_warnings['unknown_added'] > 0:
            warning_message.append(f"{nan_warnings['unknown_added']} combinations now include Unknown proteins")

        # if warning_message:
        #   print( Debug: "Warning: " + "; ".join(warning_message))

        self.multi_position_combinations = list(protein_combinations)
        return self.multi_position_combinations
    
    def _on_submit(self, button, df):
        """Handle submit button click with enhanced position handling"""
        self.submit_button.disabled = True
        self.reset_button.disabled = True
        self.progress.value = 0
        
        with self.protein_output_area:
            try:
                self.protein_output_area.clear_output()
                decisions_by_combo = {}
                rows_to_remove = set()
                new_rows = []
                total_inputs = len(self.decision_inputs)
                
                # First pass: collect all decisions
                for i, (combo, protein, input_widget) in enumerate(self.decision_inputs):
                    try:
                        decision = input_widget.value.strip()
                        if decision:
                            status_display = self.status_displays[(combo, protein)]
                            status_display.value = f'<span style="color: green">Decision: {decision}</span>'
                            
                            if combo not in decisions_by_combo:
                                decisions_by_combo[combo] = {}
                            decisions_by_combo[combo][protein] = decision
                    except Exception as e:
                        display(HTML(f"<b style='color:red;'>Error processing decision for {protein}: {str(e)}</b>"))
                        continue

                    self.progress.value = ((i + 1) / total_inputs * 25)
                
                # Second pass: validate decisions
                validation_errors = []
                
                for combo, protein_decisions in decisions_by_combo.items():
                    # Check if decisions are valid for this combination
                    has_asis = any(decision.upper() == 'ASIS' for decision in protein_decisions.values())
                    has_custom = any(decision.upper().startswith('CUSTOM:') or decision.startswith('Custom:') 
                                    for decision in protein_decisions.values())
                    has_new = any(decision.upper() == 'NEW' for decision in protein_decisions.values())
                    has_remove = any(decision.upper() == 'REMOVE' for decision in protein_decisions.values())
                    
                    # Validation rules
                    if has_asis and (has_custom or has_new or has_remove):
                        validation_errors.append(f"Combination '{combo}': ASIS cannot be used with other decision types")
                    
                    #if has_new and (has_custom or has_asis):
                    #    validation_errors.append(f"Combination '{combo}': NEW cannot be used with CUSTOM or ASIS")
                    
                    # Validate individual decision formats
                    for protein, decision in protein_decisions.items():
                        decision_upper = decision.upper()
                        if (decision_upper not in ['NEW', 'REMOVE', 'ASIS'] and 
                            not decision_upper.startswith('CUSTOM:') and 
                            not decision.startswith('Custom:')):
                            validation_errors.append(f"Protein '{protein}': Invalid decision format '{decision}'")
                        
                        if (decision_upper.startswith('CUSTOM:') or decision.startswith('Custom:')) and len(decision.split(':', 1)[1].strip()) == 0:
                            validation_errors.append(f"Protein '{protein}': CUSTOM decision requires a protein ID after the colon")
                
                # If validation errors, stop processing
                if validation_errors:
                    error_message = "Cannot process due to the following errors:<br>"
                    for error in validation_errors:
                        error_message += f"• {error}<br>"
                    error_message += "<br>Valid combinations:<br>"
                    error_message += "• All proteins can be ASIS (no changes)<br>"
                    error_message += "• CUSTOM, NEW and REMOVE can be used together<br>"
                    error_message += "• ASIS cannot be used with other decision types (CUSTOM, NEW or REMOVE)<br>"
                    
                    display(HTML(f"<b style='color:red;'>{error_message}</b>"))
                    self.progress.value = 0
                    self.submit_button.disabled = False
                    self.reset_button.disabled = False
                    return df
                
                # Third pass: process the dataframe
                if decisions_by_combo:
                    processed_df = df.copy()
                    processed_count = 0
                    total_combinations = len(decisions_by_combo)

                    for combo, protein_decisions in decisions_by_combo.items():
                        try:
                            # Extract protein IDs for pattern matching
                            proteins = []
                            for part in combo.split('; '):
                                if not part.startswith('['):
                                    protein_id = part.split()[0]
                                    proteins.append(protein_id)
                            
                            # Create regex pattern
                            pattern_parts = []
                            for protein in proteins:
                                escaped_protein = re.escape(protein)
                                pattern_parts.append(f'(?=.*{escaped_protein})')
                            pattern = ''.join(pattern_parts)

                            try:
                                # Find matching rows
                                valid_rows = processed_df['Positions in Proteins'].notna()
                                mask = processed_df['Positions in Proteins'].fillna('').str.contains(pattern, regex=True)
                                mask = valid_rows & mask
                                matched_indices = processed_df[mask].index

                                for idx in matched_indices:
                                    row = processed_df.loc[idx]
                                    positions = row['Positions in Proteins'].split('; ')
                                    master_accs = row['Master Protein Accessions'].split('; ') if '; ' in row['Master Protein Accessions'] else [row['Master Protein Accessions']]
                                    
                                    # Extract protein IDs from positions
                                    current_proteins = []
                                    for pos in positions:
                                        parts = pos.split()
                                        if parts and not parts[0].startswith('['):
                                            current_proteins.append(parts[0])
                                    
                                    if set(current_proteins) == set(proteins):
                                        # Check if all decisions are ASIS
                                        all_asis = all(decision.upper() == 'ASIS' for decision in protein_decisions.values())
                                        if all_asis:
                                            continue
                                        
                                        # Process decisions
                                        proteins_to_remove = []
                                        custom_changes = {}
                                        new_proteins = []
                                        
                                        for protein, decision in protein_decisions.items():
                                            decision_upper = decision.upper()
                                            
                                            if decision_upper == 'NEW':
                                                new_proteins.append(protein)
                                                proteins_to_remove.append(protein)
                                                
                                                # Create new row for this protein
                                                matching_position = next((p for p in positions if p.startswith(protein)), None)
                                                if matching_position:
                                                    new_row = row.copy()
                                                    new_row['Positions in Proteins'] = matching_position
                                                    new_row['Master Protein Accessions'] = protein
                                                    new_rows.append(new_row)
                                                    
                                            elif decision_upper == 'REMOVE':
                                                proteins_to_remove.append(protein)
                                                
                                            elif decision_upper.startswith('CUSTOM:') or decision.startswith('Custom:'):
                                                parts = decision.split(':', 1)
                                                if len(parts) > 1:
                                                    new_protein_id = parts[1].strip()
                                                    custom_changes[protein] = new_protein_id
                                        
                                        # First apply custom changes
                                        for protein, new_protein_id in custom_changes.items():
                                            for i, pos in enumerate(positions):
                                                if pos.startswith(protein):
                                                    pos_parts = pos.split(' ', 1)
                                                    if len(pos_parts) > 1:
                                                        pos_range = pos_parts[1]
                                                        new_position = f"{new_protein_id} {pos_range}"
                                                        positions[i] = new_position
                                            
                                            for i, acc in enumerate(master_accs):
                                                if acc == protein:
                                                    master_accs[i] = new_protein_id
                                        
                                        # Then remove proteins
                                        if proteins_to_remove:
                                            positions = [pos for pos in positions if not any(pos.startswith(p) for p in proteins_to_remove)]
                                            master_accs = [acc for acc in master_accs if acc not in proteins_to_remove]
                                        
                                        # Update the original row if there are positions left
                                        if positions:
                                            processed_df.at[idx, 'Positions in Proteins'] = '; '.join(positions)
                                            processed_df.at[idx, 'Master Protein Accessions'] = '; '.join(master_accs) if master_accs else "Unknown"
                                        else:
                                            rows_to_remove.add(idx)

                            except Exception as regex_error:
                                display(HTML(f"<b style='color:orange;'>Warning: Error in pattern matching: {str(regex_error)}</b>"))
                                continue

                        except Exception as combo_error:
                            display(HTML(f"<b style='color:orange;'>Warning: Error processing combination {combo}: {str(combo_error)}</b>"))
                            continue

                        processed_count += 1
                        progress = 30 + (processed_count / total_combinations * 70)
                        self.progress.value = progress

                    # Final processing
                    if rows_to_remove:
                        processed_df = processed_df.drop(index=list(rows_to_remove))
                    if new_rows:
                        processed_df = pd.concat([processed_df, pd.DataFrame(new_rows)], ignore_index=True)

                    self.pd_results_cleaned = processed_df
                    self.progress.value = 100
                    display(HTML("<b style='color:green;'>Processing complete.</b>"))
                    
                    # Notify the workflow that processing completed successfully
                    if hasattr(self, 'workflow') and hasattr(self.workflow, '_on_submit_complete'):
                        self.workflow._on_submit_complete(True)
                    
            except Exception as e:
                display(HTML(f"<b style='color:red;'>Error in submit handler: {str(e)}</b>"))
                self.progress.value = 0
                
                # Notify the workflow that processing failed
                if hasattr(self, 'workflow') and hasattr(self.workflow, '_on_submit_complete'):
                    self.workflow._on_submit_complete(False)

            finally:
                self.submit_button.disabled = False
                self.reset_button.disabled = False

        return self.pd_results_cleaned

    def create_help_icon(self, tooltip_text):
        """Create a help icon widget with tooltip"""
        return widgets.HTML(
            f'<div title="{tooltip_text}" style="display: inline-block;">'
            '<i class="fa fa-question-circle" style="color: #007bff;"></i>'
            '</div>'
        )

    def _create_buttons(self):
        """Create submit and reset buttons"""
        self.submit_button = widgets.Button(
            description="Submit",
            button_style='success',
            disabled=False
        )
        self.reset_button = widgets.Button(
            description="Reset",
            button_style='warning',
            disabled=False
        )
        self.progress = widgets.FloatProgress(
            value=0,
            min=0,
            max=100,
            description='Processing:',
            bar_style='info',
            style={'bar_color': '#0080ff'},
            orientation='horizontal',
            layout=widgets.Layout(width='50%')
        )

        button_box = widgets.VBox([
            widgets.HBox([self.submit_button, self.reset_button]),
            self.progress
        ], layout=widgets.Layout(width='100%', margin='5px 0', max_height='150px', max_width='1000px', overflow='hidden'))

        self.reset_button.on_click(self._on_reset_button_clicked)
        self.submit_button.on_click(lambda b: self._on_submit(b, self.pd_results.copy()))

        return button_box
    
    def _on_reset_button_clicked(self, b):
        """Handle reset button click by resetting options to default values"""
        # Disable buttons during reset
        self.submit_button.disabled = True
        self.reset_button.disabled = True

        # Clear output area
        with self.protein_output_area:
            self.protein_output_area.clear_output()
            display(HTML("<b style='color:blue;'>Resetting options to defaults...</b>"))

        # Reset progress bar
        self.progress.value = 0

        try:
            # Reset each input field to its default value based on Master Protein Accessions
            df = self.pd_results.copy()
            processed = 0
            total_inputs = len(self.decision_inputs)

            for combo, protein, input_field in self.decision_inputs:
                # Find rows with this combination
                proteins = combo.split('; ')
                combo_rows = []
                for _, row in df.iterrows():
                    if pd.isna(row['Positions in Proteins']):
                        continue
                    row_proteins = set(p.split()[0] for p in row['Positions in Proteins'].split('; '))
                    if row_proteins == set(proteins):
                        combo_rows.append(row)

                # Determine default decision
                default_decision = 'asis'
                if combo_rows:
                    first_row = combo_rows[0]
                    if not pd.isna(first_row['Master Protein Accessions']):
                        master_proteins = first_row['Master Protein Accessions'].split(';')
                        master_proteins = [p.strip() for p in master_proteins]
                        default_decision = 'new' if protein in master_proteins else 'remove'

                # Set input field value
                input_field.value = default_decision

                # Update status display
                status_display = self.status_displays[(combo, protein)]
                status_text = {
                    'new': "Will be created as new row",
                    'remove': "Will be removed",
                    'asis': "Will keep as is",
                    'Custom: (protein ID)': "ie. Custom: P02666A1"
                }
                status_display.value = f'<span style="color: gray">{status_text[default_decision]}</span>'

                # Update progress
                processed += 1
                self.progress.value = (processed / total_inputs) * 100

            # Reset internal state
            self.user_decisions = {}
            self.pd_results_cleaned = self.pd_results.copy()

            with self.protein_output_area:
                self.protein_output_area.clear_output()
                display(HTML("<b style='color:green;'>Reset complete. All options set to defaults.</b>"))

        except Exception as e:
            with self.protein_output_area:
                self.protein_output_area.clear_output()
                display(HTML(f"<b style='color:red;'>Error during reset: {str(e)}</b>"))

        finally:
            # Re-enable buttons
            self.submit_button.disabled = False
            self.reset_button.disabled = False

In [5]:
class ProcessingWorkflow:
    def __init__(self):
        self.data_transformer = DataTransformation()
        self.protein_handler = ProteinCombinationHandler(self.data_transformer, self)
        self.group_processor = GroupProcessing()
        
        self._initialize_instructions()
        
        # Set up observers
        self.data_transformer.observe(self._handle_data_change, names=['pd_results'])
        self.data_transformer.observe(self._handle_fasta_change, names=['protein_dict'])
        
        # Add property to track protein processing completion
        self.protein_processing_complete = False
        
        # Connect the protein handler's widget to enable group processing when complete
        self.protein_handler.protein_mapping_widget.observe(self._on_protein_mapping_change, names='value')

    def _handle_data_change(self, change):
        """Handle changes in proteomics data"""
        if change.name == 'pd_results':
            if change.new is not None:
                self.protein_handler.protein_mapping_widget.disabled = False
                #display(HTML("<h3>Multiple Protein Mappings</h3>"))
                self.protein_handler.pd_results = change.new
                #self.protein_handler.process_protein_mapping()

                #self.protein_handler.process_protein_mapping()
            self.group_processor.update_data(change.new)
    
    def _on_protein_mapping_change(self, change):
        """Handle changes to protein mapping widget selection"""
        if change['name'] == 'value':
            # If user selects to skip protein mapping, enable group processing immediately
            if change['new'] is False:  # "No (skip)" option
                self.protein_processing_complete = True
                self.group_processor.enable_widgets(True)
                # Update the message to indicate group processing is now available
                with self.steptwo_status_output:
                    clear_output(wait=True)
                    display(HTML(self.steptwo_output_html_message))

    def _handle_fasta_change(self, change):
        """Handle changes in FASTA data"""
        if change.new != change.old:
            #self.protein_handler.process_protein_mapping()
            pass
            #with protein_mapping_output:
            #    protein_mapping_output.clear_output()
            #    display(HTML("<h3>Multiple Protein Mappings</h3>"))
            #    print(f"Using updated protein dictionary with {len(self.data_transformer.protein_dict)} proteins")
            #    #if self.protein_handler.pd_results is not None:
            #    #    self.protein_handler.process_protein_mapping()
    
    def _initialize_instructions(self):
        def stepone_instructions():
            self.stepone_output_html_message = """
            <div style='padding: 10px; background-color: #f8f9fa; border-left: 5px solid #007bff; margin: 10px 0;'>
                <h3>Step 1: Upload Data</h3>
                <ul style='list-style-type: none;'>
                    <li> <b>Required:</b> Upload a peptidomic file exported from your software of choice</li>
                    <li> Optional: Upload Functional Data or Query the MBPDB for functional data</li>
                    <li> Optional: Upload a FASTA file or by defulat our internal protein database or UniProt will be querried for protein information</li>
                </ul>
                <p>Start by uploading your peptidomic data file. This file should contain peptide sequences, 
                abundance values, and protein mapping information.</p>
            </div>
            """
            self.stepone_status_output = widgets.Output(
                layout=widgets.Layout(
                    max_width='1000px',
                    width='100%'
                )
            )
            with self.stepone_status_output:
                clear_output(wait=True)
                display(HTML(self.stepone_output_html_message))
                
        def steptwo_instructions():
            self.steptwo_output_html_message = """
            <div style='padding: 10px; background-color: #f8f9fa; border-left: 5px solid #007bff; margin: 10px 0;'>
                <h3>Step 2 (Optional): Organize Peptides with Multiple Protein Mappings</h3>
                <p>In peptidomic datasets, peptides may map to multiple proteins due to:</p>
                <ul style='list-style-type: circle;'>
                    <li>Multiple proteins sharing identical sequences</li>
                    <li>Proteins from different species with similar sequences</li>
                    <li>Ambiguous mapping from incomplete sequence information</li>
                </ul>
                <p>Please select whether you want to organize these mappings manually or use the default approach:</p>
                <ul style='list-style-type: none;'>
                    <li><b>Yes</b> - Process manually: Review each protein mapping combination and decide how to handle it</li>
                    <li><b>No (skip)</b> - Default mapping: Returns the dataset unchanged keeping the current protein mapping.</li>
                </ul>
            </div>
            """
            self.steptwo_status_output = widgets.Output(
                layout=widgets.Layout(
                    max_width='1000px',
                    width='100%'
                )
            )
            with self.steptwo_status_output:
                clear_output(wait=True)
                display(HTML(self.steptwo_output_html_message))

        def stepthree_instructions():
            self.stepthree_output_html_message = """
            <div style='padding: 10px; background-color: #f8f9fa; border-left: 5px solid #007bff; margin: 10px 0;'>
                <h3>Step 3 (Optional): Assign Study Variables for Data Grouping</h3>
                <p>This step allows you to define experimental variables by grouping absorbance data columns. Select one of these three options:</p>
                
                <details>
                    <summary style='cursor: pointer; padding: 8px; background-color: #f8f9fa; border-radius: 5px; margin: 5px 0;'>
                        <b>Option 1: Upload Existing Group Dictionary</b>
                    </summary>
                    <div style='padding: 8px; margin-left: 20px;'>
                        <ul>
                            <li>Import a predefined JSON file from previous analyses</li>
                            <li>Useful for reusing experimental designs or standardized groupings</li>
                        </ul>
                    </div>
                </details>
                
                <details>
                    <summary style='cursor: pointer; padding: 8px; background-color: #f8f9fa; border-radius: 5px; margin: 5px 0;'>
                        <b>Option 2: Assign Selected Columns to a Group</b>
                    </summary>
                    <div style='padding: 8px; margin-left: 20px;'>
                        <ul>
                            <li>Select absorbance columns and assign them a group name</li>
                            <li>Use the search function to find columns containing specific text</li>
                            <li>Click "Add Group" to create multiple groups as needed</li>
                        </ul>
                    </div>
                </details>
                
                <details>
                    <summary style='cursor: pointer; padding: 8px; background-color: #f8f9fa; border-radius: 5px; margin: 5px 0;'>
                        <b>Option 3: Use Selected Columns Without Grouping</b>
                    </summary>
                    <div style='padding: 8px; margin-left: 20px;'>
                        <ul>
                            <li>Select absorbance columns and use them directly without assigning a common group name</li>
                            <li>Each column will be treated as its own separate group</li>
                            <li>Use "No Groups" button after selecting the columns</li>
                        </ul>
                    </div>
                </details>
                
                <details>
                    <summary style='cursor: pointer; padding: 8px; background-color: #f8f9fa; border-radius: 5px; margin: 5px 0;'>
                        <b>How Grouping Works</b>
                    </summary>
                    <div style='padding: 8px; margin-left: 20px;'>
                        <ul>
                            <li><b>Purpose:</b> When you create a group, the system will calculate an average absorbance value across all selected columns.</li>
                            <li><b>Example:</b> If you group 3 replicates named "Sample1_Rep1", "Sample1_Rep2", and "Sample1_Rep3" as "Sample1", you'll get a new "Avg_Sample1" column in your processed data.</li>
                            <li><b>Multiple Groups:</b> The same column can be assigned to multiple groups for different analyses (e.g., a column could belong to both "Treatment" and "Day1" groups).</li>
                        </ul>
                    </div>
                </details>
                
                <p style='margin-top: 10px;'>After defining your groups, proceed to Step 4 to generate the processed dataset with averaged values.</p>
            </div>
            """
            
            self.stepthree_status_output = widgets.Output(
                layout=widgets.Layout(
                    max_width='1000px',
                    width='100%'
                )
            )
            
            with self.stepthree_status_output:
                display(HTML(self.stepthree_output_html_message))
                
        stepone_instructions()
        steptwo_instructions()
        stepthree_instructions()       


        # Set up observers for the file uploaders and UniProt search checkbox
        self.data_transformer.pd_uploader.observe(self._update_step1_status, names='value')
        self.data_transformer.mbpdb_uploader.observe(self._update_step1_status, names='value')
        self.data_transformer.fasta_uploader.observe(self._update_step1_status, names='value')
        self.data_transformer.search_button.observe(self._update_step1_status, names='value')
        self.data_transformer.mbpdb_results_from_search_placeholder.observe(self._update_step1_status, names='value')
        self.data_transformer.fasta_uploader_placeholder.observe(self._update_step1_status, names='value')
        #self.data_transformer.uniprot_search.observe(self._update_step1_status, names='value')
        self.data_transformer.pd_uploader.observe(self._update_step2_status_skip, names='value')
        self.protein_handler.protein_mapping_widget.observe(self._update_step2_status, names='value')
        self.group_processor.notification_widget.observe(self._update_step3_message, names='value')

    def _update_step1_status(self, change):
        """Update Step 1 status message based on uploaded data"""
        # Check current status of uploads
        has_peptide_data = hasattr(self.data_transformer, 'pd_results') and not self.data_transformer.pd_results.empty
        has_mbpdb_data = hasattr(self.data_transformer, 'mbpdb_results') and self.data_transformer.mbpdb_results is not None and not self.data_transformer.mbpdb_results.empty
        has_protein_data = hasattr(self.data_transformer, 'protein_dict') and len(self.data_transformer.protein_dict) > 0
        has_fasta_data = self.data_transformer.fasta_uploader_placeholder.value
        has_uniprot_enabled = hasattr(self.data_transformer, 'uniprot_search') and self.data_transformer.uniprot_search.value
        
        if has_peptide_data:
            # Get row counts for informative message
            peptide_count = len(self.data_transformer.pd_results)
            protein_count = len(self.data_transformer.protein_dict) if has_protein_data else 0
            mbpdb_count = len(self.data_transformer.mbpdb_results) if has_mbpdb_data else 0
            fasta_count = len(self.data_transformer.protein_dict) if has_fasta_data else 0
            
            if has_mbpdb_data:
                filename = f'<b>{self.data_transformer.mbpdb_filename}</b>' if self.data_transformer.mbpdb_results_from_search_placeholder.value == False else f'<b>MBPDB Successfully Queried</b>'
            else:
                filename = ''
           
            if has_protein_data:
                protein_fimename = f'<b>{self.data_transformer.fasta_filename}</b>' if has_fasta_data else ''

            # Create updated message showing success with checkmarks
            self.stepone_output_html_message = f"""
            <div style='padding: 10px; background-color: #e8f5e9; border-left: 5px solid #4caf50; margin: 10px 0;'>
                <h3>Step 1: Upload Data</h3>
                <ul style='list-style-type: none;'>
                    <li>✅ <b>{self.data_transformer.pd_filename}</b> Peptide data loaded with {peptide_count} rows of data</li>
                    <li>{('✅ ' if has_mbpdb_data else '➤')} {str(filename) + ' returning ' + str(mbpdb_count) + ' rows of data' if has_mbpdb_data else 'No Functional data (optional)'}</li>
                    <li>{('✅ ' if has_protein_data or has_uniprot_enabled or has_fasta_data else '➤')} Optional: {
                        'Protein data loaded (' + str(protein_count) + ' proteins loaded from <b>Internal Protein Database</b>)' if has_protein_data and not has_fasta_data
                        else 'Protein data loaded (' + str(fasta_count) + ' proteins loaded from ' + str(protein_fimename) + ')' if has_fasta_data
                        else 'UniProt search enabled' if has_uniprot_enabled 
                        else 'No protein data (optional)'
                    }</li>
                </ul>
                <p style='color: green;'><b>✅ All required data is loaded. You can proceed to Step 2.</b></p>
            </div>
            """
        else:
            # Still waiting for peptide data upload
            self.stepone_output_html_message = """
            <div style='padding: 10px; background-color: #fff3e0; border-left: 5px solid #ff9800; margin: 10px 0;'>
                <h3>Step 1: Upload Data</h3>
                <ul style='list-style-type: none;'>
                    <li>➤ <b>Required:</b> Upload a peptidomic data file exported from your software of choice</li>
                    <li>➤ Optional: Upload Functional Data or Query the MBPDB for functional data</li>
                    <li>➤ Optional: Upload a FASTA file or enable UniProt search for protein information</li>
                </ul>
                <p style='color: #ff9800;'><b>⚠️ Please upload your peptide data file to continue.</b></p>
            </div>
            """
        
        # Update the status display
        with self.stepone_status_output:
            clear_output(wait=True)
            display(HTML(self.stepone_output_html_message))
    
    def _update_step2_status_skip(self, change):
        """Update Step 2 status message based on protein mapping choice"""
        combinations = self.protein_handler.get_protein_combinations()
        if len(combinations) == 0:
            # User selected to use default
            self.steptwo_output_html_message = """
            <div style='padding: 10px; background-color: #e8f5e9; border-left: 5px solid #4caf50; margin: 10px 0;'>
                <h3>Step 2 (Optional): Organize Peptides with Multiple Protein Mappings</h3>
                <p style='color: green; margin-top: 10px;'><b>✅ Peptides are not mapped to multiple proteins. No changes will be made to the protein mapping. You can proceed to Step 3.</b></p>
            </div>
            """
            self.protein_handler.protein_mapping_widget.value = False
            self.protein_handler.protein_mapping_widget.disabled = True

        # Update the display
        with self.steptwo_status_output:
            clear_output(wait=True)
            display(HTML(self.steptwo_output_html_message))

    def _update_step2_status(self, change):
        """Update Step 2 status message based on protein mapping choice"""
        combinations = self.protein_handler.get_protein_combinations()
        if len(combinations) == 0:
            return
        else:
            if change['name'] == 'value' and change['new'] is not None:
                process_manually = change['new']  # True or False
                
                if process_manually:
                    # User selected to process manually
                    example_1 = """
                    <p><b>Example 1:</b> Peptide mapped to Beta-Casein varriant P02666A1, P02666A2</p>
                    <ul>
                        <li><b>Example 1.1:</b> Rename P02666A1/P02666A2 to P02666</li>
                        <ul>
                            <li>Choose <code>Custom: P02666</code> for both P02666A1 to rename one of the proteins in the combination to P02666</li>
                            <li>Choose <code>remove</code> for P02666A2 to remove the protein from the combination leaving only P02666</li>
                        </ul>
                        <li><b>Example 1.2:</b> separate P02666A1/P02666A2 into new respective rows P02666A1 and P02666A2</li>
                        <ul>
                            <li>Choose <code>new</code> for both P02666A1 and P02666A2 to separate and create two new rows</li>
                        </ul>
                        <li><b>Example 1.3:</b> Remove P02666A2 from the combination leaving only P02666A1</li>
                        <ul>
                            <li>Choose <code>remove</code> for P02666A2 to remove the protein from the combination</li>
                            <li>Choose <code>new</code> for P02666A1 to keep the combination returning only P02666A1 from the combination</li>
                        </ul>
                        <li><b>Example 1.4:</b> Return combination default by assign <code>asis</code> to the combination P02666A1/P02666A2</li>
                        <ul>
                            <li>Choose <code>asis</code> for both P02666A1 and P02666A2 to keep the combination as is, returning the current combination unchanged</li>         
                        </ul>
                    </ul>
                    """
                    
                    example_2 = """
                    <p><b>Example 2:</b> Peptide with mapping to Bovine (P02666) and Human (P05814) Beta-Casein</p>
                    <ul>
                        <li><b>Example 2.1:</b> Remove Human Beta-casein as Bovine data is the only interests of the study</li>
                        <ul>
                            <li>Choose <code>new</code> for P02666 keeping the protein in the combination</li>
                            <li>Choose <code>remove</code> for P05814 to remove the protein from the combination leaving only P02666</li>
                        </ul>
                        <li><b>Example 2.2:</b> separate P02666/P05814 into new respective rows P02666 and P05814</li>
                        <ul>
                            <li>Choose <code>new</code> for both P02666 and P05814 to separate and create two new rows</li>
                        </ul>
                        <li><b>Example 2.3:</b> Return combination default by assign <code>asis</code> to the combination P02666/P05814</li>
                        <ul>
                            <li>Choose <code>asis</code> for both P02666 and P05814 to keep the combination as is, returning the current combination unchanged</li>         
                        </ul>
                    </ul>
                    """
                    
                    example_3 = """
                    <p><b>Example 3:</b> Peptide with mapping to protein of interest (P02663, "Alpha-S2-casein") and unimportant minor protein (A5D980, "Protein tyrosine phosphatase")</p>
                    <ul>
                        <li><b>Example 3.1:</b> Remove A5D980 as P02663 data is of interests in the study</li>
                        <ul>
                            <li>Choose <code>new</code> for P02663 keeping the protein in the combination</li>
                            <li>Choose <code>remove</code> for A5D980 to remove the protein from the combination leaving only P02663</li>
                        </ul>
                    </ul>
                    """
                    
                    example_4 = """
                    <p><b>Example 4:</b> Peptide with mapping to primary protein (P02754,"Beta-lactoglobulin") and primary protein frament or the same protein with different name (Q9BDG3, "Beta lactoglobulin D (Fragment)") </p>
                    <ul>
                        <li><b>Example 4.1:</b> Remove Q9BDG3 so all Beta-lactoglobulin peptides will be mapped to P02754 </li>
                        <ul>
                            <li>Choose <code>new</code> for P02754 keeping the protein in the combination</li>
                            <li>Choose <code>remove</code> for Q9BDG3 to remove the protein from the combination leaving only P02754</li>
                        </ul>
                    </ul>
                    """

                    self.steptwo_output_html_message = f"""
                    <div style='padding: 10px; background-color: #e8f5e9; border-left: 5px solid #4caf50; margin: 10px 0;'>
                        <h3>Step 2 (Optional): Organize Peptides with Multiple Protein Mappings</h3>
                        <p><b>✅ Manual processing selected.</b> You'll review each protein mapping combination and decide how to handle it.</p>
                        <div style='background-color: #f8f9fa; padding: 10px; border-radius: 5px; margin-top: 10px;'>
                            <h4>Peptides Mapped to Multiple Proteins</h4>
                            <p>For each combination of proteins, you can choose:</p>
                            <ul>
                                <li><b>new</b> - Create a separate row for this protein</li>
                                <li><b>remove</b> - Remove this protein from the combination</li>
                                <li><b>asis</b> - Returns current combination unchanged</li>
                                <li><b>Custom: (protein ID)</b> - e.g., Custom: P02666A1 to assign a custom protein ID</li>
                            </ul>
                            <p>These options help you handle situations where:</p>
                            <ul>
                                <li>Multiple proteins in Master Protein Accessions</li>
                                <li>Multiple proteins in Positions in Proteins</li>
                                <li>Proteins from different species assigned to the same peptide</li>
                                <li>Unknown protein mappings (from missing values)</li>
                            </ul>
                            <details>
                                <summary style='cursor: pointer; font-weight: bold;'>Click to see examples</summary>
                                <div style='margin-top: 10px; padding: 10px; background-color: #f1f8e9; border-radius: 5px;'>
                                {example_1}
                                {example_2}
                                {example_3}
                                {example_4}
                                </div>
                            </details>
                        </div>
                        <p style='margin-top: 10px;'>Please review the protein combinations below and click "Submit" when finished.</p>
                    </div>
                    """
                else:
                    # User selected to use default
                    self.steptwo_output_html_message = """
                    <div style='padding: 10px; background-color: #e8f5e9; border-left: 5px solid #4caf50; margin: 10px 0;'>
                        <h3>Step 2 (Optional): Organize Peptides with Multiple Protein Mappings</h3>
                        <p><b>✅ Default processing selected.</b> Peptides will be keep defualt mapping based on Master Protein Accessions.</p>
                        <div style='background-color: #f8f9fa; padding: 10px; border-radius: 5px; margin-top: 10px;'>
                            <p>The system will follow these default rules:</p>
                            <ul>
                                <li>Use the first protein ID in the Master Protein Accessions field</li>
                                <li>For peptides with multiple positions, prefer the master protein position</li>
                                <li>Retain all peptides in the dataset without manual filtering</li>
                            </ul>
                        </div>
                        <p style='color: green; margin-top: 10px;'><b>✅ No changes will be made to the protein mapping. You can proceed to Step 3.</b></p>
                    </div>
                    """
                
                # Update the display
                with self.steptwo_status_output:
                    clear_output(wait=True)
                    display(HTML(self.steptwo_output_html_message))
            
    def _update_step3_message(self, change):
        """Update Step 3 status message based on group processing status"""
        # Get the value from the notification widget
        option_completed = change['new']

        # Check current status of group data
        has_group_data = bool(self.group_processor.group_data)
        group_count = len(self.group_processor.group_data) if has_group_data else 0
        
        # Determine which option was completed
        option1_completed = has_group_data and option_completed == 1  # Uploaded from file
        option2_completed = has_group_data and option_completed == 2  # Assigned groups
        option3_completed = has_group_data and option_completed == 3  # No groups option
        
        if has_group_data:
            # Create summary of group data for display
            group_summary = []
            for group_number, group_info in self.group_processor.group_data.items():
                group_name = group_info['grouping_variable']
                column_count = len(group_info['abundance_columns'])
                group_summary.append(f"{group_name} ({column_count} columns)")
            
            # Limit to first 3 groups in the summary if there are many
            if len(group_summary) > 2:
                group_display = ", ".join(group_summary[:2]) + f" and {len(group_summary)-2} more..."
            else:
                group_display = ", ".join(group_summary)
            
            # Create updated message showing success with checkmarks
            self.stepthree_output_html_message = f"""
            <div style='padding: 10px; background-color: #e8f5e9; border-left: 5px solid #4caf50; margin: 10px 0;'>
                <h3>Step 3 (Optional): Assign Study Variables for Data Grouping</h3>
                <p>This step allows you to define experimental variables by grouping absorbance data columns:</p>
                
                <ul style='list-style-type: none;'>
                    <li>{('✅ ' if option1_completed else '➤')} <b>Option 1:</b> Existing Group Dictionary imported from <b>{self.group_processor.jsonfilename if option1_completed else ''}</b></li>
                    <li>{('✅ ' if option2_completed else '➤')} <b>Option 2:</b> Assign Selected Columns to a Group {' - Completed' if option2_completed else ''}</li>
                    <li>{('✅ ' if option3_completed else '➤')} <b>Option 3:</b> Use Selected Columns Without Grouping {' - Completed' if option3_completed else ''}</li>
                </ul>
                
                <details>
                    <summary style='cursor: pointer;'><b>How Grouping Works</b> (click to expand)</summary>
                    <ul>
                        <li><b>Purpose:</b> When you create a group, the system will calculate an average absorbance value across all selected columns.</li>
                        <li><b>Example:</b> If you group 3 replicates named "Sample1_Rep1", "Sample1_Rep2", and "Sample1_Rep3" as "Sample1", you'll get a new "Avg_Sample1" column in your processed data.</li>
                        <li><b>Multiple Groups:</b> The same column can be assigned to multiple groups for different analyses (e.g., a column could belong to both "Treatment" and "Day1" groups).</li>
                    </ul>
                </details>
                
                <div style='background-color: #d4edda; padding: 8px; border-radius: 5px; margin-top: 10px;'>
                    {f"<p style='color: green; margin: 0;'><b>✅ Group data successfully configured:</b> {group_count} groups defined ({group_display}).</p>" if (option1_completed or option3_completed) else ""}
                    {f"<p style='color: green; margin: 0;'><b>✅ At least one group has been created.</b> You can create more groups or proceed to Step 4.</p>" if option2_completed else "<p style='color: green; margin: 0;'><b>You can now proceed to Step 4.</b></p>"}
                </div>
            </div>
            """
        else:
            # Default message when no groups are defined
            self.stepthree_output_html_message = """
            <div style='padding: 10px; background-color: #f8f9fa; border-left: 5px solid #007bff; margin: 10px 0;'>
                <h3>Step 3 (Optional): Assign Study Variables for Data Grouping</h3>
                <p>This step allows you to define experimental variables by grouping absorbance data columns:</p>
                
                <ul style='list-style-type: none;'>
                    <li>➤ <b>Option 1:</b> Upload Existing Group Dictionary - Import a predefined JSON file from previous analyses</li>
                    <li>➤ <b>Option 2:</b> Assign Selected Columns to a Group - Select columns and assign them a group name</li>
                    <li>➤ <b>Option 3:</b> Use Selected Columns Without Grouping - Create individual groups for each column</li>
                </ul>
                
                <details>
                    <summary style='cursor: pointer;'><b>How Grouping Works</b> (click to expand)</summary>
                    <ul>
                        <li><b>Purpose:</b> When you create a group, the system will calculate an average absorbance value across all selected columns.</li>
                        <li><b>Example:</b> If you group 3 replicates named "Sample1_Rep1", "Sample1_Rep2", and "Sample1_Rep3" as "Sample1", you'll get a new "Avg_Sample1" column in your processed data.</li>
                        <li><b>Multiple Groups:</b> The same column can be assigned to multiple groups for different analyses (e.g., a column could belong to both "Treatment" and "Day1" groups).</li>
                    </ul>
                </details>
                
                <p style='color: #007bff; margin-top: 10px;'><b>Choose one of the three options above to configure your data groups.</b></p>
            </div>
            """
        
        # Update the status display
        with self.stepthree_status_output:
            clear_output(wait=True)
            display(HTML(self.stepthree_output_html_message))

    def _create_option_summary(self, number, title, is_selected):
        """Create the summary text for an option, with checkmark if selected"""
        if is_selected:
            return f"Option {number}: {title} ✅"
        else:
            return f"Option {number}: {title}"
    # Add a method to update status after submit button is clicked (for manual processing)
    def _on_submit_complete(self, success=True):
        """Update Step 2 status after manual processing is completed"""
        if success:
            self.steptwo_output_html_message = """
            <div style='padding: 10px; background-color: #e8f5e9; border-left: 5px solid #4caf50; margin: 10px 0;'>
                <h3>Step 2 (Optional): Organize Peptides with Multiple Protein Mappings</h3>
                <p><b>✅ Manual processing completed successfully.</b> Your protein mapping choices have been applied.</p>
                <div style='background-color: #f8f9fa; padding: 10px; border-radius: 5px; margin-top: 10px;'>
                    <p>The system has processed your selections for each protein combination.</p>
                    <ul>
                        <li>New protein entries have been created where requested</li>
                        <li>Proteins marked for removal have been filtered out</li>
                        <li>Custom protein IDs have been applied where specified</li>
                    </ul>
                </div>
                <p style='color: green; margin-top: 10px;'><b>✅ All protein mappings have been processed. You can proceed to Step 3.</b></p>
            </div>
            """        
            # Mark protein processing as complete and enable group processing
            self.protein_processing_complete = True
            self.group_processor.enable_widgets(True)
        else:
            self.steptwo_output_html_message = """
            <div style='padding: 10px; background-color: #fff3e0; border-left: 5px solid #ff9800; margin: 10px 0;'>
                <h3>Step 2 (Optional): Organize Peptides with Multiple Protein Mappings</h3>
                <p><b>⚠️ An error occurred during protein mapping processing.</b></p>
                <p>Please check your selections and try again. Make sure all entries have valid decision values.</p>
            </div>
            """        
            # Keep protein processing as incomplete
            self.protein_processing_complete = False
            self.group_processor.enable_widgets(False)
    
        # Update the display
        with self.steptwo_status_output:
            clear_output(wait=True)
            display(HTML(self.steptwo_output_html_message)) 

    def display(self):
        """Display the complete workflow interface"""
        # Step 1: Upload Data
        widgets_ui = self.data_transformer.display_widgets()
        step1_box = widgets.VBox([
            self.stepone_status_output,
            widgets_ui
        ], layout=widgets.Layout(width='auto', padding='5px', height='auto', overflow='hidden'))
        display(step1_box)
        # Step 2 (Optional): Organize proteins
        #if self.data_transformer.missing_proteins:
        #    protein_mapping_widget_outputs = self.protein_handler.display_widgets()
        #else:
        #    protein_mapping_widget_outputs = widgets.VBox([self.protein_handler.protein_mapping_output_area])
        
        step2_box = widgets.VBox([
            self.steptwo_status_output,
            #widgets.HTML("<h3><u>Protein Mapping</u></h3>"),
            self.protein_handler.protein_mapping_widget,
            self.protein_handler.buttonuniprot_output_area_output,
            #self.protein_handler.uniprot_output_area,
            #widgets.VBox([self.protein_handler.progress_uniprot, self.protein_handler.counter_text]),
            self.protein_handler.uniprot_output_area,
            self.protein_handler.protein_mapping_output_area,
        ], layout=widgets.Layout(width='auto', padding='5px', height='auto', overflow='hidden'))
        display(step2_box)

        # Step 3: Assgin Study Varriable Gouping
        #display(self.stepthree_status_output)

        #input_group_selector1 = self.group_processor.display_group_selector()
        input_group_selector2 = self.group_processor.display_widgets()
        # Step 3: Group Selection
        step3_box = widgets.VBox([
            self.stepthree_status_output,
            #input_group_selector1,
            input_group_selector2,
        ], layout=widgets.Layout(width='auto', padding='5px', height='auto', overflow='hidden'))
        display(step3_box)

In [6]:
class CombineAverageDataframes:
    def __init__(self, data_transformer, group_processor, protein_handler):
        self.data_transformer = data_transformer
        self.group_processor = group_processor
        self.pd_results = data_transformer.pd_results
        self.mbpdb_results = data_transformer.mbpdb_results
        self.pd_results_cleaned = protein_handler.pd_results_cleaned if hasattr(protein_handler, 'pd_results_cleaned') and protein_handler.pd_results_cleaned is not None else pd.DataFrame()
        self._merged_df = None
        # Set up observer for data changes
        self.data_transformer.observe(self._handle_data_change, names=['pd_results', 'mbpdb_results'])
        
    @property  # Make protein_dict a property that always reads from data_transformer
    def protein_dict(self):
        return self.data_transformer.protein_dict
        
    def _handle_data_change(self, change):
        """Handle changes in the input data."""
        if change.name == 'pd_results':
            self.pd_results = change.new
        elif change.name == 'mbpdb_results':
            self.mbpdb_results = change.new
        elif change.name == 'pd_results_cleaned':
            self.pd_results_cleaned = change.new        # Re-run interactive display
        clear_output()        
    @property
    def merged_df(self):
        """Property to access the merged DataFrame."""
        return self._merged_df
        
    def add_protein_info(self, df):
        """
        Adds protein species and name information to the dataframe based on Master Protein Accessions,
        inserting them after Master Protein Accessions and before Positions in Proteins.
        
        Args:
            df (pandas.DataFrame): Input dataframe containing 'Master Protein Accessions' column
            
        Returns:
            pandas.DataFrame: DataFrame with added 'protein_species' and 'protein_name' columns
        """
        # First, make a copy to avoid modifying the original
        df = df.copy()
        
        # Create temporary columns
        df['protein_species'] = 'Unknown'
        df['protein_name'] = 'Unknown Protein'
        
        # Process each row
        for idx, row in df.iterrows():
            # Get the protein accessions - handle potential multiple proteins
            proteins = str(row['Master Protein Accessions']).split(';')
            
            # Process first protein in the list (primary protein)
            if proteins and proteins[0] != '' and proteins[0] != 'nan':
                protein = proteins[0].strip()
                df.at[idx, 'protein_species'] = self.protein_dict.get(protein, {}).get('species', "Unknown")
                df.at[idx, 'protein_name'] = self.protein_dict.get(protein, {}).get('name', "Unknown Protein")
        
        # Get all column names
        all_cols = list(df.columns)
        
        # Remove the new columns from their current position
        remaining_cols = [col for col in all_cols if col not in ['protein_species', 'protein_name']]
        
        # Find the position after 'Master Protein Accessions'
        insert_pos = remaining_cols.index('Master Protein Accessions') + 1
        
        # Create the new column order
        new_cols = (
            remaining_cols[:insert_pos] +  # Columns before and including Master Protein Accessions
            ['protein_species', 'protein_name'] +  # New columns
            remaining_cols[insert_pos:]  # Remaining columns
        )
        
        # Reorder the DataFrame with the new column order
        result_df = df.reindex(columns=new_cols)
        
        # Verify column order (optional debug print)
        # print("DEBUG: Column order:", new_cols)
        # print("DEBUG: Position of Master Protein Accessions:", insert_pos)
        
        return result_df
        
    def extract_bioactive_peptides(self):
        """
        Extracts the list of bioactive peptide matches from the imported MBPDB search.
        """
        if not self.mbpdb_results.empty:
            # Drop rows where protein_id is NaN or 'None'
            mbpdb_results_cleaned = self.mbpdb_results.copy()
            mbpdb_results_cleaned.dropna(subset=['search_peptide'], inplace=True)
            mbpdb_results_cleaned = mbpdb_results_cleaned[mbpdb_results_cleaned['protein_id'] != 'None']

            # Dynamically build aggregation dictionary based on available columns
            available_columns = mbpdb_results_cleaned.columns.tolist()
            
            # Base aggregation for required columns
            agg_dict = {}
            
            # Always include these if they exist
            if 'peptide' in available_columns:
                agg_dict['peptide'] = 'first'
            if 'protein_id' in available_columns:
                agg_dict['protein_id'] = 'first'
            
            # Add optional columns if they exist
            optional_columns = {
                'protein_description': 'first',
                '% Alignment': 'first', 
                'species': 'first',
                'intervals': 'first',
                'additional_details': 'first',
                'ic50': 'first',
                'inhibition_type': 'first',
                'inhibited_microorganisms': 'first',
                'ptm': 'first',
                'title': 'first',
                'authors': 'first',
                'abstract': 'first',
                'doi': 'first',
                'search_type': 'first',
                'scoring_matrix': 'first'
            }
            
            for col, agg_func in optional_columns.items():
                if col in available_columns:
                    agg_dict[col] = agg_func
            
            # Special handling for function column (combine unique values)
            if 'function' in available_columns:
                agg_dict['function'] = lambda x: list(x.dropna().unique())
            
            # Ensure we have at least peptide for grouping
            if not agg_dict:
                print("Warning: No expected columns found for aggregation. Using available columns as-is.")
                return mbpdb_results_cleaned, mbpdb_results_cleaned

            # Perform the groupby and aggregation with error handling
            try:
                self.mbpdb_results_grouped = mbpdb_results_cleaned.groupby('search_peptide').agg(agg_dict).reset_index()
                
                # Flatten the 'function' list if it exists
                if 'function' in self.mbpdb_results_grouped.columns:
                    self.mbpdb_results_grouped['function'] = self.mbpdb_results_grouped['function'].apply(
                        lambda x: '; '.join([str(func) for func in x if str(func) != 'nan']) if isinstance(x, list) else str(x)
                    )
                
                return mbpdb_results_cleaned, self.mbpdb_results_grouped
                
            except Exception as e:
                print(f"Error during MBPDB aggregation: {str(e)}")
                print(f"Available columns: {available_columns}")
                print(f"Aggregation dict: {list(agg_dict.keys())}")
                return mbpdb_results_cleaned, mbpdb_results_cleaned
        else:
            return None, None
    
    def create_unique_id(self, row):
        """Creates a unique ID for each peptide row."""
        # Handle Sequence - convert list to comma-separated string if needed
        sequence = row['Sequence']
        if isinstance(sequence, list):
            sequence = ','.join(sequence)
        else:
            sequence = str(sequence).strip()
        
        # Create unique ID with modifications if present
        if pd.notna(row['Modifications']):
            unique_id = sequence + "_" + row['Modifications'].strip()
        else:
            unique_id = sequence
        
        # Ensure unique_id is a string and strip trailing underscores
        unique_id = str(unique_id).strip()
        return unique_id.rstrip('_')

    def process_pd_results(self, mbpdb_results_grouped):
        pd_results_cleaned = self.pd_results_cleaned
        
        # Process positions and accessions
        #pd_results_cleaned['Positions in Proteins'] = pd_results_cleaned['Positions in Proteins'].str.split(';', expand=False).str[0]
        #pd_results_cleaned['Master Protein Accessions'] = pd_results_cleaned['Master Protein Accessions'].str.split(';', expand=False).str[0]
                    
        # Handle NaN/Unknown values first
        pd_results_cleaned['Master Protein Accessions'] = pd_results_cleaned['Master Protein Accessions'].fillna('Unknown')
        pd_results_cleaned['Positions in Proteins'] = pd_results_cleaned['Positions in Proteins'].fillna('Unknown')
        
        # Create sequence column if needed
        # Create sequence column if needed
        if 'Sequence' not in pd_results_cleaned.columns:
            # First create Sequence column with NaN values
            pd_results_cleaned['Sequence'] = pd.NA
            
            def extract_sequence(annotated_seq):
                if pd.isna(annotated_seq):
                    return pd.NA
                
                # Case 1: [X].SEQUENCE.[X] format
                if '.' in annotated_seq:
                    parts = annotated_seq.split('.')
                    if len(parts) > 1:
                        return parts[1]
                
                # Case 2: Plain sequence like "LLL" or "WE"
                return annotated_seq
            
            # Apply the extraction function to all rows
            pd_results_cleaned['Sequence'] = pd_results_cleaned['Annotated Sequence'].apply(extract_sequence)
        
        # Create unique ID
        pd_results_cleaned['unique ID'] = pd_results_cleaned.apply(self.create_unique_id, axis=1)

        # Extract start and stop positions
        try:
            # Initialize start and stop columns with NaN
            pd_results_cleaned['start'] = pd.NA
            pd_results_cleaned['stop'] = pd.NA
            
            # Create mask for rows without semicolons (single positions) and not Unknown
            valid_position_mask = (~pd_results_cleaned['Positions in Proteins'].str.contains(';', na=False) & 
                                 (pd_results_cleaned['Positions in Proteins'] != 'Unknown'))
            
            # Process rows with single positions
            single_positions = pd_results_cleaned.loc[valid_position_mask, 'Positions in Proteins']
            if not single_positions.empty:
                extracted = single_positions.str.extract(r'\[(\d+)-(\d+)\]')
                
                # Convert to numeric and handle invalid values
                pd_results_cleaned.loc[valid_position_mask, 'start'] = pd.to_numeric(extracted[0], errors='coerce')
                pd_results_cleaned.loc[valid_position_mask, 'stop'] = pd.to_numeric(extracted[1], errors='coerce')
            
            # Convert to Int64 to handle missing values properly
            pd_results_cleaned['start'] = pd_results_cleaned['start'].astype('Int64')
            pd_results_cleaned['stop'] = pd_results_cleaned['stop'].astype('Int64')
        except Exception as e:
            print(f"Error processing positions: {str(e)}")
        
    
        # Reorder columns with unique ID and Sequence first
        remaining_cols = [col for col in pd_results_cleaned.columns 
                         if col not in ['unique ID', 'Sequence', 'Master Protein Accessions', 
                                      'Positions in Proteins', 'start', 'stop']]
        
        columns_order = ['unique ID', 'Sequence', 'Master Protein Accessions', 
                        'Positions in Proteins', 'start', 'stop'] + remaining_cols
        
        pd_results_cleaned = pd_results_cleaned[columns_order]
                
        # Merge with MBPDB results if available
        if self.mbpdb_results_grouped is not None and not self.mbpdb_results_grouped.empty:
            # First do the regular merge
            merged_df = pd.merge(pd_results_cleaned, self.mbpdb_results_grouped, 
                                right_on='search_peptide', left_on='unique ID', how='left')
            
            # Second pass: handle comma-separated unique IDs
            comma_mask = merged_df['unique ID'].str.contains(',', na=False)
            comma_rows = merged_df[comma_mask].copy()
            
            for idx, row in comma_rows.iterrows():
                # Split the unique ID
                unique_ids = row['unique ID'].split(',')
                
                # Check if any part matches with search_peptide
                matches = self.mbpdb_results_grouped[self.mbpdb_results_grouped['search_peptide'].isin(unique_ids)]

                if not matches.empty:
                    # Take the first match and update all MBPDB columns
                    match = matches.iloc[0]
                    for col in self.mbpdb_results_grouped.columns:
                        #if col != 'search_peptide':  # Don't overwrite unique ID
                        merged_df.loc[idx, col] = match[col]
        
            #display(HTML("<b style='color:green;'>The MBPDB was successfully merged with the peptidomic data matching the Search Peptide and Unique ID columns (including comma-separated IDs).</b>"))
        
        else:
            merged_df = pd_results_cleaned.copy()
            merged_df['function'] = np.nan

        
        # Ensure columns are in correct order
        final_column_order = columns_order + [col for col in merged_df.columns if col not in columns_order]
        merged_df = merged_df[final_column_order]
        
        return merged_df
    
    def calculate_group_abundance_averages(self, df, group_data):
        """Calculates group abundance averages, organizing them with averages"""
        # Check if all average abundance columns already exist
        all_columns_exist = True
        for group_number, details in group_data.items():
            average_column_name = f"Avg_{details['grouping_variable']}"
            if average_column_name not in df.columns:
                all_columns_exist = False
                break
        
        if all_columns_exist:
            display(HTML('<b style="color:orange;">All average abundance columns already exist. Returning original DataFrame.</b>'))
            return df
        
        # If not all columns exist, proceed with calculations
        average_columns = {}
        
        # Calculate all averages but store them separately
        for group_number, details in group_data.items():
            grouping_variable = details['grouping_variable']
            abundance_columns = details['abundance_columns']
            
            # Convert abundance columns to numeric
            for col in abundance_columns:
                df[col] = pd.to_numeric(df[col], errors='coerce')
            
            # Define column names
            average_column_name = f"Avg_{grouping_variable}"
        
            average_columns[average_column_name] = df[abundance_columns].mean(axis=1, skipna=True)
        
        # Combine the columns in the desired order (all averages, then all SEMs)
        new_columns = {**average_columns}
        
        # Add new columns to DataFrame
        df = pd.concat([df, pd.DataFrame(new_columns)], axis=1)
        
        #if not df.empty:
            #display(HTML('<b style="color:green;">Group average columns have been successfully added to the DataFrame.</b>'))
        return df
        
    def create_column_to_groups_mapping(self, group_data):
        """
        Creates a mapping dictionary where keys are column names and values are lists of 
        grouping variables that include this column.
        """
        column_to_groups = {}
        
        # Iterate through all groups
        for group_number, details in group_data.items():
            grouping_variable = details['grouping_variable']
            abundance_columns = details['abundance_columns']
            
            # For each column, add the current grouping variable to its list
            for column in abundance_columns:
                if column not in column_to_groups:
                    column_to_groups[column] = []
                column_to_groups[column].append(grouping_variable)
        
        return column_to_groups

    def update_column_names_with_groups(self, df, group_data):
        """
        Updates column names in the DataFrame by adding grouping information.
        """
        # Create the mapping of columns to their grouping variables
        column_to_groups = self.create_column_to_groups_mapping(group_data)
        # Create a copy of the DataFrame to avoid modifying the original
        df_renamed = df.copy()
        
        # Create renaming dictionary
        rename_dict = {}
        for column in column_to_groups:
            if column in df.columns:
                groups_str = "; ".join(column_to_groups[column])
                new_name = f"{column} 'Grouped: ({groups_str})'"
                rename_dict[column] = new_name
        
        # Rename columns
        df_renamed = df_renamed.rename(columns=rename_dict)
        
        if rename_dict:
            pass
            #display(HTML('<b style="color:green;">Column names have been updated with grouping information.</b>'))
        else:
            display(HTML('<b style="color:orange;">No columns were updated with grouping information.</b>'))
        
        return df_renamed

    def process_data(self, group_data):
        """Main method to process all data."""
        if hasattr(self, 'pd_results') and self.pd_results is not None and not self.pd_results.empty:
            try:
                # Extract and process bioactive peptides
                mbpdb_results_cleaned, self.mbpdb_results_grouped = self.extract_bioactive_peptides()
                
                if not hasattr(self, 'pd_results_cleaned') or self.pd_results_cleaned is None:
                    self.pd_results_cleaned = self.pd_results.copy()
                
                # Process PD results and merge with MBPDB
                merged_df_temp = self.process_pd_results(self.mbpdb_results_grouped)
                
                # Calculate abundance averages if group_data exists
                if group_data:
                    with warnings.catch_warnings():
                        warnings.simplefilter("ignore", UserWarning)
                        df_temp = self.calculate_group_abundance_averages(merged_df_temp, group_data)
                        
                        # Apply the new function to update column names with grouping information
                        final_df_temp = self.update_column_names_with_groups(df_temp, group_data)
                else:
                    final_df_temp = merged_df_temp
                    display(HTML("<b style='color:orange;'>No group data provided. Skipping abundance calculations and column renaming.</b>"))
        
                
                # Store the final DataFrame and add protein name and species 
                final_df = self.add_protein_info(final_df_temp)
                self._merged_df = final_df

                return final_df

            except Exception as e:
                display(HTML(f"<b style='color:red;'>Error processing data: {str(e)}</b>"))
                return None
        else:
            display(HTML("<b style='color:red;'>No PD results data available for processing.</b>"))
            return None
    
    def update_data(self, pd_results):
        """Update data and refresh filtered columns"""
        self.pd_results = pd_results
        
        # Only update if we have valid data
        if pd_results is not None and not pd_results.empty:
            self.setup_data()
            
            # Update the dropdown with new filtered columns
            self.column_dropdown.options = self.filtered_columns
            
            with self.output:
                self.output.clear_output()
                #display(widgets.HTML('<b style="color:green;">Data updated successfully. Column selection refreshed.</b>'))
        else:
            # Clear options if no data
            self.column_dropdown.options = []
            with self.output:
                self.output.clear_output()
                display(widgets.HTML('<b style="color:orange;">No data available for column selection.</b>'))

In [7]:
class ExportManager:
    """Class to manage all export operations with predefined buttons"""
    def __init__(self):
        
        # Create output area for status messages
        self.status_output = widgets.Output()
        # Create all export buttons
        self.mbpdb_button = widgets.Button(
            description='Download MBPDB Results',
            icon='download',
            button_style='info',
            layout=widgets.Layout(width='300px'),
            tooltip='Download the results from searching your peptides against the MBPDB database',
            disabled=True
        )
        
        self.group_data_button = widgets.Button(
            description='Download Group Definitions',
            icon='download',
            button_style='info',
            layout=widgets.Layout(width='300px'),
            tooltip='Download the categorical variable definitions used for data grouping and analysis',
            disabled=True
        )
        
        self.dataset_button = widgets.Button(
            description='Download Merged Dataset',
            icon='download',
            button_style='info',
            layout=widgets.Layout(width='300px'),
            tooltip='Download the complete merged dataset containing all processed data',
            disabled=True
        )
        
        self.export_group_correlation_button = widgets.Button(
            description='Export Sample-to-Sample Correlations',
            button_style='info',
            icon='download',
            layout=widgets.Layout(width='300px'),
            disabled=True
            )   
        
        self.export_replicate_correlation_button = widgets.Button(
            description='Export Technical Replicate Correlations',
            button_style='info',
            icon='download',
            layout=widgets.Layout(width='300px'),
            disabled=True
            )
        
        self.export_sequence_list_button = widgets.Button(
            description='Export Summed Peptide Results',
            button_style='info',
            icon='download',
            layout=widgets.Layout(width='300px'),
            disabled=True
        )

        self.export_summed_peptide_results_button = widgets.Button(
            description='Export Summed Peptide Results',
            button_style='info',
            icon='download',
            layout=widgets.Layout(width='300px'),
            disabled=True
        )

        self.export_protein_data_button = widgets.Button(
            description='Export Protein Analysis Results',
            button_style='info',
            icon='download',
            layout=widgets.Layout(width='300px'),
            disabled=True
        )

        self.export_summed_function_data_button = widgets.Button(
            description='Export Summed Functional Data',
            button_style='info',
            icon='download',
            layout=widgets.Layout(width='300px'),
            disabled=True
        )
        self.correlation_type = widgets.Dropdown(
            options=['Pearson', 'Spearman'],
            description='Correlation:',
            value='Pearson',
            layout=widgets.Layout(width='300px', height='30px'),
            disabled=True
        )

        self.log_transform = widgets.Checkbox(
            value=True,
            description='Log10 transform data',
            layout=widgets.Layout(width='300px', height='50px'),
            disabled=True
        )
        
        self.export_group_correlation_button.on_click(self._handle_group_correlation_download)
        self.export_replicate_correlation_button.on_click(self._handle_replicate_correlation_download)
        self.export_summed_peptide_results_button.on_click(self._handle_summed_peptide_download)
        self.export_sequence_list_button.on_click(self._handle_sequence_list_download)
        self.export_protein_data_button.on_click(self._handle_protein_download)
        self.export_summed_function_data_button.on_click(self._handle_summed_function_download)

        # Add click handlers
        self.mbpdb_button.on_click(self._handle_mbpdb_download)
        self.group_data_button.on_click(self._handle_group_download)
        self.dataset_button.on_click(self._handle_dataset_download)
        """
        # Create labels for descriptions
        self.mbpdb_desc = widgets.HTML(
            value='<div style="color: #666; font-style: italic; margin: 5px 0;">Download MBPDB search results and bioactivity data</div>'
        )
        self.group_desc = widgets.HTML(
            value='<div style="color: #666; font-style: italic; margin: 5px 0;">Download categorical variable definitions for data grouping</div>'
        )
        self.dataset_desc = widgets.HTML(
            value='<div style="color: #666; font-style: italic; margin: 5px 0;">Download the complete processed dataset</div>'
        )
        """
        # Store references to data
        self.mbpdb_df = None
        self.group_data = None
        self.merged_df = None
        
        # Create button container with spacing
        """
        self.button_container = widgets.VBox([
            self.mbpdb_button,
            self.group_data_button,
            self.dataset_button,
            self.export_group_correlation_button,
            self.export_replicate_correlation_button,
            self.export_summed_peptide_results_button,
            self.export_protein_data_button,
            self.export_summed_function_data_button
        ],layout=widgets.Layout(width='310px'),
        )"""

    def calculate_correlation(self, x, y):
        """Calculate correlation based on selected method"""
        if  self.correlation_type.value == 'Pearson':
            return pearsonr(x, y)[0]
        else:  # Spearman
            return spearmanr(x, y)[0]
        
    def prepare_data(self, data):
        """Prepare data based on log transform setting"""
        if  self.log_transform.value:
            return np.log10(data)
        return data
    
    def _convert_group_data_dict(self):
            # Calculate within-group correlations
            df = self.merged_df.copy()
            group_data = self.group_data
            
            # Find all columns that have the 'Grouped:' pattern
            grouped_columns = [col for col in df.columns if " 'Grouped:" in str(col)]
            
            # Create mapping for column renaming (strip the 'Grouped:' part)
            renamed_columns = {}
            for col in grouped_columns:
                base_col_name = col.split(" 'Grouped:")[0].strip()
                renamed_columns[col] = base_col_name
            
            # Rename the columns in the DataFrame to remove the 'Grouped:' part
            df = df.rename(columns=renamed_columns)
            
            # Update the abundance_columns in group_data to match the renamed columns
            updated_group_data = {}
            for key, value in group_data.items():
                grouping_variable = value['grouping_variable']
                abundance_columns = value['abundance_columns']
                
                # Create a new list of abundance columns that match the renamed columns
                updated_abundance_columns = []
                for col in abundance_columns:
                    # Find the matching column in the renamed DataFrame
                    matching_cols = [c for c in df.columns if c == col]
                    if matching_cols:
                        updated_abundance_columns.append(matching_cols[0])
                
                updated_group_data[key] = {
                    'grouping_variable': grouping_variable,
                    'abundance_columns': updated_abundance_columns
                }
            
            # Use the updated group data for further processing
            group_data = updated_group_data
            return group_data, df

    def _process_bioactive_data(self):
        """Process bioactive peptide data for visualization"""
        if self.merged_df is None:
            return None
            

        df = self.merged_df.copy()
        absorbance_cols = []
        selected_groups = []
        unique_function_absorbance = {}
        # Get Absorbance columns based on selected groups
        for _, value in self.group_data.items():
            grouping_variable = value['grouping_variable']
            abundance_columns = value['abundance_columns']        
            selected_groups.append(grouping_variable)
            if grouping_variable != abundance_columns:
                absorbance_cols.append(f'Avg_{grouping_variable}')
            else:
                absorbance_cols.append(abundance_columns)
            if 'function' not in df.columns:
                return None
            
            for column in absorbance_cols:
                            
                # Filter and process data
                temp_df = df[['unique ID', 'function', column]].copy()
                temp_df = temp_df[
                    (temp_df[column] != 0) & 
                    temp_df[column].notna() &
                    temp_df['function'].notna()
                ]
                
                if temp_df.empty:
                    continue
                
                # Process functions
                temp_df.loc[:, 'function'] = temp_df['function'].fillna('').str.split(';')
                exploded_df = temp_df.explode('function')
                exploded_df.loc[:, 'function'] = exploded_df['function'].str.strip()
                exploded_df = exploded_df[exploded_df['function'] != '']
                
                if not exploded_df.empty:
                    function_grouped = exploded_df.groupby('function')[column].sum()
                    unique_function_absorbance[grouping_variable] = function_grouped.to_dict()

        return unique_function_absorbance, absorbance_cols              
    
    def _process_functional_peptide_export_data(self):
        """Process data for export into Excel format"""
        unique_function_absorbance, absorbance_cols = self._process_bioactive_data()
        if not unique_function_absorbance:
            return None
            
        # Get all groups and functions
        groups = list(unique_function_absorbance.keys())
        all_functions = set()
        for group_data in unique_function_absorbance.values():
            all_functions.update(group_data.keys())
            
        # Calculate function counts
        df = self.merged_df.copy()
        summed_function_count = {}
        unique_function_counts = {}
        unique_function_count_averages = {}
        summed_function_abundance = {}
        
        for group, abundance_column in zip(groups, absorbance_cols):
            if abundance_column not in df.columns:
                continue
                
            # Filter and process data
            temp_df = df[['unique ID', 'function', abundance_column]].copy()
            temp_df = temp_df[
                (temp_df[abundance_column] != 0) & 
                temp_df[abundance_column].notna() &
                temp_df['function'].notna()
            ]
            
            # Drop duplicates and calculate counts
            filtered_df = temp_df.drop_duplicates(subset='unique ID')
            unique_peptide_count = filtered_df['unique ID'].nunique()
            total_sum = filtered_df[abundance_column].sum()
            
            # Store the totals
            summed_function_abundance[group] = total_sum
            summed_function_count[group] = unique_peptide_count
            
            # Process functions
            filtered_df.loc[:, 'function'] = filtered_df['function'].fillna('').str.split(';')
            exploded_df = filtered_df.explode('function')
            exploded_df.loc[:, 'function'] = exploded_df['function'].str.strip()
            exploded_df = exploded_df[exploded_df['function'] != '']
            
            if not exploded_df.empty:
                # Count functions
                function_counts = exploded_df['function'].value_counts().to_dict()
                unique_function_counts[group] = function_counts
                
                # Calculate averages (using 1 since we're using averaged columns)
                function_averages = {func: count for func, count in function_counts.items()}
                unique_function_count_averages[group] = function_averages
        
        # Create DataFrames for export
        peptide_count_df = pd.DataFrame.from_dict(
            summed_function_count,
            orient='index',
            columns=['Counts of peptides']
        )
        
        function_count_df = pd.DataFrame.from_dict(
            unique_function_counts,
            orient='index'
        ).fillna(0).astype(int)
        
        combined_count_df = pd.concat([peptide_count_df, function_count_df], axis=1).T
        
        # Create abundance DataFrames
        peptide_absorbance_df = pd.DataFrame.from_dict(
            summed_function_abundance,
            orient='index',
            columns=['Summed Absorbance']
        )
        
        function_absorbance_df = pd.DataFrame.from_dict(
            unique_function_absorbance,
            orient='index'
        ).fillna(0)
        
        combined_absorbance_df = pd.concat(
            [peptide_absorbance_df, function_absorbance_df],
            axis=1
        ).T
        
        # Create combined DataFrame with formatted values
        combined_df = pd.DataFrame(
            index=combined_absorbance_df.index,
            columns=combined_absorbance_df.columns
        )
        
        for col in combined_absorbance_df.columns:
            for idx in combined_absorbance_df.index:
                abundance = combined_absorbance_df.loc[idx, col]
                count = (combined_count_df.loc['Counts of peptides', col]
                        if idx == 'Summed Absorbance'
                        else combined_count_df.loc[idx, col])
                combined_df.loc[idx, col] = "-" if (abundance == 0 and count == 0) else f"{abundance:.2e} ({round(count)})"
        
        combined_df.rename(index={'Summed Absorbance': 'Total'}, inplace=True)
        
        return combined_df, combined_count_df, combined_absorbance_df

    def _export_summed_peptide_data(self, data):
        """
        Export peptide data to Excel with summary and replicate details.
        
        Args:
            data (dict): Dictionary containing peptide analysis results
            
        Returns:
            bytes: Excel file content as bytes
        """
        try:
            # Create summary DataFrame
            summary_data = []
            for group, values in data.items():
                summary_data.append({
                    'Group': group,
                    'Total_Absorbance': values['total_Absorbance'],
                    'Abundance_SEM': values['abundance_sem'],
                    'Unique_Peptides': values['unique_peptides'],
                    'Count_SEM': values['count_sem']
                })
            summary_df = pd.DataFrame(summary_data)
            
            # Create replicate details DataFrame
            replicate_data = []
            for group, values in data.items():
                # Get the replicate information
                replicate_info = values['replicate_data']
                
                # Add entry for each replicate
                for i, replicate_name in enumerate(replicate_info['abundance_columns']):
                    replicate_data.append({
                        'Group': group,
                        'Replicate': replicate_name,
                        'Total_Absorbance': replicate_info['replicate_abundances'][i],
                        'Unique_Peptides': replicate_info['replicate_counts'][i]
                    })
            replicate_df = pd.DataFrame(replicate_data)
            
            # Create Excel file in memory
            output = io.BytesIO()
            with pd.ExcelWriter(output, engine='openpyxl') as writer:
                # Write summary sheet
                summary_df.to_excel(
                    writer, 
                    sheet_name='Summary',
                    index=False
                )
                
                # Write replicate details sheet
                replicate_df.to_excel(
                    writer, 
                    sheet_name='Replicate Details',
                    index=False
                )
                
                # Auto-adjust column widths for both sheets
                for sheet in writer.sheets.values():
                    for column in sheet.columns:
                        max_length = 0
                        column = [cell for cell in column if cell.value is not None]
                        for cell in column:
                            try:
                                if len(str(cell.value)) > max_length:
                                    max_length = len(str(cell.value))
                            except:
                                pass
                        adjusted_width = (max_length + 2)
                        sheet.column_dimensions[column[0].column_letter].width = adjusted_width
            
            # Get the Excel file content
            excel_content = output.getvalue()
            output.close()
            
            return excel_content
            
        except Exception as e:
            print(f"Error exporting data: {str(e)}")
            return None
            
    def _export_protein_data(self):
        
        if self.merged_df is None:
            return False
        
        # First, update the protein list to plot
        self.pro_list = list(set(self.merged_df['protein_name']))
        
        df = self.merged_df.copy()
        absorbance_cols = []
        selected_groups = []
        # Get Absorbance columns based on selected groups
        for _, value in self.group_data.items():
            grouping_variable = value['grouping_variable']
            abundance_columns = value['abundance_columns']        
            selected_groups.append(grouping_variable)
            if grouping_variable != abundance_columns:
                absorbance_cols.append(f'Avg_{grouping_variable}')
            else:
                absorbance_cols.append(abundance_columns)
        df['Total_Absorbance'] = df[absorbance_cols].sum(axis=1).astype(int)
        
        # Filter out zero Absorbance entries
        result_df = df[['unique ID', 'Total_Absorbance']]
        result_df = result_df[result_df['Total_Absorbance'] == 0]
        all_zero_list = list(result_df['unique ID'])
        peptides_df = df[~df['unique ID'].isin(all_zero_list)]

        # Process protein positions and create proteins DataFrame
        additional_columns = ['Master Protein Accessions', 'unique ID']
        selected_columns = additional_columns + absorbance_cols
        
        peptides_df.loc[:, 'Master Protein Accessions'] = peptides_df['Master Protein Accessions']
        
        temp_df = peptides_df.copy()
        temp_df.loc[:, 'Protein_ID'] = temp_df['Master Protein Accessions']
        
        # Create proteins DataFrame with selected columns
        self.proteins_df = temp_df.groupby('Protein_ID').agg(
            {**{col: 'first' for col in ['Master Protein Accessions']},
            **{col: 'sum' for col in absorbance_cols}}
        ).reset_index()
        
        # Calculate relative Absorbance for selected groups
        for col in absorbance_cols:
            col_sum = self.proteins_df[col].sum()
            if col_sum > 0:  # Avoid division by zero
                self.proteins_df[f'Rel_{col}'] = (self.proteins_df[col] / col_sum) * 100
            else:
                self.proteins_df[f'Rel_{col}'] = 0
                
        # Create sum DataFrame for selected groups
        self.sum_df = pd.DataFrame({
            'Sample': absorbance_cols,
            'Total_Sum': [self.proteins_df[col].sum() for col in absorbance_cols]
        })
        
        # Add protein descriptions
        name_list = []
        for _, row in self.proteins_df.iterrows():
            if ',' in row['Protein_ID']:
                strrow = row['Protein_ID'].split(',')
                named_combo = self._fetch_protein_names('; '.join(strrow))
            else:
                named_combo = self._fetch_protein_names(row['Protein_ID'])
            name_list.append(named_combo)
        
        # Drop the 'Protein_ID' column
        self.proteins_df = self.proteins_df.drop(columns=['Protein_ID'])    
        
        self.proteins_df['Description'] = name_list
        self.proteins_df['Description'] = self.proteins_df['Description'].astype(str).str.replace(r"['\['\]]", "", regex=True)
        
        # Calculate average Absorbance for sorting using only selected groups
       
        # Calculate sum of all selected columns
        total_sum = self.proteins_df[absorbance_cols].sum().sum()
        
        # Calculate row sums
        row_sums = self.proteins_df[absorbance_cols].sum(axis=1)
        
        # Calculate relative percentage contribution
        self.proteins_df['avg_absorbance_all'] = (row_sums / total_sum * 100).round(2)
        
        # Sort proteins by abundance for consistent ordering
        self.proteins_df = self.proteins_df.sort_values('avg_absorbance_all', ascending=False)
                                
        # Create a dictionary to store the actual peptide counts per group
        self.peptide_count_totals = {}
        
        # Dictionary to store unique peptide counts per protein
        self.protein_peptide_counts = {}
        
        # Track which peptides belong to which proteins
        protein_to_peptides = defaultdict(set)
        
        # Track which peptides belong to which proteins in each group
        protein_to_group_peptides = defaultdict(lambda: defaultdict(set))
        
        # Determine counts based on merged_df and add to proteins_df
        if selected_groups and self.proteins_df is not None and df is not None:
            # Add count columns to the proteins_df (initialize with zeros)
            for group in selected_groups:
                count_col = f'Count_{group}'
                rel_count_col = f'Rel_Count_{group}'
                # Initialize with float64 dtype
                self.proteins_df[count_col] = pd.Series(dtype='float64')
                self.proteins_df[rel_count_col] = pd.Series(dtype='float64')
                # Set initial values to 0.0
                self.proteins_df[count_col] = 0.0
                self.proteins_df[rel_count_col] = 0.0
            
            # Create a mapping from accession to protein index in proteins_df
            accession_to_idx = {}
            accession_to_description = {}  # Map accessions to descriptions for counting
            for idx, row in self.proteins_df.iterrows():
                if 'Master Protein Accessions' in row and pd.notna(row['Master Protein Accessions']):
                    accession_to_idx[row['Master Protein Accessions']] = idx
                    accession_to_description[row['Master Protein Accessions']] = row['Description']
                elif 'Accession' in row and pd.notna(row['Accession']):
                    accession_to_idx[row['Accession']] = idx
                    accession_to_description[row['Accession']] = row['Description']
            
            # For each group, count peptides per protein
            for group in selected_groups:
                # Filter peptides that are present in this group
                group_peptides = df[df[f'Avg_{group}'] > 0]
                
                # Store the total number of peptides for this group
                self.peptide_count_totals[group] = len(group_peptides)
                
                # Track which peptides have already been counted
                counted_peptides = set()
                
                # Track warning stats
                peptides_with_no_accession = 0
                peptides_with_no_id = 0
                peptides_already_counted = 0
                peptides_with_multi_accessions = set()
                peptides_with_no_protein_match = 0
                
                # Count peptides for each protein
                for _, peptide in group_peptides.iterrows():
                    if 'Master Protein Accessions' not in peptide or pd.isna(peptide['Master Protein Accessions']):
                        peptides_with_no_accession += 1
                        continue
                        
                    # Get unique peptide ID to track counting
                    peptide_id = peptide.get('unique ID', None)
                    if peptide_id is None or pd.isna(peptide_id):
                        peptides_with_no_id += 1
                        continue  # Skip if no unique ID
                    
                    # Skip if we've already counted this peptide for this group
                    if peptide_id in counted_peptides:
                        peptides_already_counted += 1
                        continue
                    
                    accession = peptide['Master Protein Accessions']
                    found_match = False
                    
                    # Check if this peptide maps to multiple proteins
                    if ';' in accession:
                        peptides_with_multi_accessions.add(peptide_id)
                        accessions = [acc.strip() for acc in accession.split(';') if acc.strip()]
                        
                        # Only count for the first valid protein in the list
                        for acc in accessions:
                            if acc in accession_to_idx:
                                idx = accession_to_idx[acc]
                                count_col = f'Count_{group}'
                                self.proteins_df.at[idx, count_col] += 1
                                
                                # Add this peptide to the protein's set for protein-specific counting
                                protein_desc = accession_to_description.get(acc, acc)
                                protein_to_peptides[protein_desc].add(peptide_id)
                                protein_to_group_peptides[protein_desc][group].add(peptide_id)
                                
                                counted_peptides.add(peptide_id)  # Mark as counted
                                found_match = True
                                break  # Count only once
                    else:
                        # Handle direct match - only single protein
                        if accession in accession_to_idx:
                            idx = accession_to_idx[accession]
                            count_col = f'Count_{group}'
                            self.proteins_df.at[idx, count_col] += 1
                            
                            # Add this peptide to the protein's set for protein-specific counting
                            protein_desc = accession_to_description.get(accession, accession)
                            protein_to_peptides[protein_desc].add(peptide_id)
                            protein_to_group_peptides[protein_desc][group].add(peptide_id)
                            
                            counted_peptides.add(peptide_id)  # Mark as counted
                            found_match = True
                    
                    # Track peptides that didn't match any protein in our list
                    if not found_match:
                        peptides_with_no_protein_match += 1
                        
                # After counting all peptides for this group, calculate relative counts
                count_col = f'Count_{group}'
                rel_count_col = f'Rel_Count_{group}'
                total_value = self.peptide_count_totals[group]
                
                # Calculate relative counts as percentages of total peptides
                # When calculating relative counts
                if total_value > 0:
                    for idx in range(len(self.proteins_df)):
                        protein_count = float(self.proteins_df.at[idx, count_col])  # Ensure float
                        rel_value = (protein_count / total_value) * 100
                        self.proteins_df.at[idx, rel_count_col] = rel_value

                
                # Display warning about peptides mapping to multiple proteins
                warning_html = '<div style="color: orange; margin: 5px 0;"><b>Warning:</b> Peptide counting stats for group {0}:<br>'
                
                if peptides_with_no_accession > 0:
                    warning_html += f'• Skipped {peptides_with_no_accession} peptides with no accession<br>'
                    
                if peptides_with_no_id > 0:
                    warning_html += f'• Skipped {peptides_with_no_id} peptides with no unique ID<br>'
                    
                if peptides_already_counted > 0:
                    warning_html += f'• Skipped {peptides_already_counted} duplicate peptides (already counted)<br>'
                    
                if len(peptides_with_multi_accessions) > 0:
                    warning_html += f'• Found {len(peptides_with_multi_accessions)} peptides mapping to multiple proteins<br>'
                    warning_html += f'  (Each counted only once for the first matching protein)<br>'
                    
                if peptides_with_no_protein_match > 0:
                    warning_html += f'• {peptides_with_no_protein_match} peptides had no matching protein in the protein list<br>'
                    
                total_peptides = len(group_peptides)
                warning_html += f'• Total peptides processed: {total_peptides}, successfully counted: {len(counted_peptides)}'
                warning_html += '</div>'
                
                #display(HTML(warning_html.format(group)))

        # Calculate the number of unique peptides per protein
        for protein, peptides in protein_to_peptides.items():
            self.protein_peptide_counts[protein] = len(peptides)

        # Create a copy of the proteins DataFrame for protein sample distribution calculation
        working_df = self.proteins_df.copy()
        
        # Calculate protein distributions across samples (for both counts and absorbance)
        self.protein_sample_distribution = {}
        
        # Calculate data for major proteins (based on pro_list)
        major_proteins = []
        if hasattr(self, 'pro_list') and self.pro_list:
            major_proteins = self.pro_list
            
        # Add "Minor Proteins" data structures to hold aggregated values
        minor_proteins_data = {
            'counts': {group: 0 for group in selected_groups},
            'count_relative': {group: 0 for group in selected_groups},
            'absorbance': {group: 0 for group in selected_groups},
            'absorbance_relative': {group: 0 for group in selected_groups},
            'unique_peptide_count': 0,
            'total_value': 0,
            'total_absorbance': 0,
            'total_count': 0
        }
        
        # Counts to track minor proteins' peptides
        minor_proteins_peptides = set()
        
        # Process each protein
        for _, row in working_df.iterrows():
            protein_name = row['Description']
            
            # Skip if protein name is empty or NaN
            if pd.isna(protein_name) or not protein_name:
                continue
            
            # Initialize data structure for this protein
            protein_data = {
                'counts': {},
                'count_relative': {},
                'absorbance': {},
                'absorbance_relative': {},
                'unique_peptide_count': 0
            }
            
            # Get count values for each group
            count_values = {}
            absorbance_values = {}
            
            for group in selected_groups:
                # Get count values from proteins_df
                count_col = f'Count_{group}'
                if count_col in row:
                    count_values[group] = row[count_col]
                else:
                    count_values[group] = 0
                
                # Get absorbance values
                absorbance_col = f'Avg_{group}'
                if absorbance_col in row:
                    absorbance_values[group] = row[absorbance_col]
                else:
                    absorbance_values[group] = 0
            
            # Get the actual count of unique peptides for this protein (across all groups)
            if protein_name in protein_to_peptides:
                protein_data['unique_peptide_count'] = len(protein_to_peptides[protein_name])
            
            # Store the count and absorbance values
            protein_data['counts'] = count_values
            protein_data['absorbance'] = absorbance_values
            
            # Calculate totals as sums across groups
            protein_total_count = sum(count_values.values())
            protein_total_absorbance = sum(absorbance_values.values())
            
            protein_data['total_count'] = protein_total_count
            protein_data['total_absorbance'] = protein_total_absorbance
            
            # Calculate relative distributions
            # Count relative distribution - percentage of this protein's total count in each group
            if protein_total_count > 0:
                for group, count in count_values.items():
                    protein_data['count_relative'][group] = (count / protein_total_count) * 100
            else:
                for group in selected_groups:
                    protein_data['count_relative'][group] = 0
            
            # Absorbance relative distribution
            if protein_total_absorbance > 0:
                for group, absorbance in absorbance_values.items():
                    protein_data['absorbance_relative'][group] = (absorbance / protein_total_absorbance) * 100
            else:
                for group in selected_groups:
                    protein_data['absorbance_relative'][group] = 0
            
            # Add backward compatibility
            use_count = hasattr(self, 'abs_or_count') and ('count' in getattr(self, 'abs_or_count').value.lower() 
                                                        if hasattr(getattr(self, 'abs_or_count'), 'value') else True)
            
            if use_count:
                protein_data['total'] = protein_total_count
                protein_data['values'] = count_values
                protein_data['relative'] = protein_data['count_relative']
            else:
                protein_data['total'] = protein_total_absorbance
                protein_data['values'] = absorbance_values
                protein_data['relative'] = protein_data['absorbance_relative']
            
            # Check if this is a major or minor protein
            if major_proteins and protein_name not in major_proteins:
                # This is a minor protein - add its data to the minor proteins aggregated data
                for group in selected_groups:
                    minor_proteins_data['counts'][group] += count_values[group]
                    minor_proteins_data['absorbance'][group] += absorbance_values[group]
                
                # For minor proteins, track both the sum and the unique peptide count
                if protein_name in protein_to_peptides:
                    minor_proteins_peptides.update(protein_to_peptides[protein_name])
                
                minor_proteins_data['total_count'] += protein_total_count
                minor_proteins_data['total_absorbance'] += protein_total_absorbance
            else:
                # This is a major protein - store its individual data
                self.protein_sample_distribution[protein_name] = protein_data
        
        # Set unique peptide count for minor proteins
        minor_proteins_data['unique_peptide_count'] = len(minor_proteins_peptides)
        
        # Calculate relative distributions for minor proteins
        if minor_proteins_data['total_count'] > 0:
            for group in selected_groups:
                minor_proteins_data['count_relative'][group] = (minor_proteins_data['counts'][group] / minor_proteins_data['total_count'] * 100)
        
        if minor_proteins_data['total_absorbance'] > 0:
            for group in selected_groups:
                minor_proteins_data['absorbance_relative'][group] = (minor_proteins_data['absorbance'][group] / minor_proteins_data['total_absorbance'] * 100)
        
        # Add backward compatibility for minor proteins
        if use_count:
            minor_proteins_data['total'] = minor_proteins_data['total_count']
            minor_proteins_data['values'] = minor_proteins_data['counts']
            minor_proteins_data['relative'] = minor_proteins_data['count_relative']
        else:
            minor_proteins_data['total'] = minor_proteins_data['total_absorbance']
            minor_proteins_data['values'] = minor_proteins_data['absorbance']
            minor_proteins_data['relative'] = minor_proteins_data['absorbance_relative']
        
        # Print debug info for key proteins
        debug_proteins = ['Beta-casein']
        for protein in debug_proteins:
            if protein in self.protein_sample_distribution:
                data = self.protein_sample_distribution[protein]
                print(
                    f"\nProtein: {protein}\n"
                    f"Total count across all groups: {data['total_count']}\n"
                    f"Unique peptide count: {data.get('unique_peptide_count', 'N/A')}\n"
                    f"Total absorbance: {data['total_absorbance']:.2e}\n"
                    f"Sample distribution (% of protein's total):\n"
                    f"Count: {', '.join([f'{g}: {v:.1f}%' for g, v in data['count_relative'].items()])}\n"
                    f"Absorbance: {', '.join([f'{g}: {v:.1f}%' for g, v in data['absorbance_relative'].items()])}\n"
                    f"Sum of count percentages: {sum(data['count_relative'].values()):.1f}%"
                )
                
        return True
    
    def _fetch_protein_names(self, accession_str):
        """
        Fetch protein names from the proteins dictionary.
        Returns a list of protein names, using the full protein name.
        """
        names = []
        for acc in accession_str.split('; '):
            if acc in self.protein_dict:
                # Use the full protein name instead of splitting it
                name = self.protein_dict[acc]['name']
                names.append(name)
            else:
                names.append(acc)
        return names
    
    def _export_group_correlation_analysis(self, df, group_data):
        """
        Calculate and export correlation analysis to Excel.
        Returns bytes of Excel file content.
        """
        try:
            # Calculate cross-group correlations
            correlation_results = []
            avg_columns = {
                group_info['grouping_variable']: f"Avg_{group_info['grouping_variable']}"
                for group_info in group_data.values()
                if f"Avg_{group_info['grouping_variable']}" in df.columns
            }
            
            # Create Excel writer buffer
            buffer = io.BytesIO()
            with pd.ExcelWriter(buffer, engine='openpyxl') as writer:
                # Cross-group correlations
                for (group1, col1), (group2, col2) in combinations(avg_columns.items(), 2):
                    mask = (df[col1] > 0) & (df[col2] > 0)
                    if mask.sum() > 1:
                        values1 = self.prepare_data(df.loc[mask, col1])
                        values2 = self.prepare_data(df.loc[mask, col2])
                        correlation = self.calculate_correlation(values1, values2)
                        correlation_results.append({
                            'Group 1': group1,
                            'Group 2': group2,
                            'Correlation': round(correlation, 3),
                            'Number of Peptides': mask.sum()
                        })
                
                # Create and write cross-group correlation sheet
                if correlation_results:
                    cross_group_df = pd.DataFrame(correlation_results)
                    cross_group_df.to_excel(writer, sheet_name='Cross-Group Correlations', index=False)
                
                # Calculate summary statistics
                summary_stats = {
                    'Average Correlation': round(np.mean([r['Correlation'] for r in correlation_results]), 3),
                    'Min Correlation': round(min([r['Correlation'] for r in correlation_results]), 3),
                    'Max Correlation': round(max([r['Correlation'] for r in correlation_results]), 3),
                    'Total Comparisons': len(correlation_results)
                }
                
                # Write summary statistics
                pd.DataFrame([summary_stats]).to_excel(writer, sheet_name='Summary', index=False)
    
            return buffer.getvalue()
            
        except Exception as e:
            raise Exception(f"Error in group correlation analysis: {str(e)}")
    
    def _export_replicate_correlation_analysis(self):
        """
        Calculate and export replicate correlation analysis to Excel.
        Returns bytes of Excel file content.
        """
        try:
            group_data, df = self._convert_group_data_dict()
            within_group_correlations = {}
            for key, value in group_data.items():
                grouping_variable = value['grouping_variable']
                abundance_columns = value['abundance_columns']
                
                data = df[abundance_columns].copy()
                data = data[data.gt(0).all(axis=1)]  # Filter for rows where all values > 0
                
                if len(data) > 1:
                    data = self.prepare_data(data)
                    
                    # Calculate correlation matrix
                    method = 'pearson' if self.correlation_type.value == 'Pearson' else 'spearman'
                    correlation_matrix = data.corr(method=method)
                    
                    # Get lower triangle only to avoid redundancy
                    lower_triangle = correlation_matrix.where(
                        np.tril(np.ones(correlation_matrix.shape), k=-1).astype(bool)
                    )
                    
                    # Create pairs and get correlation values
                    pairs = []
                    values = []
                    for i in range(len(abundance_columns)):
                        for j in range(i):
                            pair_name = f"{abundance_columns[j]} vs {abundance_columns[i]}"
                            pairs.append(pair_name)
                            values.append(round(lower_triangle.iloc[i,j], 3))
                    
                    within_group_correlations[grouping_variable] = pd.Series(values)
    
            # Create Excel file
            buffer = io.BytesIO()
            with pd.ExcelWriter(buffer, engine='openpyxl') as writer:
                if within_group_correlations:
                    # Create summary sheet with all groups
                    combined_correlation_df = pd.concat(within_group_correlations, axis=1)
                    
                    # Calculate summary statistics
                    min_values = combined_correlation_df.min().round(3)
                    max_values = combined_correlation_df.max().round(3)
                    mean_values = combined_correlation_df.mean().round(3)
                    
                    summary_df = pd.DataFrame({
                        'Min': min_values,
                        'Max': max_values,
                        'Average': mean_values
                    }).T
                    
                    # Combine and write summary
                    combined_with_summary = pd.concat([combined_correlation_df, summary_df], axis=0)
                    combined_with_summary.to_excel(writer, sheet_name='Summary')
                    
                    # Create individual sheets for each group
                    for key, value in group_data.items():
                        grouping_variable = value['grouping_variable']
                        if grouping_variable in within_group_correlations:
                            values = within_group_correlations[grouping_variable]
                            pairs = []
                            for i in range(len(value['abundance_columns'])):
                                for j in range(i):
                                    pairs.append(f"{value['abundance_columns'][j]} vs {value['abundance_columns'][i]}")
                            
                            group_df = pd.DataFrame({
                                'Pair': pairs,
                                'Correlation': values
                            })
                            group_df.to_excel(writer, sheet_name=grouping_variable, index=False)
            
            return buffer.getvalue()
            
        except Exception as e:
            raise Exception(f"Error in replicate correlation analysis: {str(e)}")

    def summed_peptide_results(self):
        group_data, df = self._convert_group_data_dict()
            
        # Initialize dictionary to store results for all groups
        total_peptide_results_dict = {}
        
        # Use consistent reference to merged dataframe
        filtered_df = df.copy()

        # Process each group from the simplified group data structure
        for _, value in group_data.items():
            grouping_variable = value['grouping_variable']
            abundance_columns = value['abundance_columns']
                
            # Calculate total abundance and SEM from the abundance columns

            #valid_abundance_cols = [f"Avg_{col}" for col in abundance_columns 
            #                    if f"Avg_{col}" in filtered_df.columns]
            valid_abundance_cols = abundance_columns
            if not valid_abundance_cols:
                print(f"Warning: No valid abundance columns found for group {grouping_variable}")
                continue
                        
            # Filter for non-zero, non-null values in any abundance column
            temp_df = filtered_df[['unique ID'] + valid_abundance_cols].copy()
            
            # Convert abundance columns to numeric, forcing non-numeric values to NaN
            for col in valid_abundance_cols:
                temp_df[col] = pd.to_numeric(temp_df[col], errors='coerce')
            
            # Additional filtering for valid data
            valid_data_mask = (
                temp_df[valid_abundance_cols].notna().any(axis=1) & 
                (temp_df[valid_abundance_cols] != 0).any(axis=1) &
                temp_df['unique ID'].notna()
            )
            temp_df = temp_df[valid_data_mask]
            
            if temp_df.empty:
                print(f"Warning: No valid data for group {grouping_variable}")
                # Add empty results to maintain group in output
                total_peptide_results_dict[grouping_variable] = {
                    'unique_peptides': 0,
                    'total_Absorbance': 0,
                    'total_sem': 0,
                    'abundance_sem': 0,
                    'count_sem': 0,
                    'replicate_data': {
                        'abundance_columns': valid_abundance_cols,
                        'replicate_counts': [0] * len(valid_abundance_cols),
                        'replicate_abundances': [0] * len(valid_abundance_cols)
                    }
                }
                continue
                
            # Rest of the function remains the same...
            # Calculate peptide counts for each replicate
            replicate_counts = []
            for col in valid_abundance_cols:
                count = temp_df[temp_df[col].notna() & (temp_df[col] != 0)]['unique ID'].nunique()
                replicate_counts.append(count)
            
            # Calculate mean count and SEM across replicates
            if len(replicate_counts) > 1:
                count_sem = np.std(replicate_counts, ddof=1) / np.sqrt(len(replicate_counts))
            else:
                count_sem = 0
                
            # Calculate abundance statistics
            abundances = temp_df[valid_abundance_cols].values.astype(float)
            peptide_means = np.nanmean(abundances, axis=1)
            total_abundance = np.nansum(peptide_means)
            
            # Calculate SEM for abundance
            peptide_sems = np.nanstd(abundances, axis=1) / np.sqrt(abundances.shape[1])
            total_sem = np.sqrt(np.nansum(peptide_sems ** 2))

            # Calculate total count for group
            all_unique_peptides = temp_df[
                (temp_df[valid_abundance_cols] > 0).any(axis=1)
            ]['unique ID'].nunique()
            
            # Store results for this group
            total_peptide_results_dict[grouping_variable] = {
                'unique_peptides': all_unique_peptides,
                'total_Absorbance': total_abundance,
                'total_sem': total_sem,
                'abundance_sem': total_sem,
                'count_sem': count_sem,
                'replicate_data': {
                    'abundance_columns': valid_abundance_cols,
                    'replicate_counts': replicate_counts,
                    'replicate_abundances': [temp_df[col].replace(0, np.nan).sum() for col in valid_abundance_cols]
                }
            }
        
        return total_peptide_results_dict
                        
    def extract_sequences_by_name(self):
        # Create an empty dictionary to store the results
        result_dict = {}
        
        # Get the actual column names from the DataFrame
        available_columns = self.merged_df.columns.tolist()
        
        # Iterate through each group in the group_data dictionary
        for group_id, group_info in self.group_data.items():
            # Get the grouping variable name and abundance columns
            grouping_variable = f"Avg_{group_info['grouping_variable']}"
            
            # Filter sample_ids to only include columns that exist in the DataFrame
            if grouping_variable in available_columns:
                # Create a mask for peptides that have non-zero values
                # Since we're working with a single column, we don't need axis=1
                mask = self.merged_df[grouping_variable] > 0
                
                # Extract the unique IDs for peptides that match this mask
                unique_peptides = self.merged_df.loc[mask, 'unique ID'].unique().tolist()
                
                # Store the list of unique peptides in the result dictionary using the grouping variable name
                result_dict[group_info['grouping_variable']] = unique_peptides
            else:
                print(f"Warning: No valid columns found for grouping variable {group_info['grouping_variable']}")
                continue
        
        # Convert the dictionary to a DataFrame
        # First, find the maximum length of any list in the dictionary
        max_length = max(len(peptides) for peptides in result_dict.values())
        
        # Create a new dictionary with padded lists
        padded_dict = {k: v + [None] * (max_length - len(v)) for k, v in result_dict.items()}
        
        # Convert to DataFrame
        result_df = pd.DataFrame(padded_dict)
        
        return result_df

    def _trigger_download(self, content, filename, mime_type):
        """Helper method to trigger file download"""
        if isinstance(content, str):
            content = content.encode('utf-8')
            
        b64_data = base64.b64encode(content).decode('utf-8')
        file_data = f"data:{mime_type};base64,{b64_data}"
        
        with self.status_output:
            self.status_output.clear_output(wait=True)
            display(HTML(f"""
                <div id="download_{filename}">
                    <a id="download_link_{filename}" 
                       href="{file_data}" 
                       download="{filename}"
                       style="display: none;"></a>
                    <script>
                        document.getElementById('download_link_{filename}').click();
                        setTimeout(() => {{
                            document.getElementById('download_{filename}').remove();
                        }}, 1000);
                    </script>
                </div>
            """))
            display(HTML(f'<div style="color: green">Successfully downloaded {filename}</div>'))

    def _handle_mbpdb_download(self, b):
        """Handle MBPDB results download"""
        try:
            if self.mbpdb_df is not None and 'function' in self.mbpdb_df.columns:
                timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
                filename = f"MBPDB_SEARCH_{timestamp}.tsv"
                content = self.mbpdb_df.to_csv(sep='\t', index=False)
                self._trigger_download(content, filename, 'text/tab-separated-values')
        except Exception as e:
            with self.status_output:
                self.status_output.clear_output(wait=True)
                display(HTML(f'<div style="color: red">Error downloading MBPDB results: {str(e)}</div>'))

    def _handle_group_download(self, b):
        """Handle group data download"""
        try:
            if self.group_data:
                # Convert enumerated format to simplified format
                simplified_data = {}
                for _, group_info in self.group_data.items():
                    group_name = group_info['grouping_variable']
                    abundance_cols = group_info['abundance_columns']
                    simplified_data[group_name] = abundance_cols
                
                # Create filename with timestamp
                timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
                filename = f"Categorical_variable_definitions_{timestamp}.json"
                
                # Convert to JSON with indentation
                content = json.dumps(simplified_data, indent=4)
                
                # Trigger download
                self._trigger_download(content, filename, 'application/json')
        except Exception as e:
            with self.status_output:
                self.status_output.clear_output(wait=True)
                display(HTML(f'<div style="color: red">Error downloading group data: {str(e)}</div>'))

    def _handle_dataset_download(self, b):
        """Handle Merged Dataset download"""
        try:
            if self.merged_df is not None:
                timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
                filename = f"Merged_Dataframe_{timestamp}.csv"
                content = self.merged_df.to_csv(index=False)
                self._trigger_download(content, filename, 'text/csv')
        except Exception as e:
            with self.status_output:
                self.status_output.clear_output(wait=True)
                display(HTML(f'<div style="color: red">Error downloading dataset: {str(e)}</div>'))
                
    def _handle_replicate_correlation_download(self, b):
        """Handle correlation export button click"""
        try:
            if self.merged_df is None or self.group_data is None:
                display(HTML('<div style="color: red; padding: 10px;">No data available for correlation analysis.</div>'))
                return
            
            # Get the Excel content
            excel_content = self._export_replicate_correlation_analysis()
            
            # Generate filename
            timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
            correlation_type = self.correlation_type.value.lower()
            transform_type = "log10" if self.log_transform.value else "raw"
            filename = f"correlation_analysis_{correlation_type}_{transform_type}_{timestamp}.xlsx"
            
            self._trigger_download(excel_content, filename, 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet')    
                
        except Exception as e:
            with self.status_output:
                self.status_output.clear_output(wait=True)
                display(HTML(f'<div style="color: red; padding: 10px;">Error exporting correlations: {str(e)}</div>'))
   
    def _handle_group_correlation_download(self, b):
        """Handle correlation export button click"""
        try:
            with self.status_output:
                self.status_output.clear_output(wait=True)
                
                if self.merged_df is None or self.group_data is None:
                    display(HTML('<div style="color: red; padding: 10px;">No data available for correlation analysis.</div>'))
                    return
                
                # Get the Excel content
                excel_content = self._export_group_correlation_analysis(
                    self.merged_df,
                    self.group_data
                )
                
                if excel_content is None:
                    display(HTML('<div style="color: red; padding: 10px;">No correlation data generated.</div>'))
                    return
                    
                # Generate filename
                timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
                correlation_type = self.correlation_type.value.lower()
                transform_type = "log10" if self.log_transform.value else "raw"
                filename = f"group_correlations_{correlation_type}_{transform_type}_{timestamp}.xlsx"
                
                # Create base64 encoded string for download
                b64_content = base64.b64encode(excel_content).decode()
                
                self._trigger_download(excel_content, filename, 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet')
                
        except Exception as e:
            with self.status_output:
                self.status_output.clear_output(wait=True)
                display(HTML(f'<div style="color: red; padding: 10px;">Error exporting correlations: {str(e)}</div>'))
    def _handle_sequence_list_download(self, b):
        """Handle data export with automatic download"""
        sequences_by_name_df = self.extract_sequences_by_name()
        if self.merged_df is not None:
            timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
            filename = f'List_of_Peptides_by_Sequences_{timestamp}.csv'
            csv_content = sequences_by_name_df.to_csv(index=False)
            self._trigger_download(csv_content, filename, 'text/csv')
        else:
            print("Please generate the analysis first.")
            
    def _handle_summed_peptide_download(self, b):
        summed_results = self.summed_peptide_results()
        if summed_results is not None:
            excel_content = self._export_summed_peptide_data(summed_results)
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        filename = f"summed_peptide_results_{timestamp}.xlsx"
        self._trigger_download(excel_content, filename, 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet')
   
    def _handle_protein_download(self, b):
        self._export_protein_data()
        """Handle data export with automatic download"""
        if self.proteins_df is not None:
            timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
            filename = f'protein_absorbance_analysis_{timestamp}.csv'
            csv_content = self.proteins_df.to_csv(index=False)
            self._trigger_download(csv_content, filename, 'text/csv')
        else:
            print("Please generate the analysis first.")
    
    def _handle_summed_function_download(self, b):
        """Handle summed function data download"""
        try:
            # Process the data
            combined_df, combined_count_df, combined_absorbance_df = self._process_functional_peptide_export_data()

            # Create Excel file in memory
            output = io.BytesIO()
            with pd.ExcelWriter(output, engine='openpyxl') as writer:
                combined_df.to_excel(writer, sheet_name='combined', index=True)
                combined_count_df.to_excel(writer, sheet_name='count', index=True)
                combined_absorbance_df.to_excel(writer, sheet_name='absorbance', index=True)
            
            # Get the value of the BytesIO buffer
            excel_data = output.getvalue()
            
            # Generate filename
            timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
            filename = f"Processed_mbpdb_results_{timestamp}.xlsx"
            
            self._trigger_download(excel_data, filename, 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet')
                
        except Exception as e:
            with self.export_output:
                self.export_output.clear_output(wait=True)
                display(HTML(f'<div style="color: red; padding: 10px;">Error exporting data: {str(e)}</div>'))

    def update_data(self, mbpdb_df=None, group_data=None, merged_df=None, protein_dict=None):
        """Update data and enable/disable buttons accordingly"""
        self.mbpdb_df = mbpdb_df
        self.group_data = group_data
        self.merged_df = merged_df
        self.protein_dict = protein_dict

        # Enable/disable buttons based on data availability
        # depends on functional data
        #self.mbpdb_button.disabled = not (mbpdb_df is not None and 'function' in mbpdb_df.columns)
        #self.export_summed_function_data_button.disabled = not (mbpdb_df is not None and 'function' in mbpdb_df.columns)
        # depends on group data
        self.export_sequence_list_button.disabled = not bool(group_data)
        self.export_summed_peptide_results_button.disabled = not bool(group_data)
        self.export_group_correlation_button.disabled = not bool(group_data)
        #self.export_replicate_correlation_button.disabled = not bool(group_data)
        self.group_data_button.disabled = not bool(group_data)
        self.log_transform.disabled = not bool(group_data)
        self.correlation_type.disabled = not bool(group_data)
        # depends on merged data
        self.dataset_button.disabled = merged_df is None
        self.export_protein_data_button.disabled = merged_df is None

In [8]:
class DataProcessingController:
    def __init__(self, workflow):
        self.workflow = workflow  # Store reference to workflow
        self.export_manager = ExportManager()
        self.data_transformer = self.workflow.data_transformer
        self.combiner = CombineAverageDataframes(
            self.workflow.data_transformer,
            self.workflow.group_processor,
            self.workflow.protein_handler
        )  # Initialize combiner here
        self.merged_df = None
        
        
        # Create processing button
        self.process_button = widgets.Button(
            description=' Generate/Update Data',
            button_style='success',
            disabled=True,
            icon='refresh',
            layout=widgets.Layout(width='300px'),
            tooltip='Click to start data processing'
        )
                
        # Create separate output areas
        self.process_output = widgets.Output()
        self.export_output = widgets.Output()
        self.search_output = widgets.Output()
        self.stats_output = widgets.Output()
        self.grid_output = widgets.Output()
        
        # Set up button callbacks
        self.process_button.on_click(self._on_process_clicked)
        
        # Initialize export manager with disabled buttons
        self.export_manager.update_data(None, None, None)
        
        # Initialize instructions HTML message
        self._initialize_instructions()   
        # Initialize tab headers with default "pending" status
        self.create_headers_for_tabs()

        # Set up observers for group data changes
        self.workflow.group_processor.group_uploader.observe(self._check_enable_process_button, names='value')
        self.workflow.group_processor.add_group_button.on_click(self._check_enable_process_button)
        self.workflow.group_processor.no_group_button.on_click(self._check_enable_process_button)
        # Add observers for changes that would affect the step 4 message
        #self.workflow.data_transformer.mbpdb_uploader.observe(self._check_mbpdb_data_change, names='value')
        #self.workflow.data_transformer.mbpdb_results_from_search_placeholder.observe(self._check_mbpdb_data_change, names='value')
        

    def _check_enable_process_button(self, change=None):
        """Check if process button should be enabled based on group data"""
        # Enable button if group data exists
        has_group_data = bool(self.workflow.group_processor.group_data)
        self.process_button.disabled = not has_group_data
   
    def display_interactive_results(self, df):
        """Display interactive grid with row search functionality"""
        if df is None:
            with self.stats_output:
                self.stats_output.clear_output()
                display(HTML("<b style='color:red;'>No data to display</b>"))
            return
        
        # Create search widget
        search_widget = widgets.Text(
            placeholder='Search for data in rows...',
            description='Search:',
            layout=widgets.Layout(width='300px', height='30px',overflow='hidden'),
            style={'description_width': 'initial'}
        )
        
        def get_column_category(col):
            """Determine category for each column"""
            if col.startswith('Avg_'):
                return 'Average Abundance'
            elif hasattr(self.data_transformer, 'mbpdb_results') and \
                not self.data_transformer.mbpdb_results.empty and \
                col in self.data_transformer.mbpdb_results.columns:
                return 'MBPDB Search Results'
            else:
                return 'Peptidomic Data'

        # Create multi-level columns while preserving order
        column_tuples = [(get_column_category(col), col) for col in df.columns]
        
        df_display = df.copy()
        df_display.columns = pd.MultiIndex.from_tuples(column_tuples)

        def create_grid(df_to_display):
            grid = DataGrid(
                df_to_display, 
                selection_mode='cell', 
                editable=False,
                layout=widgets.Layout(height='600px')
            )
            grid.auto_fit_columns = True
            grid.base_row_size = 25
            grid.base_column_size = 150
            grid.auto_fit_params = {'area': 'column', 'padding': 10}
            return grid
        
        def on_search_change(change):
            search_term = change['new'].strip()
            
            # Update stats output
            with self.stats_output:
                self.stats_output.clear_output()
                
                if search_term:
                    # Filter data based on search term
                    str_df = df_display.astype(str)
                    mask = str_df.apply(
                        lambda row: row.str.contains(search_term, case=False, na=False).any(),
                        axis=1
                    )
                    filtered_df = df_display[mask]
                    display(HTML(f'<b style="color:green;">Found {len(filtered_df)} matching rows out of {len(df_display)} total rows</b>'))
                else:
                    filtered_df = df_display
                    display(HTML(f'<b style="color:orange;">No search term entered. Displaying all {len(df_display)} rows</b>'))
            
            # Always update grid output regardless of search term
            with self.grid_output:
                self.grid_output.clear_output()
                if search_term:
                    display(create_grid(filtered_df))
                else:
                    display(create_grid(df_display))
        
        # Connect search widget to callback
        search_widget.observe(on_search_change, names='value')

        # Clear previous outputs
        self.search_output.clear_output()
        self.stats_output.clear_output()
        self.grid_output.clear_output()
        
        # Display search interface
        with self.search_output:
            display(search_widget)
        
        # Display initial stats
        with self.stats_output:
            display(HTML(f'<b style="color:blue;">Ready to search {len(df_display)} rows of data</b>'))
        
        # Initialize grid display
        with self.grid_output:
            display(create_grid(df_display))

    def _on_process_clicked(self, b):
        # Clear all outputs
        self.process_output.clear_output()
        self.search_output.clear_output()
        self.stats_output.clear_output()
        self.grid_output.clear_output()

        group_data = self.workflow.group_processor.group_data
        with self.process_output:
            # Pass the actual data_transformer, not the workflow
            self.combiner = CombineAverageDataframes(
                self.workflow.data_transformer,
                self.workflow.group_processor, 
                self.workflow.protein_handler
            )
            self.merged_df = self.combiner.process_data(group_data)
            
            if self.merged_df is not None:
                # Check if functional data exists in the processed data
                has_functional_data = (hasattr(self.merged_df, 'function') and 
                                    not self.merged_df['function'].isna().all())
                has_groups = bool(group_data)

                # Iterate through each key in the group_data dictionary
                has_no_reps = True  # Start with assumption that there are no replicates
                for group_id, group_info in group_data.items():
                    # Get the grouping variable name and abundance columns
                    abundance_columns = group_info['abundance_columns']
                    if len(abundance_columns) > 1:
                        has_no_reps = False
                        break

                # Check protein mapping status
                protein_mapping_submitted = False
                has_multiple_proteins = False
                
                # Check if protein mapping was submitted
                if hasattr(self.workflow.protein_handler, 'submit_button') and \
                hasattr(self.workflow.protein_handler, 'pd_results_cleaned'):
                    protein_mapping_submitted = True

                    
                    df = self.merged_df
                    
                    # Check Master Protein Accessions column
                    if 'Master Protein Accessions' in df.columns:
                        has_multiple_in_master = df['Master Protein Accessions'].str.contains(';', na=False).any()
                    else:
                        has_multiple_in_master = False
                        
                    # Check Positions in Proteins column
                    if 'Positions in Proteins' in df.columns:
                        has_multiple_in_positions = df['Positions in Proteins'].str.contains(';', na=False).any()
                    else:
                        has_multiple_in_positions = False
                        
                    has_multiple_proteins = has_multiple_in_master or has_multiple_in_positions
                
                # Update step 4 message to show success
                self._update_step4_message(
                    processed=True,
                    has_functional_data=has_functional_data,
                    has_groups=has_groups,
                    has_no_reps=has_no_reps,
                    protein_mapping_submitted=protein_mapping_submitted,
                    has_multiple_proteins=has_multiple_proteins
                )
                
                
                # Enable export buttons after successful processing
                self.export_manager.update_data(
                    mbpdb_df=self.workflow.data_transformer.mbpdb_results if hasattr(self.workflow.data_transformer, 'mbpdb_results') else None,
                    group_data=group_data,
                    merged_df=self.merged_df,
                    protein_dict=self.workflow.data_transformer.protein_dict if hasattr(self.workflow.data_transformer, 'protein_dict') else None,
                )
                
                self.display_interactive_results(self.merged_df)
            else:
                # Update step 4 message to show failure
                self._update_step4_message(
                    processed=False,
                    has_functional_data=False,
                    has_groups=bool(group_data),
                    has_no_reps=False,
                    protein_mapping_submitted=False,
                    has_multiple_proteins=False
                )
                
                display(HTML(f'<b style=\"color:red;\">Error: No data was processed.</b>'))
                # Keep export buttons disabled
                self.export_manager.update_data(None, None, None, None)

    def _on_export_clicked(self, b):
            """Handle export button click"""
            with self.export_output:
                self.export_output.clear_output(wait=True)
                
                # Update export manager with current data
                self.export_manager.update_data(
                    mbpdb_df=self.workflow.data_transformer.mbpdb_results if hasattr(self.workflow.data_transformer, 'mbpdb_results') else None,
                    group_data=self.workflow.group_processor.group_data,
                    merged_df=self.merged_df,
                    protein_dict=self.workflow.data_transformer.protein_dict if hasattr(self.workflow.data_transformer, 'protein_dict') else None,
                )
                
                # Display the export manager
                self.export_manager.display()
                
    def _initialize_instructions(self):
        """Initialize step four instructions with dynamic updating capability"""
        self.stepfour_base_message = """
            <div style='padding: 10px; background-color: #f8f9fa; border-left: 5px solid #007bff; margin: 10px 0;'>
                <h3>Step 4: Process & Export Data</h3>
                <p>Click the <b>Generate/Update Data</b> button to process your peptidomic data with the settings you've configured:</p>
                
                <details>
                    <summary><b>What happens when you generate data</b> (click to expand)</summary>
                    <ul style='list-style-type: circle;'>
                        <li>Combine your peptidomic data with MBPDB bioactivity information</li>
                        <li>Calculate group averages based on your defined categories</li>
                        <li>Apply protein mapping decisions to resolve multiple protein hits</li>
                        <li>Organize data with standardized column naming</li>
                    </ul>
                </details>
                
                <p>Use the <b>Explore & Export Data</b> tabs below to interactively view and download your processed data in various formats.</p>
                
                <details>
                    <summary><b>Available options after processing</b> (click to expand)</summary>
                    <ul style='list-style-type: circle;'>
                        <li>Merged Dataset: View and download the complete dataset with all peptide data</li>
                        <li>Functional Data: Export bioactivity information</li>
                        <li>Study Design: Export your study variable definitions</li>
                        <li>Correlative Analysis: Calculate and export correlation statistics</li>
                        <li>Peptide Summary: Export peptide counts and abundance values by group</li>
                        <li>Protein Analysis: Export protein-level distribution across experimental groups</li>
                        <li>List of Sequences: Export the peptide sequences by names for each experimental groups</li>
                    </ul>
                </details>
                {status_message}
            </div>
        """
        # Start with no status message 
        self.stepfour_output_html_message = self.stepfour_base_message.format(status_message="")
        
        self.stepfour_status_output = widgets.Output(
            layout=widgets.Layout(
                max_width='1000px',
                width='100%'
            )
        )
        with self.stepfour_status_output:
            display(HTML(self.stepfour_output_html_message))

    def _update_step4_message(self, processed=False, has_functional_data=False, has_groups=False,
                            has_no_reps=False, protein_mapping_submitted=False, has_multiple_proteins=False): 
        """Update the step four message based on the current state"""
        background_color = "#f8f9fa"  # Default light gray background
        border_color = "#007bff"      # Default blue border
        
        if processed:
            # Data has been processed - show success with green background
            background_color = "#e8f5e9"  # Light green background
            border_color = "#4caf50"      # Green border
        
        # Generate dynamic processing steps list with checkmarks for completed steps
        if processed:
            steps_html = """
            <ul style='list-style-type: none;'>
                <li>✅ <b>Combined peptidomic data</b> into a unified <b>Merged Dataset</b></li>
            """
            
            # Add checkmark for MBPDB integration if functional data exists
            if has_functional_data:
                steps_html += """
                <li>✅ <b>Merged functional data</b> with peptidomic data</li>
                """
            else:
                steps_html += """
                <li>➤ <i>No functional data available to integrate</i></li>
                """
                
            # Add checkmark for group averages if groups exist
            if has_groups:
                if has_no_reps:
                    steps_html += """
                    <li>➤ <i>No group definitions available for averaging, abudnance columns are replicated for "Avg_" columns</i></li>
                    """
                else:
                    steps_html += """
                    <li>✅ <b>Calculated group averages</b> based on defined categories</li>
                    <li>✅ <b>Standardized column naming</b> with grouping information saved to abudnace columns and "Avg_" columns added</li>
                    """
            else:
                steps_html += """
                <li>➤ <i>No group definitions available for averaging</i></li>
                <li>✅ <b>Standardized basic column naming</b></li>
                """
            
            # Customize protein mapping message based on status
            if has_multiple_proteins:
                if protein_mapping_submitted:
                    steps_html += """
                    <li>➤ <i>Multiple protein are still mapped to shared peptides</i></li>
                    """
                else:
                    steps_html += """
                    <li>➤ <b>Used default protein mapping</b> <i>Multiple protein are still mapped to shared peptides</i></li>
                    """
            else:
                if protein_mapping_submitted:
                    steps_html += """
                    <li>✅ <b>Protein Mapping Processed</b>  no shared alignment detected - all peptides have unique protein assignments</li>
                    """
                else:
                    steps_html += """
                    <li>✅ <b>Used default protein mapping</b> no shared alignment detected - all peptides have unique protein assignments</li>
                    """

                
            steps_html += """
            </ul>
            """
        else:
            # Not processed yet, show pending steps
            steps_html = """
            <ul style='list-style-type: none;'>
                <li>➤ Combine your peptidomic data into a unified dataset</li>
            """
            
            if has_functional_data:
                steps_html += """
                <li>➤ Integrate MBPDB bioactivity information with peptidomic data</li>
                """
            else:
                steps_html += """
                <li>➤ <i>No functional data available to integrate</i></li>
                """
                
            if has_groups:
                steps_html += """
                <li>➤ Calculate group averages based on defined categories</li>
                <li>➤ Standardize column naming with grouping information</li>
                """
            else:
                steps_html += """
                <li>➤ <i>No group definitions available for averaging</i></li>
                <li>➤ Standardize basic column naming</li>
                """
            
            # Customize protein mapping message based on detected state
            if has_multiple_proteins:
                if protein_mapping_submitted:
                    steps_html += """
                    <li>➤ Apply protein mapping decisions to resolve multiple protein hits</li>
                    """
                else:
                    steps_html += """
                    <li>➤ Use default protein mapping for peptides with multiple protein assignments</li>
                    """
            else:
                steps_html += """
                <li>➤ Check for multiple protein mappings (none detected currently)</li>
                """
                
            steps_html += """
            </ul>
            """
        # Generate available exports list based on data availability
        export_html = """
        <p><b>Available export options:</b></p>
        <ul style='list-style-type: none;'>
        """
        
        # Always available if processed
        if processed:
            export_html += """
            <li>✅ <b>Merged Dataset:</b> Complete dataset with all peptide data</li>
            """
        else:
            export_html += """
            <li>➤ <b>Merged Dataset:</b> Complete dataset with all peptide data</li>
            """
        
        # Functional data exports
        if processed and has_functional_data:
            export_html += """
            <li>✅ <b>Functional Data:</b> Functional information avalibile</li>
            <li>✅ <b>Function Summary:</b> Summarized functional categories</li>
            """
            self.export_manager.mbpdb_button.button_style = 'info'
            self.export_manager.mbpdb_button.disabled = False
            self.export_manager.export_summed_function_data_button.button_style = 'info'
            self.export_manager.export_summed_function_data_button.disabled = False
        else:
            if has_functional_data:
                export_html += """
                <li>➤ <b>Functional Data:</b> No functional information avalible</li>
                <li>➤ <b>Function Summary:</b> Summarized functional categories</li>
                """
            else:
                export_html += """
                <li>❌ <b>Functional Data:</b> No bioactivity information available</li>
                <li>❌ <b>Function Summary:</b> No functional categories available</li>
                """
                self.export_manager.export_summed_function_data_button.button_style = 'danger'
                self.export_manager.export_summed_function_data_button.disabled = True
                self.export_manager.mbpdb_button.button_style = 'danger'
                self.export_manager.mbpdb_button.disabled = True
        # Group-based exports
        if processed and has_groups and not has_no_reps:
            export_html += """
            <li>✅ <b>Study Design:</b> Your defined study variable groups</li>
            <li>✅ <b>Peptide Summary:</b> Counts and abundance values by group</li>
            <li>✅ <b>Correlation Analysis:</b> Statistics between groups</li>
            <li>✅ <b>Correlation Analysis:</b> Statistics between replicates</li>
            <li>✅ <b>Protein Analysis:</b> Protein distribution across experimental groups</li>
            <li>✅ <b>Sequence Lists:</b> Peptide sequences by experimental groups</li>
            """
            self.export_manager.export_replicate_correlation_button.disabled = False
            self.export_manager.export_replicate_correlation_button.button_style = 'info'

        else:
            if has_groups and has_no_reps:
                export_html += """
                <li>✅ <b>Study Design:</b> Your defined study variable with out groups or replicates declared</li>
                <li>➤ <b>Peptide Summary:</b> Counts and abundance valuesn, SEM data was not generated due to no replicate data</li>
                <li>✅ <b>Correlation Analysis:</b> Statistics between groups</li>
                <li>❌ <b>Correlation Analysis:</b> Statistics between replicates was not generated due to no replicate data</li>
                <li>✅ <b>Protein Analysis:</b> Protein distribution across experimental groups</li>
                <li>✅ <b>Sequence Lists:</b> Peptide sequences by experimental groups</li>
                """
                self.export_manager.export_replicate_correlation_button.disabled=True
                self.export_manager.export_replicate_correlation_button.button_style = 'danger'


            else:
                export_html += """
                <li>❌ <b>Study Design:</b> No study variables defined</li>
                <li>❌ <b>Peptide Summary:</b> Requires study variables</li>
                <li>❌ <b>Correlation Analysis:</b> Statistics between groups</li>
                <li>❌ <b>Correlation Analysis:</b> Statistics between replicates</li>
                <li>❌ <b>Protein Analysis:</b> Requires study variables</li>
                <li>❌ <b>Sequence Lists:</b> Requires study variables</li>
                """
        
        export_html += """
        </ul>
        """
        
        # Add status message based on processing state
        if processed:
            status_html = """
            <div style='background-color: #d4edda; padding: 8px; border-radius: 5px; margin-top: 10px;'>
                <p style='color: green; margin: 0;'><b>✅ Data processing completed successfully!</b> Navigate the tables below to view and export your data.</p>
                </div>
            """
            #if not has_functional_data and not has_groups:
            #    status_html += """
            #    <p style='color: orange; margin: 0;'>Note: Limited export options available due to missing functional data and study variables.</p>
            #    """
            #elif not has_functional_data:
            #    status_html += """
            #    <p style='color: orange; margin: 0;'>Note: Functional data exports unavailable (no functional data was provided or matched the list of sequences).</p>
            #    """
            #elif not has_groups:
            #    status_html += """
            #    <p style='color: orange; margin: 0;'>Note: Group-based exports unavailable (no study variables defined).</p>
            #    """
            
            #status_html += """
            #</div>
            #"""
        else:
            if has_groups:
                status_html = """
                <div style='background-color: #fff3e0; padding: 8px; border-radius: 5px; margin-top: 10px;'>
                    <p style='color: #ff9800; margin: 0;'><b>⚠️ Ready to process data.</b> Click the Generate/Update Data button to proceed.</p>
                </div>
                """
            else:
                status_html = """
                <div style='background-color: #fff3e0; padding: 8px; border-radius: 5px; margin-top: 10px;'>
                    <p style='color: #ff9800; margin: 0;'><b>⚠️ Please define study variables in Step 3 before processing data.</b></p>
                </div>
                """
        
        # Create the complete HTML with dynamic background color
        message_html = f"""
            <div style='padding: 10px; background-color: {background_color}; border-left: 5px solid {border_color}; margin: 10px 0;'>
                <h3>Step 4: Process & Export Data</h3>
                <p>Click the <b>Generate/Update Data</b> button to process your peptidomic data:</p>
                
                <div style='margin-left: 15px;'>
                    <p><b>Processing steps:</b></p>
                    {steps_html}
                </div>
                
                <div style='margin-left: 15px;'>
                    {export_html}
                </div>
                
                {status_html}
            </div>
        """
        
        # Update the message with the new status
        self.stepfour_output_html_message = message_html
        
        # Update the display
        with self.stepfour_status_output:
            clear_output(wait=True)
            display(HTML(self.stepfour_output_html_message))
    
    def create_headers_for_tabs(self):
        # Create descriptive headers for each tab
        self.merged_dataset_header = "<p><u>Dataset Viewer & Exporter:</u><br> Download the complete merged dataset with all peptide data, bioactivity information, and group averages. Use the search box to filter and explore your data interactively.</p>"
        
        self.functional_data_header = "<p><u>Bioactivity Data:</u><br> Download information about peptide bioactivity from the MBPDB database, including both raw search results and summarized functional data with quantitative abundance and count values.</p>"
        
        self.study_definitions_header = "<p><u>Study Design:</u><br> Export your study variable definitions showing how samples were organized into experimental groups for analysis and visualization.</p>"
        
        self.correlation_data_header = "<p><u>Correlative Analysis:</u><br> Calculate and export correlation statistics between sample groups or technical replicates. Configure correlation type and data transformation options before exporting.</p>"
        
        self.summed_peptide_header = "<p><u>Peptide Summary:</u><br> Export summary data showing peptide counts and total abundance values for each experimental group, including both group averages and individual replicate details.</p>"
        
        self.protein_contribution_header = "<p><u>Protein Analysis:</u><br> Export protein-level analysis showing the relative contribution and distribution of proteins across your experimental groups based on both peptide counts and abundance values.</p>"
        
        self.sequence_list_header = "<p><u>Peptide Sequence List:</u><br> Export list of detected peptide unique IDs (sequnces + modifications) by your experimental groups for venn diagram generation.</p>"

    def display(self):
        """Display the user interface"""
        self.create_headers_for_tabs()
 
        tab_children = [
            widgets.VBox([
                widgets.HTML(self.merged_dataset_header),
                self.export_manager.dataset_button,
                widgets.HTML("<br><u>Search & Explore Dataset</u>"),
                self.search_output,
                self.stats_output,
                self.grid_output
            ]),
            widgets.VBox([
                widgets.HTML(self.functional_data_header),
                self.export_manager.mbpdb_button,
                self.export_manager.export_summed_function_data_button
            ]),
            widgets.VBox([
                widgets.HTML(self.study_definitions_header),
                self.export_manager.group_data_button
            ]),
            widgets.VBox([                
                widgets.HTML(self.correlation_data_header),
                widgets.HBox([  
                    self.export_manager.correlation_type,
                    self.export_manager.log_transform
                ], layout=widgets.Layout(margin='0 0 15px 0')),
                self.export_manager.export_group_correlation_button,
                self.export_manager.export_replicate_correlation_button 
            ]),
            widgets.VBox([
                widgets.HTML(self.summed_peptide_header),
                self.export_manager.export_summed_peptide_results_button
            ]),
            widgets.VBox([
                widgets.HTML(self.protein_contribution_header),
                self.export_manager.export_protein_data_button
            ]),            
            widgets.VBox([
                widgets.HTML(self.sequence_list_header),
                self.export_manager.export_sequence_list_button
            ]),
        ]

        tab = widgets.Tab(
            children=tab_children,
            layout=widgets.Layout(width='auto')  # Adjust width as needed
        )

        tab_titles = [
            "Merged Dataset",
            "Functional Data",
            "Study Definitions",
            "Correlation Data",
            "Peptide Summations",
            "Protein Contribution",
            "List of Sequences"
        ]
        for i, title in enumerate(tab_titles):
            tab.set_title(i, title)

        self.button_container = tab

        main_display = widgets.VBox([
            self.stepfour_status_output,
            self.process_button,
            self.process_output,
            self.button_container,
            self.export_manager.status_output,
        ], layout=widgets.Layout(
            width='1000px',
            max_width='1000px',
            padding='5px',
            margin='10px 0',
            overflow='hidden'
        ))

        display(main_display)

In [9]:
# Initialize workflow and controller
userinput_workflow = ProcessingWorkflow()
userinput_workflow.display()

process_and_export_controller = DataProcessingController(userinput_workflow)
process_and_export_controller.display()

VBox(children=(Output(layout=Layout(max_width='1000px', width='100%')), VBox(children=(HTML(value='\n         …

VBox(children=(Output(layout=Layout(max_width='1000px', width='100%')), RadioButtons(description='Process pept…

VBox(children=(Output(layout=Layout(max_width='1000px', width='100%')), GridspecLayout(children=(VBox(children…

VBox(children=(Output(layout=Layout(max_width='1000px', width='100%')), Button(button_style='success', descrip…