In [425]:
## Install/Import packages & define key varribles and functions
# Run install script
# %chmod +x setup_jupyterlab.sh
# %./setup_jupyterlab.sh

# Import necessary libraries for the script to function.
import pandas as pd
import tempfile, csv, json, re, os, shutil, io, base64, time, subprocess, sqlite3, zipfile, base64
from io import StringIO, BytesIO
import numpy as np

#from django.conf import settings
from collections import defaultdict
from datetime import datetime

#import statsmodels.api as sm
#from statsmodels.formula.api import ols
#from statsmodels.stats.multicomp import pairwise_tukeyhsd
import warnings


from itertools import combinations
from ipydatagrid import DataGrid

from IPython.display import display, HTML, clear_output
import ipywidgets as widgets
from ipywidgets import interact, interactive, fixed, interact_manual

import traitlets
from traitlets import HasTraits, Instance, observe

# Global variable declaration

import _settings as settings
global spec_translate_list
spec_translate_list = settings.SPEC_TRANSLATE_LIST
# Set the default font to Calibri
#matplotlib.rcParams['font.family'] = 'Calibri'

# Create global output areas
protein_mapping_output = widgets.Output()
group_processing_output = widgets.Output()

def find_species(header, spec_translate_list):
    """Search for a species in the header and return the first element (species name) from the list."""
    header_lower = header.lower()
    for spec_group in spec_translate_list:
        for term in spec_group[1:]:  # Iterate over possible species names/terms except the first element
            if term.lower() in header_lower:
                return spec_group[0]  # Return the first element of the list (main species name)
    return "unknown"  # Return unknown if no species match is found

def parse_headers():
    fasta_dict = {}
    with open("protein_headers.txt", 'r') as file:
        protein_id = ""
        protein_name = ""
        species = ""
        for line in file:
            line = line.strip()
            if line.startswith('>'):
                if protein_id:
                    # Save the previous protein entry in the dictionary
                    fasta_dict[protein_id] = {
                        "name": protein_name,
                        "species": species
                    }
                sequence = ""
                header_parts = line[1:].split('|')
                if len(header_parts) > 2:
                    protein_id = header_parts[1]
                    protein_name_full = re.split(r' OS=', header_parts[2])[0]
                    if ' ' in protein_name_full:

                        protein_name = protein_name_full#.split()[1]
                    else:
                        protein_name = protein_name_full
                    # Find species in the header
                    species = find_species(line, spec_translate_list)

        if protein_id:
            # Save the last protein entry in the dictionary
            fasta_dict[protein_id] = {
                "name": protein_name,
                "species": species
            }
    return fasta_dict


In [426]:
class DataTransformation(HasTraits):
    pd_results = Instance(pd.DataFrame, allow_none=True)
    mbpdb_results = Instance(pd.DataFrame, allow_none=True)
    # pd_results_cleaned = Instance(pd.DataFrame, allow_none=True)
    search_results = Instance(pd.DataFrame, allow_none=True)
    protein_dict = {}  # Add explicit trait for protein_dict

    def __init__(self):
        super().__init__()
        self.pd_results = pd.DataFrame()
        # self.pd_results_cleaned = pd.DataFrame()
        self.mbpdb_results = pd.DataFrame()
        self.search_results = pd.DataFrame()
        self.protein_dict = parse_headers()
        self.output_area = None
        self.mbpdb_uploader = None
        self.pd_uploader = None
        self.fasta_uploader = None
        self.search_widget = None
        self.search_progress = None

    def create_download_link(self, file_path, label):
        """Create a download link for a file."""
        if os.path.exists(file_path):
            # Read file content and encode it as base64
            with open(file_path, 'rb') as f:
                content = f.read()
            b64_content = base64.b64encode(content).decode('utf-8')

            # Generate the download link HTML
            return widgets.HTML(f"""
                <a download="{os.path.basename(file_path)}" 
                   href="data:application/octet-stream;base64,{b64_content}" 
                   style="color: #0366d6; text-decoration: none; margin-left: 20px; font-size: 14px;">
                    {label}
                </a>
            """)
        else:
            # Show an error message if the file does not exist
            return widgets.HTML(f"""
                <span style="color: red; margin-left: 20px; font-size: 14px;">
                    File "{file_path}" not found!
                </span>
            """)

    def setup_search_ui(self, peptides):
        """Initialize and display the search UI"""
        # Create dropdown for similarity threshold
        self.threshold_dropdown = widgets.Dropdown(
            options=list(range(0, 101, 10)),
            value=80,
            description='Similarity Threshold:',
            style={'description_width': 'initial'},
            layout=widgets.Layout(width='300px')
        )

        # Create search button
        self.search_button = widgets.Button(
            description='Search Peptides',
            button_style='primary',
            layout=widgets.Layout(width='200px')
        )

        # Progress indicator
        self.search_progress = widgets.HTML(
            value="",
            layout=widgets.Layout(margin='10px 0px')
        )

        # Connect button click to handler
        self.search_button.on_click(lambda b: self._on_search_click(b, ))

        # Create layout
        self.search_widget = widgets.VBox([
            widgets.HBox([
                self.threshold_dropdown,
                self.search_button
            ], layout=widgets.Layout(align_items='center')),
            self.search_progress
        ])

        display(self.search_widget)

    def _on_search_click(self, b):
        """Handle search button click"""
        with self.search_output_area:
            clear_output()

            if self.pd_results is None or self.pd_results.empty:
                display(HTML("<b style='color:red'>Please upload peptidomic data first.</b>"))
                return

            try:
                # Extract sequences from peptidomic data
                self.peptides = self._extract_sequences(self.pd_results)

                if not self.peptides:
                    display(HTML("<b style='color:red'>No valid sequences found in peptidomic data.</b>"))
                    return

                display(HTML(f"<b style='color:blue'>Found {len(self.peptides)} sequences. Searching database...</b>"))

                # Perform search
                results = self._search_peptides_comprehensive(
                    self.peptides,
                    similarity_threshold=self.threshold_dropdown.value
                )
                # Format results if we have any matches
                if not results.empty:
                    self.mbpdb_results = self._format_search_results_with_matches(results)
                    display(
                        HTML(f"<b style='color:green'>Search complete! Found {len(self.mbpdb_results)} matches</b>"))
                else:
                    self.mbpdb_results = results
                    display(HTML("<b style='color:orange'>No matches found in the database.</b>"))

            except Exception as e:
                display(HTML(f"<b style='color:red'>Error: {str(e)}</b>"))
                self.mbpdb_results = pd.DataFrame()

    def _search_peptides_comprehensive(self, peptides, similarity_threshold=100):
        """Search for peptides with BLAST-based similarity matching"""

        # WORK_DIRECTORY = '/home/kuhfeldrf/mbpdb/include/peptide/uploads/temp'
        # conn = sqlite3.connect('/home/kuhfeldrf/mbpdb/include/peptide/db.sqlite3')

        WORK_DIRECTORY = '../../uploads/temp'
        conn = sqlite3.connect('../../db.sqlite3')
        work_path = self._create_work_directory(WORK_DIRECTORY)

        fasta_db_path = os.path.join(work_path, "db.fasta")
        results = []
        extra_info = defaultdict(list)

        # Create database with all peptides for BLAST
        query = "SELECT p.id, p.peptide FROM peptide_peptideinfo p"
        db_peptides = pd.read_sql_query(query, conn)

        # Create BLAST database
        with open(fasta_db_path, 'w') as f:
            for _, row in db_peptides.iterrows():
                f.write(f">{row['id']}\n{row['peptide']}\n")

        self._make_blast_db(fasta_db_path)

        for peptide in self.peptides:
            if similarity_threshold == 100 or len(peptide) <4:
                query = """
                SELECT DISTINCT
                    ? as search_peptide,
                    pi.pid as protein_id,
                    p.id as peptide_id,
                    p.peptide,
                    pi.desc as protein_description,
                    pi.species,
                    p.intervals,
                    f.function,
                    r.additional_details,
                    r.ic50,
                    r.inhibition_type,
                    r.inhibited_microorganisms,
                    r.ptm,
                    r.title,
                    r.authors,
                    r.abstract,
                    r.doi,
                    'sequence' as search_type,
                    'IDENTITY' as scoring_matrix
                FROM peptide_peptideinfo p
                JOIN peptide_proteininfo pi ON p.protein_id = pi.id
                LEFT JOIN peptide_function f ON f.pep_id = p.id
                LEFT JOIN peptide_reference r ON r.func_id = f.id
                WHERE p.peptide = ?
                """
                df = pd.read_sql_query(query, conn, params=[peptide, peptide])
                results.append(df)
            else:
                # Run BLASTP search for similarity matching
                query_path = os.path.join(work_path, "query.fasta")
                with open(query_path, "w") as query_file:
                    query_file.write(f">pep_query\n{peptide}\n")

                output_path = os.path.join(work_path, "blastp_short.out")
                blast_args = [
                    "blastp",
                    "-query", query_path,
                    "-db", fasta_db_path,
                    "-outfmt", "6 std ppos qcovs qlen slen positive",
                    "-evalue", "1000",
                    "-word_size", "2",
                    "-matrix", "IDENTITY",
                    "-threshold", "1",
                    "-task", "blastp-short",
                    "-out", output_path
                ]

                subprocess.check_output(blast_args, stderr=subprocess.STDOUT)

                # Process BLAST results
                search_ids = self._process_blast_results(output_path, similarity_threshold, extra_info)

                if search_ids:
                    df = self._fetch_peptide_data(conn, peptide, search_ids)
                    self._add_blast_details(df, extra_info)
                    results.append(df)

        conn.close()
        self._cleanup_work_directory(WORK_DIRECTORY)

        return self._combine_results(results)

    def _create_work_directory(self, base_dir):
        """Create a working directory for BLAST operations"""
        path = os.path.join(base_dir, f'work_{int(round(time.time() * 1000))}')
        os.makedirs(path)
        return path

    def _make_blast_db(self, library_fasta_path):
        """Create BLAST database from FASTA file"""
        subprocess.check_output(
            ['makeblastdb', '-in', library_fasta_path, '-dbtype', 'prot'],
            stderr=subprocess.STDOUT
        )

    def _process_blast_results(self, output_path, similarity_threshold, extra_info):
        """Process BLAST results and collect search IDs"""
        search_ids = []
        csv.register_dialect('blast_dialect', delimiter='\t')

        with open(output_path, "r") as output_file:
            blast_data = csv.DictReader(
                output_file,
                fieldnames=['query', 'subject', 'percid', 'align_len', 'mismatches',
                            'gaps', 'qstart', 'qend', 'sstart', 'send', 'evalue',
                            'bitscore', 'ppos', 'qcov', 'qlen', 'slen', 'numpos'],
                dialect='blast_dialect'
            )

            for row in blast_data:
                tlen = float(row['slen']) if float(row['slen']) > float(row['qlen']) else float(row['qlen'])
                simcalc = 100 * ((float(row['numpos']) - float(row['gaps'])) / tlen)

                if simcalc >= similarity_threshold:
                    search_ids.append(row['subject'])
                    extra_info[row['subject']] = [
                        f"{simcalc:.2f}", row['qstart'], row['qend'], row['sstart'],
                        row['send'], row['evalue'], row['align_len'], row['mismatches'],
                        row['gaps']
                    ]

        return search_ids

    def _fetch_peptide_data(self, conn, peptide, search_ids):
        """Fetch peptide data from database"""
        placeholders = ','.join(['?' for _ in search_ids])
        query = f"""
        SELECT DISTINCT
            ? as search_peptide,
            pi.pid as protein_id,
            p.id as peptide_id,
            p.peptide,
            pi.desc as protein_description,
            pi.species,
            p.intervals,
            f.function,
            r.additional_details,
            r.ic50,
            r.inhibition_type,
            r.inhibited_microorganisms,
            r.ptm,
            r.title,
            r.authors,
            r.abstract,
            r.doi,
            'sequence' as search_type,
            'IDENTITY' as scoring_matrix
        FROM peptide_peptideinfo p
        JOIN peptide_proteininfo pi ON p.protein_id = pi.id
        LEFT JOIN peptide_function f ON f.pep_id = p.id
        LEFT JOIN peptide_reference r ON r.func_id = f.id
        WHERE p.id IN ({placeholders})
        """

        return pd.read_sql_query(query, conn, params=[peptide] + search_ids)

    def _add_blast_details(self, df, extra_info):
        """Add BLAST details to DataFrame"""
        for idx, row in df.iterrows():
            if str(row['peptide_id']) in extra_info:
                blast_details = extra_info[str(row['peptide_id'])]
                df.at[idx, '% Alignment'] = blast_details[0]
                df.at[idx, 'Query start'] = blast_details[1]
                df.at[idx, 'Query end'] = blast_details[2]
                df.at[idx, 'Subject start'] = blast_details[3]
                df.at[idx, 'Subject end'] = blast_details[4]
                df.at[idx, 'e-value'] = blast_details[5]
                df.at[idx, 'Alignment length'] = blast_details[6]
                df.at[idx, 'Mismatches'] = blast_details[7]
                df.at[idx, 'Gap opens'] = blast_details[8]

    def _cleanup_work_directory(self, work_directory):
        """Clean up old work directories"""
        try:
            dirs = [f for f in os.scandir(work_directory) if f.is_dir()]
            dirs.sort(key=lambda x: os.path.getmtime(x.path), reverse=True)

            for dir_entry in dirs[25:]:
                try:
                    shutil.rmtree(dir_entry.path)
                except Exception:
                    pass
        except Exception:
            pass

    def _combine_results(self, results):
        """Combine and format final results"""
        if not results:
            mbpdb_columns = [
                'search_peptide', 'protein_id', 'peptide', 'protein_description',
                'species', 'intervals', 'function', 'additional_details', 'ic50',
                'inhibition_type', 'inhibited_microorganisms', 'ptm', 'title',
                'authors', 'abstract', 'doi', 'search_type', 'scoring_matrix'
            ]
            return pd.DataFrame(columns=mbpdb_columns)

        final_results = pd.concat(results, ignore_index=True)

        if 'peptide_id' in final_results.columns:
            final_results = final_results.drop('peptide_id', axis=1)

        sort_columns = ['search_peptide']
        if '% Alignment' in final_results.columns:
            sort_columns.append('% Alignment')

        return final_results.sort_values(
            sort_columns,
            ascending=[True] + [False] * (len(sort_columns) - 1)
        )

    def _format_search_results_with_matches(self, final_results):
        """Format search results with matches"""
        if '% Alignment' in final_results.columns:
            final_results['% Alignment'] = pd.to_numeric(
                final_results['% Alignment'],
                errors='coerce'
            )

        grouped = final_results.groupby(["search_peptide", "function"], as_index=False)
        aggregated_results = []
        processed_indices = set()

        for _, group in grouped:
            if len(group) > 1:
                aggregated_row = self._aggregate_group_data(group)
                aggregated_results.append(aggregated_row)
                processed_indices.update(group.index)

        remaining_rows = final_results.loc[~final_results.index.isin(processed_indices)]
        aggregated_df = pd.DataFrame(aggregated_results)

        return pd.concat([aggregated_df, remaining_rows], ignore_index=True)

    def _aggregate_group_data(self, group):
        """Aggregate data for a group of results"""

        def enumerate_field(field):
            if field in group.columns and not group[field].dropna().empty:
                valid_values = set(group[field].dropna().astype(str).str.strip())
                valid_values = {val for val in valid_values if val != ''}
                if len(valid_values) > 1:
                    return "; ".join([f"{i + 1}) {val}" for i, val in enumerate(valid_values)])
                elif len(valid_values) == 1:
                    return next(iter(valid_values))
                return ''
            return ''

        return {col: enumerate_field(col) for col in group.columns}

    def setup_data_loading_ui(self):
        """Initialize and display the data loading UI with integrated search and help tooltips"""

        def create_help_icon(tooltip_text):
            """Create a help icon widget with tooltip"""
            help_icon = widgets.HTML(
                value='<i class="fa fa-question-circle" style="color: #007bff;"></i>',
                layout=widgets.Layout(width='25px', margin='2px 5px')
            )
            help_icon.add_class('jupyter-widgets')
            help_icon.add_class('widget-html')
            return widgets.HTML(
                f'<div title="{tooltip_text}" style="display: inline-block;">{help_icon.value}</div>'
            )

        def create_labeled_uploader(widget, label, tooltip):
            """Create an uploader with label and help icon"""
            return widgets.HBox([
                widget,
                create_help_icon(tooltip)
            ], layout=widgets.Layout(align_items='center'))

        # Create file upload widgets with the same configurations
        self.mbpdb_uploader = widgets.FileUpload(
            accept='.csv,.txt,.tsv,.xlsx',
            multiple=False,
            description='Upload MBPDB File',
            layout=widgets.Layout(width='300px'),
            style={'description_width': 'initial'}
        )

        self.pd_uploader = widgets.FileUpload(
            accept='.csv,.txt,.tsv,.xlsx',
            multiple=False,
            description='Upload Peptidomic File',
            layout=widgets.Layout(width='300px'),
            style={'description_width': 'initial'}
        )

        self.fasta_uploader = widgets.FileUpload(
            accept='.fasta',
            multiple=True,
            description='Upload FASTA Files',
            layout=widgets.Layout(width='300px'),
            style={'description_width': 'initial'}
        )

        # Create search interface
        self.threshold_dropdown = widgets.Dropdown(
            options=list(range(0, 101, 10)),
            value=80,
            description='Similarity Threshold (%):',
            style={'description_width': 'initial'},
            layout=widgets.Layout(width='225px')
        )

        self.search_button = widgets.Button(
            description='Search Database',
            button_style='primary',
            layout=widgets.Layout(width='150px')
        )


        # Create output areas
        self.output_area = widgets.Output()
        self.search_output_area = widgets.Output()

        mbpdb_box = widgets.HBox([
            widgets.HTML("""
                    <div margin-bottom: 5px;'>
                        <b>Option 1: Upload File</b>
                    </div>
                """),
            self.create_download_link(
                "example_MBPDB_search.tsv",
                "Example"
            )
        ])
        # Create MBPDB options section
        mbpdb_options = widgets.HBox([widgets.VBox([
            mbpdb_box,
            create_labeled_uploader(
                self.mbpdb_uploader,
                "MBPDB File",
                "Upload your own MBPDB file (optional)"
            )
        ]),
            widgets.HTML("<div style='margin: 0 20px; line-height: 100px;'><b>OR</b></div>"),
            widgets.VBox([
                widgets.HTML("<div style='font-weight: bold; margin-bottom: 5px;'>Option 2: Search Database</div>"),
                widgets.HBox([
                    self.threshold_dropdown,
                    self.search_button,
                    create_help_icon("Search peptides against the MBPDB (optional)")
                ], layout=widgets.Layout(align_items='center'))
            ])
        ], layout=widgets.Layout(align_items='center', margin='0'))

        # Create peptide file uploader box with example link
        peptide_box = widgets.HBox([
            create_labeled_uploader(
                self.pd_uploader,
                "Peptidomic File",
                "Upload peptide groups data from Proteome Discover export file (required)"
            ),
            self.create_download_link(
                "example_peptide_data.csv",
                "Example"
            )
        ], layout=widgets.Layout(align_items='center'))

        # Create FASTA uploader box with example link
        fasta_box = widgets.HBox([
            create_labeled_uploader(
                self.fasta_uploader,
                "FASTA Files",
                "Upload Protein FASTA file used in Proteome Discoverer Search (optional)"
            ),
            self.create_download_link(
                "example_fasta.fasta",
                "Example"
            )
        ], layout=widgets.Layout(align_items='center'))

        # Create main container
        main_container = widgets.VBox([
            widgets.HTML("<h3><u>Upload Peptidomic Data Files:</u></h3>"),
            peptide_box,
            widgets.HTML("<h3 style='margin-bottom: 0;'><u>MBPDB Data (Optional):</u></h3>"),
            mbpdb_options,
            widgets.HTML("<h3><u>Upload Protein FASTA Files (Optional):</u></h3>"),
            fasta_box,
            widgets.HTML("<br>"),
            widgets.HTML("<div style='margin-top: 10px;'></div>"),
            self.output_area,
            self.search_output_area
        ])

        # Register observers
        self.pd_uploader.observe(self._on_pd_upload_change, names='value')
        self.mbpdb_uploader.observe(self._on_mbpdb_upload_change, names='value')
        self.fasta_uploader.observe(self._on_fasta_upload_change, names='value')
        self.search_button.on_click(self._on_search_click)

        # Add Font Awesome CSS for help icons
        display(widgets.HTML("""
            <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.15.4/css/all.min.css">
        """))

        display(main_container)
    
    def _extract_sequences(self, df):
        """Extract sequences from peptidomic data"""
        if 'Sequence' not in df.columns:
            # First create Sequence column with NaN values
            df['Sequence'] = pd.NA
            
            def extract_sequence(annotated_seq):
                if pd.isna(annotated_seq):
                    return pd.NA
                
                # Split by comma if present to handle multiple sequences
                if ',' in annotated_seq:
                    sequences = []
                    for seq in annotated_seq.split(','):
                        seq = seq.strip()
                        # Handle [X].SEQUENCE.[X] format
                        if '.' in seq:
                            parts = seq.split('.')
                            if len(parts) > 1:
                                sequences.append(parts[1])
                        # Handle plain sequence
                        else:
                            sequences.append(seq)
                    return sequences
                
                # Single sequence case
                # Handle [X].SEQUENCE.[X] format
                if '.' in annotated_seq:
                    parts = annotated_seq.split('.')
                    if len(parts) > 1:
                        return parts[1]
                
                # Handle plain sequence
                return annotated_seq
            
            # Apply the extraction function and explode the results
            df['Sequence'] = df['Annotated Sequence'].apply(extract_sequence)
            # Explode sequences if they're in a list (from comma separation)
            df = df.explode('Sequence')
            
        return df['Sequence'].dropna().unique().tolist()

    def _on_pd_upload_change(self, change):
        if change['type'] == 'change' and change['name'] == 'value':
            with self.output_area:
                self.output_area.clear_output()
                if change['new'] and len(change['new']) > 0:
                    file_data = change['new'][0]
                    self.pd_results, pd_status = self._load_data(
                        file_data,
                        required_columns=['Positions in Proteins'],
                        file_type='Peptidomic'
                    )
                    if pd_status == 'yes' and self.pd_results is not None:
                        display(HTML(
                            f'<b style="color:green;">Peptidomic data imported with {self.pd_results.shape[0]} rows and {self.pd_results.shape[1]} columns.</b>'))

    def _on_mbpdb_upload_change(self, change):
        if change['type'] == 'change' and change['name'] == 'value':
            with self.output_area:
                self.output_area.clear_output()
                if change['new'] and len(change['new']) > 0:
                    file_data = change['new'][0]
                    self.mbpdb_results, mbpdb_status = self._load_data(
                        file_data,
                        required_columns=['Search peptide', 'Protein ID', 'Peptide'],
                        file_type='MBPDB'
                    )
                    if mbpdb_status == 'yes' and self.mbpdb_results is not None:
                        self.mbpdb_results.rename(columns={
                            'Search peptide': 'search_peptide',
                            'Protein ID': 'protein_id',
                            'Peptide': 'peptide',
                            'Protein description': 'protein_description',
                            'Species': 'species',
                            'Intervals': 'intervals',
                            'Function': 'function',
                            'Additional details': 'additional_details',
                            'IC50 (μM)': 'ic50',
                            'Inhibition type': 'inhibition_type',
                            'Inhibited microorganisms': 'inhibited_microorganisms',
                            'PTM': 'ptm',
                            'Title': 'title',
                            'Authors': 'authors',
                            'Abstract': 'abstract',
                            'DOI': 'doi',
                            'Search type': 'search_type',
                            'Scoring matrix': 'scoring_matrix',
                        }, inplace=True)
                        display(HTML(
                            f'<b style="color:green;">MBPDB file imported with {self.mbpdb_results.shape[0]} rows and {self.mbpdb_results.shape[1]} columns</b>'))

    def _on_fasta_upload_change(self, change):
        if change['type'] == 'change' and change['name'] == 'value':
            new_proteins = {}
            with self.output_area:
                self.output_area.clear_output()
                if change['new'] and len(change['new']) > 0:
                    for file_data in change['new']:
                        try:
                            if file_data.name.endswith('.fasta'):
                                parsed = self._parse_uploaded_fasta(file_data)
                                new_proteins.update(parsed)
                                #print(f"Parsed {len(parsed)} proteins from {file_data.name}")
                                display(HTML(f'<b style="color:green;">Successfully imported {file_data.name}</b>'))
                        except Exception as e:
                            print(f"Error: {str(e)}")

                    # Update protein_dict with new data
                    self.protein_dict = new_proteins
                    #print(f"Updated protein_dict with {len(new_proteins)} entries")

    def _load_data(self, file_obj, required_columns, file_type):
        """
        Load and validate uploaded data files, cleaning empty rows and validating data.
        
        Args:
            file_obj: Uploaded file object
            required_columns (list): List of required column names (either single names or pairs)
            file_type (str): Type of file being loaded ('MBPDB' or 'Peptidomic')
            
        Returns:
            tuple: (DataFrame or None, status string 'yes'/'no')
        """
        try:
            content = file_obj.content
            filename = file_obj.name
            extension = filename.split('.')[-1].lower()
            
            file_stream = io.BytesIO(content)
            
            # Load data based on file extension with multiple delimiter attempts
            if extension == 'csv':
                # Try different delimiters in order of common usage
                delimiters = [',', ';', '|', '\t']
                df = None
                successful_delimiter = None
                
                for delimiter in delimiters:
                    try:
                        # Reset file stream position
                        file_stream.seek(0)
                        temp_df = pd.read_csv(file_stream, sep=delimiter)
                        
                        # Check if we got more than one column
                        if len(temp_df.columns) > 1:
                            df = temp_df
                            successful_delimiter = delimiter
                            break
                    except:
                        continue
                        
                if df is None:
                    raise ValueError("Could not parse CSV file with any common delimiter (tried: comma, semicolon, pipe, tab)")
                
                # Show which delimiter was used
                #display(HTML(f'<b style="color:blue;">File parsed using delimiter: {successful_delimiter}</b>'))
                
            elif extension in ['txt', 'tsv']:
                # For txt/tsv files, try tab first, then other delimiters
                delimiters = ['\t', ',', ';', '|']
                df = None
                successful_delimiter = None
                
                for delimiter in delimiters:
                    try:
                        file_stream.seek(0)
                        temp_df = pd.read_csv(file_stream, sep=delimiter)
                        if len(temp_df.columns) > 1:
                            df = temp_df
                            successful_delimiter = delimiter
                            break
                    except:
                        continue
                        
                if df is None:
                    raise ValueError("Could not parse TXT/TSV file with any common delimiter")
                    
                #display(HTML(f'<b style="color:blue;">File parsed using delimiter: {successful_delimiter}</b>'))
                
            elif extension == 'xlsx':
                df = pd.read_excel(file_stream)
            else:
                raise ValueError("Unsupported file format. Please upload .csv, .txt, .tsv, or .xlsx files.")
            
            # Clean column names
            df.columns = df.columns.str.strip()
            
            # Drop empty rows
            df = df.dropna(how='all')
            df = df[~(df.astype(str).apply(lambda x: x.str.strip().eq('')).all(axis=1))]
            
            # Handle validation differently based on file type
            if file_type == 'MBPDB':
                # Use column pairs for MBPDB validation
                column_pairs = {
                    'Search peptide': 'search_peptide',
                    'Protein ID': 'protein_id',
                    'Peptide': 'peptide'
                }
                
                # Check for required columns in either format
                missing_pairs = []
                for orig_col, std_col in column_pairs.items():
                    if not (orig_col in df.columns or std_col in df.columns):
                        missing_pairs.append(f"'{orig_col}' or '{std_col}'")
                
                if missing_pairs:
                    display(HTML(f'<b style="color:red;">{file_type} File Error: Missing required columns: {", ".join(missing_pairs)}</b>'))
                    return None, 'no'
                
                # Validate non-empty required columns
                empty_pairs = []
                for orig_col, std_col in column_pairs.items():
                    col_to_check = orig_col if orig_col in df.columns else std_col
                    if df[col_to_check].isna().all() or (df[col_to_check].astype(str).str.strip() == '').all():
                        empty_pairs.append(f"'{orig_col}' or '{std_col}'")
                
                if empty_pairs:
                    display(HTML(f'<b style="color:red;">{file_type} File Error: Required columns are empty: {", ".join(empty_pairs)}</b>'))
                    return None, 'no'
                    
            else:
                # Standard validation for other file types
                if not set(required_columns).issubset(df.columns):
                    missing = set(required_columns) - set(df.columns)
                    display(HTML(f'<b style="color:red;">{file_type} File Error: Missing required columns: {", ".join(missing)}</b>'))
                    return None, 'no'
                
                # Validate non-empty required columns
                empty_required = []
                for col in required_columns:
                    if df[col].isna().all() or (df[col].astype(str).str.strip() == '').all():
                        empty_required.append(col)
                
                if empty_required:
                    display(HTML(f'<b style="color:red;">{file_type} File Error: Required columns are empty: {", ".join(empty_required)}</b>'))
                    return None, 'no'
            
            # Show success message
            display(HTML(f'<b style="color:green;">{file_type} file loaded successfully with {len(df)} rows after cleaning.</b>'))
            
            return df, 'yes'
            
        except Exception as e:
            display(HTML(f'<b style="color:red;">{file_type} File Error: {str(e)}</b>'))
            return None, 'no'
    
    def _parse_uploaded_fasta(self, file_data):
        """Parse uploaded FASTA file content"""
        fasta_dict = {}
        fasta_text = bytes(file_data.content).decode('utf-8')
        lines = fasta_text.split('\n')

        protein_id = ""
        protein_name = ""
        sequence = ""
        species = ""

        for line in lines:
            line = line.strip()
            if line.startswith('>'):
                if protein_id:
                    fasta_dict[protein_id] = {
                        "name": protein_name,
                        "sequence": sequence,
                        "species": species
                    }
                sequence = ""
                header_parts = line[1:].split('|')
                if len(header_parts) > 2:
                    protein_id = header_parts[1]
                    protein_name_full = re.split(r' OS=', header_parts[2])[0]
                    if ' ' in protein_name_full:
                        protein_name = protein_name_full
                    else:
                        protein_name = protein_name_full
                    species = self._find_species(line)
            else:
                sequence += line

        if protein_id:
            fasta_dict[protein_id] = {
                "name": protein_name,
                "sequence": sequence,
                "species": species
            }

        return fasta_dict

    def _find_species(self, header):
        """Find species in FASTA header"""
        header_lower = header.lower()
        for spec_group in spec_translate_list:
            for term in spec_group[1:]:
                if term.lower() in header_lower:
                    return spec_group[0]
        return "unknown"


In [427]:
class GroupProcessing:
    def __init__(self):
        self.group_data = {}
        self.group_number = 1
        self.filtered_columns = []
        self.group_uploader = widgets.FileUpload(
        accept='.json',
        multiple=False,
        description='Upload Groups File',
        layout=widgets.Layout(width='300px'),
        style={'description_width': 'initial'}
        )
        self.group_uploader.observe(self._on_group_upload_change, names='value')
        
        # Initialize output areas
        self.output = widgets.Output()
        self.gd_output_area = widgets.Output()
        
        # Initialize widgets for group selection
        self.column_dropdown = widgets.SelectMultiple(
            #description='Absorbance ',
            style={'description_width': 'initial'},
            disabled=False,
            layout=widgets.Layout(width='225px', height='300px')
        )

        column_dropdown_box = widgets.HBox([
            widgets.HTML("<b>Absorbance Columns:&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</b>"),
            self.column_dropdown
            ],layout=widgets.Layout(width='400px', height='305px')
            )

        self.grouping_variable_text = widgets.Text(
            #description='Assign New Group Name',
            layout=widgets.Layout(width='230px'),
            style={'description_width': 'initial'}
        )

        text_box = widgets.HBox([
            widgets.HTML("<b>Search or Assigned Name:</b>"),
            self.grouping_variable_text
        ])
        self.group_box = widgets.VBox([
            column_dropdown_box,
            text_box], layout=widgets.Layout(width='100%'))
        
        # Initialize buttons
        self.search_button = widgets.Button(
            description='Search',
            button_style='info',
            layout=widgets.Layout(margin='10px 10px 0 0')
        )
        
        self.add_group_button = widgets.Button(
            description='Add Group',
            button_style='success',
            layout=widgets.Layout(margin='10px 10px 0 0')
        )
        
        self.reset_file_button = widgets.Button(
            description='Reset Selection',
            button_style='warning',
            layout=widgets.Layout(margin='10px 10px 0 75px')
        )
        
        # Set up button callbacks
        self.search_button.on_click(self._search_columns)
        self.add_group_button.on_click(self._add_group)
        self.reset_file_button.on_click(self._reset_selection)
        
    def setup_data(self):
        """Initialize data and filters for the analysis"""
        # Define columns to exclude with more flexible matching
        columns_to_exclude = [
            'Marked as', 'Number of Missed Cleavages', 'Missed Cleavages',
            'Checked', 'Confidence', 'Annotated Sequence', 'Unnamed: 3', 
            'Modifications', 'Protein Groups', 'Proteins', 'PSMs', 
            'Master Protein Accessions', 'Master Protein Descriptions', 'Description',
            'Positions in Master Proteins', 'Positions in Proteins' 'Modifications in Master Proteins',
            'Modifications in Master Proteins all Sites',
            'Theo MHplus in Da', 'Quan Info', 
            'Confidence by Search Engine', 
            'q-Value by Search Engine',
            'XCorr by Search Engine',
            'Percolator PEP by Search Engine', 'Percolator q-Value by Search Engine',
            'Percolator SVMScore by Search Engine',
            'PEP', 'q-Value', 'RT in min', 'RT in min by Search Engine',
            'Sequence', 'Sequence Length', 'search_peptide', 'Peptide', 'protein_id', 
            'protein_description', 'Alignment', 'Species', 
            'Intervals', 'function', 'unique ID', 'PEP (by Search Engine): Sequest HT',
            'SVM Score (by Search Engine): Sequest HT', 'SVM_Score',
            'XCorr (by Search Engine): Sequest HT', 'PEP', 'q-Value', 'Qvality PEP', 'Qvality q-value',
            'Top Apex RT [min]', 'Top Apex RT in min', 'start', 'stop',
            'Abundance Ratio', 'Abundance Ratio Adj P-Value', 'Abundance Ratio log2', 
            'Abundance Ratio P-Value', 'Abundances', 'Abundances Counts', 
            'Abundances Grouped', 'Abundances Grouped Count', 'Abundances Grouped CV',
            'Abundances Normalized', 'Abundances Scaled', 'Charge by Search Engine',
            'Concatenated Rank by Search Engine', 'Delta Cn by Search Engine',
            'Delta M in ppm by Search Engine', 'Delta mz in Da by Search Engine',
            'Delta Score by Search Engine', 'Found in Sample Groups', 'Found in Samples',
            'Modifications all possible sites', 'mz in Da by Search Engine',
            'Number of Isoforms', 'Number of Protein Groups', 'Number of Proteins',
            'Protein Accessions', 'PSM Ambiguity', 'Rank by Search Engine',
            'Search Engine Rank by Search Engine', 'Score CHIMERYS Identification (by search engine)'
            ]
        
        exclude_substrings = [
            'Abundances by Bio Rep', 
            'Count', 
            'Origin',
            'Average_Abundance',
            'Avg_',
            'PEP by Search Engine',
            'SVM Score by Search Engine',
            'XCorr by Search Engine',
            'Top Apex RT'
            
        ]
    
        # Use cleaned data if available, otherwise use original
        df = self.pd_results_cleaned if (hasattr(self, 'pd_results_cleaned') and 
                                       not self.pd_results_cleaned.empty) else self.pd_results
        
        if df is not None and not df.empty:
            # More flexible column filtering
            self.filtered_columns = []
            for col in df.columns:
                # Check if any exclusion pattern matches the column name
                should_exclude = any(excl.lower() in col.lower() for excl in columns_to_exclude)
                # Check if any substring pattern matches
                has_excluded_substring = any(sub.lower() in col.lower() for sub in exclude_substrings)
                
                if not should_exclude and not has_excluded_substring:
                    self.filtered_columns.append(col)
              
            # Update dropdown options
            self.column_dropdown.options = self.filtered_columns
            self._reset_inputs()
        else:
            self.filtered_columns = []
            self.column_dropdown.options = []
            with self.output:
                self.output.clear_output()
                display(widgets.HTML('<b style="color:red;">No valid data available for processing.</b>'))
   
    def update_data(self, pd_results):
        """Update data and refresh filtered columns"""
        self.pd_results = pd_results
        
        # Only update if we have valid data
        if pd_results is not None:
            self.setup_data()
            
            # Update the dropdown with new filtered columns
            with self.output:
                self.output.clear_output()
                display(widgets.HTML('<b style="color:green;">Data updated successfully. Column selection refreshed.</b>'))
    
    def create_download_link(self, file_path, label):
        """Create a download link for a file."""
        if os.path.exists(file_path):
            # Read file content and encode it as base64
            with open(file_path, 'rb') as f:
                content = f.read()
            b64_content = base64.b64encode(content).decode('utf-8')
    
            # Generate the download link HTML
            return widgets.HTML(f"""
                <a download="{os.path.basename(file_path)}" 
                   href="data:application/octet-stream;base64,{b64_content}" 
                   style="color: #0366d6; text-decoration: none; margin-left: 20px; font-size: 14px;">
                    {label}
                </a>
            """)
        else:
            # Show an error message if the file does not exist
            return widgets.HTML(f"""
                <span style="color: red; margin-left: 20px; font-size: 14px;">
                    File "{file_path}" not found!
                </span>
            """)
            
    def display_group_selector(self):
        """Display the JSON file selector for group dictionaries"""
        group_box_uploader = widgets.HBox([
            self.group_uploader,
            self.create_download_link("example_group_definition.json", "Example")
        ], layout=widgets.Layout(align_items='center'))

        display(widgets.HTML("<h3><u>Upload Existing Group Dictionary:</u></h3>"))
        display(group_box_uploader, self.gd_output_area)
        
    def display_widgets(self):
        """Display the main UI for group selection"""
        # Create main grid container
        grid = widgets.GridspecLayout(1, 2,  # Number of rows and columns
            width='800px', 
            grid_gap='5px',  # Adjust spacing between grid elements
        )
        
        # Create input container with vertical scroll
        input_container = widgets.VBox([
            widgets.HTML("<h3><u>Select New Grouping of Data:</u></h3>"),
            widgets.HTML('Now select the <b>absorbance columns</b> and assign the name of the <b>grouping variable</b>:'),
            self.group_box,
            # Create button layouts
            widgets.HBox([self.search_button, self.add_group_button], layout=widgets.Layout(width='100%')),
            widgets.HBox([self.reset_file_button], layout=widgets.Layout(width='100%'))
        ], layout=widgets.Layout(
            width='400px',
            height='600px',
            overflow_y='auto'  # Add vertical scroll
        ))
        
        # Create output container with vertical scroll
        output_container = widgets.VBox([
            widgets.HTML("<h3><u>Group Selection Results:</u></h3>"),
            self.output
        ], layout=widgets.Layout(
            width='400px',
            height='600px',
            overflow_y='auto',  # Add vertical scroll
            padding='10px'
        ))
        
        # Add to grid
        grid[0, 0] = input_container  # Left column
        grid[0, 1] = output_container  # Right column
        
        display(grid)
    
    def _on_gd_submit(self, b, dropdown):
        """Handle JSON file submission"""
        selected_file = dropdown.value
        with self.gd_output_area:
            clear_output()
            
            if selected_file == 'Select an existing grouping dictionary file':
                print("Please select a valid file.")
                return
                
            try:
                # Load and process JSON file
                with open(selected_file, 'r') as file:
                    data = json.load(file)
                self.group_data = {}
                
                # Process groups
                with self.output:
                    clear_output()
                    for group_number, group_info in data.items():
                        group_name = group_info.get('grouping_variable')
                        selected_columns = group_info.get('abundance_columns')
                        
                        self.group_data[group_number] = {
                            'grouping_variable': group_name,
                            'abundance_columns': selected_columns
                        }
                        
                        display(widgets.HTML(
                            f"<b>Group {group_number}</b> created with <b>{len(selected_columns)} columns assigned</b>."
                        ))
                        display(widgets.HTML(f"<b>Grouping Variable:</b> {group_name}"))
                        display(widgets.HTML(f"<b>Selected Columns:</b> {', '.join(selected_columns)}"))
                        display(widgets.HTML("<hr style='border: 1px solid black;'>"))
                        
                display(widgets.HTML(f'<b style="color:green;">Successfully uploaded: {selected_file}</b>'))
                
            except Exception as e:
                display(widgets.HTML(f"<b style='color:red;'>An error occurred while processing the file: {str(e)}</b>"))
    
    def _search_columns(self, b):
        """Search for columns based on group name"""
        group_name = self.grouping_variable_text.value
        if group_name:
            matching_columns = [col for col in self.filtered_columns if group_name in col]
            self.column_dropdown.value = matching_columns
        else:
            with self.output:
                clear_output()
                display(widgets.HTML('<b style="color:red;">Please enter a group name to search.</b>'))
    
    def _add_group(self, b):
        """Add a new group to the data"""
        group_name = self.grouping_variable_text.value
        selected_columns = list(self.column_dropdown.value)
        
        if not (group_name and selected_columns):
            with self.output:
                display(widgets.HTML('<b style="color:red;">Please enter a group name and select at least one column.</b>'))
            return
        
        # If group_data exists, use next number, otherwise start at 1
        if self.group_data:
            # Convert existing keys to integers and find max
            existing_numbers = [int(k) for k in self.group_data.keys()]
            next_number = max(existing_numbers) + 1
            self.group_number = str(next_number)
        else:
            self.group_data = {}
            self.group_number = "1"
        
        # Add new group data to the dictionary
        self.group_data[self.group_number] = {
            'grouping_variable': group_name,
            'abundance_columns': selected_columns
        }
        
        # Display output
        with self.output:
            display(widgets.HTML(f"<b>Group {self.group_number}</b> created with <b>{len(selected_columns)} columns assigned</b>."))
            display(widgets.HTML(f"<b>Grouping Variable:</b> {group_name}"))
            display(widgets.HTML(f"<b>Selected Columns:</b> {', '.join(selected_columns)}"))
            display(widgets.HTML("<hr style='border: 1px solid black;'>"))
        
        self._reset_inputs()
        
    def _reset_selection(self, b):
        """Reset all selections and data"""
        self.group_data = {}
        self.group_number = 1
        with self.gd_output_area:
            clear_output()
        with self.output:
            clear_output()
        self._reset_inputs()
    
    def _reset_inputs(self):
        """Reset input fields"""
        self.grouping_variable_text.value = ''
        self.column_dropdown.value = ()

    def _on_group_upload_change(self, change):
        """Handle JSON file upload"""
        if change['type'] == 'change' and change['name'] == 'value':
            with self.gd_output_area:
                if change['new'] and len(change['new']) > 0:
                    file_data = change['new'][0]
                    try:
                        content = bytes(file_data.content).decode('utf-8')
                        simplified_data = json.loads(content)
                        
                        # Convert simplified format to enumerated format
                        enumerated_data = {}
                        for i, (group_name, abundance_cols) in enumerate(simplified_data.items(), 1):
                            group_number = str(i)  # Convert to string to match original format
                            enumerated_data[group_number] = {
                                'grouping_variable': group_name,
                                'abundance_columns': abundance_cols
                            }
                        
                        # Process groups using the enumerated format
                        with self.output:
                            for group_number, group_info in enumerated_data.items():
                                group_name = group_info['grouping_variable']
                                selected_columns = group_info['abundance_columns']
                                
                                # Update group_data without clearing previous entries
                                self.group_data[group_number] = {
                                    'grouping_variable': group_name,
                                    'abundance_columns': selected_columns
                                }
                                
                                display(widgets.HTML(
                                    f"<b>Group {group_number}</b> created with <b>{len(selected_columns)} columns assigned</b>."
                                ))
                                display(widgets.HTML(f"<b>Grouping Variable:</b> {group_name}"))
                                display(widgets.HTML(f"<b>Selected Columns:</b> {', '.join(selected_columns)}"))
                                display(widgets.HTML("<hr style='border: 1px solid black;'>"))
                                
                        display(widgets.HTML(f'<b style="color:green;">Successfully uploaded: {file_data.name}</b>'))
                        
                    except Exception as e:
                        display(widgets.HTML(f"<b style='color:red;'>An error occurred while processing the file: {str(e)}</b>"))


In [428]:
class ProteinCombinationHandler(HasTraits):
    def __init__(self, data_transformer):
        super().__init__()
        self.data_transformer = data_transformer  # Store reference to data_transformer
        self.pd_results = data_transformer.pd_results
        self.pd_results_cleaned = None
        self.protein_output_area = None
        self.user_decisions = {}
        self.decision_inputs = []
        self.multi_position_combinations = []
        self.submit_button = None
        self.reset_button = None
        self.progress = None
        self.protein_mapping_output_area = widgets.Output()
        
        self.protein_mapping_widget = widgets.RadioButtons(
            options=[('Yes', True), ('No (skip)', False)],
            description='Process peptides mapped to multiple proteins?',
            disabled=True, 
            style={'description_width': 'initial'},
            value=None
        )
        
        self.protein_mapping_widget.observe(self.process_protein_mapping, names='value')


    @property  # Make protein_dict a property that always reads from data_transformer
    def protein_dict(self):
        return self.data_transformer.protein_dict

    def _get_protein_combinations(self):
        """Extract unique protein combinations from the dataset with NaN handling"""
        if self.pd_results is None or self.pd_results.empty:
            return []

        protein_combinations = set()
        nan_warnings = {
            'positions': 0,
            'master_acc': 0,
            'unknown_added': 0
        }

        # Create a working copy of the dataframe
        working_df = self.pd_results.copy()

        # Track NaN counts before modification
        nan_warnings['positions'] = working_df['Positions in Proteins'].isna().sum()
        nan_warnings['master_acc'] = working_df['Master Protein Accessions'].isna().sum()

        # Replace NaN values with "Unknown" instead of dropping
        working_df['Positions in Proteins'] = working_df['Positions in Proteins'].fillna('Unknown')
        working_df['Master Protein Accessions'] = working_df['Master Protein Accessions'].fillna('Unknown')

        for _, row in working_df.iterrows():
            try:
                # Handle "Unknown" case specially
                if row['Positions in Proteins'] == 'Unknown':
                    position_proteins = ['Unknown']
                else:
                    position_proteins = [p.split()[0] for p in row['Positions in Proteins'].split('; ')]

                master_acc = row['Master Protein Accessions']

                # Check species of proteins in Positions in Proteins
                species_set = set()
                for protein in position_proteins:
                    if protein in self.protein_dict:
                        species_set.add(self.protein_dict[protein]['species'])
                    elif protein == 'Unknown':
                        species_set.add('Unknown')

                if (';' in master_acc or
                        ';' in row['Positions in Proteins'] or
                        len(species_set) > 1 or
                        'Unknown' in species_set):  # Include Unknown combinations
                    protein_combinations.add('; '.join(sorted(position_proteins)))
                    if 'Unknown' in position_proteins:
                        nan_warnings['unknown_added'] += 1

            except Exception as e:
                print(f"Warning: Error processing row {_}: {str(e)}")
                continue

        # Store warning message for display
        warning_message = []

        if nan_warnings['positions'] > 0:
            warning_message.append(
                f"{nan_warnings['positions']} rows with missing 'Positions in Proteins' were marked as Unknown")
        if nan_warnings['master_acc'] > 0:
            warning_message.append(
                f"{nan_warnings['master_acc']} rows with missing 'Master Protein Accessions' were marked as Unknown")
        if nan_warnings['unknown_added'] > 0:
            warning_message.append(f"{nan_warnings['unknown_added']} combinations now include Unknown proteins")

        # if warning_message:
        #   print("Warning: " + "; ".join(warning_message))

        self.multi_position_combinations = list(protein_combinations)
        return self.multi_position_combinations

    def process_protein_mapping(self, change):


        with self.protein_mapping_output_area:
            self.protein_mapping_output_area.clear_output()
            if self.protein_mapping_widget.value == True:
                display(HTML("<b style='color:green;'>Using user-defined protein mappings.</b>"))
                self.pd_results_cleaned = self.process_protein_combinations()
            else:
                self.pd_results_cleaned = self.pd_results.copy()
                display(HTML("<b style='color:green;'>Using original protein mappings.</b>"))


        return self.pd_results_cleaned

    def process_protein_combinations(self):
        """Process protein combinations in pd_results with Unknown handling"""
        if not self.pd_results.empty:
            df = self.pd_results.copy()

            # Fill NaN values with "Unknown"
            df['Positions in Proteins'] = df['Positions in Proteins'].fillna('Unknown')
            df['Master Protein Accessions'] = df['Master Protein Accessions'].fillna('Unknown')

            # Create warning display area
            warning_area = widgets.HTML(
                layout=widgets.Layout(
                    margin='10px 0',
                    padding='10px',
                    border='1px solid #ffeeba',
                    background_color='#fff3cd',
                    border_radius='4px'
                )
            )

            # Get combinations and track Unknown statistics
            combinations = self._get_protein_combinations()

            # Update warning area with statistics
            unknown_positions = (df['Positions in Proteins'] == 'Unknown').sum()
            unknown_master_acc = (df['Master Protein Accessions'] == 'Unknown').sum()

            if unknown_positions > 0 or unknown_master_acc > 0:
                warning_html = "<div><b>ℹ️ Notice:</b><ul style='margin: 5px 0'>"
                if unknown_positions > 0:
                    warning_html += f"<li>{unknown_positions} rows with missing 'Positions in Proteins' are marked as Unknown</li>"
                if unknown_master_acc > 0:
                    warning_html += f"<li>{unknown_master_acc} rows with missing 'Master Protein Accessions' are marked as Unknown</li>"
                warning_html += "</ul>These peptides will be preserved in the output.</div>"
                warning_area.value = warning_html
            else:
                warning_area = widgets.HTML(f"<br>")

            # Main container with warning area
            main_container = widgets.VBox([
                warning_area,
                widgets.HTML("""
                    <h3>Peptides Mapped to Multiple Proteins</h3>
                    <div style='margin-bottom: 15px;'>
                        Select how to handle each protein mapping combination in your dataset.
                        These combinations come from either:
                        <ul>
                            <li>Multiple proteins in Master Protein Accessions</li>
                            <li>Multiple proteins in Positions in Proteins</li>
                            <li>Proteins from different species</li>
                            <li>Unknown protein mappings (from missing values)</li>
                        </ul>
                    </div>
                """)
            ], layout=widgets.Layout(width='100%', padding='20px'))

            # Get combinations
            combinations = self._get_protein_combinations()

            def create_help_icon(self, tooltip_text):
                """Create a help icon widget with tooltip"""
                return f'<div title="{tooltip_text}" style="display: inline-block; margin-left: 4px;">' \
                       '<i class="fa fa-question-circle" style="color: #007bff; font-size: 14px;"></i>' \
                       '</div>'

            table_header = widgets.HTML("""
                            <div style="display: grid; grid-template-columns: 100px 100px 420px 200px auto; gap: 2px; margin-bottom: 10px; font-weight: bold; align-items: center;">
                                <div>
                                    Protein ID
                                    <span title="Unique identifier for the protein" style="display: inline-block; margin-left: 4px;">
                                        <i class="fa fa-question-circle" style="color: #007bff; font-size: 14px;"></i>
                                    </span>
                                </div>
                                <div>
                                    Species
                                    <span title="Source organism of the protein" style="display: inline-block; margin-left: 4px;">
                                        <i class="fa fa-question-circle" style="color: #007bff; font-size: 14px;"></i>
                                    </span>
                                </div>
                                <div>
                                    Description
                                    <span title="Full protein name or description" style="display: inline-block; margin-left: 4px;">
                                        <i class="fa fa-question-circle" style="color: #007bff; font-size: 14px;"></i>
                                    </span>
                                </div>
                                <div>
                                    Decision
                                    <span title="Available options:\n
            - 'new' - Create a separate row for this protein\n
            - 'remove' - Remove this protein from combination\n
            - 'asis' - Keep as part of current combination\n
            - 'Custom: (protein ID)': ie. Custom: P02666A1"
            style="display: inline-block; margin-left: 4px;">
                                        <i class="fa fa-question-circle" style="color: #007bff; font-size: 14px;"></i>
                                    </span>
                                </div>
                                <div>
                                    Status
                                    <span title="Color indicators:\n
            - Grey - Default option (not yet submitted)\n
            - Green - Option has been submitted" style="display: inline-block; margin-left: 4px;">
                                        <i class="fa fa-question-circle" style="color: #007bff; font-size: 14px;"></i>
                                    </span>
                                </div>
                            </div>
                            <hr style="margin: 0 0 10px 0;">
                        """)

            # Create input area
            input_area = widgets.VBox([table_header],
                                      layout=widgets.Layout(width='100%', margin='10px 0'))

            # Add rows for each combination
            self.decision_inputs = []
            self.status_displays = {}

            for combo_idx, combo in enumerate(combinations, 1):
                proteins = combo.split('; ')

                # Find rows with this combination
                combo_rows = []
                for _, row in df.iterrows():
                    if pd.isna(row['Positions in Proteins']):
                        continue
                    row_proteins = set(p.split()[0] for p in row['Positions in Proteins'].split('; '))
                    if row_proteins == set(proteins):
                        combo_rows.append(row)

                occurrences = len(combo_rows)

                # Add combination header
                input_area.children += (widgets.HTML(f"""
                    <div style="background-color: #f8f9fa; padding: 2px; margin: 5px 0; border-radius: 5px;">
                        <b>Combination {combo_idx}</b> ({occurrences} occurrences)
                    </div>
                """),)

                # Process each protein in the combination
                for protein in proteins:
                    species = "Unknown" if protein == 'Unknown' else self.protein_dict.get(protein, {}).get('species',
                                                                                                            "Unknown")
                    name = "Unknown Protein" if protein == 'Unknown' else self.protein_dict.get(protein, {}).get('name',
                                                                                                                 "Unknown")

                    # Set default decision based on Master Protein Accessions
                    default_decision = 'asis'  # Always keep Unknown proteins as-is
                    if protein != 'Unknown' and combo_rows:
                        first_row = combo_rows[0]
                        if not pd.isna(first_row['Master Protein Accessions']):
                            master_proteins = first_row['Master Protein Accessions'].split(';')
                            master_proteins = [p.strip() for p in master_proteins]
                            default_decision = 'new' if protein in master_proteins else 'remove'

                    # Create decision input
                    decision_input = widgets.Text(
                        layout=widgets.Layout(width='125px'),
                        value=default_decision
                    )
                    self.decision_inputs.append((combo, protein, decision_input))

                    # Create status display with initial status
                    status_text = {
                        'new': "Will be created as new row",
                        'remove': "Will be removed",
                        'asis': "Will keep as is",
                        'Custom: (protein ID)': "ie. Custom: P02666A1"
                    }
                    initial_status = status_text.get(default_decision, '')
                    status_display = widgets.HTML(f'<span style="color: gray">{initial_status}</span>')
                    self.status_displays[(combo, protein)] = status_display

                    # Create the row content
                    row_content = widgets.HTML(f"""
                    <div style="display: grid; grid-template-columns: 100px 100px 420px; gap: 2px; align-items: center;">
                            <div>{protein}</div>
                            <div>{species}</div>
                            <div>{name}</div>
                        </div>
                    """)

                    # Create container with all elements
                    container = widgets.HBox([
                        row_content,
                        widgets.HBox([decision_input], layout=widgets.Layout(width='150px', padding='0')),
                        widgets.HBox([status_display], layout=widgets.Layout(width='200px', padding='0'))
                    ], layout=widgets.Layout(
                        margin='2px 0',
                        display='flex',
                        align_items='center',
                        overflow='hidden',
                        width='100%'
                    ))

                    input_area.children += (container,)

            # Create buttons
            button_box = self._create_buttons()

            # Add output area
            self.protein_output_area = widgets.Output(
                layout=widgets.Layout(width='100%', margin='5px 0')
            )

            # Add all components
            main_container.children += (input_area, button_box, self.protein_output_area)

            self.pd_results_cleaned = df
            display(main_container)
            return df

    def _on_submit(self, button, df):
        """Handle submit button click with enhanced position handling"""
        self.submit_button.disabled = True
        self.reset_button.disabled = True
        self.progress.value = 0
        
        with self.protein_output_area:
            try:
                self.protein_output_area.clear_output()
                decisions_by_combo = {}
                rows_to_remove = set()
                new_rows = []
                total_inputs = len(self.decision_inputs)
                
                # First pass: collect all decisions
                for i, (combo, protein, input_widget) in enumerate(self.decision_inputs):
                    try:
                        decision = input_widget.value.strip()
                        if decision:
                            status_display = self.status_displays[(combo, protein)]
                            status_display.value = f'<span style="color: green">Decision: {decision}</span>'
                            
                            if combo not in decisions_by_combo:
                                decisions_by_combo[combo] = {}
                            decisions_by_combo[combo][protein] = decision
                    except Exception as e:
                        display(HTML(f"<b style='color:red;'>Error processing decision for {protein}: {str(e)}</b>"))
                        continue

                    self.progress.value = ((i + 1) / total_inputs * 25)
                
                # Second pass: validate decisions
                validation_errors = []
                
                for combo, protein_decisions in decisions_by_combo.items():
                    # Check if decisions are valid for this combination
                    has_asis = any(decision.upper() == 'ASIS' for decision in protein_decisions.values())
                    has_custom = any(decision.upper().startswith('CUSTOM:') or decision.startswith('Custom:') 
                                    for decision in protein_decisions.values())
                    has_new = any(decision.upper() == 'NEW' for decision in protein_decisions.values())
                    has_remove = any(decision.upper() == 'REMOVE' for decision in protein_decisions.values())
                    
                    # Validation rules
                    if has_asis and (has_custom or has_new or has_remove):
                        validation_errors.append(f"Combination '{combo}': ASIS cannot be used with other decision types")
                    
                    #if has_new and (has_custom or has_asis):
                    #    validation_errors.append(f"Combination '{combo}': NEW cannot be used with CUSTOM or ASIS")
                    
                    # Validate individual decision formats
                    for protein, decision in protein_decisions.items():
                        decision_upper = decision.upper()
                        if (decision_upper not in ['NEW', 'REMOVE', 'ASIS'] and 
                            not decision_upper.startswith('CUSTOM:') and 
                            not decision.startswith('Custom:')):
                            validation_errors.append(f"Protein '{protein}': Invalid decision format '{decision}'")
                        
                        if (decision_upper.startswith('CUSTOM:') or decision.startswith('Custom:')) and len(decision.split(':', 1)[1].strip()) == 0:
                            validation_errors.append(f"Protein '{protein}': CUSTOM decision requires a protein ID after the colon")
                
                # If validation errors, stop processing
                if validation_errors:
                    error_message = "Cannot process due to the following errors:<br>"
                    for error in validation_errors:
                        error_message += f"• {error}<br>"
                    error_message += "<br>Valid combinations:<br>"
                    error_message += "• All proteins can be ASIS (no changes)<br>"
                    error_message += "• CUSTOM, NEW and REMOVE can be used together<br>"
                    error_message += "• ASIS cannot be used with other decision types (CUSTOM, NEW or REMOVE)<br>"
                    
                    display(HTML(f"<b style='color:red;'>{error_message}</b>"))
                    self.progress.value = 0
                    self.submit_button.disabled = False
                    self.reset_button.disabled = False
                    return df
                
                # Third pass: process the dataframe
                if decisions_by_combo:
                    processed_df = df.copy()
                    processed_count = 0
                    total_combinations = len(decisions_by_combo)

                    for combo, protein_decisions in decisions_by_combo.items():
                        try:
                            # Extract protein IDs for pattern matching
                            proteins = []
                            for part in combo.split('; '):
                                if not part.startswith('['):
                                    protein_id = part.split()[0]
                                    proteins.append(protein_id)
                            
                            # Create regex pattern
                            pattern_parts = []
                            for protein in proteins:
                                escaped_protein = re.escape(protein)
                                pattern_parts.append(f'(?=.*{escaped_protein})')
                            pattern = ''.join(pattern_parts)

                            try:
                                # Find matching rows
                                valid_rows = processed_df['Positions in Proteins'].notna()
                                mask = processed_df['Positions in Proteins'].fillna('').str.contains(pattern, regex=True)
                                mask = valid_rows & mask
                                matched_indices = processed_df[mask].index

                                for idx in matched_indices:
                                    row = processed_df.loc[idx]
                                    positions = row['Positions in Proteins'].split('; ')
                                    master_accs = row['Master Protein Accessions'].split('; ') if '; ' in row['Master Protein Accessions'] else [row['Master Protein Accessions']]
                                    
                                    # Extract protein IDs from positions
                                    current_proteins = []
                                    for pos in positions:
                                        parts = pos.split()
                                        if parts and not parts[0].startswith('['):
                                            current_proteins.append(parts[0])
                                    
                                    if set(current_proteins) == set(proteins):
                                        # Check if all decisions are ASIS
                                        all_asis = all(decision.upper() == 'ASIS' for decision in protein_decisions.values())
                                        if all_asis:
                                            continue
                                        
                                        # Process decisions
                                        proteins_to_remove = []
                                        custom_changes = {}
                                        new_proteins = []
                                        
                                        for protein, decision in protein_decisions.items():
                                            decision_upper = decision.upper()
                                            
                                            if decision_upper == 'NEW':
                                                new_proteins.append(protein)
                                                proteins_to_remove.append(protein)
                                                
                                                # Create new row for this protein
                                                matching_position = next((p for p in positions if p.startswith(protein)), None)
                                                if matching_position:
                                                    new_row = row.copy()
                                                    new_row['Positions in Proteins'] = matching_position
                                                    new_row['Master Protein Accessions'] = protein
                                                    new_rows.append(new_row)
                                                    
                                            elif decision_upper == 'REMOVE':
                                                proteins_to_remove.append(protein)
                                                
                                            elif decision_upper.startswith('CUSTOM:') or decision.startswith('Custom:'):
                                                parts = decision.split(':', 1)
                                                if len(parts) > 1:
                                                    new_protein_id = parts[1].strip()
                                                    custom_changes[protein] = new_protein_id
                                        
                                        # First apply custom changes
                                        for protein, new_protein_id in custom_changes.items():
                                            for i, pos in enumerate(positions):
                                                if pos.startswith(protein):
                                                    pos_parts = pos.split(' ', 1)
                                                    if len(pos_parts) > 1:
                                                        pos_range = pos_parts[1]
                                                        new_position = f"{new_protein_id} {pos_range}"
                                                        positions[i] = new_position
                                            
                                            for i, acc in enumerate(master_accs):
                                                if acc == protein:
                                                    master_accs[i] = new_protein_id
                                        
                                        # Then remove proteins
                                        if proteins_to_remove:
                                            positions = [pos for pos in positions if not any(pos.startswith(p) for p in proteins_to_remove)]
                                            master_accs = [acc for acc in master_accs if acc not in proteins_to_remove]
                                        
                                        # Update the original row if there are positions left
                                        if positions:
                                            processed_df.at[idx, 'Positions in Proteins'] = '; '.join(positions)
                                            processed_df.at[idx, 'Master Protein Accessions'] = '; '.join(master_accs) if master_accs else "Unknown"
                                        else:
                                            rows_to_remove.add(idx)

                            except Exception as regex_error:
                                display(HTML(f"<b style='color:orange;'>Warning: Error in pattern matching: {str(regex_error)}</b>"))
                                continue

                        except Exception as combo_error:
                            display(HTML(f"<b style='color:orange;'>Warning: Error processing combination {combo}: {str(combo_error)}</b>"))
                            continue

                        processed_count += 1
                        progress = 30 + (processed_count / total_combinations * 70)
                        self.progress.value = progress

                    # Final processing
                    if rows_to_remove:
                        processed_df = processed_df.drop(index=list(rows_to_remove))
                    if new_rows:
                        processed_df = pd.concat([processed_df, pd.DataFrame(new_rows)], ignore_index=True)

                    self.pd_results_cleaned = processed_df
                    self.progress.value = 100
                    display(HTML("<b style='color:green;'>Processing complete.</b>"))
                else:
                    display(HTML("<b style='color:orange;'>No decisions to process.</b>"))

            except Exception as e:
                display(HTML(f"<b style='color:red;'>Error in submit handler: {str(e)}</b>"))
                self.progress.value = 0

            finally:
                self.submit_button.disabled = False
                self.reset_button.disabled = False

        return self.pd_results_cleaned

    def create_help_icon(self, tooltip_text):
        """Create a help icon widget with tooltip"""
        return widgets.HTML(
            f'<div title="{tooltip_text}" style="display: inline-block;">'
            '<i class="fa fa-question-circle" style="color: #007bff;"></i>'
            '</div>'
        )

    def _create_buttons(self):
        """Create submit and reset buttons"""
        self.submit_button = widgets.Button(
            description="Submit",
            button_style='success',
            disabled=False
        )
        self.reset_button = widgets.Button(
            description="Reset",
            button_style='warning',
            disabled=False
        )
        self.progress = widgets.FloatProgress(
            value=0,
            min=0,
            max=100,
            description='Processing:',
            bar_style='info',
            style={'bar_color': '#0080ff'},
            orientation='horizontal',
            layout=widgets.Layout(width='50%')
        )

        button_box = widgets.VBox([
            widgets.HBox([self.submit_button, self.reset_button]),
            self.progress
        ])

        self.reset_button.on_click(self._on_reset_button_clicked)
        self.submit_button.on_click(lambda b: self._on_submit(b, self.pd_results.copy()))

        return button_box

    def _on_reset_button_clicked(self, b):
        """Handle reset button click by resetting options to default values"""
        # Disable buttons during reset
        self.submit_button.disabled = True
        self.reset_button.disabled = True

        # Clear output area
        with self.protein_output_area:
            self.protein_output_area.clear_output()
            display(HTML("<b style='color:blue;'>Resetting options to defaults...</b>"))

        # Reset progress bar
        self.progress.value = 0

        try:
            # Reset each input field to its default value based on Master Protein Accessions
            df = self.pd_results.copy()
            processed = 0
            total_inputs = len(self.decision_inputs)

            for combo, protein, input_field in self.decision_inputs:
                # Find rows with this combination
                proteins = combo.split('; ')
                combo_rows = []
                for _, row in df.iterrows():
                    if pd.isna(row['Positions in Proteins']):
                        continue
                    row_proteins = set(p.split()[0] for p in row['Positions in Proteins'].split('; '))
                    if row_proteins == set(proteins):
                        combo_rows.append(row)

                # Determine default decision
                default_decision = 'asis'
                if combo_rows:
                    first_row = combo_rows[0]
                    if not pd.isna(first_row['Master Protein Accessions']):
                        master_proteins = first_row['Master Protein Accessions'].split(';')
                        master_proteins = [p.strip() for p in master_proteins]
                        default_decision = 'new' if protein in master_proteins else 'remove'

                # Set input field value
                input_field.value = default_decision

                # Update status display
                status_display = self.status_displays[(combo, protein)]
                status_text = {
                    'new': "Will be created as new row",
                    'remove': "Will be removed",
                    'asis': "Will keep as is",
                    'Custom: (protein ID)': "ie. Custom: P02666A1"
                }
                status_display.value = f'<span style="color: gray">{status_text[default_decision]}</span>'

                # Update progress
                processed += 1
                self.progress.value = (processed / total_inputs) * 100

            # Reset internal state
            self.user_decisions = {}
            self.pd_results_cleaned = self.pd_results.copy()

            with self.protein_output_area:
                self.protein_output_area.clear_output()
                display(HTML("<b style='color:green;'>Reset complete. All options set to defaults.</b>"))

        except Exception as e:
            with self.protein_output_area:
                self.protein_output_area.clear_output()
                display(HTML(f"<b style='color:red;'>Error during reset: {str(e)}</b>"))

        finally:
            # Re-enable buttons
            self.submit_button.disabled = False
            self.reset_button.disabled = False

In [429]:
class ProcessingWorkflow:
    def __init__(self):
        self.data_transformer = DataTransformation()
        self.protein_handler = ProteinCombinationHandler(self.data_transformer)
        self.group_processor = GroupProcessing()
        
        # Set up observers
        self.data_transformer.observe(self._handle_data_change, names=['pd_results'])
        self.data_transformer.observe(self._handle_fasta_change, names=['protein_dict'])
            
    def _handle_data_change(self, change):
        """Handle changes in proteomics data"""
        if change.name == 'pd_results':
            with protein_mapping_output:
                protein_mapping_output.clear_output()
                if change.new is not None:
                    self.protein_handler.protein_mapping_widget.disabled = False
                    #display(HTML("<h3>Multiple Protein Mappings</h3>"))
                    self.protein_handler.pd_results = change.new
                    #self.protein_handler.process_protein_mapping()
                else:
                    display(HTML("<h3>Multiple Protein Mappings</h3>"))
                    display(HTML("<b style='color:orange;'>Waiting for proteomics data to be uploaded...</b>"))
                    #self.protein_handler.process_protein_mapping()

            self.group_processor.update_data(change.new)
            
    def _handle_fasta_change(self, change):
        """Handle changes in FASTA data"""
        if change.new != change.old:
            # No need to copy dictionary since we're using property access
            with protein_mapping_output:
                protein_mapping_output.clear_output()
                display(HTML("<h3>Multiple Protein Mappings</h3>"))
                print(f"Using updated protein dictionary with {len(self.data_transformer.protein_dict)} proteins")
                #if self.protein_handler.pd_results is not None:
                #    self.protein_handler.process_protein_mapping()
    
    def display(self):
        """Display the complete workflow interface"""
        self.data_transformer.setup_data_loading_ui()
        display(widgets.HTML("<br>"))
        display(widgets.HTML("<h3><u>Protein Mapping</u></h3>"))
        display(self.protein_handler.protein_mapping_widget)

        display(protein_mapping_output)
        display(self.protein_handler.protein_mapping_output_area)
        #with protein_mapping_output:
            #if self.data_transformer.pd_results is not None:
                #self.protein_handler.process_protein_mapping()
        
        display(widgets.HTML("<br>"))
        with group_processing_output:
            self.group_processor.display_group_selector()
            self.group_processor.display_widgets()
        display(group_processing_output)


In [430]:
class CombineAverageDataframes:
    def __init__(self, data_transformer, group_processor, protein_handler):
        self.data_transformer = data_transformer
        self.group_processor = group_processor
        self.pd_results = data_transformer.pd_results
        self.mbpdb_results = data_transformer.mbpdb_results
        self.pd_results_cleaned = protein_handler.pd_results_cleaned if hasattr(protein_handler, 'pd_results_cleaned') and protein_handler.pd_results_cleaned is not None else pd.DataFrame()
        self._merged_df = None
        # Set up observer for data changes
        self.data_transformer.observe(self._handle_data_change, names=['pd_results', 'mbpdb_results'])
        
    @property  # Make protein_dict a property that always reads from data_transformer
    def protein_dict(self):
        return self.data_transformer.protein_dict
        
    def _handle_data_change(self, change):
        """Handle changes in the input data."""
        if change.name == 'pd_results':
            self.pd_results = change.new
        elif change.name == 'mbpdb_results':
            self.mbpdb_results = change.new
        elif change.name == 'pd_results_cleaned':
            self.pd_results_cleaned = change.new        # Re-run interactive display
        clear_output()        
    @property
    def merged_df(self):
        """Property to access the merged DataFrame."""
        return self._merged_df
        
    def add_protein_info(self, df):
        """
        Adds protein species and name information to the dataframe based on Master Protein Accessions,
        inserting them after Master Protein Accessions and before Positions in Proteins.
        
        Args:
            df (pandas.DataFrame): Input dataframe containing 'Master Protein Accessions' column
            
        Returns:
            pandas.DataFrame: DataFrame with added 'protein_species' and 'protein_name' columns
        """
        # First, make a copy to avoid modifying the original
        df = df.copy()
        
        # Create temporary columns
        df['protein_species'] = 'Unknown'
        df['protein_name'] = 'Unknown Protein'
        
        # Process each row
        for idx, row in df.iterrows():
            # Get the protein accessions - handle potential multiple proteins
            proteins = str(row['Master Protein Accessions']).split(';')
            
            # Process first protein in the list (primary protein)
            if proteins and proteins[0] != '' and proteins[0] != 'nan':
                protein = proteins[0].strip()
                df.at[idx, 'protein_species'] = self.protein_dict.get(protein, {}).get('species', "Unknown")
                df.at[idx, 'protein_name'] = self.protein_dict.get(protein, {}).get('name', "Unknown Protein")
        
        # Get all column names
        all_cols = list(df.columns)
        
        # Remove the new columns from their current position
        remaining_cols = [col for col in all_cols if col not in ['protein_species', 'protein_name']]
        
        # Find the position after 'Master Protein Accessions'
        insert_pos = remaining_cols.index('Master Protein Accessions') + 1
        
        # Create the new column order
        new_cols = (
            remaining_cols[:insert_pos] +  # Columns before and including Master Protein Accessions
            ['protein_species', 'protein_name'] +  # New columns
            remaining_cols[insert_pos:]  # Remaining columns
        )
        
        # Reorder the DataFrame with the new column order
        result_df = df.reindex(columns=new_cols)
        
        # Verify column order (optional debug print)
        # print("Column order:", new_cols)
        # print("Position of Master Protein Accessions:", insert_pos)
        
        return result_df
        
    def extract_bioactive_peptides(self):
        """
        Extracts the list of bioactive peptide matches from the imported MBPDB search.
        """
        if not self.mbpdb_results.empty:
            # Drop rows where protein_id is NaN or 'None'
            mbpdb_results_cleaned = self.mbpdb_results.copy()
            mbpdb_results_cleaned.dropna(subset=['search_peptide'], inplace=True)
            mbpdb_results_cleaned = mbpdb_results_cleaned[mbpdb_results_cleaned['protein_id'] != 'None']

            # Check if '% Alignment' column exists
            if '% Alignment' in mbpdb_results_cleaned.columns:
                agg_dict = {
                    'peptide': 'first', 
                    'protein_id': 'first',
                    'protein_description': 'first',
                    '% Alignment': 'first',
                    'species': 'first',
                    'intervals': 'first',
                    'function': lambda x: list(x.dropna().unique())
                }
            else:
                agg_dict = {
                    'peptide': 'first', 
                    #'search_peptide': 'first',
                    'protein_description': 'first',
                    'species': 'first',
                    'intervals': 'first',
                    'function': lambda x: list(x.dropna().unique())
                }

            # Perform the groupby and aggregation
            self.mbpdb_results_grouped = mbpdb_results_cleaned.groupby('search_peptide').agg(agg_dict).reset_index()

            # Flatten the 'function' list
            self.mbpdb_results_grouped['function'] = self.mbpdb_results_grouped['function'].apply(
                lambda x: '; '.join(x) if isinstance(x, list) else x
            )
            return mbpdb_results_cleaned, self.mbpdb_results_grouped
        else:
            return None, None
    
    def create_unique_id(self, row):
        """Creates a unique ID for each peptide row."""
        # Handle Sequence - convert list to comma-separated string if needed
        sequence = row['Sequence']
        if isinstance(sequence, list):
            sequence = ','.join(sequence)
        else:
            sequence = str(sequence).strip()
        
        # Create unique ID with modifications if present
        if pd.notna(row['Modifications']):
            unique_id = sequence + "_" + row['Modifications'].strip()
        else:
            unique_id = sequence
        
        # Ensure unique_id is a string and strip trailing underscores
        unique_id = str(unique_id).strip()
        return unique_id.rstrip('_')

    def process_pd_results(self, mbpdb_results_grouped):
        pd_results_cleaned = self.pd_results_cleaned
        
        # Process positions and accessions
        #pd_results_cleaned['Positions in Proteins'] = pd_results_cleaned['Positions in Proteins'].str.split(';', expand=False).str[0]
        #pd_results_cleaned['Master Protein Accessions'] = pd_results_cleaned['Master Protein Accessions'].str.split(';', expand=False).str[0]
                    
        # Handle NaN/Unknown values first
        pd_results_cleaned['Master Protein Accessions'] = pd_results_cleaned['Master Protein Accessions'].fillna('Unknown')
        pd_results_cleaned['Positions in Proteins'] = pd_results_cleaned['Positions in Proteins'].fillna('Unknown')
        
        # Create sequence column if needed
        # Create sequence column if needed
        if 'Sequence' not in pd_results_cleaned.columns:
            # First create Sequence column with NaN values
            pd_results_cleaned['Sequence'] = pd.NA
            
            def extract_sequence(annotated_seq):
                if pd.isna(annotated_seq):
                    return pd.NA
                
                # Case 1: [X].SEQUENCE.[X] format
                if '.' in annotated_seq:
                    parts = annotated_seq.split('.')
                    if len(parts) > 1:
                        return parts[1]
                
                # Case 2: Plain sequence like "LLL" or "WE"
                return annotated_seq
            
            # Apply the extraction function to all rows
            pd_results_cleaned['Sequence'] = pd_results_cleaned['Annotated Sequence'].apply(extract_sequence)
        
        # Create unique ID
        pd_results_cleaned['unique ID'] = pd_results_cleaned.apply(self.create_unique_id, axis=1)

        # Extract start and stop positions
        try:
            # Initialize start and stop columns with NaN
            pd_results_cleaned['start'] = pd.NA
            pd_results_cleaned['stop'] = pd.NA
            
            # Create mask for rows without semicolons (single positions) and not Unknown
            valid_position_mask = (~pd_results_cleaned['Positions in Proteins'].str.contains(';', na=False) & 
                                 (pd_results_cleaned['Positions in Proteins'] != 'Unknown'))
            
            # Process rows with single positions
            single_positions = pd_results_cleaned.loc[valid_position_mask, 'Positions in Proteins']
            if not single_positions.empty:
                extracted = single_positions.str.extract(r'\[(\d+)-(\d+)\]')
                
                # Convert to numeric and handle invalid values
                pd_results_cleaned.loc[valid_position_mask, 'start'] = pd.to_numeric(extracted[0], errors='coerce')
                pd_results_cleaned.loc[valid_position_mask, 'stop'] = pd.to_numeric(extracted[1], errors='coerce')
            
            # Convert to Int64 to handle missing values properly
            pd_results_cleaned['start'] = pd_results_cleaned['start'].astype('Int64')
            pd_results_cleaned['stop'] = pd_results_cleaned['stop'].astype('Int64')
        except Exception as e:
            print(f"Error processing positions: {str(e)}")
        
    
        # Reorder columns with unique ID and Sequence first
        remaining_cols = [col for col in pd_results_cleaned.columns 
                         if col not in ['unique ID', 'Sequence', 'Master Protein Accessions', 
                                      'Positions in Proteins', 'start', 'stop']]
        
        columns_order = ['unique ID', 'Sequence', 'Master Protein Accessions', 
                        'Positions in Proteins', 'start', 'stop'] + remaining_cols
        
        pd_results_cleaned = pd_results_cleaned[columns_order]
                
        # Merge with MBPDB results if available
        if self.mbpdb_results_grouped is not None and not self.mbpdb_results_grouped.empty:
            # First do the regular merge
            merged_df = pd.merge(pd_results_cleaned, self.mbpdb_results_grouped, 
                                right_on='search_peptide', left_on='unique ID', how='left')
            
            # Second pass: handle comma-separated unique IDs
            comma_mask = merged_df['unique ID'].str.contains(',', na=False)
            comma_rows = merged_df[comma_mask].copy()
            
            for idx, row in comma_rows.iterrows():
                # Split the unique ID
                unique_ids = row['unique ID'].split(',')
                
                # Check if any part matches with search_peptide
                matches = self.mbpdb_results_grouped[self.mbpdb_results_grouped['search_peptide'].isin(unique_ids)]

                if not matches.empty:
                    # Take the first match and update all MBPDB columns
                    match = matches.iloc[0]
                    for col in self.mbpdb_results_grouped.columns:
                        #if col != 'search_peptide':  # Don't overwrite unique ID
                        merged_df.loc[idx, col] = match[col]
        
            display(HTML("<b style='color:green;'>The MBPDB was successfully merged with the peptidomic data matching the Search Peptide and Unique ID columns (including comma-separated IDs).</b>"))
        
        else:
            merged_df = pd_results_cleaned.copy()
            merged_df['function'] = np.nan
            display(HTML("<b style='color:orange;'>No MBPDB was uploaded.</b>"))
            display(HTML("<b style='color:orange;'>The merged Dataframe contains only peptidomic data.</b>"))
        
        # Ensure columns are in correct order
        final_column_order = columns_order + [col for col in merged_df.columns if col not in columns_order]
        merged_df = merged_df[final_column_order]
        
        return merged_df
    
    def calculate_group_abundance_averages(self, df, group_data):
        """Calculates group abundance averages and SEMs, organizing them with averages first, then SEMs."""
        # Check if all average abundance columns already exist
        all_columns_exist = True
        for group_number, details in group_data.items():
            average_column_name = f"Avg_{details['grouping_variable']}"
            if average_column_name not in df.columns:
                all_columns_exist = False
                break
        
        if all_columns_exist:
            display(HTML('<b style="color:orange;">All average abundance columns already exist. Returning original DataFrame.</b>'))
            return df
        
        # If not all columns exist, proceed with calculations
        average_columns = {}
        
        # Calculate all averages and SEMs but store them separately
        for group_number, details in group_data.items():
            grouping_variable = details['grouping_variable']
            abundance_columns = details['abundance_columns']
            
            # Convert abundance columns to numeric
            for col in abundance_columns:
                df[col] = pd.to_numeric(df[col], errors='coerce')
            
            # Define column names
            average_column_name = f"Avg_{grouping_variable}"
        
            average_columns[average_column_name] = df[abundance_columns].mean(axis=1, skipna=True)
        
        # Combine the columns in the desired order (all averages, then all SEMs)
        new_columns = {**average_columns}
        
        # Add new columns to DataFrame
        df = pd.concat([df, pd.DataFrame(new_columns)], axis=1)
        
        if not df.empty:
            display(HTML('<b style="color:green;">Group average columns have been successfully added to the DataFrame.</b>'))
        return df

    
    def process_data(self, group_data):
        """Main method to process all data."""
        if hasattr(self, 'pd_results') and self.pd_results is not None and not self.pd_results.empty:
            try:
                # Extract and process bioactive peptides
                mbpdb_results_cleaned, self.mbpdb_results_grouped = self.extract_bioactive_peptides()
                
                if not hasattr(self, 'pd_results_cleaned') or self.pd_results_cleaned is None:
                    self.pd_results_cleaned = self.pd_results.copy()
                
                # Process PD results and merge with MBPDB
                merged_df_temp = self.process_pd_results(self.mbpdb_results_grouped)
                
                # Calculate abundance averages if group_data exists
                if group_data:
                    with warnings.catch_warnings():
                        warnings.simplefilter("ignore", UserWarning)
                        final_df_temp = self.calculate_group_abundance_averages(merged_df_temp, group_data)
                else:
                    final_df_temp = merged_df_temp
                    display(HTML("<b style='color:orange;'>No group data provided. Skipping abundance calculations.</b>"))
        
                
                # Store the final DataFrame and add protien name and species 
                final_df = self.add_protein_info(final_df_temp)
                self._merged_df = final_df

                return final_df

            except Exception as e:
                display(HTML(f"<b style='color:red;'>Error processing data: {str(e)}</b>"))
                return None
        else:
            display(HTML("<b style='color:red;'>No PD results data available for processing.</b>"))
            return None
            
    def update_data(self, pd_results):
        """Update data and refresh filtered columns"""
        self.pd_results = pd_results
        
        # Only update if we have valid data
        if pd_results is not None and not pd_results.empty:
            self.setup_data()
            
            # Update the dropdown with new filtered columns
            self.column_dropdown.options = self.filtered_columns
            
            with self.output:
                self.output.clear_output()
                display(widgets.HTML('<b style="color:green;">Data updated successfully. Column selection refreshed.</b>'))
        else:
            # Clear options if no data
            self.column_dropdown.options = []
            with self.output:
                self.output.clear_output()
                display(widgets.HTML('<b style="color:orange;">No data available for column selection.</b>'))

In [431]:
class ExportManager:
    """Class to manage all export operations with predefined buttons"""
    
    def __init__(self):
        # Create output area for status messages
        self.status_output = widgets.Output()
        
        # Create all export buttons
        self.mbpdb_button = widgets.Button(
            description='Download MBPDB Results',
            icon='download',
            button_style='info',
            layout=widgets.Layout(width='300px'),
            tooltip='Download the results from searching your peptides against the MBPDB database',
            disabled=True
        )
        
        self.group_data_button = widgets.Button(
            description='Download Group Definitions',
            icon='download',
            button_style='info',
            layout=widgets.Layout(width='300px'),
            tooltip='Download the categorical variable definitions used for data grouping and analysis',
            disabled=True
        )
        
        self.dataset_button = widgets.Button(
            description='Download Merged Dataset',
            icon='download',
            button_style='info',
            layout=widgets.Layout(width='300px'),
            tooltip='Download the complete merged dataset containing all processed data',
            disabled=True
        )
        
        # Add click handlers
        self.mbpdb_button.on_click(self._handle_mbpdb_download)
        self.group_data_button.on_click(self._handle_group_download)
        self.dataset_button.on_click(self._handle_dataset_download)
        """
        # Create labels for descriptions
        self.mbpdb_desc = widgets.HTML(
            value='<div style="color: #666; font-style: italic; margin: 5px 0;">Download MBPDB search results and bioactivity data</div>'
        )
        self.group_desc = widgets.HTML(
            value='<div style="color: #666; font-style: italic; margin: 5px 0;">Download categorical variable definitions for data grouping</div>'
        )
        self.dataset_desc = widgets.HTML(
            value='<div style="color: #666; font-style: italic; margin: 5px 0;">Download the complete processed dataset</div>'
        )
        """
        # Store references to data
        self.mbpdb_df = None
        self.group_data = None
        self.merged_df = None
        
        # Create button container with spacing
        self.button_container = widgets.VBox([
            self.mbpdb_button,
            self.group_data_button,
            self.dataset_button,
        ],layout=widgets.Layout(width='310px'),
        )

    def _trigger_download(self, content, filename, mime_type):
        """Helper method to trigger file download"""
        if isinstance(content, str):
            content = content.encode('utf-8')
            
        b64_data = base64.b64encode(content).decode('utf-8')
        file_data = f"data:{mime_type};base64,{b64_data}"
        
        with self.status_output:
            self.status_output.clear_output(wait=True)
            display(HTML(f"""
                <div id="download_{filename}">
                    <a id="download_link_{filename}" 
                       href="{file_data}" 
                       download="{filename}"
                       style="display: none;"></a>
                    <script>
                        document.getElementById('download_link_{filename}').click();
                        setTimeout(() => {{
                            document.getElementById('download_{filename}').remove();
                        }}, 1000);
                    </script>
                </div>
            """))
            display(HTML(f'<div style="color: green">Successfully downloaded {filename}</div>'))

    def _handle_mbpdb_download(self, b):
        """Handle MBPDB results download"""
        try:
            if self.mbpdb_df is not None and 'function' in self.mbpdb_df.columns:
                timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
                filename = f"MBPDB_SEARCH_{timestamp}.tsv"
                content = self.mbpdb_df.to_csv(sep='\t', index=False)
                self._trigger_download(content, filename, 'text/tab-separated-values')
        except Exception as e:
            with self.status_output:
                self.status_output.clear_output(wait=True)
                display(HTML(f'<div style="color: red">Error downloading MBPDB results: {str(e)}</div>'))

    def _handle_group_download(self, b):
        """Handle group data download"""
        try:
            if self.group_data:
                # Convert enumerated format to simplified format
                simplified_data = {}
                for _, group_info in self.group_data.items():
                    group_name = group_info['grouping_variable']
                    abundance_cols = group_info['abundance_columns']
                    simplified_data[group_name] = abundance_cols
                
                # Create filename with timestamp
                timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
                filename = f"Categorical_variable_definitions_{timestamp}.json"
                
                # Convert to JSON with indentation
                content = json.dumps(simplified_data, indent=4)
                
                # Trigger download
                self._trigger_download(content, filename, 'application/json')
        except Exception as e:
            with self.status_output:
                self.status_output.clear_output(wait=True)
                display(HTML(f'<div style="color: red">Error downloading group data: {str(e)}</div>'))

    def _handle_dataset_download(self, b):
        """Handle Merged Dataset download"""
        try:
            if self.merged_df is not None:
                timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
                filename = f"Merged_Dataframe_{timestamp}.csv"
                content = self.merged_df.to_csv(index=False)
                self._trigger_download(content, filename, 'text/csv')
        except Exception as e:
            with self.status_output:
                self.status_output.clear_output(wait=True)
                display(HTML(f'<div style="color: red">Error downloading dataset: {str(e)}</div>'))

    def update_data(self, mbpdb_df=None, group_data=None, merged_df=None):
        """Update data and enable/disable buttons accordingly"""
        self.mbpdb_df = mbpdb_df
        self.group_data = group_data
        self.merged_df = merged_df
        
        # Enable/disable buttons based on data availability
        self.mbpdb_button.disabled = not (mbpdb_df is not None and 'function' in mbpdb_df.columns)
        self.group_data_button.disabled = not bool(group_data)
        self.dataset_button.disabled = merged_df is None

    def display(self):
        """Display all outputs"""
        display(self.button_container)
        display(self.status_output)

In [432]:
class DataProcessingController:
    def __init__(self, workflow):
        self.workflow = workflow  # Store reference to workflow
        self.export_manager = ExportManager()
        self.data_transformer = self.workflow.data_transformer
        self.combiner = None
        self.merged_df = None
        
        # Create processing button
        self.process_button = widgets.Button(
            description='Process Data',
            button_style='success',
            icon='refresh',
            tooltip='Click to start data processing'
        )
                
        # Create button container
        self.button_container = widgets.HBox([self.process_button])
        
        # Create separate output areas
        self.process_output = widgets.Output()
        self.export_output = widgets.Output()
        self.search_output = widgets.Output()
        self.stats_output = widgets.Output()
        self.grid_output = widgets.Output()
        
        # Set up button callbacks
        self.process_button.on_click(self._on_process_clicked)
        
        # Initialize export manager with disabled buttons
        self.export_manager.update_data(None, None, None)
        
    def display_interactive_results(self, df):
        """Display interactive grid with row search functionality"""
        if df is not None:
            # Create search widget
            search_widget = widgets.Text(
                placeholder='Search for data in rows...',
                description='Search:',
                layout=widgets.Layout(width='50%'),
                style={'description_width': 'initial'}
            )
            
            def get_column_category(col):
                """Determine category for each column"""
                if col.startswith('Avg_'):
                    return 'Average Abundance'
                elif col in self.data_transformer.mbpdb_results.columns:
                    return 'MBPDB Search Results'
                else:
                    return 'Peptidomic Data'

            # Create multi-level columns while preserving order
            column_tuples = [(get_column_category(col), col) for col in df.columns]
            
            df_display = df.copy()
            df_display.columns = pd.MultiIndex.from_tuples(column_tuples)
         
            def create_grid(df_to_display):
                grid = DataGrid(
                    df_to_display, 
                    selection_mode='cell', 
                    editable=False,
                    layout=widgets.Layout(height='600px')
                )
                grid.auto_fit_columns = True
                grid.base_row_size = 25
                grid.base_column_size = 150
                grid.auto_fit_params = {'area': 'column', 'padding': 10}
                return grid
            
            def on_search_change(change):
                with self.grid_output:
                    self.grid_output.clear_output()
                    
                    search_term = change['new'].strip()
                    if search_term:
                        str_df = df_display.astype(str)
                        mask = str_df.apply(
                            lambda row: row.str.contains(search_term, case=False, na=False).any(),
                            axis=1
                        )
                        filtered_df = df_display[mask]
                        
                        with self.stats_output:
                            self.stats_output.clear_output()
                            print(f"Found {len(filtered_df)} matching rows out of {len(df_display)} total rows")
                    else:
                        filtered_df = df_display
                        with self.stats_output:
                            self.stats_output.clear_output()
                    
                    display(create_grid(filtered_df))
            
            search_widget.observe(on_search_change, names='value')

            # Display search interface
            with self.search_output:
                self.search_output.clear_output()
                display(search_widget)
            
            # Initialize grid display
            with self.grid_output:
                self.grid_output.clear_output()
                display(create_grid(df_display))
            
        else:
            print("No data to display")

    def _on_process_clicked(self, b):
        # Clear all outputs
        self.process_output.clear_output()
        self.search_output.clear_output()
        self.stats_output.clear_output()
        self.grid_output.clear_output()
        
        with self.process_output:           
            # Pass the actual data_transformer, not the workflow
            self.combiner = CombineAverageDataframes(
                self.workflow.data_transformer,  # Pass the data_transformer directly
                self.workflow.group_processor, 
                self.workflow.protein_handler
            )
            self.merged_df = self.combiner.process_data(self.workflow.group_processor.group_data)
            
            if self.merged_df is not None:
                display(HTML(
                    f'<b style="color:green;">\nData processing completed successfully!</b>'))
 
                # Enable export buttons after successful processing
                self.export_manager.update_data(
                    mbpdb_df=self.workflow.data_transformer.mbpdb_results if hasattr(self.workflow.data_transformer, 'mbpdb_results') else None,
                    group_data=self.workflow.group_processor.group_data,
                    merged_df=self.merged_df
                )
                
                self.display_interactive_results(self.merged_df)
            else:
                print("Error: No data was processed")
                display(HTML(
                    f'<b style="color:red;">Error: No data was processed.</b>'))
                # Keep export buttons disabled
                self.export_manager.update_data(None, None, None)
                     
    def _on_export_clicked(self, b):
            """Handle export button click"""
            with self.export_output:
                self.export_output.clear_output(wait=True)
                
                # Update export manager with current data
                self.export_manager.update_data(
                    mbpdb_df=self.workflow.data_transformer.mbpdb_results if hasattr(self.workflow.data_transformer, 'mbpdb_results') else None,
                    group_data=self.workflow.group_processor.group_data,
                    merged_df=self.merged_df
                )
                
                # Display the export manager
                self.export_manager.display()
        
    def display(self):
        """Display the complete interface"""
        display(self.button_container)
        display(self.process_output)
        
        # Always display the export manager buttons (they start disabled)
        display(HTML("<h3><u>Export Options:</u></h3>"))
        self.export_manager.display()
        
        display(self.search_output)
        display(self.stats_output)
        display(self.grid_output)

In [433]:
workflow = ProcessingWorkflow()
workflow.display()

controller = DataProcessingController(workflow)
controller.display()

HTML(value='\n            <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.1…

VBox(children=(HTML(value='<h3><u>Upload Peptidomic Data Files:</u></h3>'), HBox(children=(HBox(children=(File…

HTML(value='<br>')

HTML(value='<h3><u>Protein Mapping</u></h3>')

RadioButtons(description='Process peptides mapped to multiple proteins?', disabled=True, options=(('Yes', True…

Output()

Output()

HTML(value='<br>')

Output()

Output()

HBox(children=(Button(button_style='success', description='Process Data', icon='refresh', style=ButtonStyle(),…

Output()

VBox(children=(Button(button_style='info', description='Download MBPDB Results', disabled=True, icon='download…

Output()

Output()

Output()

Output()