In [1]:
## Install/Import packages & define key varribles and functions
# Run install script
# %chmod +x setup_jupyterlab.sh
# %./setup_jupyterlab.sh

# Import necessary libraries for the script to function.
import pandas as pd
import tempfile, csv, json, re, os, shutil, io, base64, time, subprocess, sqlite3, zipfile, base64
from io import StringIO, BytesIO
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import numpy as np
import matplotlib.gridspec as gridspec
from matplotlib.colors import LinearSegmentedColormap
from matplotlib.patches import Patch

from django.conf import settings
from collections import defaultdict
from datetime import datetime

#import statsmodels.api as sm
#from statsmodels.formula.api import ols
#from statsmodels.stats.multicomp import pairwise_tukeyhsd
import warnings

from functools import partial
import seaborn as sns
from scipy.stats import pearsonr
from itertools import combinations
from ipydatagrid import DataGrid

from IPython.display import display, HTML, clear_output
import ipywidgets as widgets
from ipywidgets import interact, interactive, fixed, interact_manual

import traitlets
from traitlets import HasTraits, Instance, observe

# Global variable declaration

import _settings as settings
global spec_translate_list
spec_translate_list = settings.SPEC_TRANSLATE_LIST
# Set the default font to Calibri
#matplotlib.rcParams['font.family'] = 'Calibri'

def find_species(header, spec_translate_list):
    """Search for a species in the header and return the first element (species name) from the list."""
    header_lower = header.lower()
    for spec_group in spec_translate_list:
        for term in spec_group[1:]:  # Iterate over possible species names/terms except the first element
            if term.lower() in header_lower:
                return spec_group[0]  # Return the first element of the list (main species name)
    return "unknown"  # Return unknown if no species match is found

def parse_headers():
    fasta_dict = {}
    with open("protein_headers.txt", 'r') as file:
        protein_id = ""
        protein_name = ""
        species = ""
        for line in file:
            line = line.strip()
            if line.startswith('>'):
                if protein_id:
                    # Save the previous protein entry in the dictionary
                    fasta_dict[protein_id] = {
                        "name": protein_name,
                        "species": species
                    }
                sequence = ""
                header_parts = line[1:].split('|')
                if len(header_parts) > 2:
                    protein_id = header_parts[1]
                    protein_name_full = re.split(r' OS=', header_parts[2])[0]
                    if ' ' in protein_name_full:

                        protein_name = protein_name_full#.split()[1]
                    else:
                        protein_name = protein_name_full
                    # Find species in the header
                    species = find_species(line, spec_translate_list)

        if protein_id:
            # Save the last protein entry in the dictionary
            fasta_dict[protein_id] = {
                "name": protein_name,
                "species": species
            }
    return fasta_dict


In [2]:
class DataTransformation(HasTraits):
    pd_results = Instance(pd.DataFrame, allow_none=True)
    mbpdb_results = Instance(pd.DataFrame, allow_none=True)
    #pd_results_cleaned = Instance(pd.DataFrame, allow_none=True)
    search_results = Instance(pd.DataFrame, allow_none=True)
    

    def __init__(self):
        super().__init__()
        self.pd_results = pd.DataFrame()
        #self.pd_results_cleaned = pd.DataFrame()
        self.mbpdb_results = pd.DataFrame()
        self.search_results = pd.DataFrame()
        self.proteins_dic = parse_headers()
        self.output_area = None
        self.mbpdb_uploader = None
        self.pd_uploader = None
        self.fasta_uploader = None
        self.reset_button = None
        self.search_widget = None
        self.search_progress = None
             
    def setup_search_ui(self, peptides):
        """Initialize and display the search UI"""
        # Create dropdown for similarity threshold
        self.threshold_dropdown = widgets.Dropdown(
            options=list(range(0, 101, 10)),
            value=80,
            description='Similarity Threshold:',
            style={'description_width': 'initial'},
            layout=widgets.Layout(width='300px')
        )
        
        # Create search button
        self.search_button = widgets.Button(
            description='Search Peptides',
            button_style='primary',
            layout=widgets.Layout(width='200px')
        )
        
        # Progress indicator
        self.search_progress = widgets.HTML(
            value="",
            layout=widgets.Layout(margin='10px 0px')
        )
        
        # Connect button click to handler
        self.search_button.on_click(lambda b: self._on_search_click(b,))
        
        # Create layout
        self.search_widget = widgets.VBox([
            widgets.HBox([
                self.threshold_dropdown, 
                self.search_button
            ], layout=widgets.Layout(align_items='center')),
            self.search_progress
        ])
        
        display(self.search_widget)

    def _on_search_click(self, b):
        """Handle search button click"""
        with self.search_output_area:
            clear_output()
            
            if self.pd_results is None or self.pd_results.empty:
                display(HTML("<b style='color:red'>Please upload peptidomic data first.</b>"))
                return
                
            try:
                # Extract sequences from peptidomic data
                peptides = self._extract_sequences(self.pd_results)
                
                if not peptides:
                    display(HTML("<b style='color:red'>No valid sequences found in peptidomic data.</b>"))
                    return
                    
                display(HTML(f"<b style='color:blue'>Found {len(peptides)} sequences. Searching database...</b>"))
                
                # Perform search
                results = self._search_peptides_comprehensive(
                    peptides, 
                    similarity_threshold=self.threshold_dropdown.value
                )          
                # Format results if we have any matches
                if not results.empty:
                    self.mbpdb_results = self._format_search_results_with_matches(results)
                    display(HTML(f"<b style='color:green'>Search complete! Found {len(self.mbpdb_results)} matches</b>"))
                else:
                    self.mbpdb_results = results
                    display(HTML("<b style='color:orange'>No matches found in the database.</b>"))
                
            except Exception as e:
                display(HTML(f"<b style='color:red'>Error: {str(e)}</b>"))
                self.mbpdb_results = pd.DataFrame()
    
    def _search_peptides_comprehensive(self, peptides, similarity_threshold=100):
        """Search for peptides with BLAST-based similarity matching"""
        
        #WORK_DIRECTORY = '/home/kuhfeldrf/mbpdb/include/peptide/uploads/temp'
        #conn = sqlite3.connect('/home/kuhfeldrf/mbpdb/include/peptide/db.sqlite3')
        
        WORK_DIRECTORY = '../../uploads/temp'
        conn = sqlite3.connect('../../db.sqlite3')
        work_path = self._create_work_directory(WORK_DIRECTORY)
        
        fasta_db_path = os.path.join(work_path, "db.fasta")
        results = []
        extra_info = defaultdict(list)
        
        # Create database with all peptides for BLAST
        query = "SELECT p.id, p.peptide FROM peptide_peptideinfo p"
        db_peptides = pd.read_sql_query(query, conn)
        
        # Create BLAST database
        with open(fasta_db_path, 'w') as f:
            for _, row in db_peptides.iterrows():
                f.write(f">{row['id']}\n{row['peptide']}\n")
                
        self._make_blast_db(fasta_db_path)
        
        for peptide in peptides:
            if similarity_threshold == 100:
                query = """
                SELECT DISTINCT
                    ? as search_peptide,
                    pi.pid as protein_id,
                    p.id as peptide_id,
                    p.peptide,
                    pi.desc as protein_description,
                    pi.species,
                    p.intervals,
                    f.function,
                    r.additional_details,
                    r.ic50,
                    r.inhibition_type,
                    r.inhibited_microorganisms,
                    r.ptm,
                    r.title,
                    r.authors,
                    r.abstract,
                    r.doi,
                    'sequence' as search_type,
                    'IDENTITY' as scoring_matrix
                FROM peptide_peptideinfo p
                JOIN peptide_proteininfo pi ON p.protein_id = pi.id
                LEFT JOIN peptide_function f ON f.pep_id = p.id
                LEFT JOIN peptide_reference r ON r.func_id = f.id
                WHERE p.peptide = ?
                """
                df = pd.read_sql_query(query, conn, params=[peptide, peptide])
                results.append(df)
            else:
                # Run BLASTP search for similarity matching
                query_path = os.path.join(work_path, "query.fasta")
                with open(query_path, "w") as query_file:
                    query_file.write(f">pep_query\n{peptide}\n")
                    
                output_path = os.path.join(work_path, "blastp_short.out")
                blast_args = [
                    "blastp",
                    "-query", query_path,
                    "-db", fasta_db_path,
                    "-outfmt", "6 std ppos qcovs qlen slen positive",
                    "-evalue", "1000",
                    "-word_size", "2",
                    "-matrix", "IDENTITY",
                    "-threshold", "1",
                    "-task", "blastp-short",
                    "-out", output_path
                ]
                
                subprocess.check_output(blast_args, stderr=subprocess.STDOUT)
                
                # Process BLAST results
                search_ids = self._process_blast_results(output_path, similarity_threshold, extra_info)
                
                if search_ids:
                    df = self._fetch_peptide_data(conn, peptide, search_ids)
                    self._add_blast_details(df, extra_info)
                    results.append(df)
        
        conn.close()
        self._cleanup_work_directory(WORK_DIRECTORY)
        
        return self._combine_results(results)
    
    def _create_work_directory(self, base_dir):
        """Create a working directory for BLAST operations"""
        path = os.path.join(base_dir, f'work_{int(round(time.time() * 1000))}')
        os.makedirs(path)
        return path
    
    def _make_blast_db(self, library_fasta_path):
        """Create BLAST database from FASTA file"""
        subprocess.check_output(
            ['makeblastdb', '-in', library_fasta_path, '-dbtype', 'prot'],
            stderr=subprocess.STDOUT
        )
    
    def _process_blast_results(self, output_path, similarity_threshold, extra_info):
        """Process BLAST results and collect search IDs"""
        search_ids = []
        csv.register_dialect('blast_dialect', delimiter='\t')
        
        with open(output_path, "r") as output_file:
            blast_data = csv.DictReader(
                output_file,
                fieldnames=['query', 'subject', 'percid', 'align_len', 'mismatches', 
                           'gaps', 'qstart', 'qend', 'sstart', 'send', 'evalue', 
                           'bitscore', 'ppos', 'qcov', 'qlen', 'slen', 'numpos'],
                dialect='blast_dialect'
            )
            
            for row in blast_data:
                tlen = float(row['slen']) if float(row['slen']) > float(row['qlen']) else float(row['qlen'])
                simcalc = 100 * ((float(row['numpos']) - float(row['gaps'])) / tlen)
                
                if simcalc >= similarity_threshold:
                    search_ids.append(row['subject'])
                    extra_info[row['subject']] = [
                        f"{simcalc:.2f}", row['qstart'], row['qend'], row['sstart'],
                        row['send'], row['evalue'], row['align_len'], row['mismatches'],
                        row['gaps']
                    ]
        
        return search_ids
    
    def _fetch_peptide_data(self, conn, peptide, search_ids):
        """Fetch peptide data from database"""
        placeholders = ','.join(['?' for _ in search_ids])
        query = f"""
        SELECT DISTINCT
            ? as search_peptide,
            pi.pid as protein_id,
            p.id as peptide_id,
            p.peptide,
            pi.desc as protein_description,
            pi.species,
            p.intervals,
            f.function,
            r.additional_details,
            r.ic50,
            r.inhibition_type,
            r.inhibited_microorganisms,
            r.ptm,
            r.title,
            r.authors,
            r.abstract,
            r.doi,
            'sequence' as search_type,
            'IDENTITY' as scoring_matrix
        FROM peptide_peptideinfo p
        JOIN peptide_proteininfo pi ON p.protein_id = pi.id
        LEFT JOIN peptide_function f ON f.pep_id = p.id
        LEFT JOIN peptide_reference r ON r.func_id = f.id
        WHERE p.id IN ({placeholders})
        """
        
        return pd.read_sql_query(query, conn, params=[peptide] + search_ids)
    
    def _add_blast_details(self, df, extra_info):
        """Add BLAST details to DataFrame"""
        for idx, row in df.iterrows():
            if str(row['peptide_id']) in extra_info:
                blast_details = extra_info[str(row['peptide_id'])]
                df.at[idx, '% Alignment'] = blast_details[0]
                df.at[idx, 'Query start'] = blast_details[1]
                df.at[idx, 'Query end'] = blast_details[2]
                df.at[idx, 'Subject start'] = blast_details[3]
                df.at[idx, 'Subject end'] = blast_details[4]
                df.at[idx, 'e-value'] = blast_details[5]
                df.at[idx, 'Alignment length'] = blast_details[6]
                df.at[idx, 'Mismatches'] = blast_details[7]
                df.at[idx, 'Gap opens'] = blast_details[8]
    
    def _cleanup_work_directory(self, work_directory):
        """Clean up old work directories"""
        try:
            dirs = [f for f in os.scandir(work_directory) if f.is_dir()]
            dirs.sort(key=lambda x: os.path.getmtime(x.path), reverse=True)
            
            for dir_entry in dirs[25:]:
                try:
                    shutil.rmtree(dir_entry.path)
                except Exception:
                    pass
        except Exception:
            pass
    
    def _combine_results(self, results):
        """Combine and format final results"""
        if not results:
            mbpdb_columns = [
                'search_peptide', 'protein_id', 'peptide', 'protein_description',
                'species', 'intervals', 'function', 'additional_details', 'ic50',
                'inhibition_type', 'inhibited_microorganisms', 'ptm', 'title',
                'authors', 'abstract', 'doi', 'search_type', 'scoring_matrix'
            ]
            return pd.DataFrame(columns=mbpdb_columns)
        
        final_results = pd.concat(results, ignore_index=True)
        
        if 'peptide_id' in final_results.columns:
            final_results = final_results.drop('peptide_id', axis=1)
            
        sort_columns = ['search_peptide']
        if '% Alignment' in final_results.columns:
            sort_columns.append('% Alignment')
            
        return final_results.sort_values(
            sort_columns,
            ascending=[True] + [False] * (len(sort_columns) - 1)
        )
    
    def _format_search_results_with_matches(self, final_results):
        """Format search results with matches"""
        if '% Alignment' in final_results.columns:
            final_results['% Alignment'] = pd.to_numeric(
                final_results['% Alignment'], 
                errors='coerce'
            )

        grouped = final_results.groupby(["search_peptide", "function"], as_index=False)
        aggregated_results = []
        processed_indices = set()

        for _, group in grouped:
            if len(group) > 1:
                aggregated_row = self._aggregate_group_data(group)
                aggregated_results.append(aggregated_row)
                processed_indices.update(group.index)

        remaining_rows = final_results.loc[~final_results.index.isin(processed_indices)]
        aggregated_df = pd.DataFrame(aggregated_results)
        
        return pd.concat([aggregated_df, remaining_rows], ignore_index=True)
    
    def _aggregate_group_data(self, group):
        """Aggregate data for a group of results"""
        def enumerate_field(field):
            if field in group.columns and not group[field].dropna().empty:
                valid_values = set(group[field].dropna().astype(str).str.strip())
                valid_values = {val for val in valid_values if val != ''}
                if len(valid_values) > 1:
                    return "; ".join([f"{i+1}) {val}" for i, val in enumerate(valid_values)])
                elif len(valid_values) == 1:
                    return next(iter(valid_values))
                return ''
            return ''

        return {col: enumerate_field(col) for col in group.columns}   
                
    def setup_data_loading_ui(self):
        """Initialize and display the data loading UI with integrated search and help tooltips"""
        
        def create_help_icon(tooltip_text):
            """Create a help icon widget with tooltip"""
            help_icon = widgets.HTML(
                value='<i class="fa fa-question-circle" style="color: #007bff;"></i>',
                layout=widgets.Layout(width='25px', margin='2px 5px')
            )
            help_icon.add_class('jupyter-widgets')
            help_icon.add_class('widget-html')
            return widgets.HTML(
                f'<div title="{tooltip_text}" style="display: inline-block;">{help_icon.value}</div>'
            )
    
        def create_labeled_uploader(widget, label, tooltip):
            """Create an uploader with label and help icon"""
            return widgets.HBox([
                widget,
                create_help_icon(tooltip)
            ], layout=widgets.Layout(align_items='center'))
    
        # Create file upload widgets with the same configurations
        self.mbpdb_uploader = widgets.FileUpload(
            accept='.csv,.txt,.tsv,.xlsx',
            multiple=False,
            description='Upload MBPDB File',
            layout=widgets.Layout(width='300px'),
            style={'description_width': 'initial'}
        )
        
        self.pd_uploader = widgets.FileUpload(
            accept='.csv,.txt,.tsv,.xlsx',
            multiple=False,
            description='Upload Peptidomic File',
            layout=widgets.Layout(width='300px'),
            style={'description_width': 'initial'}
        )
        
        self.fasta_uploader = widgets.FileUpload(
            accept='.fasta',
            multiple=True,
            description='Upload FASTA Files',
            layout=widgets.Layout(width='300px'),
            style={'description_width': 'initial'}
        )
        
        # Create search interface
        self.threshold_dropdown = widgets.Dropdown(
            options=list(range(0, 101, 10)),
            value=80,
            description='Similarity Threshold (%):',
            style={'description_width': 'initial'},
            layout=widgets.Layout(width='225px')
        )
        
        self.search_button = widgets.Button(
            description='Search Database',
            button_style='primary',
            layout=widgets.Layout(width='150px')
        )
        
        # Reset button
        self.reset_button = widgets.Button(
            description='Reset',
            button_style='warning',
            layout=widgets.Layout(width='100px')
        )
        
        # Create output areas
        self.output_area = widgets.Output()
        self.search_output_area = widgets.Output()
    
        # Create the MBPDB section with side-by-side options
        mbpdb_options = widgets.HBox([
            widgets.VBox([
                widgets.HTML("<div style='font-weight: bold; margin-bottom: 5px;'>Option 1: Upload File</div>"),
                create_labeled_uploader(
                    self.mbpdb_uploader,
                    "MBPDB File",
                    "Upload your own MBPDB file (optional)"
                )
            ]),
            widgets.HTML("<div style='margin: 0 20px; line-height: 100px;'>OR</div>"),
            widgets.VBox([
                widgets.HTML("<div style='font-weight: bold; margin-bottom: 5px;'>Option 2: Search Database</div>"),
                widgets.HBox([
                    self.threshold_dropdown,
                    self.search_button,
                    create_help_icon("Search peptides against the MBPDB (optional)")
                ], layout=widgets.Layout(align_items='center'))
            ])
        ], layout=widgets.Layout(align_items='center', margin='-10px 0 0 0'))        
        # Create main container
        main_container = widgets.VBox([
            widgets.HTML("<h3><u>Upload Peptidomic Data Files:</u></h3>"),
            create_labeled_uploader(
                self.pd_uploader,
                "Peptidomic File",
                "Upload peptide groups data from Proteome Discover export file (required)"
            ),
            #widgets.HTML("<h3><u>MBPDB Data (Optional):</u></h3>"),
            mbpdb_options,
            widgets.HTML("<h3><u>Upload Protein FASTA Files (Optional):</u></h3>"),
            create_labeled_uploader(
                self.fasta_uploader,
                "FASTA Files",
                "Upload Protein FASTA file used in Proteome Discoverer Search (optional - This helps label proteins in data transformation)"
            ),
            widgets.HTML("<br>"),
            widgets.HBox([
                self.reset_button,
                create_help_icon("Reset all uploaded files")
            ], layout=widgets.Layout(align_items='center')),
            widgets.HTML("<div style='margin-top: 10px;'></div>"),  # Spacing
            self.output_area,
            self.search_output_area
        ])
        
        # Register observers
        self.pd_uploader.observe(self._on_pd_upload_change, names='value')
        self.mbpdb_uploader.observe(self._on_mbpdb_upload_change, names='value')
        self.fasta_uploader.observe(self._on_fasta_upload_change, names='value')
        self.reset_button.on_click(self._reset_ui)
        self.search_button.on_click(self._on_search_click())
        
        # Add Font Awesome CSS for help icons
        display(widgets.HTML("""
            <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.15.4/css/all.min.css">
        """))
        
        display(main_container)
        
    def setup_data_loading_ui(self):
        """Initialize and display the data loading UI with integrated search and help tooltips"""
        
        def create_help_icon(tooltip_text):
            """Create a help icon widget with tooltip"""
            help_icon = widgets.HTML(
                value='<i class="fa fa-question-circle" style="color: #007bff;"></i>',
                layout=widgets.Layout(width='25px', margin='2px 5px')
            )
            help_icon.add_class('jupyter-widgets')
            help_icon.add_class('widget-html')
            return widgets.HTML(
                f'<div title="{tooltip_text}" style="display: inline-block;">{help_icon.value}</div>'
            )
    
        def create_labeled_uploader(widget, label, tooltip):
            """Create an uploader with label and help icon"""
            return widgets.HBox([
                widget,
                create_help_icon(tooltip)
            ], layout=widgets.Layout(align_items='center'))
    
        # Create file upload widgets with the same configurations
        self.mbpdb_uploader = widgets.FileUpload(
            accept='.csv,.txt,.tsv,.xlsx',
            multiple=False,
            description='Upload MBPDB File',
            layout=widgets.Layout(width='300px'),
            style={'description_width': 'initial'}
        )
        
        self.pd_uploader = widgets.FileUpload(
            accept='.csv,.txt,.tsv,.xlsx',
            multiple=False,
            description='Upload Peptidomic File',
            layout=widgets.Layout(width='300px'),
            style={'description_width': 'initial'}
        )
        
        self.fasta_uploader = widgets.FileUpload(
            accept='.fasta',
            multiple=True,
            description='Upload FASTA Files',
            layout=widgets.Layout(width='300px'),
            style={'description_width': 'initial'}
        )
        
        # Create search interface
        self.threshold_dropdown = widgets.Dropdown(
            options=list(range(0, 101, 10)),
            value=80,
            description='Similarity Threshold (%):',
            style={'description_width': 'initial'},
            layout=widgets.Layout(width='225px')
        )
        
        self.search_button = widgets.Button(
            description='Search Database',
            button_style='primary',
            layout=widgets.Layout(width='150px')
        )
        
        # Reset button
        self.reset_button = widgets.Button(
            description='Reset',
            button_style='warning',
            layout=widgets.Layout(width='100px')
        )
        
        # Create output areas
        self.output_area = widgets.Output()
        self.search_output_area = widgets.Output()
    
        # Create the MBPDB section with side-by-side options
        mbpdb_options = widgets.HBox([
            widgets.VBox([
                widgets.HTML("<div style='font-weight: bold; margin-bottom: 5px;'>Option 1: Upload File</div>"),
                create_labeled_uploader(
                    self.mbpdb_uploader,
                    "MBPDB File",
                    "Upload your own MBPDB file (optional)"
                )
            ]),
            widgets.HTML("<div style='margin: 0 20px; line-height: 100px;'><b>OR</b></div>"),
            widgets.VBox([
                widgets.HTML("<div style='font-weight: bold; margin-bottom: 5px;'>Option 2: Search Database</div>"),
                widgets.HBox([
                    self.threshold_dropdown,
                    self.search_button,
                    create_help_icon("Search peptides against the MBPDB (optional)")
                ], layout=widgets.Layout(align_items='center'))
            ])
        ], layout=widgets.Layout(align_items='center', margin='0'))
        
        # Create main container
        main_container = widgets.VBox([
            widgets.HTML("<h3><u>Upload Peptidomic Data Files:</u></h3>"),
            create_labeled_uploader(
                self.pd_uploader,
                "Peptidomic File",
                "Upload peptide groups data from Proteome Discover export file (required)"
            ),
            widgets.HTML("<h3 style='margin-bottom: 0;'><u>MBPDB Data (Optional):</u></h3>"),
            mbpdb_options,
            widgets.HTML("<h3><u>Upload Protein FASTA Files (Optional):</u></h3>"),
            create_labeled_uploader(
                self.fasta_uploader,
                "FASTA Files",
                "Upload Protein FASTA file used in Proteome Discoverer Search (optional - This helps label proteins in data transformation)"
            ),
            widgets.HTML("<br>"),
            widgets.HBox([
                self.reset_button,
                create_help_icon("Reset all uploaded files")
            ], layout=widgets.Layout(align_items='center')),
            widgets.HTML("<div style='margin-top: 10px;'></div>"),  # Spacing
            self.output_area,
            self.search_output_area
        ])
        
        # Register observers
        self.pd_uploader.observe(self._on_pd_upload_change, names='value')
        self.mbpdb_uploader.observe(self._on_mbpdb_upload_change, names='value')
        self.fasta_uploader.observe(self._on_fasta_upload_change, names='value')
        self.reset_button.on_click(self._reset_ui)
        self.search_button.on_click(self._on_search_click)
        
        # Add Font Awesome CSS for help icons
        display(widgets.HTML("""
            <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.15.4/css/all.min.css">
        """))
        display(main_container)

    
    def _extract_sequences(self, df):
        """Extract sequences from peptidomic data"""
        if 'Sequence' not in df.columns:
            if 'Annotated Sequence' in df.columns:
                sequences = df['Annotated Sequence'].str.split('.', expand=False).str[1]
                df = df.assign(Sequence=sequences)
            elif 'Positions in Proteins' in df.columns:  # Add any other potential column names
                df['Sequence'] = df['Positions in Proteins']
        return df['Sequence'].dropna().unique().tolist()
    
    
    def _reset_ui(self, b):
        """Reset the UI state"""
        self.mbpdb_uploader._counter = 0
        self.pd_uploader._counter = 0
        self.fasta_uploader._counter = 0
        self.mbpdb_uploader.value = ()
        self.pd_uploader.value = ()
        self.fasta_uploader.value = ()
        self.pd_results = pd.DataFrame()
        self.mbpdb_results = pd.DataFrame()
        self.search_results = pd.DataFrame()
        self.proteins_dic = parse_headers()
        
        with self.output_area:
            clear_output()
            display(HTML('<b style="color:blue;">All uploads cleared.</b>'))
        
        with self.search_output_area:
            clear_output()
            display(HTML('<b style="color:blue;">Search results cleared.</b>')) 
            
    def _on_pd_upload_change(self, change):
        if change['type'] == 'change' and change['name'] == 'value':
            with self.output_area:
                self.output_area.clear_output()
                if change['new'] and len(change['new']) > 0:
                    file_data = change['new'][0]
                    self.pd_results, pd_status = self._load_data(
                        file_data,
                        required_columns=['Positions in Proteins'],
                        file_type='Peptidomic'
                    )
                    if pd_status == 'yes' and self.pd_results is not None:
                        display(HTML(f'<b style="color:green;">Peptidomic data imported with {self.pd_results.shape[0]} rows and {self.pd_results.shape[1]} columns.</b>'))

    def _on_mbpdb_upload_change(self, change):
        if change['type'] == 'change' and change['name'] == 'value':
            with self.output_area:
                self.output_area.clear_output()
                if change['new'] and len(change['new']) > 0:
                    file_data = change['new'][0]
                    self.mbpdb_results, mbpdb_status = self._load_data(
                        file_data,
                        required_columns=['Search peptide', 'Protein ID', 'Peptide'],
                        file_type='MBPDB'
                    )
                    if mbpdb_status == 'yes' and self.mbpdb_results is not None:
                        self.mbpdb_results.rename(columns={
                            'Search peptide': 'search_peptide',
                            'Protein ID': 'protein_id',
                            'Peptide': 'peptide',
                            'Protein description': 'protein_description',
                            'Species': 'species',
                            'Intervals': 'intervals',
                            'Function': 'function',
                            'Additional details': 'additional_details',
                            'IC50 (μM)': 'ic50',
                            'Inhibition type': 'inhibition_type',
                            'Inhibited microorganisms': 'inhibited_microorganisms',
                            'PTM': 'ptm',
                            'Title': 'title',
                            'Authors': 'authors',
                            'Abstract': 'abstract',
                            'DOI': 'doi',
                            'Search type': 'search_type',
                            'Scoring matrix': 'scoring_matrix',
                            }, inplace=True)
                        display(HTML(f'<b style="color:green;">MBPDB file imported with {self.mbpdb_results.shape[0]} rows and {self.mbpdb_results.shape[1]} columns</b>'))
    
    def _on_fasta_upload_change(self, change):
        
        if change['type'] == 'change' and change['name'] == 'value':
            self.proteins_dic = {}
            with self.output_area:
                self.output_area.clear_output()
                if change['new'] and len(change['new']) > 0:
                    for file_data in change['new']:
                        try:
                            file_name = getattr(file_data, 'name', None)
                            if file_name and file_name.endswith('.fasta'):
                                new_proteins = self._parse_uploaded_fasta(file_data)
                                self.proteins_dic.update(new_proteins)
                                display(HTML(f'<b style="color:green;">Successfully imported FASTA file: {file_name} ({len(new_proteins)} proteins)</b>'))
                        except Exception as e:
                            display(HTML(f'<b style="color:red;">Error processing FASTA file: {str(e)}</b>'))       
    def _load_data(self, file_obj, required_columns, file_type):
        """
        Load and validate uploaded data files, cleaning empty rows and validating data.
        
        Args:
            file_obj: Uploaded file object
            required_columns (list): List of required column names (either single names or pairs)
            file_type (str): Type of file being loaded ('MBPDB' or 'Peptidomic')
            
        Returns:
            tuple: (DataFrame or None, status string 'yes'/'no')
        """
        try:
            content = file_obj.content
            filename = file_obj.name
            extension = filename.split('.')[-1].lower()
            
            file_stream = io.BytesIO(content)
            
            # Load data based on file extension
            if extension == 'csv':
                df = pd.read_csv(file_stream)
            elif extension in ['txt', 'tsv']:
                df = pd.read_csv(file_stream, delimiter='\t')
            elif extension == 'xlsx':
                df = pd.read_excel(file_stream)
            else:
                raise ValueError("Unsupported file format. Please upload .csv, .txt, .tsv, or .xlsx files.")
            
            # Clean column names
            df.columns = df.columns.str.strip()
            
            # Drop empty rows
            df = df.dropna(how='all')
            df = df[~(df.astype(str).apply(lambda x: x.str.strip().eq('')).all(axis=1))]
            
            # Handle validation differently based on file type
            if file_type == 'MBPDB':
                # Use column pairs for MBPDB validation
                column_pairs = {
                    'Search peptide': 'search_peptide',
                    'Protein ID': 'protein_id',
                    'Peptide': 'peptide'
                }
                
                # Check for required columns in either format
                missing_pairs = []
                for orig_col, std_col in column_pairs.items():
                    if not (orig_col in df.columns or std_col in df.columns):
                        missing_pairs.append(f"'{orig_col}' or '{std_col}'")
                
                if missing_pairs:
                    display(HTML(f'<b style="color:red;">{file_type} File Error: Missing required columns: {", ".join(missing_pairs)}</b>'))
                    return None, 'no'
                
                # Validate non-empty required columns
                empty_pairs = []
                for orig_col, std_col in column_pairs.items():
                    col_to_check = orig_col if orig_col in df.columns else std_col
                    if df[col_to_check].isna().all() or (df[col_to_check].astype(str).str.strip() == '').all():
                        empty_pairs.append(f"'{orig_col}' or '{std_col}'")
                
                if empty_pairs:
                    display(HTML(f'<b style="color:red;">{file_type} File Error: Required columns are empty: {", ".join(empty_pairs)}</b>'))
                    return None, 'no'
                    
            else:
                # Standard validation for other file types
                if not set(required_columns).issubset(df.columns):
                    missing = set(required_columns) - set(df.columns)
                    display(HTML(f'<b style="color:red;">{file_type} File Error: Missing required columns: {", ".join(missing)}</b>'))
                    return None, 'no'
                
                # Validate non-empty required columns
                empty_required = []
                for col in required_columns:
                    if df[col].isna().all() or (df[col].astype(str).str.strip() == '').all():
                        empty_required.append(col)
                
                if empty_required:
                    display(HTML(f'<b style="color:red;">{file_type} File Error: Required columns are empty: {", ".join(empty_required)}</b>'))
                    return None, 'no'
            
            # Show success message
            display(HTML(f'<b style="color:green;">{file_type} file loaded successfully with {len(df)} rows after cleaning.</b>'))
            
            return df, 'yes'
            
        except Exception as e:
            display(HTML(f'<b style="color:red;">{file_type} File Error: {str(e)}</b>'))
            return None, 'no'
    
    def _parse_uploaded_fasta(self, file_data):
        """Parse uploaded FASTA file content"""
        fasta_dict = {}
        fasta_text = bytes(file_data.content).decode('utf-8')
        lines = fasta_text.split('\n')
        
        protein_id = ""
        protein_name = ""
        sequence = ""
        species = ""
        
        for line in lines:
            line = line.strip()
            if line.startswith('>'):
                if protein_id:
                    fasta_dict[protein_id] = {
                        "name": protein_name,
                        "sequence": sequence,
                        "species": species
                    }
                sequence = ""
                header_parts = line[1:].split('|')
                if len(header_parts) > 2:
                    protein_id = header_parts[1]
                    protein_name_full = re.split(r' OS=', header_parts[2])[0]
                    if ' ' in protein_name_full:
                        protein_name = protein_name_full
                    else:
                        protein_name = protein_name_full
                    species = self._find_species(line)
            else:
                sequence += line
                
        if protein_id:
            fasta_dict[protein_id] = {
                "name": protein_name,
                "sequence": sequence,
                "species": species
            }
        
        return fasta_dict

    def _find_species(self, header):
        """Find species in FASTA header"""
        header_lower = header.lower()
        for spec_group in spec_translate_list:
            for term in spec_group[1:]:
                if term.lower() in header_lower:
                    return spec_group[0]
        return "unknown"

    # Then to use it, we can create an observe function:
    def observe_data_changes(change):
        if hasattr(change, 'new'):
            combiner.update_data(data_transformer.pd_results, data_transformer.mbpdb_results)
            setup_data.update_data(data_transformer.pd_results)#, data_transformer.pd_results_cleaned)
    
        
    
    # Add this to DataTransformation class:
    def attach_observers(self, group_processor):
        """
        Attach observers to monitor changes in pd_results #and pd_results_cleaned
        
        Args:
            group_processor: Instance of GroupProcessing class
        """
        def observe_data_changes(change):
            if change.name in ['pd_results']:#, 'pd_results_cleaned']:
                group_processor.update_data(self.pd_results)#, self.pd_results_cleaned)
        
        self.observe(observe_data_changes, names=['pd_results'])#, 'pd_results_cleaned'])

In [3]:
class GroupProcessing:
    def __init__(self):
        self.group_data = {}
        self.group_number = 1
        self.filtered_columns = []
        self.group_uploader = widgets.FileUpload(
        accept='.json',
        multiple=False,
        description='Upload Groups File',
        layout=widgets.Layout(width='300px'),
        style={'description_width': 'initial'}
        )
        self.group_uploader.observe(self._on_group_upload_change, names='value')
        
        # Initialize output areas
        self.output = widgets.Output()
        self.gd_output_area = widgets.Output()
        
        # Initialize widgets for group selection
        self.column_dropdown = widgets.SelectMultiple(
            description='Absorbance',
            style={'description_width': 'initial'},
            disabled=False,
            layout=widgets.Layout(width='90%', height='300px')
        )
        
        self.grouping_variable_text = widgets.Text(
            description='Group Name',
            layout=widgets.Layout(width='90%'),
            style={'description_width': 'initial'}
        )
        
        # Initialize buttons
        self.search_button = widgets.Button(
            description='Search',
            button_style='info',
            layout=widgets.Layout(margin='10px 10px 0 0')
        )
        
        self.add_group_button = widgets.Button(
            description='Add Group',
            button_style='success',
            layout=widgets.Layout(margin='10px 10px 0 0')
        )
        
        self.reset_file_button = widgets.Button(
            description='Reset Selection',
            button_style='warning',
            layout=widgets.Layout(margin='10px 10px 0 75px')
        )
        
        # Set up button callbacks
        self.search_button.on_click(self._search_columns)
        self.add_group_button.on_click(self._add_group)
        self.reset_file_button.on_click(self._reset_selection)
        

    def update_data(self, pd_results):
        """Update data and refresh filtered columns"""
        self.pd_results = pd_results
        
        # Only update if we have valid data
        if pd_results is not None:
            self.setup_data()
            
            # Update the dropdown with new filtered columns
            with self.output:
                self.output.clear_output()
                display(widgets.HTML('<b style="color:green;">Data updated successfully. Column selection refreshed.</b>'))
    
            
    def setup_data(self):
        """Initialize data and filters for the analysis"""
        # Define columns to exclude with more flexible matching
        columns_to_exclude = [
            'Marked as', 'Number of Missed Cleavages', 'Missed Cleavages',
            'Checked', 'Confidence', 'Annotated Sequence', 'Unnamed: 3', 
            'Modifications', 'Protein Groups', 'Proteins', 'PSMs', 
            'Master Protein Accessions', 'Positions in Proteins', 
            'Modifications in Proteins',
            'Theo MHplus in Da', 'Quan Info', 
            'Confidence by Search Engine', 
            'q-Value by Search Engine',
            'PEP by Search Engine',
            'SVM Score by Search Engine',
            'XCorr by Search Engine',
            'PEP', 'q-Value', 'Top Apex RT', 'RT in min',
            'Sequence', 'search_peptide', 'Peptide', 'protein_id', 
            'protein_description', 'Alignment', 'Species', 
            'Intervals', 'function', 'unique ID'
            ]
        
        exclude_substrings = [
            'Abundances by Bio Rep', 
            'Count', 
            'Origin',
            'Average_Abundance',
            'Avg_',
            'SEM_'
        ]
    
        # Use cleaned data if available, otherwise use original
        df = self.pd_results_cleaned if (hasattr(self, 'pd_results_cleaned') and 
                                       not self.pd_results_cleaned.empty) else self.pd_results
        
        if df is not None and not df.empty:
            # More flexible column filtering
            self.filtered_columns = []
            for col in df.columns:
                # Check if any exclusion pattern matches the column name
                should_exclude = any(excl.lower() in col.lower() for excl in columns_to_exclude)
                # Check if any substring pattern matches
                has_excluded_substring = any(sub.lower() in col.lower() for sub in exclude_substrings)
                
                if not should_exclude and not has_excluded_substring:
                    self.filtered_columns.append(col)
              
            # Update dropdown options
            self.column_dropdown.options = self.filtered_columns
            self._reset_inputs()
        else:
            self.filtered_columns = []
            self.column_dropdown.options = []
            with self.output:
                self.output.clear_output()
                display(widgets.HTML('<b style="color:red;">No valid data available for processing.</b>'))

    def display_group_selector(self):
        """Display the JSON file selector for group dictionaries"""
        display(widgets.HTML("<h3><u>Upload Existing Group Dictionary:</u></h3>"))
        display(self.group_uploader, self.gd_output_area)
        

    def display_widgets(self):
        """Display the main UI for group selection"""
        # Create main grid container
        grid = widgets.GridspecLayout(1, 2,  # Number of rows and columns
            width='1000px', 
            grid_gap='5px',  # Adjust spacing between grid elements
        )
        
        # Create input container with vertical scroll
        input_container = widgets.VBox([
            widgets.HTML("<h3><u>Select New Grouping of Data:</u></h3>"),
            widgets.HTML('Now select the <b>absorbance columns</b> and assign the name of the <b>grouping variable</b>:'),
            self.column_dropdown,
            self.grouping_variable_text,
            # Create button layouts
            widgets.HBox([self.search_button, self.add_group_button]),
            widgets.HBox([self.reset_file_button])
        ], layout=widgets.Layout(
            width='95%',
            height='600px',
            overflow_y='auto'  # Add vertical scroll
        ))
        
        # Create output container with vertical scroll
        output_container = widgets.VBox([
            widgets.HTML("<h3><u>Group Selection Results:</u></h3>"),
            self.output
        ], layout=widgets.Layout(
            width='95%',
            height='600px',
            overflow_y='auto',  # Add vertical scroll
            padding='10px'
        ))
        
        # Add to grid
        grid[0, 0] = input_container  # Left column
        grid[0, 1] = output_container  # Right column
        
        display(grid)
    def _on_gd_submit(self, b, dropdown):
        """Handle JSON file submission"""
        selected_file = dropdown.value
        with self.gd_output_area:
            clear_output()
            
            if selected_file == 'Select an existing grouping dictionary file':
                print("Please select a valid file.")
                return
                
            try:
                # Load and process JSON file
                with open(selected_file, 'r') as file:
                    data = json.load(file)
                self.group_data = {}
                
                # Process groups
                with self.output:
                    clear_output()
                    for group_number, group_info in data.items():
                        group_name = group_info.get('grouping_variable')
                        selected_columns = group_info.get('abundance_columns')
                        
                        self.group_data[group_number] = {
                            'grouping_variable': group_name,
                            'abundance_columns': selected_columns
                        }
                        
                        display(widgets.HTML(
                            f"<b>Group {group_number}</b> created with <b>{len(selected_columns)} columns assigned</b>."
                        ))
                        display(widgets.HTML(f"<b>Grouping Variable:</b> {group_name}"))
                        display(widgets.HTML(f"<b>Selected Columns:</b> {', '.join(selected_columns)}"))
                        display(widgets.HTML("<hr style='border: 1px solid black;'>"))
                        
                display(widgets.HTML(f'<b style="color:green;">Successfully uploaded: {selected_file}</b>'))
                
            except Exception as e:
                display(widgets.HTML(f"<b style='color:red;'>An error occurred while processing the file: {str(e)}</b>"))
    
    def _search_columns(self, b):
        """Search for columns based on group name"""
        group_name = self.grouping_variable_text.value
        if group_name:
            matching_columns = [col for col in self.filtered_columns if group_name in col]
            self.column_dropdown.value = matching_columns
        else:
            with self.output:
                clear_output()
                display(widgets.HTML('<b style="color:red;">Please enter a group name to search.</b>'))
    
    def _add_group(self, b):
        """Add a new group to the data"""
        group_name = self.grouping_variable_text.value
        selected_columns = list(self.column_dropdown.value)
        
        if not (group_name and selected_columns):
            with self.output:
                display(widgets.HTML('<b style="color:red;">Please enter a group name and select at least one column.</b>'))
            return
        
        # If group_data exists, use next number, otherwise start at 1
        if self.group_data:
            # Convert existing keys to integers and find max
            existing_numbers = [int(k) for k in self.group_data.keys()]
            next_number = max(existing_numbers) + 1
            self.group_number = str(next_number)
        else:
            self.group_data = {}
            self.group_number = "1"
        
        # Add new group data to the dictionary
        self.group_data[self.group_number] = {
            'grouping_variable': group_name,
            'abundance_columns': selected_columns
        }
        
        # Display output
        with self.output:
            display(widgets.HTML(f"<b>Group {self.group_number}</b> created with <b>{len(selected_columns)} columns assigned</b>."))
            display(widgets.HTML(f"<b>Grouping Variable:</b> {group_name}"))
            display(widgets.HTML(f"<b>Selected Columns:</b> {', '.join(selected_columns)}"))
            display(widgets.HTML("<hr style='border: 1px solid black;'>"))
        
        self._reset_inputs()
        
    def _reset_selection(self, b):
        """Reset all selections and data"""
        self.group_data = {}
        self.group_number = 1
        with self.gd_output_area:
            clear_output()
        with self.output:
            clear_output()
        self._reset_inputs()
    
    def _reset_inputs(self):
        """Reset input fields"""
        self.grouping_variable_text.value = ''
        self.column_dropdown.value = ()

    def _on_group_upload_change(self, change):
        """Handle JSON file upload"""
        if change['type'] == 'change' and change['name'] == 'value':
            with self.gd_output_area:
                if change['new'] and len(change['new']) > 0:
                    file_data = change['new'][0]
                    try:
                        content = bytes(file_data.content).decode('utf-8')
                        data = json.loads(content)
                        
                        # Process groups
                        with self.output:
                            for group_number, group_info in data.items():
                                group_name = group_info.get('grouping_variable')
                                selected_columns = group_info.get('abundance_columns')
                                
                                # Update group_data without clearing previous entries
                                self.group_data[group_number] = {
                                    'grouping_variable': group_name,
                                    'abundance_columns': selected_columns
                                }
                                
                                display(widgets.HTML(
                                    f"<b>Group {group_number}</b> created with <b>{len(selected_columns)} columns assigned</b>."
                                ))
                                display(widgets.HTML(f"<b>Grouping Variable:</b> {group_name}"))
                                display(widgets.HTML(f"<b>Selected Columns:</b> {', '.join(selected_columns)}"))
                                display(widgets.HTML("<hr style='border: 1px solid black;'>"))
                                
                        display(widgets.HTML(f'<b style="color:green;">Successfully uploaded: {file_data.name}</b>'))
                        
                    except Exception as e:
                        display(widgets.HTML(f"<b style='color:red;'>An error occurred while processing the file: {str(e)}</b>"))


In [4]:
class ProteinCombinationHandler(HasTraits):
    def __init__(self, data_transformer):
        super().__init__()
        self.data_transformer = data_transformer
        self.pd_results = data_transformer.pd_results
        self.proteins_dic = data_transformer.proteins_dic
        self.pd_results_cleaned = None
        self.protein_output_area = None
        self.user_decisions = {}
        self.decision_inputs = []
        self.multi_position_combinations = []
        
        # Remove these observers as the workflow will handle updates
        # self.data_transformer.observe(self._handle_pd_results_change, names=['pd_results'])
        # self.data_transformer.observe(self._handle_proteins_dic_change, names=['proteins_dic'])

    # Remove these methods as they're no longer needed
    # def _handle_pd_results_change(self, change):
    # def _handle_proteins_dic_change(self, change):
            
    def handle_combinations(self):
        """Main method to handle protein combinations"""
        if self.pd_results is None or self.pd_results.empty:
            return None
            
        choice = widgets.RadioButtons(
            options=[('Yes', True), ('No', False)],
            description='Process peptides mapped to multiple proteins?',
            style={'description_width': 'initial'},
            value=None
        )
        output = widgets.Output()
        
        def process_choice(_):
            with output:
                clear_output()
                if choice.value:
                    self.pd_results_cleaned = self.process_protein_combinations()
                    display(HTML("<b style='color:green;'>Processed peptides mapped to multiple proteins.</b>"))
                else:
                    self.pd_results_cleaned = self.pd_results.copy()
                    display(HTML("<b>Using original protein mappings.</b>"))
        
        choice.observe(process_choice, 'value')
        display(choice)
        display(output)
        
        return self.pd_results_cleaned
            
    def _get_protein_combinations(self):
       """Extract unique protein combinations from the dataset"""
       protein_combinations = set()
       
       for _, row in self.pd_results.iterrows():
           if pd.isna(row['Positions in Proteins']) or pd.isna(row['Master Protein Accessions']):
               continue
               
           position_proteins = [p.split()[0] for p in row['Positions in Proteins'].split('; ')]
           master_acc = row['Master Protein Accessions']
           
           # Check species of proteins in Positions in Proteins
           species_set = set()
           for protein in position_proteins:
               if protein in self.proteins_dic:
                   species_set.add(self.proteins_dic[protein]['species'])
           
           # Add to combinations if:
           # 1. Multiple proteins in master accession, OR
           # 2. Multiple proteins in Positions in Proteins, OR 
           # 3. Single protein in master accession but proteins in positions are from different species
           if ';' in master_acc or ';' in row['Positions in Proteins'] or len(species_set) > 1:
               protein_combinations.add('; '.join(sorted(position_proteins)))
       
       self.multi_position_combinations = list(protein_combinations)
       return self.multi_position_combinations
            
    def _count_combination_occurrences(self, df, proteins):
        """Count occurrences of a specific protein combination"""
        count = 0
        for _, row in df.iterrows():
            if pd.isna(row['Positions in Proteins']):
                continue
                
            row_proteins = set(p.split()[0] for p in row['Positions in Proteins'].split('; '))
            if row_proteins == set(proteins):
                count += 1
        return count 
        
    def _get_default_decision(self, protein, row):
        """
        Determine default decision based on protein's presence in Master Protein Accessions
        
        Args:
            protein: The protein ID to check
            row: DataFrame row containing Master Protein Accessions
            
        Returns:
            str: 'new' if protein is in Master Accessions, 'remove' if not
        """
        if pd.isna(row['Master Protein Accessions']):
            return 'remove'
        
        master_proteins = row['Master Protein Accessions'].split(';')
        master_proteins = [p.strip() for p in master_proteins]
        
        return 'new' if protein in master_proteins else 'remove'
    
    def process_protein_combinations(self):
        """Process protein combinations in pd_results"""
        if not self.pd_results.empty:
            df = self.pd_results.copy()
            
            # Create main container
            main_container = widgets.VBox([
                widgets.HTML("""
                    <h3>Peptides Mapped to Multiple Proteins</h3>
                    <div style='margin-bottom: 15px;'>
                        Select how to handle each protein mapping combination in your dataset.
                        These combinations come from either:
                        <ul>
                            <li>Multiple proteins in Master Protein Accessions</li>
                            <li>Multiple proteins in Positions in Proteins</li>
                            <li>Proteins from different species</li>
                        </ul>
                    </div>
                """)
            ], layout=widgets.Layout(width='100%', padding='20px'))
    
            # Get combinations
            combinations = self._get_protein_combinations()
                                    
            def create_help_icon(self, tooltip_text):
                """Create a help icon widget with tooltip"""
                return f'<div title="{tooltip_text}" style="display: inline-block; margin-left: 4px;">' \
                       '<i class="fa fa-question-circle" style="color: #007bff; font-size: 14px;"></i>' \
                       '</div>'
            
            # In your process_protein_combinations method:
            table_header = widgets.HTML("""
                            <div style="display: grid; grid-template-columns: 100px 100px 420px 200px auto; gap: 2px; margin-bottom: 10px; font-weight: bold; align-items: center;">
                                <div>
                                    Protein ID
                                    <span title="Unique identifier for the protein" style="display: inline-block; margin-left: 4px;">
                                        <i class="fa fa-question-circle" style="color: #007bff; font-size: 14px;"></i>
                                    </span>
                                </div>
                                <div>
                                    Species
                                    <span title="Source organism of the protein" style="display: inline-block; margin-left: 4px;">
                                        <i class="fa fa-question-circle" style="color: #007bff; font-size: 14px;"></i>
                                    </span>
                                </div>
                                <div>
                                    Description
                                    <span title="Full protein name or description" style="display: inline-block; margin-left: 4px;">
                                        <i class="fa fa-question-circle" style="color: #007bff; font-size: 14px;"></i>
                                    </span>
                                </div>
                                <div>
                                    Decision
                                    <span title="Available options:
            - 'new' - Create a separate row for this protein
            - 'remove' - Remove this protein from combination
            - 'asis' - Keep as part of current combination" style="display: inline-block; margin-left: 4px;">
                                        <i class="fa fa-question-circle" style="color: #007bff; font-size: 14px;"></i>
                                    </span>
                                </div>
                                <div>
                                    Status
                                    <span title="Color indicators:
            - Grey - Default option (not yet submitted)
            - Green - Option has been submitted" style="display: inline-block; margin-left: 4px;">
                                        <i class="fa fa-question-circle" style="color: #007bff; font-size: 14px;"></i>
                                    </span>
                                </div>
                            </div>
                            <hr style="margin: 0 0 10px 0;">
                        """)
            # Create input area
            input_area = widgets.VBox([table_header], 
                                    layout=widgets.Layout(width='100%', margin='10px 0'))
            
            # Add rows for each combination
            self.decision_inputs = []
            self.status_displays = {}
            
            for combo_idx, combo in enumerate(combinations, 1):
                proteins = combo.split('; ')
                
                # Find rows with this combination
                combo_rows = []
                for _, row in df.iterrows():
                    if pd.isna(row['Positions in Proteins']):
                        continue
                    row_proteins = set(p.split()[0] for p in row['Positions in Proteins'].split('; '))
                    if row_proteins == set(proteins):
                        combo_rows.append(row)
                
                occurrences = len(combo_rows)
                
                # Add combination header
                input_area.children += (widgets.HTML(f"""
                    <div style="background-color: #f8f9fa; padding: 2px; margin: 5px 0; border-radius: 5px;">
                        <b>Combination {combo_idx}</b> ({occurrences} occurrences)
                    </div>
                """),)
                
                # Process each protein in the combination
                for protein in proteins:
                    species = "Unknown"
                    name = "Unknown"
                    if protein in self.proteins_dic:
                        species = self.proteins_dic[protein]['species']
                        name = self.proteins_dic[protein]['name']
                    
                    # Set default decision based on Master Protein Accessions
                    default_decision = 'asis'
                    if combo_rows:
                        first_row = combo_rows[0]
                        if not pd.isna(first_row['Master Protein Accessions']):
                            master_proteins = first_row['Master Protein Accessions'].split(';')
                            master_proteins = [p.strip() for p in master_proteins]
                            default_decision = 'new' if protein in master_proteins else 'remove'
                    
                    # Create decision input
                    decision_input = widgets.Text(
                        layout=widgets.Layout(width='125px'),
                        value=default_decision
                    )
                    self.decision_inputs.append((combo, protein, decision_input))
                    
                    # Create status display with initial status
                    status_text = {
                        'new': "Will be created as new row",
                        'remove': "Will be removed",
                        'asis': "Will keep as is"
                    }
                    initial_status = status_text.get(default_decision, '')
                    status_display = widgets.HTML(f'<span style="color: gray">{initial_status}</span>')
                    self.status_displays[(combo, protein)] = status_display
                    
                    # Create the row content
                    row_content = widgets.HTML(f"""
                    <div style="display: grid; grid-template-columns: 100px 100px 420px; gap: 2px; align-items: center;">
                            <div>{protein}</div>
                            <div>{species}</div>
                            <div>{name}</div>
                        </div>
                    """)
                    
                    # Create container with all elements
                    container = widgets.HBox([
                        row_content,
                        widgets.HBox([decision_input], layout=widgets.Layout(width='150px', padding='0')),
                        widgets.HBox([status_display], layout=widgets.Layout(width='200px', padding='0'))
                    ], layout=widgets.Layout(
                        margin='2px 0',
                        display='flex',
                        align_items='center',
                        overflow='hidden', 
                        width='100%'
                    ))
                    
                    input_area.children += (container,)
    
            # Create buttons
            button_box = self._create_buttons()
            
            # Add output area
            self.protein_output_area = widgets.Output(
                layout=widgets.Layout(width='100%', margin='5px 0')
            )
            
            # Add all components
            main_container.children += (input_area, button_box, self.protein_output_area)
            
            self.pd_results_cleaned = df
            display(main_container)
            return df
            
    def _handle_remove_decision(self, df, index, row, positions, protein_to_remove):
        """Handle 'REMOVE' decision by removing the entire row"""
        # Simply drop the row
        df = df.drop(index)
        return df
            
    def _handle_new_decision(self, df, index, row, positions):
        """Handle 'NEW' decision for a row by removing original row and creating separate rows for each protein"""
        # Create new rows for each protein's position
        new_rows = []
        for pos in positions:
            new_row = row.copy()
            new_row['Positions in Proteins'] = pos
            protein_id = pos.split()[0]
            new_row['Master Protein Accessions'] = protein_id
            new_rows.append(new_row)
        
        # Remove the original row
        df = df.drop(index)
        
        # Add all new rows
        if new_rows:
            df = pd.concat([df, pd.DataFrame(new_rows)], ignore_index=True)
        
        return df 
       
    def _on_submit(self, button, df):
        """Handle submit button click"""
        self.submit_button.disabled = True
        self.reset_button.disabled = True
        self.progress.value = 0
        
        with self.protein_output_area:
            try:
                self.protein_output_area.clear_output()
                
                # Process decisions by combination
                decisions_by_combo = {}
                rows_to_remove = set()  # Track indices of original rows to remove
                new_rows = []  # Store all new rows to be added
                total_inputs = len(self.decision_inputs)
                
                # First pass: collect all decisions
                for i, (combo, protein, input_widget) in enumerate(self.decision_inputs):
                    decision = input_widget.value.strip().upper()
                    if decision:
                        # Update status
                        status_display = self.status_displays[(combo, protein)]
                        status_display.value = f'<span style="color: green">Decision: {decision}</span>'
                        
                        # Store decision
                        if combo not in decisions_by_combo:
                            decisions_by_combo[combo] = {}
                        decisions_by_combo[combo][protein] = decision
                    
                    # Update progress
                    self.progress.value = ((i + 1) / total_inputs * 50)  # First half of progress
                
                # Second pass: process the dataframe based on decisions
                if decisions_by_combo:
                    df_processed = df.copy()
                    processed_count = 0
                    total_combinations = len(decisions_by_combo)
                    
                    for combo, protein_decisions in decisions_by_combo.items():
                        proteins = combo.split('; ')
                        # Create exact pattern match for this combination
                        pattern = ''.join(f'(?=.*{p})' for p in proteins)
                        mask = df_processed['Positions in Proteins'].str.contains(pattern, regex=True)
                        
                        matched_indices = df_processed[mask].index
                        for idx in matched_indices:
                            row = df_processed.loc[idx]
                            positions = row['Positions in Proteins'].split('; ')
                            current_proteins = [p.split()[0] for p in positions]
                            current_combo = '; '.join(sorted(current_proteins))
                            
                            # Only process if this exact combination matches
                            if current_combo == combo:
                                any_new_or_remove = False
                                custom_updates = []
                                
                                # First check if we need to remove the original row
                                for protein, decision in protein_decisions.items():
                                    if decision in ['NEW', 'REMOVE']:
                                        any_new_or_remove = True
                                        break
                                    elif decision != 'ASIS':
                                        custom_updates.append((protein, decision))
                                
                                if any_new_or_remove:
                                    # Mark original row for removal
                                    rows_to_remove.add(idx)
                                    
                                    # If any decision is 'NEW', create individual rows
                                    for protein, decision in protein_decisions.items():
                                        if decision == 'NEW':
                                            position = next(p for p in positions if protein in p)
                                            new_row = row.copy()
                                            new_row['Positions in Proteins'] = position
                                            new_row['Master Protein Accessions'] = protein
                                            new_rows.append(new_row)
                                
                                elif custom_updates:
                                    # Handle custom protein IDs without removing original row
                                    for protein, new_accession in custom_updates:
                                        df_processed = self._handle_custom_protein_id(df_processed, idx, positions, new_accession)
                        
                        processed_count += 1
                        self.progress.value = 50 + (processed_count / total_combinations * 50)
                    
                    # Remove marked rows
                    df_processed = df_processed.drop(index=list(rows_to_remove))
                    
                    # Add all new rows
                    if new_rows:
                        df_processed = pd.concat([df_processed, pd.DataFrame(new_rows)], ignore_index=True)
                    
                    self.pd_results_cleaned = df_processed
                
                display(HTML("<b style='color:green;'>Processing complete.</b>"))
                
            except Exception as e:
                display(HTML(f"<b style='color:red;'>Error: {str(e)}</b>"))
            
            finally:
                self.submit_button.disabled = False
                self.reset_button.disabled = False
                
        return self.pd_results_cleaned

                
    def create_help_icon(self, tooltip_text):
        """Create a help icon widget with tooltip"""
        return widgets.HTML(
            f'<div title="{tooltip_text}" style="display: inline-block;">'
            '<i class="fa fa-question-circle" style="color: #007bff;"></i>'
            '</div>'
        )
    
    def _create_input_fields(self, input_area):
        """Create input fields for each unique protein combination"""
        self.user_decisions = {}
        self.decision_inputs = []
        
        def create_labeled_decision(protein_info, description):
            """Create a labeled decision input with tooltip"""
            container = widgets.VBox([
                widgets.HTML(f"""
                <div style='margin-bottom: 5px;'>
                    <b>{protein_info}</b>
                    {self.create_help_icon(description).value}
                </div>
                """),
                widgets.Text(
                    description='Decision:',
                    layout=widgets.Layout(width='300px')
                )
            ], layout=widgets.Layout(margin='10px 0px'))
            return container
        
        for protein_combo in self.multi_position_combinations:
            proteins = protein_combo.split('; ')
            pattern = ''.join(f'(?=.*{p})' for p in proteins)
            occurrences = len(self.pd_results[self.pd_results['Positions in Proteins'].str.contains(pattern, regex=True)])
            
            # Create header for this combination
            combo_header = widgets.HTML(f"""
                <h4 style="display: flex; align-items: center;">
                    {occurrences} occurrences of Multiple protein combination
                    {self.create_help_icon("Multiple proteins were found mapping to the same peptide sequence").value}
                </h4>
            """)
            
            # Create container for this combination's inputs
            combo_container = widgets.VBox([combo_header])
            
            # Add individual protein inputs
            for protein in proteins:
                if protein in self.proteins_dic:
                    species = self.proteins_dic[protein]['species']
                    name = self.proteins_dic[protein]['name']
                    protein_info = f"{protein} ({species} - {name})"
                else:
                    protein_info = protein
                
                # Create tooltip description
                description = (
                    f"Select how to handle {protein}:\n"
                    "• 'new' - Create a separate row for this protein\n"
                    "• 'asis' - Keep as part of the current combination\n"
                    "• Enter a custom ID to replace this protein"
                )
                
                # Create input container for this protein
                protein_input = create_labeled_decision(protein_info, description)
                combo_container.children += (protein_input,)
                self.decision_inputs.append(protein_input.children[1])  # Store the Text widget
            
            combo_container.layout.border = '1px solid #ddd'
            combo_container.layout.padding = '10px'
            combo_container.layout.margin = '10px 0px'
            combo_container.layout.border_radius = '5px'
            
            input_area.children += (combo_container,)
    def _process_rows(self, df):
        """Process DataFrame rows based on user decisions"""
        processed_df = df.copy()
        new_rows = []
        
        for index, row in processed_df.iterrows():
            positions_row = row['Positions in Proteins']
            master_acc = row['Master Protein Accessions']
            sequence = row['Annotated Sequence']
            
            # Get the decision for this specific entry
            decision = self.grid_data[index]['Decision']
            
            if decision == 'asis':
                continue
                
            elif decision == 'new':
                # Create separate rows
                positions = positions_row.split('; ')
                proteins = [p.split()[0] for p in positions]
                
                # Update first position in current row
                processed_df.at[index, 'Positions in Proteins'] = positions[0]
                processed_df.at[index, 'Master Protein Accessions'] = proteins[0]
                
                # Create new rows for additional positions
                for pos, prot in zip(positions[1:], proteins[1:]):
                    new_row = row.copy()
                    new_row['Positions in Proteins'] = pos
                    new_row['Master Protein Accessions'] = prot
                    new_rows.append(new_row)
                    
            elif decision.startswith('custom:'):
                # Handle custom protein ID
                new_protein_id = decision.split(':')[1]
                positions = positions_row.split('; ')
                new_positions = []
                for pos in positions:
                    num_range = pos[pos.index('['):] if '[' in pos else ''
                    new_positions.append(f"{new_protein_id} {num_range}")
                
                processed_df.at[index, 'Master Protein Accessions'] = new_protein_id
                processed_df.at[index, 'Positions in Proteins'] = '; '.join(new_positions)
        
        # Add all new rows
        if new_rows:
            processed_df = pd.concat([processed_df, pd.DataFrame(new_rows)], ignore_index=True)
        
        return processed_df


    def _handle_custom_protein_id(self, df, index, positions, new_accession):
        """Handle custom protein ID decision"""
        new_positions = []
        for pos in positions:
            num_range = pos[pos.index('['):] if '[' in pos else ''
            new_positions.append(f"{new_accession} {num_range}")
        
        df.at[index, 'Master Protein Accessions'] = new_accession
        df.at[index, 'Positions in Proteins'] = '; '.join(new_positions)
        
        return df

    def _create_buttons(self):
        """Create submit and reset buttons"""
        self.submit_button = widgets.Button(
            description="Submit", 
            button_style='success',
            disabled=False
        )
        self.reset_button = widgets.Button(
            description="Reset", 
            button_style='warning',
            disabled=False
        )
        self.progress = widgets.FloatProgress(
            value=0,
            min=0,
            max=100,
            description='Processing:',
            bar_style='info',
            style={'bar_color': '#0080ff'},
            orientation='horizontal',
            layout=widgets.Layout(width='50%')
        )
        
        button_box = widgets.VBox([
            widgets.HBox([self.submit_button, self.reset_button]),
            self.progress
        ])
        
        self.reset_button.on_click(self._on_reset_button_clicked)
        self.submit_button.on_click(lambda b: self._on_submit(b, self.pd_results.copy()))
        
        return button_box
        
    def _on_reset_button_clicked(self, b):
        """Handle reset button click by resetting options to default values"""
        # Disable buttons during reset
        self.submit_button.disabled = True
        self.reset_button.disabled = True
        
        # Clear output area
        with self.protein_output_area:
            self.protein_output_area.clear_output()
            display(HTML("<b style='color:blue;'>Resetting options to defaults...</b>"))
        
        # Reset progress bar
        self.progress.value = 0
        
        try:
            # Reset each input field to its default value based on Master Protein Accessions
            df = self.pd_results.copy()
            processed = 0
            total_inputs = len(self.decision_inputs)
            
            for combo, protein, input_field in self.decision_inputs:
                # Find rows with this combination
                proteins = combo.split('; ')
                combo_rows = []
                for _, row in df.iterrows():
                    if pd.isna(row['Positions in Proteins']):
                        continue
                    row_proteins = set(p.split()[0] for p in row['Positions in Proteins'].split('; '))
                    if row_proteins == set(proteins):
                        combo_rows.append(row)
                
                # Determine default decision
                default_decision = 'asis'
                if combo_rows:
                    first_row = combo_rows[0]
                    if not pd.isna(first_row['Master Protein Accessions']):
                        master_proteins = first_row['Master Protein Accessions'].split(';')
                        master_proteins = [p.strip() for p in master_proteins]
                        default_decision = 'new' if protein in master_proteins else 'remove'
                
                # Set input field value
                input_field.value = default_decision
                
                # Update status display
                status_display = self.status_displays[(combo, protein)]
                status_text = {
                    'new': "Will be created as new row",
                    'remove': "Will be removed",
                    'asis': "Will keep as is"
                }
                status_display.value = f'<span style="color: gray">{status_text[default_decision]}</span>'
                
                # Update progress
                processed += 1
                self.progress.value = (processed / total_inputs) * 100
            
            # Reset internal state
            self.user_decisions = {}
            self.pd_results_cleaned = self.pd_results.copy()
            
            with self.protein_output_area:
                self.protein_output_area.clear_output()
                display(HTML("<b style='color:green;'>Reset complete. All options set to defaults.</b>"))
            
        except Exception as e:
            with self.protein_output_area:
                self.protein_output_area.clear_output()
                display(HTML(f"<b style='color:red;'>Error during reset: {str(e)}</b>"))
        
        finally:
            # Re-enable buttons
            self.submit_button.disabled = False
            self.reset_button.disabled = False
    

    def _display_results(self):
        """Display processing results"""
        for combo, decision in self.user_decisions.items():
            if decision == 'NEW':
                display(HTML(f'<b>{combo}</b> <b style="color:green;">has been successfully processed.</b>'))
                display(HTML(f'&nbsp;&nbsp;&nbsp;&nbsp;The positions have been updated with the new protein ID "{decision}".'))



In [5]:
# Cell where we initialize the workflow
# Create global output areas for protein mapping and group processing
protein_mapping_output = widgets.Output()
group_processing_output = widgets.Output()

# Create ProcessingWorkflow class with persistent UI elements
class ProcessingWorkflow:
    def __init__(self):
        self.data_transformer = DataTransformation()
        self.protein_handler = ProteinCombinationHandler(self.data_transformer)
        self.group_processor = GroupProcessing()
        
        # Set up observers
        self.data_transformer.observe(self._handle_data_change, names=['pd_results'])
        self.data_transformer.observe(self._handle_fasta_change, names=['proteins_dic'])
            
    def _handle_data_change(self, change):
        """Handle changes in proteomics data"""
        if change.name == 'pd_results':
            # Update protein mapping
            with protein_mapping_output:
                protein_mapping_output.clear_output()
                if change.new is not None:
                    display(HTML("<h3>Multiple Protein Mappings</h3>"))
                    self.protein_handler.pd_results = change.new
                    self.protein_handler.handle_combinations()
                else:
                    display(HTML("<h3>Multiple Protein Mappings</h3>"))
                    display(HTML("<b style='color:orange;'>Waiting for proteomics data to be uploaded...</b>"))
            
            # Update group processor
            self.group_processor.update_data(change.new)
            
    def _handle_fasta_change(self, change):
        """Handle changes in FASTA data"""
        if hasattr(self.protein_handler, 'proteins_dic'):
            self.protein_handler.proteins_dic = self.data_transformer.proteins_dic

    
    def display(self):
        """Display the complete workflow interface"""
        # Display data transformer interface
        self.data_transformer.setup_data_loading_ui()
        
        # Add spacing
        display(widgets.HTML("<br>"))
        
        # Display protein mapping section (always visible)
        display(widgets.HTML("<h3><u>Protein Mapping</u></h3>"))
        display(protein_mapping_output)
        
        # Initialize protein handler with empty state
        if self.protein_handler is None:
            with protein_mapping_output:
                self.protein_handler = ProteinCombinationHandler(self.data_transformer)
                self.protein_handler.handle_combinations()
        
        # Add spacing
        display(widgets.HTML("<br>"))
        
        # Display group processing section
        with group_processing_output:
            self.group_processor.display_group_selector()
            self.group_processor.display_widgets()
        display(group_processing_output)

# Initialize and display workflow
workflow = ProcessingWorkflow()
workflow.display()

HTML(value='\n            <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.1…

VBox(children=(HTML(value='<h3><u>Upload Peptidomic Data Files:</u></h3>'), HBox(children=(FileUpload(value=()…

HTML(value='<br>')

HTML(value='<h3><u>Protein Mapping</u></h3>')

Output()

HTML(value='<br>')

Output()

In [6]:
class CombineAverageDataframes:
    def __init__(self, data_transformer, group_processor, protein_handler):
        self.data_transformer = data_transformer
        self.group_processor = group_processor
        self.pd_results = data_transformer.pd_results
        self.mbpdb_results = data_transformer.mbpdb_results
        self.pd_results_cleaned = protein_handler.pd_results_cleaned if hasattr(protein_handler, 'pd_results_cleaned') and protein_handler.pd_results_cleaned is not None else pd.DataFrame()
        self._merged_df = None
        # Set up observer for data changes
        self.data_transformer.observe(self._handle_data_change, names=['pd_results', 'mbpdb_results'])
        
    def _handle_data_change(self, change):
        """Handle changes in the input data."""
        if change.name == 'pd_results':
            self.pd_results = change.new
        elif change.name == 'mbpdb_results':
            self.mbpdb_results = change.new
        elif change.name == 'pd_results_cleaned':
            self.pd_results_cleaned = change.new        # Re-run interactive display
        clear_output()        
    @property
    def merged_df(self):
        """Property to access the merged DataFrame."""
        return self._merged_df

    def extract_bioactive_peptides(self):
        """
        Extracts the list of bioactive peptide matches from the imported MBPDB search.
        """
        if not self.mbpdb_results.empty:
            # Drop rows where protein_id is NaN or 'None'
            mbpdb_results_cleaned = self.mbpdb_results.copy()
            mbpdb_results_cleaned.dropna(subset=['search_peptide'], inplace=True)
            mbpdb_results_cleaned = mbpdb_results_cleaned[mbpdb_results_cleaned['protein_id'] != 'None']

            # Check if '% Alignment' column exists
            if '% Alignment' in mbpdb_results_cleaned.columns:
                agg_dict = {
                    'peptide': 'first', 
                    'protein_id': 'first',
                    'protein_description': 'first',
                    '% Alignment': 'first',
                    'species': 'first',
                    'intervals': 'first',
                    'function': lambda x: list(x.dropna().unique())
                }
            else:
                agg_dict = {
                    'peptide': 'first', 
                    'search_peptide': 'first',
                    'protein_description': 'first',
                    'species': 'first',
                    'intervals': 'first',
                    'function': lambda x: list(x.dropna().unique())
                }

            # Perform the groupby and aggregation
            mbpdb_results_grouped = mbpdb_results_cleaned.groupby('search_peptide').agg(agg_dict).reset_index()

            # Flatten the 'function' list
            mbpdb_results_grouped['function'] = mbpdb_results_grouped['function'].apply(
                lambda x: '; '.join(x) if isinstance(x, list) else x
            )
            return mbpdb_results_cleaned, mbpdb_results_grouped
        else:
            return None, None

    def create_unique_id(self, row):
        """Creates a unique ID for each peptide row."""
        if pd.notna(row['Modifications']):
            unique_id = row['Sequence'] + "_" + row['Modifications'].strip()
        else:
            unique_id = row['Sequence']
        return unique_id.rstrip('_')
    
    def process_pd_results(self, mbpdb_results_grouped):
        """Processes the PD results and merges with MBPDB results."""
        pd_results_cleaned = self.pd_results_cleaned
        
        # Process positions and accessions
        #pd_results_cleaned['Positions in Proteins'] = pd_results_cleaned['Positions in Proteins'].str.split(';', expand=False).str[0]
        #pd_results_cleaned['Master Protein Accessions'] = pd_results_cleaned['Master Protein Accessions'].str.split(';', expand=False).str[0]
        
        # Create sequence column if needed
        if 'Sequence' not in pd_results_cleaned.columns:
            pd_results_cleaned['Sequence'] = pd_results_cleaned['Annotated Sequence'].str.split('.', expand=False).str[1]
        
        # Create unique ID
        pd_results_cleaned['unique ID'] = pd_results_cleaned.apply(self.create_unique_id, axis=1)
        
        #  Extract start and stop positions
        try:
            # Initialize start and stop columns with NaN
            pd_results_cleaned['start'] = pd.NA
            pd_results_cleaned['stop'] = pd.NA
            
            # Create mask for rows without semicolons (single positions)
            single_position_mask = ~pd_results_cleaned['Positions in Proteins'].str.contains(';', na=False)
            
            # Process rows with single positions
            single_positions = pd_results_cleaned.loc[single_position_mask, 'Positions in Proteins']
            if not single_positions.empty:
                extracted = single_positions.str.extract(r'\[(\d+)-(\d+)\]')
                
                # Convert to numeric and handle invalid values
                pd_results_cleaned.loc[single_position_mask, 'start'] = pd.to_numeric(extracted[0], errors='coerce')
                pd_results_cleaned.loc[single_position_mask, 'stop'] = pd.to_numeric(extracted[1], errors='coerce')
            
            # Convert to Int64 to handle missing values properly
            pd_results_cleaned['start'] = pd_results_cleaned['start'].astype('Int64')
            pd_results_cleaned['stop'] = pd_results_cleaned['stop'].astype('Int64')
            
        except Exception as e:
            print(f"Error processing protein positions: {str(e)}")
        
        # Reorder columns with unique ID and Sequence first
        remaining_cols = [col for col in pd_results_cleaned.columns 
                         if col not in ['unique ID', 'Sequence', 'Master Protein Accessions', 
                                      'Positions in Proteins', 'start', 'stop']]
        
        columns_order = ['unique ID', 'Sequence', 'Master Protein Accessions', 
                        'Positions in Proteins', 'start', 'stop'] + remaining_cols
        
        pd_results_cleaned = pd_results_cleaned[columns_order]
        
        # Merge with MBPDB results if available
        if mbpdb_results_grouped is not None and not mbpdb_results_grouped.empty:
            merged_df = pd.merge(pd_results_cleaned, mbpdb_results_grouped, 
                               right_on='search_peptide', left_on='unique ID', how='left')
            display(HTML("<b style='color:green;'>The MBPDB was successfully merged with the peptidomic data matching the Search Peptide and Unique ID columns.</b>"))
        else:
            merged_df = pd_results_cleaned.copy()
            merged_df['function'] = np.nan
            display(HTML("<b style='color:orange;'>No MBPDB was uploaded.</b>"))
            display(HTML("<b style='color:orange;'>The merged Dataframe contains only peptidomic data.</b>"))
        
        return merged_df
    
    def calculate_group_abundance_sem_averages(self, df, group_data):
        """Calculates group abundance averages and SEMs, organizing them with averages first, then SEMs."""
        # Check if all average abundance columns already exist
        all_columns_exist = True
        for group_number, details in group_data.items():
            average_column_name = f"Avg_{details['grouping_variable']}"
            if average_column_name not in df.columns:
                all_columns_exist = False
                break
        
        if all_columns_exist:
            display(HTML('<b style="color:orange;">All average abundance columns already exist. Returning original DataFrame.</b>'))
            return df
        
        # If not all columns exist, proceed with calculations
        average_columns = {}
        sem_columns = {}
        
        # Calculate all averages and SEMs but store them separately
        for group_number, details in group_data.items():
            grouping_variable = details['grouping_variable']
            abundance_columns = details['abundance_columns']
            
            # Convert abundance columns to numeric
            for col in abundance_columns:
                df[col] = pd.to_numeric(df[col], errors='coerce')
            
            # Define column names
            average_column_name = f"Avg_{grouping_variable}"
            sem_column_name = f"SEM_{grouping_variable}"
        
            # Calculate standard deviation
            std = df[abundance_columns].std(axis=1, skipna=True)
            
            # Calculate number of non-NaN values for each row
            n_samples = df[abundance_columns].notna().sum(axis=1)
            
            # Calculate SEM (standard deviation divided by square root of n)
            sem = std / np.sqrt(n_samples)
            
            # Store results in separate dictionaries
            average_columns[average_column_name] = df[abundance_columns].mean(axis=1, skipna=True)
            sem_columns[sem_column_name] = sem
        
        # Combine the columns in the desired order (all averages, then all SEMs)
        new_columns = {**average_columns, **sem_columns}
        
        # Add new columns to DataFrame
        df = pd.concat([df, pd.DataFrame(new_columns)], axis=1)
        
        if not df.empty:
            display(HTML('<b style="color:green;">Group average abundance and Standard Error of Mean (SEM) columns have been successfully added to the DataFrame.</b>'))
        return df

    
    def process_data(self, group_data):
        """Main method to process all data."""
        if hasattr(self, 'pd_results') and self.pd_results is not None and not self.pd_results.empty:
            try:
                # Extract and process bioactive peptides
                mbpdb_results_cleaned, mbpdb_results_grouped = self.extract_bioactive_peptides()
                
                if not hasattr(self, 'pd_results_cleaned') or self.pd_results_cleaned is None:
                    self.pd_results_cleaned = self.pd_results.copy()
                
                # Process PD results and merge with MBPDB
                merged_df_temp = self.process_pd_results(mbpdb_results_grouped)
                
                # Calculate abundance averages if group_data exists
                if group_data:
                    with warnings.catch_warnings():
                        warnings.simplefilter("ignore", UserWarning)
                        final_df = self.calculate_group_abundance_sem_averages(merged_df_temp, group_data)
                else:
                    final_df = merged_df_temp
                    display(HTML("<b style='color:orange;'>No group data provided. Skipping abundance calculations.</b>"))
                
                # Store the final DataFrame
                self._merged_df = final_df
                return final_df
            except Exception as e:
                display(HTML(f"<b style='color:red;'>Error processing data: {str(e)}</b>"))
                return None
        else:
            display(HTML("<b style='color:red;'>No PD results data available for processing.</b>"))
            return None
            
    def update_data(self, pd_results):
        """Update data and refresh filtered columns"""
        self.pd_results = pd_results
        
        # Only update if we have valid data
        if pd_results is not None and not pd_results.empty:
            self.setup_data()
            
            # Update the dropdown with new filtered columns
            self.column_dropdown.options = self.filtered_columns
            
            with self.output:
                self.output.clear_output()
                display(widgets.HTML('<b style="color:green;">Data updated successfully. Column selection refreshed.</b>'))
        else:
            # Clear options if no data
            self.column_dropdown.options = []
            with self.output:
                self.output.clear_output()
                display(widgets.HTML('<b style="color:orange;">No data available for column selection.</b>'))

In [7]:
class ExportManager:
    """Class to manage all export operations with notebook-compatible lazy loading"""
    
    def __init__(self):
        self.output_style = """
            <style>
            .download-link {
                background-color: #4CAF50;
                border: none;
                color: white;
                padding: 10px 20px;
                text-align: center;
                text-decoration: none;
                display: inline-block;
                font-size: 14px;
                margin: 4px 2px;
                cursor: pointer;
                border-radius: 4px;
            }
            .download-link:hover {
                background-color: #45a049;
            }
            .download-link:disabled {
                background-color: #cccccc;
                cursor: not-allowed;
            }
            .export-section {
                margin-bottom: 20px;
                padding: 15px;
                border-radius: 5px;
                background-color: #f8f9fa;
            }
            .export-description {
                color: #666;
                margin: 5px 0 15px 0;
                font-style: italic;
            }
            </style>
        """

    def _create_download_section(self, title, description, data_generator, mime_type):
        """Create a download section with direct data generation"""
        try:
            # Generate the data immediately but efficiently
            content, filename = data_generator()
            
            if isinstance(content, str):
                content = content.encode('utf-8')
            
            # Convert to base64
            b64_data = base64.b64encode(content).decode('utf-8')
            file_data = f"data:{mime_type};base64,{b64_data}"
            
            html_content = f"""
            <div class="export-section">
                <h3><u>{title}</u></h3>
                <div class="export-description">
                    {description}
                </div>
                <a href="{file_data}" 
                   download="{filename}" 
                   class="download-link"
                   title="Click to download">
                    Download Data
                </a>
            </div>
            """
            
            display(HTML(self.output_style + html_content))
            
        except Exception as e:
            print(f"Error generating download: {str(e)}")
            import traceback
            traceback.print_exc()

    def export_mbpdb_results(self, df):
        """Export MBPDB results as TSV"""
        if not 'function' in df.columns:
            return
            
        def generate_mbpdb_data():
            timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
            filename = f"MBPDB_SEARCH_{timestamp}.tsv"
            content = df.to_csv(sep='\t', index=False)
            return content.encode(), filename
            
        self._create_download_section(
            
            "Export MBPDB Search Results",
            "Download the results from searching your peptides against the MBPDB database",
            generate_mbpdb_data,
            'text/tab-separated-values'
        )

    def export_group_data(self, group_data):
        """Export group data as JSON"""
        def generate_group_data():
            timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
            filename = f"Categorical_variable_definitions_{timestamp}.json"
            content = json.dumps(group_data, indent=4)
            return content.encode(), filename
            
        self._create_download_section(
            "Export Group Data",
            "Download the categorical variable definitions used for data grouping and analysis",
            generate_group_data,
            'application/json'
        )

    def export_dataframe(self, df):
        """Export DataFrame as CSV"""
        def generate_df_data():
            timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
            filename = f"Merged_Dataframe_{timestamp}.csv"
            content = df.to_csv(index=False)
            return content.encode(), filename
            
        self._create_download_section(
            "Export Full Dataset",
            "Download the complete merged dataset containing all processed data",
            generate_df_data,
            'text/csv'
        )

    def setup_pivoted_data_export(self, merged_df, group_data):
        """Setup pivoted data export"""
        def generate_pivoted_data():
            def create_pivoted_df(df, abundance_columns):
                melted_df = df.melt(
                    id_vars=['unique ID'],
                    value_vars=abundance_columns,
                    var_name='Sample',
                    value_name='Abundance'
                )
                pivoted = melted_df.pivot_table(
                    index='Sample',
                    columns='unique ID',
                    values='Abundance'
                )
                pivoted.index.name = 'Abundance Values'
                return pivoted

            timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
            filename = f"pivoted_data_{timestamp}.xlsx"
            
            output = io.BytesIO()
            with pd.ExcelWriter(output, engine='openpyxl') as writer:
                for group_key, group_info in group_data.items():
                    pivoted_df = create_pivoted_df(
                        merged_df, 
                        group_info['abundance_columns']
                    )
                    if not pivoted_df.empty:
                        pivoted_df.to_excel(
                            writer, 
                            sheet_name=group_info['grouping_variable'],
                            index=True
                        )
            
            return output.getvalue(), filename
            
        self._create_download_section(
            "Pivoted Peptide Data Export",
            "Download abundance values organized by sample and peptide ID",
            generate_pivoted_data,
            'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
        )

    def export_bioactive_data(self, merged_df, group_data):
        """Export bioactive function analysis"""
        if not 'function' in merged_df.columns:
            if group_data is None:
                return
            
        def generate_bioactive_data():
            results = self._bioactive_function_count_and_abundance_sum_avg(merged_df, group_data)
            if not results:
                raise ValueError("No bioactive data to export")
            
            (summed_function_count, unique_function_counts, 
             unique_function_count_averages, unique_function_absorbance, 
             summed_function_abundance) = results

            # Create DataFrames
            peptide_count_df = pd.DataFrame.from_dict(
                summed_function_count, 
                orient='index', 
                columns=['Counts of peptides']
            )
            function_count_df = pd.DataFrame.from_dict(
                unique_function_counts, 
                orient='index'
            ).fillna(0).astype(int)
            combined_count_df = pd.concat([peptide_count_df, function_count_df], axis=1).T

            peptide_absorbance_df = pd.DataFrame.from_dict(
                summed_function_abundance, 
                orient='index', 
                columns=['Summed Abundance']
            )
            function_absorbance_df = pd.DataFrame.from_dict(
                unique_function_absorbance, 
                orient='index'
            ).fillna(0)
            combined_absorbance_df = pd.concat(
                [peptide_absorbance_df, function_absorbance_df], 
                axis=1
            ).T

            combined_df = pd.DataFrame(
                index=combined_absorbance_df.index, 
                columns=combined_absorbance_df.columns
            )
            
            for col in combined_absorbance_df.columns:
                for idx in combined_absorbance_df.index:
                    abundance = combined_absorbance_df.loc[idx, col]
                    count = (combined_count_df.loc['Counts of peptides', col] 
                            if idx == 'Summed Abundance' 
                            else combined_count_df.loc[idx, col])
                    combined_df.loc[idx, col] = "-" if (abundance == 0 and count == 0) else f"{abundance:.2e} ({round(count)})"
            
            combined_df.rename(index={'Summed Abundance': 'Total'}, inplace=True)

            timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
            filename = f"Processed_mbpdb_results_{timestamp}.xlsx"
            
            output = io.BytesIO()
            with pd.ExcelWriter(output, engine='openpyxl') as writer:
                combined_df.to_excel(writer, sheet_name='combined', index=True)
                combined_count_df.to_excel(writer, sheet_name='count', index=True)
                combined_absorbance_df.to_excel(writer, sheet_name='absorbance', index=True)
            
            return output.getvalue(), filename
        if 'function' in merged_df.columns:    
            if group_data:
                self._create_download_section(
                    "Export Bioactive Function Analysis",
                    "Download the bioactive function analysis results in Excel format containing three sheets: combined, count, and absorbance",
                    generate_bioactive_data,
                    'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
                    )

    def _bioactive_function_count_and_abundance_sum_avg(self, df, group_data):
        """Debug version of bioactive function counting and abundance calculation"""
        
        # Initialize result dictionaries
        summed_function_count = {}
        unique_function_counts = {}
        unique_function_count_averages = {}
        unique_function_absorbance = {}
        summed_function_abundance = {}
    

    
        # Iterate over each group
        for group_id, group_info in group_data.items():
            grouping_variable = group_info['grouping_variable']
            abundance_column = f'Avg_{grouping_variable}'
            
            
            # Check if abundance column exists
            if abundance_column not in df.columns:
                continue
                
            # Filter and process data
            temp_filter_df = df[['unique ID', 'function', abundance_column]].copy()
            
            # Filter non-zero and non-NaN values
            temp_filter_df = temp_filter_df[
                (temp_filter_df[abundance_column] != 0) & 
                temp_filter_df[abundance_column].notna()
            ]
            
            # Drop duplicates
            filtered_df = temp_filter_df.drop_duplicates(subset='unique ID')
            
            if filtered_df.empty:
                continue
                
            # Calculate metrics
            unique_peptide_count = filtered_df['unique ID'].nunique()
            total_sum = filtered_df[abundance_column].sum()
            
            
            # Store the totals
            summed_function_abundance[grouping_variable] = total_sum
            summed_function_count[grouping_variable] = unique_peptide_count
            
            # Process functions
            filtered_df['function'] = filtered_df['function'].fillna('')
            filtered_df['function'] = filtered_df['function'].str.split(';')
            exploded_df = filtered_df.explode('function')
            exploded_df['function'] = exploded_df['function'].str.strip()
            exploded_df = exploded_df[exploded_df['function'] != '']
            
            if not exploded_df.empty:
                
                # Count functions
                function_counts = exploded_df['function'].value_counts().to_dict()
                unique_function_counts[grouping_variable] = function_counts
                
                # Calculate function abundances
                function_grouped = exploded_df.groupby('function')[abundance_column].sum()
                unique_function_absorbance[grouping_variable] = function_grouped.to_dict()
                
                # Calculate averages
                num_columns_in_group = 1  # Since using averaged columns
                function_averages = {func: count / num_columns_in_group 
                                   for func, count in function_counts.items()}
                unique_function_count_averages[grouping_variable] = function_averages
            
        
        return (summed_function_count, unique_function_counts, 
                unique_function_count_averages, unique_function_absorbance, 
                summed_function_abundance)
        

In [8]:
class DataProcessingController:
    def __init__(self, workflow):
        self.workflow = workflow  # Store reference to workflow
        self.export_manager = ExportManager()
        self.combiner = None
        self.merged_df = None
        
        # Create processing button
        self.process_button = widgets.Button(
            description='Process Data',
            button_style='success',
            tooltip='Click to start data processing'
        )
        
        # Create export button (initially disabled)
        self.export_button = widgets.Button(
            description='Export Data',
            button_style='info',
            tooltip='Process data first to enable export',
            layout=widgets.Layout(margin='0 0 0 10px'),  # Add margin to separate buttons
            disabled=True  # Start disabled
        )
        
        # Create button container
        self.button_container = widgets.HBox([self.process_button, self.export_button])
        
        # Create separate output areas
        self.process_output = widgets.Output()
        self.export_output = widgets.Output()
        self.search_output = widgets.Output()
        self.stats_output = widgets.Output()
        self.grid_output = widgets.Output()
        
        # Set up button callbacks
        self.process_button.on_click(self._on_process_clicked)
        self.export_button.on_click(self._on_export_clicked)
        
    def display_interactive_results(self, df):
        """Display interactive grid with row search functionality"""
        if df is not None:
            # Create search widget
            search_widget = widgets.Text(
                placeholder='Search for data in rows...',
                description='Search:',
                layout=widgets.Layout(width='50%'),
                style={'description_width': 'initial'}
            )
            
            def get_column_category(col):
                """Determine category for each column"""
                if col.startswith('Avg_'):
                    return 'Average Abundance'
                if col.startswith('SEM_'):
                    return 'Standard Error Mean'
                elif col in self.workflow.data_transformer.mbpdb_results.columns:
                    return 'MBPDB Search Results'
                else:
                    return 'Peptidomic Data'

            # Create multi-level columns while preserving order
            column_tuples = [(get_column_category(col), col) for col in df.columns]
            
            df_display = df.copy()
            df_display.columns = pd.MultiIndex.from_tuples(column_tuples)
            
            def create_grid(df_to_display):
                grid = DataGrid(
                    df_to_display, 
                    selection_mode='cell', 
                    editable=False,
                    layout=widgets.Layout(height='600px')
                )
                grid.auto_fit_columns = True
                grid.base_row_size = 25
                grid.base_column_size = 150
                grid.auto_fit_params = {'area': 'column', 'padding': 10}
                return grid
            
            def on_search_change(change):
                with self.grid_output:
                    self.grid_output.clear_output()
                    
                    search_term = change['new'].strip()
                    if search_term:
                        str_df = df_display.astype(str)
                        mask = str_df.apply(
                            lambda row: row.str.contains(search_term, case=False, na=False).any(),
                            axis=1
                        )
                        filtered_df = df_display[mask]
                        
                        with self.stats_output:
                            self.stats_output.clear_output()
                            print(f"Found {len(filtered_df)} matching rows out of {len(df_display)} total rows")
                    else:
                        filtered_df = df_display
                        with self.stats_output:
                            self.stats_output.clear_output()
                    
                    display(create_grid(filtered_df))
            
            search_widget.observe(on_search_change, names='value')

            # Display search interface
            with self.search_output:
                self.search_output.clear_output()
                display(search_widget)
            
            # Initialize grid display
            with self.grid_output:
                self.grid_output.clear_output()
                display(create_grid(df_display))
            
        else:
            print("No data to display")

        
    def _on_process_clicked(self, b):
        # Clear all outputs except export
        self.process_output.clear_output()
        self.search_output.clear_output()
        self.stats_output.clear_output()
        self.grid_output.clear_output()
        
        with self.process_output:           
            # Pass the actual data_transformer, not the workflow
            self.combiner = CombineAverageDataframes(
                self.workflow.data_transformer,  # Pass the data_transformer directly
                self.workflow.group_processor, 
                self.workflow.protein_handler
            )
            self.merged_df = self.combiner.process_data(self.workflow.group_processor.group_data)
            
            if self.merged_df is not None:
                print("\nData processing completed successfully!")
                print(f"Final results row count: {self.merged_df.shape[0]}")
                print(f"Final results column count: {self.merged_df.shape[1]}")
                
                # Enable export button after successful processing
                self.export_button.disabled = False
                self.export_button.tooltip = 'Click to show export options'
                
                self.display_interactive_results(self.merged_df)
            else:
                print("Error: No data was processed")
                # Keep export button disabled if processing failed
                self.export_button.disabled = True
                    
    def _on_export_clicked(self, b):
        with self.export_output:
            clear_output()
            #display(HTML("<h2>Export:</h2>"))
            
            # Use workflow's data_transformer instance
            if (hasattr(self.workflow.data_transformer, 'mbpdb_results') and 
                self.workflow.data_transformer.mbpdb_results is not None):
                self.export_manager.export_mbpdb_results(self.workflow.data_transformer.mbpdb_results)
            
            # Use workflow's group_processor instance
            self.export_manager.export_group_data(self.workflow.group_processor.group_data)
            
            if self.merged_df is not None:
                self.export_manager.export_bioactive_data(
                    self.merged_df, 
                    self.workflow.group_processor.group_data
                )
                self.export_manager.export_dataframe(self.merged_df)
                self.export_manager.setup_pivoted_data_export(
                    self.merged_df,
                    self.workflow.group_processor.group_data
                )
    
    def display(self):
        """Display the complete interface"""
        display(self.button_container)
        display(self.process_output)
        display(self.export_output)
        display(self.search_output)
        display(self.stats_output)
        display(self.grid_output)

# Initialize the controller
controller = DataProcessingController(workflow)
# Display the interface
controller.display()

HBox(children=(Button(button_style='success', description='Process Data', style=ButtonStyle(), tooltip='Click …

Output()

Output()

Output()

Output()

Output()