# Data Transormation

In [27]:
## Install/Import packages & define key varribles and functions
# Run install script
# %chmod +x setup_jupyterlab.sh
# %./setup_jupyterlab.sh

# Import necessary libraries for the script to function.
import pandas as pd
import csv, json, re, os, shutil, io, base64
from io import StringIO, BytesIO
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import numpy as np
import matplotlib.gridspec as gridspec
from matplotlib.colors import LinearSegmentedColormap
from matplotlib.patches import Patch

#import statsmodels.api as sm
#from statsmodels.formula.api import ols
#from statsmodels.stats.multicomp import pairwise_tukeyhsd
import warnings

from functools import partial
import seaborn as sns
from scipy.stats import pearsonr
from itertools import combinations
from ipydatagrid import DataGrid

from IPython.display import display, HTML, clear_output
import ipywidgets as widgets
from ipywidgets import interact, interactive, fixed, interact_manual
from my_functions_datatransformation import (process_protein_combinations, setup_data_loading_ui, display_widgets, setup_widgets,
                                     initialize_settings, check_and_add_protein, process_pd_results, extract_bioactive_peptides, select_proteins,
                                     calculate_group_abundance_std_averages, export_dataframe, prompt_export_options, setup_widgets_vp, impute_missing_values,
                                     adjust_sequence_interval, export_heatmap_data_to_dict, export_group_data, display_grouping_dictionary_selector,
                                     replace_protein_accessions,check_sequence_alignment)
import traitlets
from traitlets import HasTraits, Instance, observe

# Global variable declaration
settings_dict = initialize_settings()
globals().update(settings_dict)
import _settings as settings
global spec_translate_list
spec_translate_list = settings.SPEC_TRANSLATE_LIST
# Set the default font to Calibri
#matplotlib.rcParams['font.family'] = 'Calibri'


## Imports Proteome Discover Data and MBPDB Bioactive Peptide Matches

In [2]:
class DataTransformation(HasTraits):
    pd_results = Instance(pd.DataFrame, allow_none=True)
    mbpdb_results = Instance(pd.DataFrame, allow_none=True)
    
    def __init__(self):
        super().__init__()
        self.pd_results = pd.DataFrame()
        self.pd_results_cleaned = pd.DataFrame()
        self.mbpdb_results = pd.DataFrame()
        self.proteins_dic = {}
        self.output_area = None
        self.mbpdb_uploader = None
        self.pd_uploader = None
        self.fasta_uploader = None
        self.reset_button = None        
    def setup_data_loading_ui(self):
        """Initialize and display the data loading UI"""
        # Create file upload widgets
        self.mbpdb_uploader = widgets.FileUpload(
            accept='.csv,.txt,.tsv,.xlsx',
            multiple=False,
            description='Upload MBPDB File',
            layout=widgets.Layout(width='300px'),
            style={'description_width': 'initial'}
        )
        
        self.pd_uploader = widgets.FileUpload(
            accept='.csv,.txt,.tsv,.xlsx',
            multiple=False,
            description='Upload Peptidomic File',
            layout=widgets.Layout(width='300px'),
            style={'description_width': 'initial'}
        )
        
        self.fasta_uploader = widgets.FileUpload(
            accept='.fasta',
            multiple=True,
            description='Upload FASTA Files',
            layout=widgets.Layout(width='300px'),
            style={'description_width': 'initial'}
        )

        # Reset button
        self.reset_button = widgets.Button(
            description='Reset',
            button_style='warning'
        )

        self.output_area = widgets.Output()
        
        # Display widgets
        display(HTML("<h3><u>Upload Data Files:</u></h3>"))
        display(self.mbpdb_uploader, self.pd_uploader)
        display(HTML("<h3><u>Upload Protein FASTA Files:</u></h3>"))
        display(self.fasta_uploader)
        display(self.reset_button, self.output_area)

        # Register observers
        self.pd_uploader.observe(self._on_pd_upload_change, names='value')
        self.mbpdb_uploader.observe(self._on_mbpdb_upload_change, names='value')
        self.fasta_uploader.observe(self._on_fasta_upload_change, names='value')
        self.reset_button.on_click(self._reset_ui)

    def _reset_ui(self, b):
        """Reset the UI state"""
        self.mbpdb_uploader._counter = 0
        self.pd_uploader._counter = 0
        self.fasta_uploader._counter = 0
        self.mbpdb_uploader.value = ()
        self.pd_uploader.value = ()
        self.fasta_uploader.value = ()
        self.pd_results = None
        self.mbpdb_results = None
        self.proteins_dic = {}
        with self.output_area:
            self.output_area.clear_output()
            display(HTML('<b style="color:blue;">All uploads cleared.</b>'))

    def _on_pd_upload_change(self, change):
        if change['type'] == 'change' and change['name'] == 'value':
            with self.output_area:
                self.output_area.clear_output()
                if change['new'] and len(change['new']) > 0:
                    file_data = change['new'][0]
                    self.pd_results, pd_status = self._load_data(
                        file_data,
                        required_columns=['Positions in Proteins'],
                        file_type='Peptidomic'
                    )
                    if pd_status == 'yes' and self.pd_results is not None:
                        display(HTML(f'<b style="color:green;">Peptidomic data imported with {self.pd_results.shape[0]} rows and {self.pd_results.shape[1]} columns.</b>'))

    def _on_mbpdb_upload_change(self, change):
        if change['type'] == 'change' and change['name'] == 'value':
            with self.output_area:
                self.output_area.clear_output()
                if change['new'] and len(change['new']) > 0:
                    file_data = change['new'][0]
                    self.mbpdb_results, mbpdb_status = self._load_data(
                        file_data,
                        required_columns=['Search peptide', 'Protein ID', 'Peptide'],
                        file_type='MBPDB'
                    )
                    if mbpdb_status == 'yes' and self.mbpdb_results is not None:
                        display(HTML(f'<b style="color:green;">MBPDB file imported with {self.mbpdb_results.shape[0]} rows and {self.mbpdb_results.shape[1]} columns</b>'))

    def _on_fasta_upload_change(self, change):
        if change['type'] == 'change' and change['name'] == 'value':
            with self.output_area:
                self.output_area.clear_output()
                if change['new'] and len(change['new']) > 0:
                    for file_data in change['new']:
                        try:
                            file_name = getattr(file_data, 'name', None)
                            if file_name and file_name.endswith('.fasta'):
                                new_proteins = self._parse_uploaded_fasta(file_data)
                                self.proteins_dic.update(new_proteins)
                                display(HTML(f'<b style="color:green;">Successfully imported FASTA file: {file_name} ({len(new_proteins)} proteins)</b>'))
                            else:
                                display(HTML(f'<b style="color:red;">Invalid file format. Please upload FASTA files only.</b>'))
                        except Exception as e:
                            display(HTML(f'<b style="color:red;">Error processing FASTA file: {str(e)}</b>'))

    def _load_data(self, file_obj, required_columns, file_type):
        """Load and validate uploaded data files"""
        try:
            content = file_obj.content
            filename = file_obj.name
            extension = filename.split('.')[-1].lower()
            
            file_stream = io.BytesIO(content)
            
            if extension == 'csv':
                df = pd.read_csv(file_stream)
            elif extension in ['txt', 'tsv']:
                df = pd.read_csv(file_stream, delimiter='\t')
            elif extension == 'xlsx':
                df = pd.read_excel(file_stream)
            else:
                raise ValueError("Unsupported file format.")
            
            df.columns = df.columns.str.strip()

            if not set(required_columns).issubset(df.columns):
                missing = set(required_columns) - set(df.columns)
                display(HTML(f'<b style="color:red;">{file_type} File Error: Missing required columns: {", ".join(missing)}</b>'))
                return None, 'no'
            
            return df, 'yes'
        except Exception as e:
            display(HTML(f'<b style="color:red;">{file_type} File Error: {str(e)}</b>'))
            return None, 'no'

    def _parse_uploaded_fasta(self, file_data):
        """Parse uploaded FASTA file content"""
        fasta_dict = {}
        fasta_text = bytes(file_data.content).decode('utf-8')
        lines = fasta_text.split('\n')
        
        protein_id = ""
        protein_name = ""
        sequence = ""
        species = ""
        
        for line in lines:
            line = line.strip()
            if line.startswith('>'):
                if protein_id:
                    fasta_dict[protein_id] = {
                        "name": protein_name,
                        "sequence": sequence,
                        "species": species
                    }
                sequence = ""
                header_parts = line[1:].split('|')
                if len(header_parts) > 2:
                    protein_id = header_parts[1]
                    protein_name_full = re.split(r' OS=', header_parts[2])[0]
                    if ' ' in protein_name_full:
                        protein_name = protein_name_full
                    else:
                        protein_name = protein_name_full
                    species = self._find_species(line)
            else:
                sequence += line
                
        if protein_id:
            fasta_dict[protein_id] = {
                "name": protein_name,
                "sequence": sequence,
                "species": species
            }
        
        return fasta_dict

    def _find_species(self, header):
        """Find species in FASTA header"""
        header_lower = header.lower()
        for spec_group in spec_translate_list:
            for term in spec_group[1:]:
                if term.lower() in header_lower:
                    return spec_group[0]
        return "unknown"
    
    def process_protein_combinations(self):
        """Process protein combinations in pd_results"""
        if not self.pd_results.empty:
            df = self.pd_results.copy()
            
            # Create main grid container
            grid = widgets.GridspecLayout(1, 2,  # Number of rows and columns
                width='1000px', 
                grid_gap='5px',  # Adjust spacing between grid elements
            )
            
            # Create input and output areas
            input_area = widgets.VBox([
                widgets.HTML("<h3>Peptides Mapped to Multiple Proteins</h3>"),
                widgets.HTML("Peptides that have been identified and <b>mapped to multiple proteins</b> and the '<b>Master Protein Accessions</b>' and '<b>Positions in Proteins</b>' columns have multiple entries for a single peptide require special attention.")
            ], layout=widgets.Layout(width='100%'))
            
            self.protein_output_area = widgets.Output(
                #layout=widgets.Layout(width='90%')
            )
            
            # Create split container for input and output
            """split_container = widgets.VBox([
                input_area,
                self.protein_output_area
            ])"""
            
            # Add to grid
            grid[0, 0] = input_area
            grid[0, 1] = self.protein_output_area
    
            # Count peptides with multiple protein accessions
            num_multiple_entries = len(self.pd_results[self.pd_results['Master Protein Accessions'].str.contains(';')])
            input_area.children += (widgets.HTML(f"In your dataset, you have <b>{num_multiple_entries}</b> peptides mapped to multiple Master Protein Accessions."),)
            
            unique_proteins = self.pd_results['Master Protein Accessions'].dropna().unique()
            self.multi_protein_combinations = [up for up in unique_proteins if ';' in up]
            
            # Instructions for user actions
            html_content = """
            <h3>Options</h3>
            For each protein combination with multiple entries, you have two options:<br>
            1. <b>'new'</b> - Create a new row for each protein listed in the 'Master Protein Accessions' column and their corresponding 'Positions in Proteins'.<br>
            2. <b>Enter a Protein ID</b> - Replace the current protein combination with a custom Protein ID of your choice, updating 'Positions in Proteins' accordingly.
            """
            input_area.children += (widgets.HTML(html_content),)
            
            self.user_decisions = {}
            self.decision_inputs = []
            
            # Create input fields
            for combo in self.multi_protein_combinations:
                named_combo = self.fetch_protein_names(combo)
                occurrences = self.pd_results[self.pd_results['Master Protein Accessions'].str.contains(combo, regex=False)].shape[0]
                
                combo_container = widgets.VBox([
                    widgets.HTML(f"<b>{occurrences}</b> occurrences of<br><b>{named_combo}</b>."),
                    widgets.Text(
                        placeholder="Enter 'new', or a custom Protein ID",
                        description='Decision:',
                        layout=widgets.Layout(width='300px')
                    )
                ])
                self.decision_inputs.append(combo_container.children[-1])
                input_area.children += (combo_container,)
            
            # Create buttons
            submit_button = widgets.Button(description="Submit", button_style='success')
            reset_button = widgets.Button(description="Reset Selection", button_style='warning')
            button_box_protein = widgets.HBox([submit_button, reset_button])
            input_area.children += (button_box_protein,)
            
            # Register button callbacks
            reset_button.on_click(self.on_reset_button_clicked)
            submit_button.on_click(lambda b: self.on_submit(b, df))
            self.pd_results_cleaned = df
            display(grid)
            return df
        
    def on_submit(self, button, df):
        """Handle submit button click for protein combinations"""
        with self.protein_output_area:
            self.protein_output_area.clear_output()
            for combo, decision_input in zip(self.multi_protein_combinations, self.decision_inputs):
                self.user_decisions[combo] = decision_input.value.strip().upper()
            # Iterate over each row in the DataFrame
            for index, row in df.iterrows():
                proteins_row = row['Master Protein Accessions']
                positions_row = row['Positions in Proteins']
                if proteins_row in self.user_decisions:
                    decision = self.user_decisions[proteins_row]
                    # Split accessions and positions
                    accessions = proteins_row.split('; ')
                    positions = positions_row.split('; ')
                    # Create a dictionary to map each accession to its corresponding position
                    accession_position_map = {}
                    for acc in accessions:
                        for pos in positions:
                            if acc in pos:
                                accession_position_map[acc] = pos
                                positions.remove(pos)
                                break
                    acc_pos_pairs = list(accession_position_map.items())
            
                    if decision == 'NEW':
                        # Update the current row
                        df.at[index, 'Master Protein Accessions'] = acc_pos_pairs[0][0]
                        df.at[index, 'Positions in Proteins'] = acc_pos_pairs[0][1]
                        
                        # Create new rows for each additional accession and position
                        for acc, pos in acc_pos_pairs[1:]:
                            new_row = row.copy()
                            new_row['Master Protein Accessions'] = acc
                            new_row['Positions in Proteins'] = pos
                            df.loc[len(df)] = new_row
                     
                    else:
                        new_accession = decision
                        new_positions = []
                        for pos in positions_row.split('; '):
                            num_range = pos[pos.index('['):] if '[' in pos else ''
                            new_positions.append(f"{new_accession} {num_range}")
                        df.at[index, 'Master Protein Accessions'] = new_accession
                        df.at[index, 'Positions in Proteins'] = '; '.join(new_positions)
    
            # Display output
            for combo, decision in self.user_decisions.items():
                if decision == 'NEW':
                    display(HTML(f'<b>{combo}</b> <b style="color:green;">has been successfully processed.</b>'))
                    display(HTML('&nbsp;&nbsp;&nbsp;&nbsp;Shared occurrences of the peptide have been separated, with each now assigned a unique protein ID in a new row.'))
                else:
                    display(HTML(f'<b>{combo}</b> <b style="color:green;">has been successfully processed.</b>'))
                    display(HTML(f'&nbsp;&nbsp;&nbsp;&nbsp;The occurrences of the peptide with the shared combined protein ID "{combo}" have been replaced with "{decision}".'))
        return df
    
    def on_reset_button_clicked(self, b):
        """Handle reset button click for protein combinations"""
        with self.protein_output_area:
            self.protein_output_area.clear_output()
            display(HTML('<span style="color:red;">To reset "Mapped to Multiple Proteins" selection after hitting the submit button, <b>rerun the cell</b> and make the correct selections. This button <b>only</b> displays instructions</span>'))
        
    def fetch_protein_names(self, accession_str):
        """Fetch protein names from accession string"""
        names = []
        for acc in accession_str.split('; '):
            if acc in self.proteins_dic:
                names.append(f"{acc}<span style='color:blue'> ({self.proteins_dic[acc]['species']} - {self.proteins_dic[acc]['name']})</span>")
            else:
                names.append(acc)
        return '<br>'.join(names)
    def handle_protein_combinations(self):
        """
        Simple prompt for user to decide whether to process protein combinations.
        """
        display(HTML("<h3>Multiple Protein Mappings</h3>"))
                
        choice = widgets.RadioButtons(
            options=[('Yes', True), ('No', False)],
            description='Process peptides mapped to multiple proteins?',
            style={'description_width': 'initial'},
            value=None  # This makes it start unchecked
        )
        output = widgets.Output()
        
        def process_choice(_):
            with output:
                clear_output()
                if choice.value:
                    self.pd_results_cleaned = data_transformer.process_protein_combinations()
                    display(HTML("<b style='color:green;'>Processed peptides mapped to multiple proteins.</b>"))
                else:
                    self.pd_results_cleaned = self.pd_results.copy()
                    display(HTML("<b>Using original protein mappings.</b>"))
        
        choice.observe(process_choice, 'value')
        display(choice)
        display(output)

    # Then to use it, we can create an observe function:
    def observe_data_changes(change):
        if hasattr(change, 'new'):
            combiner.update_data(data_transformer.pd_results, data_transformer.mbpdb_results)
            setup_data.update_data(data_transformer.pd_results, data_transformer.pd_results_cleaned)
    
        
    
    # Add this to DataTransformation class:
    def attach_observers(self, group_processor):
        """
        Attach observers to monitor changes in pd_results and pd_results_cleaned
        
        Args:
            group_processor: Instance of GroupProcessing class
        """
        def observe_data_changes(change):
            if change.name in ['pd_results', 'pd_results_cleaned']:
                group_processor.update_data(self.pd_results, self.pd_results_cleaned)
        
        self.observe(observe_data_changes, names=['pd_results', 'pd_results_cleaned'])

In [3]:
# Cell 1: Create the instance and setup UI
data_transformer = DataTransformation()
data_transformer.setup_data_loading_ui()


FileUpload(value=(), accept='.csv,.txt,.tsv,.xlsx', description='Upload MBPDB File', layout=Layout(width='300p…

FileUpload(value=(), accept='.csv,.txt,.tsv,.xlsx', description='Upload Peptidomic File', layout=Layout(width=…

FileUpload(value=(), accept='.fasta', description='Upload FASTA Files', layout=Layout(width='300px'), multiple…



Output()

## Handles Peptides Matched to Multiple Proteins

In [4]:
# Then call the handle_protein_combinations method
data_transformer.handle_protein_combinations()

RadioButtons(description='Process peptides mapped to multiple proteins?', options=(('Yes', True), ('No', False…

Output()

In [5]:
"""
from ipydatagrid import DataGrid

if data_transformer.pd_results is not None:
    grid = DataGrid(
        data_transformer.pd_results,
        selection_mode='cell',
        grid_style={'gridStroke': '#ddd'},
        base_row_size=25,
        base_column_size=100,
        auto_fit_columns=True,
        layout={'height': '300px', 'width': 'auto'}
    )
    display(grid)
else:
    print("No peptidomic data loaded yet")
""";

## Group data by Catagorical Varriables

In [10]:
class GroupProcessing:
    def __init__(self):
        self.group_data = {}
        self.group_number = 1
        self.filtered_columns = []
        self.group_uploader = widgets.FileUpload(
        accept='.json',
        multiple=False,
        description='Upload Groups File',
        layout=widgets.Layout(width='300px'),
        style={'description_width': 'initial'}
        )
        self.group_uploader.observe(self._on_group_upload_change, names='value')
        
        # Initialize output areas
        self.output = widgets.Output()
        self.gd_output_area = widgets.Output()
        
        # Initialize widgets for group selection
        self.column_dropdown = widgets.SelectMultiple(
            description='Absorbance',
            style={'description_width': 'initial'},
            disabled=False,
            layout=widgets.Layout(width='90%', height='300px')
        )
        
        self.grouping_variable_text = widgets.Text(
            description='Group Name',
            layout=widgets.Layout(width='90%'),
            style={'description_width': 'initial'}
        )
        
        # Initialize buttons
        self.search_button = widgets.Button(
            description='Search',
            button_style='info',
            layout=widgets.Layout(margin='10px 10px 0 0')
        )
        
        self.add_group_button = widgets.Button(
            description='Add Group',
            button_style='success',
            layout=widgets.Layout(margin='10px 10px 0 0')
        )
        
        self.reset_file_button = widgets.Button(
            description='Reset Selection',
            button_style='warning',
            layout=widgets.Layout(margin='10px 10px 0 75px')
        )
        
        # Set up button callbacks
        self.search_button.on_click(self._search_columns)
        self.add_group_button.on_click(self._add_group)
        self.reset_file_button.on_click(self._reset_selection)
        

    def update_data(self, pd_results, pd_results_cleaned):
        """Update data and refresh filtered columns"""
        self.pd_results = pd_results
        self.pd_results_cleaned = pd_results_cleaned
        
        # Only update if we have valid data
        if pd_results is not None or pd_results_cleaned is not None:
            self.setup_data()
            
            # Update the dropdown with new filtered columns
            with self.output:
                self.output.clear_output()
                display(widgets.HTML('<b style="color:green;">Data updated successfully. Column selection refreshed.</b>'))
    
            
    def setup_data(self):
        """Initialize data and filters for the analysis"""
        # Define columns to exclude with more flexible matching
        columns_to_exclude = [
            'Marked as', 'Number of Missed Cleavages', 'Missed Cleavages',
            'Checked', 'Confidence', 'Annotated Sequence', 'Unnamed: 3', 
            'Modifications', 'Protein Groups', 'Proteins', 'PSMs', 
            'Master Protein Accessions', 'Positions in Proteins', 
            'Modifications in Proteins',
            'Theo MHplus in Da', 'Quan Info', 
            'Confidence by Search Engine', 
            'q-Value by Search Engine',
            'PEP by Search Engine',
            'SVM Score by Search Engine',
            'XCorr by Search Engine',
            'PEP', 'q-Value', 'Top Apex RT', 'RT in min',
            'Sequence', 'Search peptide', 'Peptide', 'Protein ID', 
            'Protein description', 'Alignment', 'Species', 
            'Intervals', 'Function', 'unique ID'
            ]
        
        exclude_substrings = [
            'Abundances by Bio Rep', 
            'Count', 
            'Origin',
            'Average_Abundance'  # Added to exclude average abundance columns
        ]
    
        # Use cleaned data if available, otherwise use original
        df = self.pd_results_cleaned if (hasattr(self, 'pd_results_cleaned') and 
                                       not self.pd_results_cleaned.empty) else self.pd_results
        
        if df is not None and not df.empty:
            # More flexible column filtering
            self.filtered_columns = []
            for col in df.columns:
                # Check if any exclusion pattern matches the column name
                should_exclude = any(excl.lower() in col.lower() for excl in columns_to_exclude)
                # Check if any substring pattern matches
                has_excluded_substring = any(sub.lower() in col.lower() for sub in exclude_substrings)
                
                if not should_exclude and not has_excluded_substring:
                    self.filtered_columns.append(col)
              
            # Update dropdown options
            self.column_dropdown.options = self.filtered_columns
            self._reset_inputs()
        else:
            self.filtered_columns = []
            self.column_dropdown.options = []
            with self.output:
                self.output.clear_output()
                display(widgets.HTML('<b style="color:red;">No valid data available for processing.</b>'))

    def display_group_selector(self):
        """Display the JSON file selector for group dictionaries"""
        display(widgets.HTML("<h3><u>Upload Existing Group Dictionary:</u></h3>"))
        display(self.group_uploader, self.gd_output_area)
        

    def display_widgets(self):
        """Display the main UI for group selection"""
        # Create main grid container
        grid = widgets.GridspecLayout(1, 2,  # Number of rows and columns
            width='1000px', 
            grid_gap='5px',  # Adjust spacing between grid elements
        )
        
        # Create input container with vertical scroll
        input_container = widgets.VBox([
            widgets.HTML("<h3><u>Select New Grouping of Data:</u></h3>"),
            widgets.HTML('Now select the <b>absorbance columns</b> and assign the name of the <b>grouping variable</b>:'),
            self.column_dropdown,
            self.grouping_variable_text,
            # Create button layouts
            widgets.HBox([self.search_button, self.add_group_button]),
            widgets.HBox([self.reset_file_button])
        ], layout=widgets.Layout(
            width='95%',
            height='600px',
            overflow_y='auto'  # Add vertical scroll
        ))
        
        # Create output container with vertical scroll
        output_container = widgets.VBox([
            widgets.HTML("<h3><u>Group Selection Results:</u></h3>"),
            self.output
        ], layout=widgets.Layout(
            width='95%',
            height='600px',
            overflow_y='auto',  # Add vertical scroll
            padding='10px'
        ))
        
        # Add to grid
        grid[0, 0] = input_container  # Left column
        grid[0, 1] = output_container  # Right column
        
        display(grid)
    def _on_gd_submit(self, b, dropdown):
        """Handle JSON file submission"""
        selected_file = dropdown.value
        with self.gd_output_area:
            clear_output()
            
            if selected_file == 'Select an existing grouping dictionary file':
                print("Please select a valid file.")
                return
                
            try:
                # Load and process JSON file
                with open(selected_file, 'r') as file:
                    data = json.load(file)
                self.group_data = {}
                
                # Process groups
                with self.output:
                    clear_output()
                    for group_number, group_info in data.items():
                        group_name = group_info.get('grouping_variable')
                        selected_columns = group_info.get('abundance_columns')
                        
                        self.group_data[group_number] = {
                            'grouping_variable': group_name,
                            'abundance_columns': selected_columns
                        }
                        
                        display(widgets.HTML(
                            f"<b>Group {group_number}</b> created with <b>{len(selected_columns)} columns assigned</b>."
                        ))
                        display(widgets.HTML(f"<b>Grouping Variable:</b> {group_name}"))
                        display(widgets.HTML(f"<b>Selected Columns:</b> {', '.join(selected_columns)}"))
                        display(widgets.HTML("<hr style='border: 1px solid black;'>"))
                        
                display(widgets.HTML(f'<b style="color:green;">Successfully uploaded: {selected_file}</b>'))
                
            except Exception as e:
                display(widgets.HTML(f"<b style='color:red;'>An error occurred while processing the file: {str(e)}</b>"))
    
    def _search_columns(self, b):
        """Search for columns based on group name"""
        group_name = self.grouping_variable_text.value
        if group_name:
            matching_columns = [col for col in self.filtered_columns if group_name in col]
            self.column_dropdown.value = matching_columns
        else:
            with self.output:
                clear_output()
                display(widgets.HTML('<b style="color:red;">Please enter a group name to search.</b>'))
    
    def _add_group(self, b):
        """Add a new group to the data"""
        group_name = self.grouping_variable_text.value
        selected_columns = list(self.column_dropdown.value)
        
        if not (group_name and selected_columns):
            with self.output:
                display(widgets.HTML('<b style="color:red;">Please enter a group name and select at least one column.</b>'))
            return
        
        # If group_data exists, use next number, otherwise start at 1
        if self.group_data:
            # Convert existing keys to integers and find max
            existing_numbers = [int(k) for k in self.group_data.keys()]
            next_number = max(existing_numbers) + 1
            self.group_number = str(next_number)
        else:
            self.group_data = {}
            self.group_number = "1"
        
        # Add new group data to the dictionary
        self.group_data[self.group_number] = {
            'grouping_variable': group_name,
            'abundance_columns': selected_columns
        }
        
        # Display output
        with self.output:
            display(widgets.HTML(f"<b>Group {self.group_number}</b> created with <b>{len(selected_columns)} columns assigned</b>."))
            display(widgets.HTML(f"<b>Grouping Variable:</b> {group_name}"))
            display(widgets.HTML(f"<b>Selected Columns:</b> {', '.join(selected_columns)}"))
            display(widgets.HTML("<hr style='border: 1px solid black;'>"))
        
        self._reset_inputs()
        
    def _reset_selection(self, b):
        """Reset all selections and data"""
        self.group_data = {}
        self.group_number = 1
        with self.gd_output_area:
            clear_output()
        with self.output:
            clear_output()
        self._reset_inputs()
    
    def _reset_inputs(self):
        """Reset input fields"""
        self.grouping_variable_text.value = ''
        self.column_dropdown.value = ()

    def _on_group_upload_change(self, change):
        """Handle JSON file upload"""
        if change['type'] == 'change' and change['name'] == 'value':
            with self.gd_output_area:
                if change['new'] and len(change['new']) > 0:
                    file_data = change['new'][0]
                    try:
                        content = bytes(file_data.content).decode('utf-8')
                        data = json.loads(content)
                        
                        # Process groups
                        with self.output:
                            for group_number, group_info in data.items():
                                group_name = group_info.get('grouping_variable')
                                selected_columns = group_info.get('abundance_columns')
                                
                                # Update group_data without clearing previous entries
                                self.group_data[group_number] = {
                                    'grouping_variable': group_name,
                                    'abundance_columns': selected_columns
                                }
                                
                                display(widgets.HTML(
                                    f"<b>Group {group_number}</b> created with <b>{len(selected_columns)} columns assigned</b>."
                                ))
                                display(widgets.HTML(f"<b>Grouping Variable:</b> {group_name}"))
                                display(widgets.HTML(f"<b>Selected Columns:</b> {', '.join(selected_columns)}"))
                                display(widgets.HTML("<hr style='border: 1px solid black;'>"))
                                
                        display(widgets.HTML(f'<b style="color:green;">Successfully uploaded: {file_data.name}</b>'))
                        
                    except Exception as e:
                        display(widgets.HTML(f"<b style='color:red;'>An error occurred while processing the file: {str(e)}</b>"))


In [7]:
group_processor = GroupProcessing()
group_processor.display_group_selector()
#roup_processor.setup_data()
group_processor.display_widgets()
data_transformer.attach_observers(group_processor)

HTML(value='<h3><u>Upload Existing Group Dictionary:</u></h3>')

FileUpload(value=(), accept='.json', description='Upload Groups File', layout=Layout(width='300px'))

Output()

GridspecLayout(children=(VBox(children=(HTML(value='<h3><u>Select New Grouping of Data:</u></h3>'), HTML(value…

## Transforms & Export data

In [8]:
class CombineAverageDataframes:
    def __init__(self, data_transformer, group_processor):
        self.data_transformer = data_transformer
        self.group_processor = group_processor
        self.pd_results = data_transformer.pd_results
        self.mbpdb_results = data_transformer.mbpdb_results
        self.pd_results_cleaned = self.pd_results.copy() if self.pd_results is not None else None
        self._merged_df = None  # Add this line
        # Set up observer for data changes
        self.data_transformer.observe(self._handle_data_change, names=['pd_results', 'mbpdb_results'])
     
    def _handle_data_change(self, change):
        """Handle changes in the input data."""
        if change.name == 'pd_results':
            self.pd_results = change.new
        elif change.name == 'mbpdb_results':
            self.mbpdb_results = change.new
            
        self.pd_results_cleaned = self.pd_results.copy() if self.pd_results is not None else None
        
        # Re-run interactive display
        clear_output()        
    @property
    def merged_df(self):
        """Property to access the merged DataFrame."""
        return self._merged_df

    def extract_bioactive_peptides(self):
        """
        Extracts the list of bioactive peptide matches from the imported MBPDB search.
        """
        if not self.mbpdb_results.empty:
            # Drop rows where 'Protein ID' is NaN or 'None'
            mbpdb_results_cleaned = self.mbpdb_results.copy()
            mbpdb_results_cleaned.dropna(subset=['Protein ID'], inplace=True)
            mbpdb_results_cleaned = mbpdb_results_cleaned[mbpdb_results_cleaned['Protein ID'] != 'None']

            # Check if '% Alignment' column exists
            if '% Alignment' in mbpdb_results_cleaned.columns:
                agg_dict = {
                    'Peptide': 'first', 
                    'Protein ID': 'first',
                    'Protein description': 'first',
                    '% Alignment': 'first',
                    'Species': 'first',
                    'Intervals': 'first',
                    'Function': lambda x: list(x.dropna().unique())
                }
            else:
                agg_dict = {
                    'Peptide': 'first', 
                    'Protein ID': 'first',
                    'Protein description': 'first',
                    'Species': 'first',
                    'Intervals': 'first',
                    'Function': lambda x: list(x.dropna().unique())
                }

            # Perform the groupby and aggregation
            mbpdb_results_grouped = mbpdb_results_cleaned.groupby('Search peptide').agg(agg_dict).reset_index()

            # Flatten the 'Function' list
            mbpdb_results_grouped['Function'] = mbpdb_results_grouped['Function'].apply(
                lambda x: '; '.join(x) if isinstance(x, list) else x
            )
            return mbpdb_results_cleaned, mbpdb_results_grouped
        else:
            return None, None

    def create_unique_id(self, row):
        """Creates a unique ID for each peptide row."""
        if pd.notna(row['Modifications']):
            unique_id = row['Sequence'] + "_" + row['Modifications'].strip()
        else:
            unique_id = row['Sequence']
        return unique_id.rstrip('_')

    def process_pd_results(self, mbpdb_results_grouped):
        """Processes the PD results and merges with MBPDB results."""
        pd_results_cleaned = self.pd_results_cleaned

        # Process positions and accessions
        pd_results_cleaned['Positions in Proteins'] = pd_results_cleaned['Positions in Proteins'].str.split(';', expand=False).str[0]
        pd_results_cleaned['Master Protein Accessions'] = pd_results_cleaned['Master Protein Accessions'].str.split(';', expand=False).str[0]

        # Create sequence column if needed
        if 'Sequence' not in pd_results_cleaned.columns:
            pd_results_cleaned['Sequence'] = pd_results_cleaned['Annotated Sequence'].str.split('.', expand=False).str[1]

        # Create unique ID
        pd_results_cleaned['unique ID'] = pd_results_cleaned.apply(self.create_unique_id, axis=1)

        # Extract start and stop positions
        try:
            extracted = pd_results_cleaned['Positions in Proteins'].str.extract(r'\[(\d+)-(\d+)\]')
            pd_results_cleaned[['start', 'stop']] = extracted.astype(float).astype('Int64')
        except Exception as e:
            print(f"Error: {e}")

        # Reorder columns
        columns_order = ['Master Protein Accessions', 'Positions in Proteins', 'start', 'stop'] + \
                        [col for col in pd_results_cleaned.columns if col not in ['Master Protein Accessions', 'Positions in Proteins', 'start', 'stop']]
        pd_results_cleaned = pd_results_cleaned[columns_order]

        # Merge with MBPDB results if available
        if mbpdb_results_grouped is not None and not mbpdb_results_grouped.empty:
            merged_df = pd.merge(pd_results_cleaned, mbpdb_results_grouped, 
                               right_on='Search peptide', left_on='unique ID', how='left')
            display(HTML("<b style='color:green;'>The MBPDB was successfully merged with the peptidomic data matching the Search Peptide and Unique ID columns.</b>"))
        else:
            merged_df = pd_results_cleaned.copy()
            merged_df['Function'] = np.nan
            display(HTML("<b style='color:orange;'>No MBPDB was uploaded.</b>"))
            display(HTML("<b style='color:orange;'>The merged Dataframe contains only peptidomic data.</b>"))

        return merged_df
    
    def calculate_group_abundance_averages(self, df, group_data):
        """Calculates group abundance averages."""
        # Check if all average abundance columns already exist
        all_columns_exist = True
        for group_number, details in group_data.items():
            average_column_name = f"Average_Abundance_{details['grouping_variable']}"
            if average_column_name not in df.columns:
                all_columns_exist = False
                break
        
        if all_columns_exist:
            display(HTML('<b style="color:orange;">All average abundance columns already exist. Returning original DataFrame.</b>'))
            return df
        
        # If not all columns exist, proceed with calculations
        new_columns = {}
        for group_number, details in group_data.items():
            grouping_variable = details['grouping_variable']
            abundance_columns = details['abundance_columns']
            
            # Convert abundance columns to numeric
            for col in abundance_columns:
                df[col] = pd.to_numeric(df[col], errors='coerce')
            
            # Calculate averages
            average_column_name = f"Average_Abundance_{grouping_variable}"
            new_columns[average_column_name] = df[abundance_columns].mean(axis=1, skipna=True)
        
        # Add new columns to DataFrame
        df = pd.concat([df, pd.DataFrame(new_columns)], axis=1)
        if not df.empty:
            display(HTML('<b style="color:green;">Group average abundance columns have been successfully added to the DataFrame.</b>'))
        return df
    
    def process_data(self, group_data):
        """Main method to process all data."""
        if hasattr(self, 'pd_results') and self.pd_results is not None and not self.pd_results.empty:
            try:
                # Extract and process bioactive peptides
                mbpdb_results_cleaned, mbpdb_results_grouped = self.extract_bioactive_peptides()
                
                if not hasattr(self, 'pd_results_cleaned') or self.pd_results_cleaned is None:
                    self.pd_results_cleaned = self.pd_results.copy()
                
                # Process PD results and merge with MBPDB
                merged_df_temp = self.process_pd_results(mbpdb_results_grouped)
                
                # Calculate abundance averages if group_data exists
                if group_data:
                    with warnings.catch_warnings():
                        warnings.simplefilter("ignore", UserWarning)
                        final_df = self.calculate_group_abundance_averages(merged_df_temp, group_data)
                else:
                    final_df = merged_df_temp
                    display(HTML("<b style='color:orange;'>No group data provided. Skipping abundance calculations.</b>"))
                
                # Store the final DataFrame
                self._merged_df = final_df
                return final_df
            except Exception as e:
                display(HTML(f"<b style='color:red;'>Error processing data: {str(e)}</b>"))
                return None
        else:
            display(HTML("<b style='color:red;'>No PD results data available for processing.</b>"))
            return None

    
    def display_interactive_results(self, df):
        if df is not None:
            # Create DataGrid
            grid = DataGrid(df, selection_mode='cell', editable=False)
            grid.auto_fit_columns = True
            grid.base_row_size = 25
            grid.base_column_size = 150
            grid.auto_fit_params = {'area': 'column', 'padding': 10}
            
            # Display the grid
            display(grid)
        else:
            print("No data to display")

        
    def update_data(self, pd_results, mbpdb_results):
        """Update the input data and refresh the displa'."""
        self.pd_results = pd_results
        self.mbpdb_results = mbpdb_results
        self.pd_results_cleaned = pd_results.copy() if pd_results is not None else None
        
        # Clear previous outputs and rerun interactive display
        clear_output()
    

        # Display button and output
        display(confirm_button)
        display(output)


In [38]:

class ExportManager:
    """Class to manage all export operations with download link generation"""
    
    def __init__(self):
        self.output_style = """
            <style>
            .download-link {
                background-color: #4CAF50;
                border: none;
                color: white;
                padding: 10px 20px;
                text-align: center;
                text-decoration: none;
                display: inline-block;
                font-size: 14px;
                margin: 4px 2px;
                cursor: pointer;
                border-radius: 4px;
            }
            </style>
        """
    
    def _generate_download_link(self, content, filename, filetype='text/csv'):
        """Generate a download link for any content"""
        if isinstance(content, pd.DataFrame):
            content = content.to_csv(index=False)
            
        if isinstance(content, dict):
            content = json.dumps(content, indent=4)
            
        if isinstance(content, str):
            content = content.encode()
            
        b64 = base64.b64encode(content).decode()
        return f"""
            {self.output_style}
            <a download="{filename}" href="data:{filetype};base64,{b64}" class="download-link">
                Download {filename}
            </a>
        """

    def export_group_data(self, group_data):
        """Export group data as JSON with download link"""
        display(HTML("<h3><u>Export Group Data</u></h3>"))
        display(HTML("Enter a name for your group data JSON file:"))

        name_widget = widgets.Text(
            value='',
            placeholder='Enter file name',
            description='File name:',
            disabled=False
        )
        
        save_button = widgets.Button(
            description='Generate Download',
            button_style='success'
        )
        
        output = widgets.Output()
        
        def on_save_clicked(b):
            with output:
                output.clear_output()
                filename = name_widget.value
                if not filename:
                    display(HTML("<b style='color:red;'>Please enter a file name</b>"))
                    return
                    
                if not filename.endswith('.json'):
                    filename += '.json'
                    
                download_link = self._generate_download_link(
                    group_data, 
                    filename,
                    'application/json'
                )
                display(HTML(f"<h4>Group Data Export:</h4><hr style='border:1px solid grey;'>{download_link}"))
        
        save_button.on_click(on_save_clicked)
        display(name_widget, save_button, output)

    def export_dataframe(self, df):
        """Export DataFrame as CSV with download link"""
        display(HTML("<h3><u>Export Full Dataset</u></h3>"))
        display(HTML("Enter a name for your CSV file:"))

        name_widget = widgets.Text(
            value='',
            placeholder='Enter file name',
            description='File name:',
            disabled=False
        )
        
        save_button = widgets.Button(
            description='Generate Download',
            button_style='success'
        )
        
        output = widgets.Output()
        
        def on_save_clicked(b):
            with output:
                output.clear_output()
                filename = name_widget.value
                if not filename:
                    display(HTML("<b style='color:red;'>Please enter a file name</b>"))
                    return
                    
                if not filename.endswith('.csv'):
                    filename += '.csv'
                    
                download_link = self._generate_download_link(df, filename)
                display(HTML(f"<h4>DataFrame Export:</h4><hr style='border:1px solid grey;'>{download_link}"))
        
        save_button.on_click(on_save_clicked)
        display(name_widget, save_button, output)

    def setup_volcano_plot_export(self, merged_df, group_data):
        """Setup and handle volcano plot data export"""
        display(HTML("<h3><u>Volcano Plot Exporting</u></h3>"))
        
        save_button = widgets.Button(
            description='Generate Downloads',
            button_style='success'
        )
        output = widgets.Output()
        
        def create_pivoted_df(df, abundance_columns):
            melted_df = df.melt(
                id_vars=['unique ID'],
                value_vars=abundance_columns,
                var_name='Sample',
                value_name='Abundance'
            )
            return melted_df.pivot_table(
                index='Sample',
                columns='unique ID',
                values='Abundance'
            )

        def on_save_clicked(b):
            with output:
                output.clear_output()
                download_links = []
                
                for group_key, group_info in group_data.items():
                    pivoted_df = create_pivoted_df(
                        merged_df, 
                        group_info['abundance_columns']
                    )
                    
                    if not pivoted_df.empty:
                        filename = f"{group_info['grouping_variable']}___all-peptides___volcano_plot.csv"
                        download_link = self._generate_download_link(pivoted_df, filename)
                        download_links.append((filename, download_link))
                
                if download_links:
                    output_html = "<h4>Volcano Plot Exports:</h4><hr style='border:1px solid grey;'>"
                    for filename, link in download_links:
                        output_html += f"<p><b>{filename}</b>:<br>{link}</p>"
                    display(HTML(output_html))
                else:
                    display(HTML("<b style='color:red;'>No data available for export</b>"))
        
        save_button.on_click(on_save_clicked)
        display(save_button, output)

In [37]:
class DataProcessingController:
    def __init__(self):
        self.export_manager = ExportManager()
        self.combiner = None
        self.merged_df = None
        
        # Create processing button
        self.process_button = widgets.Button(
            description='Process Data',
            button_style='success',
            tooltip='Click to start data processing'
        )
        self.process_output = widgets.Output()
        self.export_output = widgets.Output()
        
        # Set up button callback
        self.process_button.on_click(self._on_process_clicked)
    
    def _on_process_clicked(self, b):
        with self.process_output:
            clear_output()
            print("Processing data...")
            
            # Create the combiner
            self.combiner = CombineAverageDataframes(data_transformer, group_processor)
            
            # Process the data
            self.merged_df = self.combiner.process_data(group_processor.group_data)
            
            if self.merged_df is not None:
                print("\nData processing completed successfully!")
                print(f"Final results shape: {self.merged_df.shape}")
                
                # Display results
                grid = DataGrid(self.merged_df)
                grid.auto_fit_columns = True
                display(grid)
                
                # Show export options
                self._show_export_options()
            else:
                print("Error: No data was processed")
    
    def _show_export_options(self):
        with self.export_output:
            clear_output()
            display(HTML("<h2>Export:</h2>"))
            
            # Export group data
            self.export_manager.export_group_data(group_processor.group_data)
            
            # Only show DataFrame and volcano plot exports if data is processed
            if self.merged_df is not None:
                self.export_manager.export_dataframe(self.merged_df)
                self.export_manager.setup_volcano_plot_export(
                    self.merged_df,
                    group_processor.group_data
                )
    
    def display(self):
        """Display the complete interface"""
        display(self.process_button)
        display(self.process_output)
        display(self.export_output)

# Initialize the controller
controller = DataProcessingController()

# Display the interface
controller.display()

Button(button_style='success', description='Process Data', style=ButtonStyle(), tooltip='Click to start data p…

Output()

Output()

In [39]:
"""
# Create an instance of the ExportManager
export_manager = ExportManager()

# Display the export section header
display(HTML(f"<h2>Export:</h2>"))

# Export group data with download link
export_manager.export_group_data(group_processor.group_data)

# Export full dataset with download link
export_manager.export_dataframe(combiner.merged_df)

# Setup and handle volcano plot exports with download links
export_manager.setup_volcano_plot_export(combiner.merged_df, group_processor.group_data)
""";