# core

> Fill in a module description here

In [None]:
#| default_exp core

In [None]:
#| hide
from nbdev.showdoc import *
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [None]:
#| export
class DatasetAnalysis():
    """Class to store common functions
    for the analysis of proteomics data"""
    pass

In [None]:
#| export
class SpectronautProcessor(DatasetAnalysis):
    def __init__(self, file_name='', column_mapping=''):
        self.filename = file_name
        self.column_mapping = self.parse_column_mapping(column_mapping)
        
    #prepare the dataset for analysis    
    def filter_protein_quantification(self, df):
        quant_cols = [n for n in df.columns if 'PG.Quantity' in n]

        # Create a mask DataFrame based on IsSingleHit columns
        mask = df[[n.replace('PG.Quantity', 'PG.IsSingleHit') for n in quant_cols]]
        mask.columns = quant_cols

        # Replace strings with their corresponding boolean values
        replacements = {'Filtered': True, 'False': False, 
                        'True': True, 'FALSE': False, 'TRUE': True}
        mask = mask.replace(replacements)

        # Get the data DataFrame with only the quantification columns
        selection = df[quant_cols]

        # Apply the mask to the data DataFrame
        filtered_selection = selection.mask(mask)

        return filtered_selection
    
    #this function create from a table of:
    #old_column, condition, replica
    #a mepping dataframe. the condition and replica
    #are used to name the new column
    def parse_column_mapping(self, mapping_file):
        # Read the mapping file into a DataFrame
        mapping_df = pd.read_csv(mapping_file, sep='\t')

        # Assert that the DataFrame has exactly 3 columns
        assert mapping_df.shape[1] == 3

        # Replace '.IsSingleHit' with '.Quantity' in the 'col_name' column
        mapping_df['col_name'] = mapping_df['col_name'].str.replace('.IsSingleHit', '.Quantity')

        # Create a new column 'new_col' with the format 'condition.replica'
        mapping_df['new_col'] = mapping_df['condition'] + '.' + mapping_df['replica'].astype(str)

        # Create a dictionary mapping the original column names to the new column names
        mapping_dict = dict(zip(mapping_df['col_name'], mapping_df['new_col']))

        return mapping_dict
    
    #this function apply the logic of getting the dataframe
    #for quantification analysis
    def process(self):
        df = pd.read_csv(self.filename, sep="\t")
        filtered_quantification = self.filter_protein_quantification(df)
        filtered_quantification = filtered_quantification.rename(self.column_mapping,axis=1)
        return filtered_quantification


In [None]:
# Initialize the class with the Spectronaut output file
# and a table to rename the columns
processor = SpectronautProcessor(
    "../toy_datasets/spectronaut_output.tsv",
    "../toy_datasets/spectronaut_column_mapping.tsv")

# Process the file and get the filtered protein quantification DataFrame
filtered_quantification = processor.process()
filtered_quantification.head()

Unnamed: 0,CELL34.1,CELL34.2,CELL34.3,CELL37.1,CELL37.2,CELL37.3,CELL40.1,CELL40.2,CELL40.3,SN34.1,SN34.2,SN34.3,SN37.1,SN37.2,SN37.3,SN40.1,SN40.2,SN40.3
0,100761.148438,102093.578125,111970.757812,99120.09,98929.71,104091.296875,92301.05,85210.16,93039.21,105977.328125,175823.1875,172380.53125,117730.5546875,119413.4453125,99135.140625,129270.6875,86490.8203125,
1,527764.3125,500388.4375,550439.875,532226.5,546521.3,526252.875,518442.2,549384.5,540686.5,1803379.5,2018498.25,2165891.0,1756412.75,1795155.625,1982255.25,1549031.375,1549369.25,1828279.0
2,944657.1875,960022.125,869661.0,1005103.0,1223561.0,260486.875,461302.8,325681.3,258715.0,,,230601.203125,1692870.25,1343.4593505859375,,,,
3,447414.59375,458718.5625,461863.6875,514026.5,490502.5,490315.96875,543306.5,519208.1,527695.4,38077.0546875,70483.5,116990.828125,70142.5546875,82265.25,81564.125,112112.515625,130479.8203125,93836.3203125
4,860323.9375,824793.8125,827789.5625,1057513.0,974005.4,848213.1875,1076790.0,1079964.0,1100252.0,3568417.75,266359.4375,496068.875,1402994.25,904210.75,589144.375,1438730.875,239517.953125,1274326.0


In [None]:
filtered_quantification.shape

(49, 18)

In [None]:
#| export
class DIAnnProcessor(DatasetAnalysis):
    def __init__(self, file_name='', peptides_name='', column_mapping=''):
        self.filename = file_name
        self.column_mapping = self.parse_column_mapping(column_mapping)
        self.peptide = peptides_name
        

    
    #this function apply the logic of getting the dataframe
    #for quantification analysis
    def process(self):
        df = pd.read_csv(self.filename, sep="\t")
        filtered_quantification = self.filter_protein_quantification(df)
        filtered_quantification = filtered_quantification.rename(self.column_mapping,axis=1)
        return filtered_quantification


In [None]:
#filtered_quantification.tail(50)

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()