# core

> Fill in a module description here

In [None]:
#| default_exp core

In [None]:
#| hide
from nbdev.showdoc import *
import pandas as pd
import numpy as np
import os
import missingno as msno
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
#| export
class DatasetViz():
    """Class to visualize a data frame"""
    def __init__(self, df='', palette = False):
        self.df = df
        if not palette:
            self.palette = ['r']*df.shape[1]
        else:
            self.palette = palette
        
    def analyse_missing_values(self, figsize=(8, 4)):
        
        #visualization of missing data
        with plt.style.context('ggplot'):

            ax=msno.bar(self.df, figsize=figsize)
            # Set global font size to 8
            plt.rcParams['font.size'] = 8
            plt.rcParams['axes.titlesize'] = 8
            plt.rcParams['axes.labelsize'] = 8
            plt.rcParams['xtick.labelsize'] = 8
            plt.rcParams['ytick.labelsize'] = 8
            plt.rcParams['legend.fontsize'] = 8
            plt.rcParams['figure.titlesize'] = 8            
            
            plt.title('Missing Data Analysis', size=12)
            ax.set_ylabel('Fraction of data points',size=12)
            #plt.savefig(os.path.join(OUT_FOLDER,'1_missing_value_bar.png'))
            plt.tight_layout()
            plt.show()
            
            ax=msno.matrix(self.df, figsize=figsize)
            plt.tight_layout()
            plt.show()  
            
            
            ax=msno.dendrogram(self.df)
            plt.tight_layout()
            plt.show()
            
            
    def analyse_values_distribution(self,  figsize=(8, 4), do_log = True):
        fig,ax=plt.subplots(figsize=figsize)
        df = self.df
        
        if do_log:
            df = np.log10(df)
        
        df.plot(kind='kde', color=self.palette, alpha=0.5, ax=ax)   
        plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
        plt.title('Value Distribution')
        plt.xlabel('Intensity')
        plt.show()
        
        
        fig,ax=plt.subplots(figsize=figsize)
        sns.boxplot(data = df, showfliers=False, palette=self.palette,ax=ax)
        plt.title('Value Distribution')
        plt.xlabel('Sample')
        plt.ylabel('Intensity')
        plt.xticks(rotation=45,ha='right')
        plt.show()
        
        

In [None]:
#| export
def norm_loading(df):
    col_sum = df.median(axis=0)
    print(col_sum)
    target = np.mean(col_sum)
    print(target)
    norm_facs = target / col_sum
    print(norm_facs)
    data_norm = df.multiply(norm_facs, axis=1)
    return data_norm 

In [None]:
#| export
class DatasetAnalysis():
    """Class to store common functions
    for the analysis of proteomics data"""
    
    #this function create from a table of:
    #old_column, condition, replica
    #a mepping dataframe. the condition and replica
    #are used to name the new column
    def parse_column_mapping(self, mapping_file):
        # Read the mapping file into a DataFrame
        if mapping_file.endswith('.csv'):
            mapping_df = pd.read_csv(mapping_file)
        else:    
            mapping_df = pd.read_csv(mapping_file, sep='\t')
        #print(mapping_df)
        # Assert that the DataFrame has at least 3 columns
        assert mapping_df.shape[1] >= 3

        # Replace '.IsSingleHit' with '.Quantity' in the 'col_name' column
        mapping_df['col_name'] = mapping_df['col_name'].str.replace('.IsSingleHit', '.Quantity')

        # Create a new column 'new_col' with the format 'condition.replica'
        mapping_df['new_col'] = mapping_df['condition'] + '.' + mapping_df['replica'].astype(str)

        # Create a dictionary mapping the original column names to the new column names
        mapping_dict = dict(zip(mapping_df['col_name'], mapping_df['new_col']))

        return mapping_dict 

    def replace_zeros(self, df):
        df = df.replace('0',0)
        df = df.replace(0,np.nan)
        return df  
   
    

In [None]:
#| export
class SpectronautProcessor(DatasetAnalysis):
    """Class to make a Spectronaut output
    ready for quntification"""
    def __init__(self, file_name='', column_mapping=''):
        self.filename = file_name
        self.column_mapping = self.parse_column_mapping(column_mapping)
        
    #prepare the dataset for analysis    
    def filter_protein_quantification(self, df):
        quant_cols = [n for n in df.columns if 'PG.Quantity' in n]

        # Create a mask DataFrame based on IsSingleHit columns
        mask = df[[n.replace('PG.Quantity', 'PG.IsSingleHit') for n in quant_cols]]
        mask.columns = quant_cols

        # Replace strings with their corresponding boolean values
        replacements = {'Filtered': True, 'False': False, 
                        'True': True, 'FALSE': False, 'TRUE': True}
        mask = mask.replace(replacements)

        # Get the data DataFrame with only the quantification columns
        selection = df[quant_cols]

        # Apply the mask to the data DataFrame
        filtered_selection = selection.mask(mask)
        filtered_selection = self.replace_zeros(filtered_selection)

        return filtered_selection
    

    #this function apply the logic of getting the dataframe
    #for quantification analysis
    def process(self):
        df = pd.read_csv(self.filename, sep="\t")
        filtered_quantification = self.filter_protein_quantification(df)
        filtered_quantification = filtered_quantification.rename(self.column_mapping,axis=1)
        return filtered_quantification


In [None]:
# Initialize the class with the Spectronaut output file
# and a table to rename the columns
processor = SpectronautProcessor(
    "../toy_datasets/spectronaut_output.tsv",
    "../toy_datasets/spectronaut_column_mapping.tsv")

# Process the file and get the filtered protein quantification DataFrame
filtered_quantification = processor.process()
filtered_quantification.head()

Unnamed: 0,CELL34.1,CELL34.2,CELL34.3,CELL37.1,CELL37.2,CELL37.3,CELL40.1,CELL40.2,CELL40.3,SN34.1,SN34.2,SN34.3,SN37.1,SN37.2,SN37.3,SN40.1,SN40.2,SN40.3
0,100761.148438,102093.578125,111970.757812,99120.09,98929.71,104091.296875,92301.05,85210.16,93039.21,105977.328125,175823.1875,172380.53125,117730.5546875,119413.4453125,99135.140625,129270.6875,86490.8203125,
1,527764.3125,500388.4375,550439.875,532226.5,546521.3,526252.875,518442.2,549384.5,540686.5,1803379.5,2018498.25,2165891.0,1756412.75,1795155.625,1982255.25,1549031.375,1549369.25,1828279.0
2,944657.1875,960022.125,869661.0,1005103.0,1223561.0,260486.875,461302.8,325681.3,258715.0,,,230601.203125,1692870.25,1343.4593505859375,,,,
3,447414.59375,458718.5625,461863.6875,514026.5,490502.5,490315.96875,543306.5,519208.1,527695.4,38077.0546875,70483.5,116990.828125,70142.5546875,82265.25,81564.125,112112.515625,130479.8203125,93836.3203125
4,860323.9375,824793.8125,827789.5625,1057513.0,974005.4,848213.1875,1076790.0,1079964.0,1100252.0,3568417.75,266359.4375,496068.875,1402994.25,904210.75,589144.375,1438730.875,239517.953125,1274326.0


In [None]:
filtered_quantification.shape

(49, 18)

In [None]:
#| export
class DIAnnProcessor(DatasetAnalysis):
    """Class to make a DIA-NN output
    ready for quntification"""
    def __init__(self, file_name='', peptides_count='', column_mapping=''):
        self.filename = file_name
        self.column_mapping = self.parse_column_mapping(column_mapping)
        self.peptides_count = peptides_count
        
    def filter_protein_quantification(self, df):
        #we use only the protein identified at least with 2
        #peptides
        df_peptide = pd.read_csv(self.peptides_count, sep='\t')
        #name from the R script used to create the peptide count file
        df_peptide.set_index('Var1',inplace=True)
        good_quant = df_peptide[df_peptide['Freq']>=2]
        filtered_selection=df.loc[good_quant.index.values]
        filtered_selection = self.replace_zeros(filtered_selection)
        return filtered_selection

    def fix_col_names(self, df):
        '''
        DIA-NN processes raw files in random order. The column names
        contains the full path to the analysed file. this function clean
        the column names and order the columns based on the order in
        the column_mapping dictionary
        '''
        #grab the last part shuld be protein name
        cols = [n.split('\\')[-1] for n in df.columns]
        #remove .dia, if raw files has been
        #transformed to .dia
        cols = [n.replace('.dia','') for n in cols]
        df.columns = cols
        return df
    
    #this function apply the logic of getting the dataframe
    #ready for quantification analysis
    def process(self):
        df = pd.read_csv(self.filename, sep="\t")
        filtered_quantification = self.filter_protein_quantification(df)
        filtered_quantification = self.fix_col_names(filtered_quantification)
        filtered_quantification = filtered_quantification.rename(self.column_mapping,axis=1)
        filtered_quantification = filtered_quantification[self.column_mapping.values()]
        return filtered_quantification


In [None]:
# Initialize the class with the Spectronaut output file
# and a table to rename the columns
processor = DIAnnProcessor(
    "../toy_datasets/DIA-NN_output.txt",
    "../toy_datasets/DIA-NN_peptides_counts.txt",
    "../toy_datasets/DIA-NN_column_mapping.csv",
)
# Process the file and get the filtered protein quantification DataFrame
#filtered_quantification = processor.process()
#filtered_quantification.head()

Unnamed: 0,WT_1.1,WT_1.2,WT_1.3,MUT3_1.1,MUT3_1.2,MUT3_1.3,MUT4_1.1,MUT4_1.2,MUT4_1.3,WTSOL_2.1,...,WTSOL_3.6,WTINS_3.4,WTINS_3.5,WTINS_3.6,MUT4SOL_3.1,MUT4SOL_3.2,MUT4SOL_3.3,MUT4INS_3.1,MUT4INS_3.2,MUT4INS_3.3
GFP.BLA,,,,9937347.0,10681540.0,9845878.0,9728128.0,10243330.0,9316970.0,,...,,,,,5944689.0,6888683.0,6956789.0,,,
Tb05.5K5.100:mRNA-p1;Tb927.5.4450:mRNA-p1,3090158.0,2574684.0,2751917.0,2333518.0,2596232.0,2407138.0,2508370.0,2562637.0,2565928.0,2804626.0,...,3112300.0,1033351.0,919455.1,1039817.0,2824497.0,2757964.0,2403540.0,1216043.0,1053073.0,1048340.0
Tb05.5K5.110:mRNA-p1;Tb927.5.4460:mRNA-p1,34131140.0,31778920.0,32899600.0,29899770.0,30039790.0,30271490.0,31465550.0,30740120.0,30751750.0,35122070.0,...,35746940.0,1975402.0,1821761.0,1814954.0,32790100.0,32082540.0,31902980.0,1931748.0,1978563.0,1991076.0
Tb05.5K5.120:mRNA-p1;Tb927.5.4470:mRNA-p1,4962111.0,5049931.0,4865019.0,4767877.0,4826146.0,4738838.0,4982286.0,4989706.0,4672616.0,616871.4,...,1333413.0,8680057.0,8580248.0,8776530.0,1135220.0,907409.1,999225.8,8560026.0,8473488.0,8810763.0
Tb05.5K5.130:mRNA-p1;Tb927.5.4480:mRNA-p1,37736860.0,33704110.0,28153740.0,23281510.0,28895540.0,30851690.0,31117060.0,31348720.0,26408440.0,4603374.0,...,8123416.0,80853950.0,83972100.0,82864660.0,6640407.0,7224879.0,7921410.0,85851870.0,82679730.0,85374140.0


In [None]:
#processor.analyse_missing_values(filtered_quantification,figsize=(16,4))

In [None]:
#dataset_viz = DatasetViz(df = filtered_quantification)
#dataset_viz.analyse_missing_values(figsize=(16,4))

In [None]:
#dataset_viz.analyse_values_distribution(figsize=(16,4))

In [None]:
#filtered_quantification.tail(50)

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()