#Notebook for preparing proteomic data files into protein-sorted abundance files.

---

After processing raw proteomic data in software of choice (Payne lab uses MaxQuant and Perseus, instructions included in README document), please:


*   Upload text file and insert directory path when indicated in below code.
*   Upload excel file from Uniprot containing all proteins within proteome of interest; insert directory path when indicated in below code.



Import Statements
--

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import pandas as pd
import csv
import numpy as np
import scipy
import re
from scipy.stats import trim_mean

In [None]:
def extract_string(input_string):
    input_string = input_string.replace('\r', '')  # remove '\r' from input_string
    pattern = r'(?:sp\|)?([^|\r]+)'
    match = re.search(pattern, input_string)
    if match:
        return match.group(1)
    else:
        return None


Load Processed Proteomic Data
--

Read in proteomics processed .txt file

In [None]:
a=pd.read_csv('/content/drive/MyDrive/ProteinCoronaPredict_PayneLab/Input_Data/Proteomics/Perseus_Files/Bov_SP_06252024_Pers.txt', sep='\t', lineterminator='\n')

In [None]:
a.columns

Index(['Intensity 31_200_PS Carb_10', 'Intensity 32_200_PS Carb_100',
       'Intensity 33_small_Citrate_10', 'Intensity 34_large_Citrate_10',
       'Intensity 35_small_PEI_10', 'Intensity 36_large_PEI_10',
       'Intensity 37_large_PVP@AU_100', 'Intensity 38_small_PVP@AU_10',
       'Intensity 39_small_PVP@AU_100', 'Intensity 40_large_Citrate_100',
       'Intensity 41_large_PEI_100', 'Intensity 42_small_Citrate_100',
       'Intensity 43_small_PEI_100', 'Intensity 44_large_PVP@AU_10',
       'Intensity 45_large_PEG@Au_100', 'Intensity 46_large_PEI@Au_100',
       'Intensity 47_small_PEI@Au_100', 'Intensity 48_200_PS Carb@ PEG_100',
       'Intensity 49_200_PS Carb@ PEG_100', 'Intensity 50_FBS',
       'Intensity 51_SPQC1', 'Intensity 52_SPQC2', 'Intensity 53_SPQC3',
       'iBAQ 31_200_PS Carb_10', 'iBAQ 32_200_PS Carb_100',
       'iBAQ 33_small_Citrate_10', 'iBAQ 34_large_Citrate_10',
       'iBAQ 35_small_PEI_10', 'iBAQ 36_large_PEI_10',
       'iBAQ 37_large_PVP@AU_100', 'iBAQ 

Below code cell is to verify that column 'Majority protein IDs' is present in the input .txt file

In [None]:
# Remove the extra whitespace from column names
a.columns = a.columns.str.strip()

# Replace NaN values with zero ### should this be done in Perseus first or is it okay here?
a.fillna(0, inplace=True)

# Filter columns and perform data concatenation
samples = a.filter(like='Intensity ').copy() #like= can be 'Intensity ', 'Top3 ', or 'iBAQ '
text_columns = ['Majority protein IDs']
text = a[text_columns].copy()
data = pd.concat([samples, text], axis=1)

# Apply extract_string function
data['Majority protein IDs'] = data['Majority protein IDs'].apply(extract_string)

# Print the resulting DataFrame
print(data.columns)
data.head(25)


Index(['Intensity 31_200_PS Carb_10', 'Intensity 32_200_PS Carb_100',
       'Intensity 33_small_Citrate_10', 'Intensity 34_large_Citrate_10',
       'Intensity 35_small_PEI_10', 'Intensity 36_large_PEI_10',
       'Intensity 37_large_PVP@AU_100', 'Intensity 38_small_PVP@AU_10',
       'Intensity 39_small_PVP@AU_100', 'Intensity 40_large_Citrate_100',
       'Intensity 41_large_PEI_100', 'Intensity 42_small_Citrate_100',
       'Intensity 43_small_PEI_100', 'Intensity 44_large_PVP@AU_10',
       'Intensity 45_large_PEG@Au_100', 'Intensity 46_large_PEI@Au_100',
       'Intensity 47_small_PEI@Au_100', 'Intensity 48_200_PS Carb@ PEG_100',
       'Intensity 49_200_PS Carb@ PEG_100', 'Intensity 50_FBS',
       'Intensity 51_SPQC1', 'Intensity 52_SPQC2', 'Intensity 53_SPQC3',
       'Majority protein IDs'],
      dtype='object')


Unnamed: 0,Intensity 31_200_PS Carb_10,Intensity 32_200_PS Carb_100,Intensity 33_small_Citrate_10,Intensity 34_large_Citrate_10,Intensity 35_small_PEI_10,Intensity 36_large_PEI_10,Intensity 37_large_PVP@AU_100,Intensity 38_small_PVP@AU_10,Intensity 39_small_PVP@AU_100,Intensity 40_large_Citrate_100,...,Intensity 45_large_PEG@Au_100,Intensity 46_large_PEI@Au_100,Intensity 47_small_PEI@Au_100,Intensity 48_200_PS Carb@ PEG_100,Intensity 49_200_PS Carb@ PEG_100,Intensity 50_FBS,Intensity 51_SPQC1,Intensity 52_SPQC2,Intensity 53_SPQC3,Majority protein IDs
0,#!{Type}E,E,E,E,E,E,E,E,E,E,...,E,E,E,E,E,E,E,E,E,T
1,0,0,0,4868800,12841000,0,10031000,0,8823200,25223000,...,0,0,0,0,0,54812000,7079300,7460400,25913000,A2I7M9
2,5750500,0,6179200,29202000,10419000,37879000,157260000,3491300,172580000,162570000,...,33192000,1102400,19913000,0,2565600,330430016,52667000,96897000,112370000,A2I7N0;Q3ZEJ6
3,0,0,0,0,0,0,0,0,0,0,...,2829000,0,0,0,0,29073000,0,0,0,A2I7N1
4,0,0,0,0,0,0,21393000,4459100,23326000,33325000,...,15416000,3883600,0,0,0,53483000,17526000,24811000,17598000,A2I7N2
5,8701900,2736100,82242000,116270000,117630000,126700000,449929984,56725000,459680000,692840000,...,173010000,12242000,105910000,11394000,22451000,1192499968,431300000,335150016,368540000,A2I7N3
6,0,1098500,0,0,0,0,0,0,0,0,...,0,0,0,2982400,16490000,0,0,0,0,A2VE23
7,0,0,0,0,0,0,0,0,0,60552000,...,0,0,0,0,0,0,0,0,0,A2VE99
8,0,2393200,13147000,9260800,0,0,0,0,0,0,...,0,0,0,3896700,10045000,0,0,0,0,Q58DS5;Q2HJI8;A4FV54;Q2HJH2;Q1RMR4;P10948;P110...
9,0,0,7893500,13742000,26382000,33397000,70589000,12098000,118180000,143470000,...,20129000,3488600,138460000,0,0,0,63205000,76583000,48905000,Q5E9E2;A4IF97


Merge Processed Proteomic Data with Uniprot Knowledge Database
--

For variable 'prot_file', please link excel file path to UniprotKB for proteome of interest. Download Swiss-Prot for SP and both Swiss-Prot and TrEMBL for TrEMBL analysis

Excel file format: 'Entry', 'Protein names', 'Length', 'Mass', 'Sequence':

In [None]:
quant_method = ['Intensity ', 'iBAQ ', 'Top3 ']
text_columns = ['Majority protein IDs']
uniprot_type = [' Swiss', ' Swiss + TrEMBLE']

for i in quant_method:
    # Set Uniprot Database file based on uniprot_type; Swiss-Prot or Swiss-Prot + TrEMBLE
    uniprot_file = uniprot_type[0]  # Adjust index as needed

    #Below: link directory to downloaded UniProt file; currently just the SP file being used
    prot_file = '/content/drive/MyDrive/ProteinCoronaPredict_PayneLab/Input_Data/Proteomics/Uniprot_Files/Bovine_swiss_05_04_24.xlsx'

    # Extract filename from the file path
    filename = os.path.basename(prot_file)
    #print(filename) # print to verify

    # Perseus txt file (same directory as code in cell 4):
    data = pd.read_csv('/content/drive/MyDrive/ProteinCoronaPredict_PayneLab/Input_Data/Proteomics/Perseus_Files/Bov_SP_06252024_Pers.txt', sep='\t', lineterminator='\n')

    # Remove the extra whitespace from column names
    data.columns = data.columns.str.strip()

    # Split and explode the 'Majority Protein IDs' where they are separated by semicolons into separate rows
    #data['Majority protein IDs'] = data['Majority protein IDs'].str.split(';')
    #data = data.explode('Majority protein IDs').reset_index(drop=True)

    # Uniprot Database file:
    prot_details = pd.read_excel(prot_file)
    identifier = filename[:3] + uniprot_file + ' ' + i
    print(identifier) # print to verify

    # Drop weird row from MQ text file
    data = data.drop([0]) # unsure about this line - maybe it is removing the column headers?
    samples = data.filter(like=i)
    text = data[text_columns]
    # Drop all columns that are not pertinent from Perseus output
    data = pd.concat([text, samples], axis=1)
    # DATA PARSING Section
    zero_replace = 1 * (10 ** 5)

    # Preprocess 'Majority protein IDs' column to handle float values
    data['Majority protein IDs'] = data['Majority protein IDs'].astype(str)

    # Apply extract_string function
    data['Majority protein IDs'] = data['Majority protein IDs'].apply(extract_string)

    data = data.rename(columns={'Majority protein IDs': 'Entry'})
    # Join Protein Names from the protein IDs
    inner_join = data.merge(prot_details, left_on='Entry', right_on='Entry', how='left', suffixes=('_left', '_right'))
    inner_join.to_csv('merge.csv')
    data.insert(1, 'MW', inner_join['Mass'].values)
    data.insert(1, 'prot', inner_join['Protein names'].values)
    #data.insert(1, 'GO', inner_join['Gene ontology (biological process)'].values) # Can uncomment if pulling this info from Uniprot database
    data.insert(1, 'prot2', data['prot'].str.split("(").str[0])
    data['prot2'] = data['prot2'].str.split("[").str[0]
    data['prot'].replace('', np.nan, inplace=True)
    data.dropna(subset=['prot'], inplace=True)


    # DATA CALCULATIONS and SORTING
    descriptor_cols = 4

    # Convert data to float where necessary
    data_norm = data.copy()
    data_norm.iloc[:, descriptor_cols:] = data_norm.iloc[:, descriptor_cols:].astype(float)

    # Normalizing using Trimmean
    for col in data_norm.columns[descriptor_cols:]:
        current_trimmean = trim_mean(data_norm[col], 0.1)
        if current_trimmean == 0:
            current_trimmean = 1
        data_norm[col] /= current_trimmean

    # Scaling data after normalizing
    scale_factor = 100
    for col in data_norm.columns[descriptor_cols:]:
        mean_val = data_norm[col].mean()
        if mean_val != 0:
            data_norm[col] = (data_norm[col] / mean_val) * scale_factor

    # Turn into percent
    for col in data_norm.columns[descriptor_cols:]:
        total = data_norm[col].sum()
        if total != 0:
            data_norm[col] = (data_norm[col] / total) * scale_factor

    # Sort by average
    data_norm_out = data_norm.copy()
    data_norm_out['avg'] = data_norm_out.iloc[:, descriptor_cols:].mean(axis=1)
    data_norm_out.sort_values('avg', ascending=False, inplace=True)
    data_norm_out.drop(columns=['avg'], inplace=True)

    x = len(i)
    for col in data_norm_out.columns[descriptor_cols:]:
        tmp = col[x:x+2]
        data_norm_out.rename(columns={col: tmp}, inplace=True)

    # Save
    # CHANGE NAME TO YOUR FILE
    version = 'complete' # to help identify files
    data_norm_out.to_csv(f'/content/drive/MyDrive/ProteinCoronaPredict_PayneLab/Input_Data/Proteomics/Abundance_Files/Bov_Swiss_{i}_{version}.csv', index=False)
    data_norm_out.to_excel(f'/content/drive/MyDrive/ProteinCoronaPredict_PayneLab/Input_Data/Proteomics/Abundance_Files/Bov_Swiss_{i}_{version}.xlsx', index=False)

Bov Swiss Intensity 


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['prot'].replace('', np.nan, inplace=True)


Bov Swiss iBAQ 


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['prot'].replace('', np.nan, inplace=True)


Bov Swiss Top3 


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['prot'].replace('', np.nan, inplace=True)
