Import Statements
--

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import pandas as pd
import csv
import numpy as np
import scipy #as stats
import re
from scipy import stats
from scipy.stats import trim_mean

In [None]:
def extract_string(input_string):
    input_string = input_string.replace('\r', '')  # Remove '\r' from input_string
    pattern = r'(?:sp\|)?([^|\r]+)'
    match = re.search(pattern, input_string)
    if match:
        return match.group(1)
    else:
        return None

def drop_after_semicolon(input_string):
    if ";" in input_string:
        return input_string.split(";")[0]
    else:
        return input_string

Remove carriage return character from Perseus text file

In [None]:
# Load the file
file_path = '/content/drive/MyDrive/Predicting_the_Protein_Corona_Vijgen/Input_Data/Proteomics/Perseus_Files/Bov_SP_06252024_Pers.txt'
data = pd.read_csv(file_path, sep='\t', lineterminator='\n')

In [None]:
data.columns

Index(['Intensity 31_200_PS Carb_10', 'Intensity 32_200_PS Carb_100',
       'Intensity 33_small_Citrate_10', 'Intensity 34_large_Citrate_10',
       'Intensity 35_small_PEI_10', 'Intensity 36_large_PEI_10',
       'Intensity 37_large_PVP@AU_100', 'Intensity 38_small_PVP@AU_10',
       'Intensity 39_small_PVP@AU_100', 'Intensity 40_large_Citrate_100',
       'Intensity 41_large_PEI_100', 'Intensity 42_small_Citrate_100',
       'Intensity 43_small_PEI_100', 'Intensity 44_large_PVP@AU_10',
       'Intensity 45_large_PEG@Au_100', 'Intensity 46_large_PEI@Au_100',
       'Intensity 47_small_PEI@Au_100', 'Intensity 48_200_PS Carb@ PEG_100',
       'Intensity 49_200_PS Carb@ PEG_100', 'Intensity 50_FBS',
       'Intensity 51_SPQC1', 'Intensity 52_SPQC2', 'Intensity 53_SPQC3',
       'iBAQ 31_200_PS Carb_10', 'iBAQ 32_200_PS Carb_100',
       'iBAQ 33_small_Citrate_10', 'iBAQ 34_large_Citrate_10',
       'iBAQ 35_small_PEI_10', 'iBAQ 36_large_PEI_10',
       'iBAQ 37_large_PVP@AU_100', 'iBAQ 

In [None]:
# Remove the extra whitespace from column names
data.columns = data.columns.str.strip()

# Replace NaN values with zero
data.fillna(0, inplace=True)

# Apply extract_string function
data['Majority protein IDs'] = data['Majority protein IDs'].apply(extract_string)

# Apply drop_after_semicolon function
data['Majority protein IDs'] = data['Majority protein IDs'].apply(drop_after_semicolon)

# Filter columns and perform data concatenation
# samples = df.filter(like='Intensity ').copy()  # options are: 'Intensity ', 'Top3 ', 'iBAQ '
# text_columns = ['Majority protein IDs']
# text = df[text_columns].copy()
# data = pd.concat([samples, text], axis=1)


# Print the resulting DataFrame columns
print(data.columns)

# Display the first 25 rows of the resulting DataFrame
#print(data.head(25))
print(data['Majority protein IDs'].head(50))


# Save the updated DataFrame to a new file
data.to_csv('/content/drive/MyDrive/Predicting_the_Protein_Corona_Vijgen/Input_Data/Proteomics/Perseus_Files/Updated/Bov_SP_06252024_Pers.txt', sep='\t', index=False)

Index(['Intensity 31_200_PS Carb_10', 'Intensity 32_200_PS Carb_100',
       'Intensity 33_small_Citrate_10', 'Intensity 34_large_Citrate_10',
       'Intensity 35_small_PEI_10', 'Intensity 36_large_PEI_10',
       'Intensity 37_large_PVP@AU_100', 'Intensity 38_small_PVP@AU_10',
       'Intensity 39_small_PVP@AU_100', 'Intensity 40_large_Citrate_100',
       'Intensity 41_large_PEI_100', 'Intensity 42_small_Citrate_100',
       'Intensity 43_small_PEI_100', 'Intensity 44_large_PVP@AU_10',
       'Intensity 45_large_PEG@Au_100', 'Intensity 46_large_PEI@Au_100',
       'Intensity 47_small_PEI@Au_100', 'Intensity 48_200_PS Carb@ PEG_100',
       'Intensity 49_200_PS Carb@ PEG_100', 'Intensity 50_FBS',
       'Intensity 51_SPQC1', 'Intensity 52_SPQC2', 'Intensity 53_SPQC3',
       'iBAQ 31_200_PS Carb_10', 'iBAQ 32_200_PS Carb_100',
       'iBAQ 33_small_Citrate_10', 'iBAQ 34_large_Citrate_10',
       'iBAQ 35_small_PEI_10', 'iBAQ 36_large_PEI_10',
       'iBAQ 37_large_PVP@AU_100', 'iBAQ 

In [None]:
quant_method = ['Intensity '] #'iBAQ ', 'Top3 '
text_columns = ['Majority protein IDs']
uniprot_type = [' Swiss'] #' Swiss + TrEMBLE'

for i in quant_method:
    # Set Uniprot Database file based on uniprot_type; Swiss-Prot or Swiss-Prot + TrEMBLE
    uniprot_file = uniprot_type[0]  # Adjust index as needed
    prot_file = '/content/drive/MyDrive/Predicting_the_Protein_Corona_Vijgen/Input_Data/Proteomics/Uniprot_Files/Bovine_swiss_05_04_24.xlsx'

    # Extract filename from the file path
    filename = os.path.basename(prot_file)
    print(filename) # print to verify

    # Perseus txt file (from Updated subfolder):
    data = pd.read_csv('/content/drive/MyDrive/Predicting_the_Protein_Corona_Vijgen/Input_Data/Proteomics/Perseus_Files/Updated/Bov_SP_06252024_Pers.txt', sep='\t', lineterminator='\n')

    # Remove the extra whitespace from column names
    data.columns = data.columns.str.strip()

    # Uniprot Database file:
    prot_details = pd.read_excel(prot_file)
    identifier = filename[:3] + uniprot_file + ' ' + i
    print(identifier) # print to verify

    # Drop weird row from MQ text file
    data = data.drop([0]) # unsure about this line - maybe it is removing the column headers?
    samples = data.filter(like=i)
    text = data[text_columns]
    # Drop all columns that are not pertinent from Perseus output
    data = pd.concat([text, samples], axis=1)
    # DATA PARSING Section
    zero_replace = 1 * (10 ** 5)

    # Preprocess 'Majority protein IDs' column to handle float values
    data['Majority protein IDs'] = data['Majority protein IDs'].astype(str)

    # Apply extract_string function
    data['Majority protein IDs'] = data['Majority protein IDs'].apply(extract_string)

    data = data.rename(columns={'Majority protein IDs': 'Entry'})
    # Join Protein Names from the protein IDs
    inner_join = data.merge(prot_details, left_on='Entry', right_on='Entry', how='left', suffixes=('_left', '_right'))
    inner_join.to_csv('merge.csv')
    data.insert(1, 'MW', inner_join['Mass'].values)
    data.insert(1, 'prot', inner_join['Protein names'].values)
    #data.insert(1, 'GO', inner_join['Gene ontology (biological process)'].values) # can uncomment if pulling this info from Uniprot database
    data.insert(1, 'prot2', data['prot'].str.split("(").str[0])
    data['prot2'] = data['prot2'].str.split("[").str[0]
    data['prot'].replace('', np.nan, inplace=True)
    data.dropna(subset=['prot'], inplace=True)


    # DATA CALCULATIONS and SORTING
    descriptor_cols = 4

    # Convert data to float where necessary
    data_norm = data.copy()
    data_norm.iloc[:, descriptor_cols:] = data_norm.iloc[:, descriptor_cols:].astype(float)

    # Normalizing using Trimmean
    for col in data_norm.columns[descriptor_cols:]:
        current_trimmean = trim_mean(data_norm[col], 0.1)
        if current_trimmean == 0:
            current_trimmean = 1
        data_norm[col] /= current_trimmean

    # Scaling data after normalizing
    scale_factor = 100
    for col in data_norm.columns[descriptor_cols:]:
        mean_val = data_norm[col].mean()
        if mean_val != 0:  # Avoid division by zero
            data_norm[col] = (data_norm[col] / mean_val) * scale_factor

    # Turn into Percent
    for col in data_norm.columns[descriptor_cols:]:
        total = data_norm[col].sum()
        if total != 0:  # Avoid division by zero
            data_norm[col] = (data_norm[col] / total) * scale_factor

    # Sort by average
    data_norm_out = data_norm.copy()
    data_norm_out['avg'] = data_norm_out.iloc[:, descriptor_cols:].mean(axis=1)
    data_norm_out.sort_values('avg', ascending=False, inplace=True)
    data_norm_out.drop(columns=['avg'], inplace=True)

    # Rename columns based on 'i' length from your loop, adjust if 'i' is defined elsewhere
    x = len(i)
    for col in data_norm_out.columns[descriptor_cols:]:
        tmp = col[x:x+2]
        data_norm_out.rename(columns={col: tmp}, inplace=True)

    # Save the processed data NEED TO CHANGE FILE PATH MOVING FORWARD
    data_norm_out.to_csv(f'/content/drive/MyDrive/Predicting_the_Protein_Corona_Vijgen/Input_Data/Proteomics/Abundance_Files/test/{identifier}_v2.csv', index=False)

Bovine_swiss_05_04_24.xlsx
Bov Swiss Intensity 
