In [9]:
### These codes are for selecting ecoli psms to be used as internal standards

def ecoli_psm_selection(unlabeled_file, labeled_file, control_file, top_n, control_tsv_output):
    
    #takes the psm_tsv from each file and extracts the ecoli peptides that: 
    #1. have only one instance (difference in charge counts as a unique separate instances)
    #2. have intensity data in the psm file
    #3. psms in the labeled and unlabeled file that are found in the control file (they do not need to be found in both the labeled and unlabeled file)
    # the top n is the top best psms based on having the lowest eculidean distance to the best fit line between the control and either the labeled or unlabeled samples. (I found that generally, keeping top_n to 200 results in a r value of > 0.90)
    # returns 3 dataframes, 1. the top n ecoli peptides in the unlabeled sample, 2. the top n ecoli peptides in the labeled sample, and control ecoli peptides.
    
    import pandas as pd
    from pandas import merge
    import numpy as no
    from scipy.linalg import svd
    
    # first handling the control df
    control_df = intial_ecoli_filter(pd.read_csv(control_file, sep='\t'))

    # second handling the unlabeled df
    unlabeled_df = intial_ecoli_filter(pd.read_csv(unlabeled_file, sep='\t'))
    
    # third handling the labeled df
    labeled_df = intial_ecoli_filter(pd.read_csv(labeled_file, sep='\t'))
    
    # finding the best 200 of unique ids between the control and unlabeled df to be used as an internal standard
    unlabeled_ecoli_std_peptides_list = find_best_ecoli_standards(control_df, unlabeled_df, top_n)
    
    # finding the best 200 of unique ids between the control and labeled df to be used as an internal standard
    labeled_ecoli_std_peptides_list = find_best_ecoli_standards(control_df, labeled_df, top_n)
    
    # creating a final dataframe with all the information of the top n ecoli psms to be used as internal standards in the unlabeled and labeled sample
    unlabeled_ecoli_std_peptides_df = unlabeled_df[unlabeled_df['Unique_ID'].isin(unlabeled_ecoli_std_peptides_list)]
    labeled_ecoli_std_peptides_df = labeled_df[labeled_df['Unique_ID'].isin(labeled_ecoli_std_peptides_list)]
    
    control_df.to_csv(control_tsv_output, sep = "\t")
    return(unlabeled_ecoli_std_peptides_df, labeled_ecoli_std_peptides_df, control_df)

    
def intial_ecoli_filter(ecoli_df):
    #1. takes the psm df and then filters out psms that have intensity data
    #2. creates a unique_id containing the modified peptide + Charge and then filters unique values 
    #3. normalizes the intensity data to the mean value
    from statistics import mean
    
    # if the intensity > 0
    ecoli_df= ecoli_df.loc[(ecoli_df['Intensity'] > 0)]
    # if the psm is from ecoli
    ecoli_df= ecoli_df[ecoli_df['Entry Name'].str.contains("ECO")]
    # creating a unique_identifier using the modified peptide _ charge to then filter based on unique instances
    ecoli_df['Unique_ID'] = ecoli_df["Modified Peptide"] + "_" + ecoli_df['Charge'].apply(str)
    ecoli_df = ecoli_df.drop_duplicates(subset=['Unique_ID'], keep=False)
    
    #normalizing the ecoli intensity data
    average_intensity = mean(ecoli_df["Intensity"].tolist())
    ecoli_df['Normalized Intensity'] = ecoli_df['Intensity'] / average_intensity
    
    return(ecoli_df)

def find_best_ecoli_standards(df1, df2, best_number):
    # merges two df based on common "Unique ID" resulting in the "Normalized Intensity_x" and "Normalized Intensity_y" columns
    # top_number is the psms with the lowest euclidean distance to the best fit line from plotting the x and y normalized intensity
    # will return a list of ecoli psms 
    import scipy as sp
    
    merged_df = pd.merge(df1, df2, on= 'Unique_ID')
    
    #creating a new column with the euclidian distance 
    points = merged_df[['Normalized Intensity_x', 'Normalized Intensity_y']].values
    plane_normal = np.array([1.0, 1.0])
    projected_points = project_points(points, plane_normal)
    best_fit_line, centroid = fit_line(projected_points)
    distances = []
    for point in projected_points:
        distance = euclidean_distance(point, centroid, best_fit_line)
        distances.append(distance)
    merged_df['Distances'] = distances
    
    #filtering out the lowest 'best_number', Distances values for the top matches
    merged_df = merged_df.nsmallest(best_number, ['Distances'])
    
    #un comment if you want to see the r value
    r, p = sp.stats.pearsonr(x=merged_df['Normalized Intensity_x'], y=merged_df['Normalized Intensity_y'])
    print(r)
    
    return(merged_df['Unique_ID'].tolist())
    
    
import numpy as np
from scipy.linalg import svd
import pandas as pd

def project_points(points, plane_normal):
    # Normalize the plane normal
    plane_normal /= np.linalg.norm(plane_normal)

    # Calculate the projection matrix
    projection_matrix = np.eye(2) - np.outer(plane_normal, plane_normal)

    # Project the points onto the plane
    projected_points = np.dot(points, projection_matrix)

    return projected_points

def fit_line(points):
    # Find the centroid of the points
    centroid = np.mean(points, axis=0)

    # Subtract the centroid from the points
    centered_points = points - centroid

    # Perform singular value decomposition
    _, _, vh = svd(centered_points, full_matrices=False)

    # The best fit line is given by the right singular vector corresponding to the smallest singular value
    best_fit_line = vh[-1]

    return best_fit_line, centroid

def euclidean_distance(point, line_point, line_direction):
    # Calculate the vector from the line point to the input point
    vector = point - line_point

    # Calculate the perpendicular distance as the norm of the cross product
    distance = np.linalg.norm(np.cross(vector, line_direction))

    return distance
    
# ecoli_psm_selection('Unlabeled_files/c1_ul_02_psm.tsv', 'Labeled_files/C1/c1_l_01_psm.tsv', 'test_files/ecoli_std_psm.tsv', 200)


In [11]:
### Selecting all Hek psms that have unlabeled and labeled modifications
### This is old code, but it works

### Returns a DF with the peptide, modified peptide, rt, intensity, reported modifications, and protein

# Analysis 1o2
C1=["14.0156","28.0313"]
C2=["28.0313","56.0626"]
C3=["42.0470","84.0939"]
C4=["56.0625","112.1253"]
C5=["70.0782","140.1564"]
C6=["84.0939","168.1878"]

def labeled_hek_selection(L_psm, C_chain):
    # extracts labeled psms that can be used for filtering
    # C_chain can be any of the C1 - C6 w/o quotes
    import pandas as pd
    df = pd.read_csv(L_psm, sep='\t')
    all_peptides = df['Peptide'].tolist()
    unique_peptides = [x for x in all_peptides if all_peptides.count(x) == 1]
    labeled_peptides_4_df = []
    with open(L_psm, "r") as psm_open:
        psm_split = psm_open.read().split("\n")[1:-1]
        for each_line in psm_split:
            line_split = each_line.split("\t")
            #designates if it is unique or not for later filtering purposes if needed.
            if line_split[2] in unique_peptides:  
                unique = 'yes'
            else:
                unique = 'no'
            if line_split[3] != "":
                #all labeled peptides need to have a modified version in the 4th column in excell
                if "HUMAN" in line_split[30]:
                    # only looking at human psms
                    peptide = line_split[2]
                    mod_peptide = line_split[3]
                    modifications = line_split[27]

                    # counts how many carbon chains are on the psm
                    num_of_carbon = 0
                    mono = 0
                    di = 0
                    mono += 1*modifications.count(C_chain[0])
                    num_of_carbon += 1*modifications.count(C_chain[0])
                    di += 1*modifications.count(C_chain[1])
                    num_of_carbon += 2*modifications.count(C_chain[1])

                    charge = line_split[7]
                    protein = line_split[30]
                    rt = float(line_split[8])

                    #Because the tsv file does not give the whole intensity, only the XeY version
                    if "E" in line_split[26]:
                        intensity = int(float(line_split[26].split("E")[0])*(10**int(line_split[26].split("E")[1])))
                    else: 
                        intensity = int(float(line_split[26]))
                    modifications = line_split[27]
                    if intensity > 0:
                        labeled_peptides_4_df.append([peptide, mod_peptide, unique, charge, rt, intensity, modifications, num_of_carbon, str(mono)+","+str(di), protein])

    labeled_peptide_df = pd.DataFrame(labeled_peptides_4_df, columns = ["peptide","modified_peptide", 'unique', 'charge', "rt", "intensity", "modification", "carbon chains", "label_frequency",  "protein"]).sort_values(by=["rt"])
    return(labeled_peptide_df)

### Selecting all unlabeled HEK psms that:
# are unique

def unlabeled_hek_selection(U_psm):
    # extracts unlabeled psms that can be used for filtering
    import pandas as pd
    df=pd.read_csv(U_psm, sep='\t')
    aL_peptides = df['Peptide'].tolist()
    unique_peptides = [x for x in aL_peptides if aL_peptides.count(x) == 1]
    completely_labeled_peptides_4_df = []
    completely_unlabeled_peptides_4_df = []
    with open(U_psm, "r") as psm_open:
        psm_split = psm_open.read().split("\n")[1:-1]
        for each_line in psm_split:
            line_split = each_line.split("\t")
            if line_split[2] in unique_peptides:
            #needs to be unique
                if "HUMAN" in line_split[30]:
                    # only looking at human psms
                    peptide = line_split[2]
                    charge = line_split[7]
                    mod_peptide = line_split[3]
                    protein = line_split[30]
                    rt = float(line_split[8])
                    if "E" in line_split[26]:
                        intensity = int(float(line_split[26].split("E")[0])*(10**int(line_split[26].split("E")[1])))
                    else: 
                        intensity = int(float(line_split[26]))
                    modifications = line_split[27]
                    
                    if intensity > 0:
                        if "[" not in mod_peptide: 
                            #any aldehyde modification has this character, because its labeled ecoli + unlabeled hek, all labeled hek proteins should be false
                            completely_unlabeled_peptides_4_df.append([peptide, mod_peptide, charge, rt, intensity, modifications, "0,0", protein])
    completely_unlabeled_peptide_df = pd.DataFrame(completely_unlabeled_peptides_4_df, columns = ["peptide","modified_peptide", 'charge',"rt", "intensity", "modification", "label_frequency", "protein"]).sort_values(by=["rt"])
    return(completely_unlabeled_peptide_df)

### Takes all the commonly identified psms between labeled and unlabeled HEK

def human_psm_selection(U_df, L_df):
    #takes the common psms between the unlabeled and the labeled dataframe, these can be used for intensity analysis
    U_pep_list = U_df['peptide'].tolist()
    L_pep_list = L_df['peptide'].tolist()
    common_peptides = set(U_pep_list).intersection(set(L_pep_list))
    filtered_U_psm = U_df.loc[U_df['peptide'].isin(common_peptides)]
    filtered_L_psm = L_df.loc[L_df['peptide'].isin(common_peptides)]
    return(filtered_U_psm, filtered_L_psm)

In [18]:
# This code lists the preceeding and succeeding ecoli psm information that can be used for the intensity transformation downstream

def intensity_transformation_analysis(hek_df, ecoli_df, which_sample):
    # this works for either labeled or unlabeled hek as long as you pair up the selected unlabeled hek df (first output from human_psm_selection()) and the selected unlabeled ecoli df (first output from ecoli_psm_selection()) and the labeled dfs (second outputs)
    # will return the hek_df with aditional columns: "preceding peptide ID", "preceding int", 'preceding rt', 'preceding delta rt', "succeeding peptide ID", "succeeding int", 'succeeding rt', 'succeeding delta rt'
    # all the column with the "preceding" title are values of the nearest ecoli peptide that elutes BEFORE the hek peptide and all the columns with the "succeeding" title are values of the nearest ecoli peptide that elutes AFTER the hek peptide
    # KEEP IN MIND: These are the values of the ecoli peptides that were run with the hek peptides, and so we can derive the ratio between the HEK and Ecoli directly. To make a comparison between the unlabeled and labeled peptides, we need to project the peptide ID back to the control dataframe to get the conversion ratio.
    # Which sample is either: "Labeled" or "Unlabeled" this will specift wiether the intensity ratio should be HEK/Ecoli for the labeled sample or Ecoli/HEK for the unlabeled sample as specified by the intensity transformation equation.
    # output_file_name is the name of the tsv file
    
    import math
    import pandas as pd
    import peptides
    import numpy as np

    # Initialize new columns in hek_df to store the preceding and succeeding values
    hek_df["preceding Ecoli peptide ID"] = ''
    hek_df['preceding Ecoli int'] = np.nan
    if which_sample == "Labeled":
        hek_df['preceding HEK/Ecoli int ratio'] = np.nan
    if which_sample == "Unlabeled":
        hek_df['preceding Ecoli/HEK int ratio'] = np.nan
    hek_df['preceding Ecoli rt'] = np.nan
    hek_df['preceding Ecoli delta rt'] = np.nan
    
    hek_df["succeeding Ecoli peptide ID"] = ''
    hek_df['succeeding Ecoli int'] = np.nan
    if which_sample == "Labeled":
        hek_df['succeeding HEK/Ecoli int ratio'] = np.nan
    if which_sample == "Unlabeled":
        hek_df['succeeding Ecoli/HEK int ratio'] = np.nan
    hek_df['succeeding Ecoli rt'] = np.nan
    hek_df['succeeding Ecoli delta rt'] = np.nan
    
    # Iterate over each row in hek_df
    for index, row in hek_df.iterrows():

        retention_time = row['rt']
        hek_int = row['intensity']

        # Find the closest preceding matching retention time in ecoli_df
        preceding_rt = ecoli_df.loc[ecoli_df['Retention'] < retention_time, 'Retention'].max()
        preceding_peptide_ID = ecoli_df.loc[ecoli_df['Retention'] == preceding_rt, 'Unique_ID'].values
        preceding_peptide_int = ecoli_df.loc[ecoli_df['Retention'] == preceding_rt, 'Intensity'].values if not pd.isnull(preceding_rt) else np.nan


        # Find the closest succeeding matching retention time in ecoli_df
        succeeding_rt = ecoli_df.loc[ecoli_df['Retention'] > retention_time, 'Retention'].min()
        succeeding_peptide_ID = ecoli_df.loc[ecoli_df['Retention'] == succeeding_rt, 'Unique_ID'].values
        succeeding_peptide_int = ecoli_df.loc[ecoli_df['Retention'] == succeeding_rt, 'Intensity'].values if not pd.isnull(succeeding_rt) else np.nan
        
        # Calculate the absolute difference for preceding and succeeding times
        preceding_delta_rt = retention_time - preceding_rt if not pd.isnull(preceding_rt) else np.nan
        succeeding_delta_rt = succeeding_rt - retention_time if not pd.isnull(succeeding_rt) else np.nan
        
        # Calculate the intensity ratios for preceding and succeeding times
        
        if which_sample == "Labeled":
            preceding_int_ratio = hek_int/preceding_peptide_int if not pd.isnull(preceding_rt) else np.nan
            succeeding_int_ratio = hek_int/succeeding_peptide_int if not pd.isnull(succeeding_rt) else np.nan
        if which_sample == "Unlabeled":
            preceding_int_ratio = preceding_peptide_int/hek_int if not pd.isnull(preceding_rt) else np.nan
            succeeding_int_ratio = succeeding_peptide_int/hek_int if not pd.isnull(succeeding_rt) else np.nan
            
        # Update the new columns in hek_df
        hek_df.at[index, 'preceding Ecoli rt'] = preceding_rt
        hek_df.at[index, 'preceding Ecoli delta rt'] = preceding_delta_rt
        hek_df.at[index, 'preceding Ecoli int'] = preceding_peptide_int
        if which_sample == "Labeled":
            hek_df.at[index, 'preceding HEK/Ecoli int ratio'] = preceding_int_ratio
        if which_sample == "Unlabeled":
            hek_df.at[index, 'preceding Ecoli/HEK int ratio'] = preceding_int_ratio
        hek_df.at[index, "preceding Ecoli peptide ID"] = preceding_peptide_ID[0] if len(preceding_peptide_ID) > 0 else np.nan
        
        hek_df.at[index, 'succeeding Ecoli rt'] = succeeding_rt
        hek_df.at[index, 'succeeding Ecoli delta rt'] = succeeding_delta_rt
        hek_df.at[index, 'succeeding Ecoli int'] = succeeding_peptide_int
        if which_sample == "Labeled":
            hek_df.at[index, 'succeeding HEK/Ecoli int ratio'] = succeeding_int_ratio
        if which_sample == "Unlabeled":
            hek_df.at[index, 'succeeding Ecoli/HEK int ratio'] = succeeding_int_ratio
        hek_df.at[index, "succeeding Ecoli peptide ID"] = succeeding_peptide_ID[0] if len(succeeding_peptide_ID) > 0 else np.nan
        
    return(hek_df)
    

    
# intensity_transformation_analysis(c1_U_hek_df, ecoli_U_df)

In [17]:
### The following first 3 functions calculate the hydrophobicity of the peptides being ClogP, Gravy Score, and BBindex
### The last function adds the data to the output df from intensity transformation analysis and outputs it to a final tsv file for analysis

def logp_calc_manual(peptide:str, charge:str, carbon:str, label_frequency:str):
    # peptide should be the peptide in string format, no modifications. 
    # charge should be the one listed in the tsv file. The charge is used to denote if there is a charge to n-term, R or K (-0.7168) or if there is a charge on a H (-0.5809) All values are verfied through Chemdraw.
    # carbon is used to deonte the aldehyde length as the different carbons result in different set changes to clog-p, it is also the key to the mod_dict which denotes this change for [monoalkylation, dialkylation]
    # labeleding frequency is from the df and is denoted as "# of monolabeled , # of dilabeled"
    # the function will return 3 different values:
    
    # 1. The clogp genereated from rdkit
    # 2. The charged and labeled clogp using the modification to clogp from charges on the histidine, n-term, lysine, and arginine as well as the amount of mono or di -labeleding
    # 3. The molecule denoted in smiles format
    
    import rdkit
    from rdkit.Chem import Descriptors
    from rdkit.Chem import Crippen
    from rdkit import Chem
    
    # mod_dict contains all the changes to clogp for the monoalkylation and dialkylation for each carbon.
    mod_dict = {"C1":[-0.05113,-0.0974],
            "C2":[0.3414,0.6828],
            "C3":[0.72907,1.46057],
            "C4":[1.11917,2.24077],
            "C5":[1.50927,3.02097],
            "C6":[1.89937,3.70134]}
    
    mono_di_labeled_values = mod_dict[carbon]
    
    smiles=Chem.MolToSmiles(Chem.MolFromFASTA(peptide))

    mol=Chem.MolFromSmiles(smiles)
    clogp = Descriptors.MolLogP(mol)
    
    # assessing the the amount of charges on n-term, lysine, and arginine vs histidine. I am assuming charges are more likely to be on K, R and n-termini vs Histidine
    # starting with 1 for the n-termini
    non_his_charge_sites = 1
    non_his_charge_sites += peptide.count("K")
    non_his_charge_sites += peptide.count("R")
    
    histidine_charge_sites = int(charge) - non_his_charge_sites
    
    # calculating the difference from charges
    charged_labeled_clogp = clogp + non_his_charge_sites*-0.7168 + histidine_charge_sites*-0.5809
    
    # calculating the difference from the labeles
    # monolabeled
    charged_labeled_clogp = clogp + int(label_frequency.split(",")[0]) * mono_di_labeled_values[0] + int(label_frequency.split(",")[1]) * mono_di_labeled_values[1]
    
    return(clogp, charged_labeled_clogp, smiles)
    
    
def gravy_calc(pep_seq, normalization = 1):
    # calculates the average gravy value for amino acids in a peptide
    aa_dict = {'A': 1.8,
               'C': 2.5,
               'D': -3.5,
               'E': -3.5,
               'F': 2.8,
               'G': -0.4,
               'H': -3.2,
               'I': 4.5,
               'K': -3.9,
               'L': 3.8,
               'M': 1.9,
               'N': -3.5,
               'P': -1.6,
               'Q': -3.5,
               'R': -4.5,
               'S': -0.8,
               'T': -0.7,
               'V': 4.2,
               'W': -0.9,
               'Y': -1.3,
               "X": 0
              }
    if normalization == 1:
        pep_hp = sum(aa_dict[i] for i in pep_seq) / len(pep_seq)
    elif normalization == 0:
        pep_hp = sum(aa_dict[i] for i in pep_seq)
    else:
        print('Normalzation only allows 0 or 1, default is 1, meaning the values is normalized by total pep length')     
    return (pep_hp)


### used to calculate bb_index score    
def bb_index(pep_seq, normalization = 1):

    aa_dict = {'A': 2.55,
               'C': 1.506,
               'D': 2.55,
               'E': 2.133,
               'F': -6.36,
               'G': 3.39,
               'H': 2.89,
               'I': -6.06,
               'K': 1.92,
               'L': -6.9,
               'M': -2.76,
               'N': 3.72,
               'P': -0.71,
               'Q': 4.05,
               'R': 2.89,
               'S': 1.76,
               'T': 1.21,
               'V': -3.14,
               'W': -5.02,
               'Y': -5.98,
               "X": 0
              }
    if normalization == 1:
        pep_hp = sum(aa_dict[i] for i in pep_seq) / len(pep_seq)
    elif normalization == 0:
        pep_hp = sum(aa_dict[i] for i in pep_seq)
    else:
        print('Normalzation only allows 0 or 1, default is 1, meaning the values is normalized by total pep length')     
    return (pep_hp)
    

def peptide_hydrophobicity_analysis(df_from_intensity_transformation, aldehyde, output_tsv_name):
    ### Adds the data to the output df from intensity transformation analysis and outputs it to a final tsv file for analysis
    df_from_intensity_transformation["Gravy Score"] = df_from_intensity_transformation.apply(lambda row: pd.Series(gravy_calc(row['peptide'])), axis=1)
    df_from_intensity_transformation["BB Index"] = df_from_intensity_transformation.apply(lambda row: pd.Series(bb_index(row['peptide'])), axis=1)
    df_from_intensity_transformation[["ClogP", "Charge Labeled ClogP", "SMILES"]] = df_from_intensity_transformation.apply(lambda row: pd.Series(logp_calc_manual(row['peptide'], row['charge'], aldehyde, row['label_frequency'])), axis=1)
    df_from_intensity_transformation.to_csv(output_tsv_name, sep = "\t")
    return(df_from_intensity_transformation)

# peptide_hydrophobicity_analysis(c1_labeled_df, "C1", "testy.tsv")
# logp_calc_manual('SNVSDAHVAQSTR', 3, "C1", "1,1")

In [35]:
### same control used for all samples
ecoli_control = 'Input_files/Control_files/ecoli_std_psm.tsv'

### C1
print("C1 Ecoli r-value (unlabeled, labeled): ")
c1_unlabeled_sample = 'Input_files/Unlabeled_files/c1_ul_02_psm.tsv'
c1_ecoli_labeled = "Input_files/Labeled_files/C1/c1_l_01_psm.tsv"
c1_hek_labeled = "Input_files/Labeled_files/C1/c1_l_01_psm.tsv"

final_hek_unlabeled_tsv = "Processed_files/Unlabeled_files/C1_Unlabeled_Hek.tsv"
final_hek_labeled_tsv = "Processed_files/Labeled_files/C1_Labeled_Hek.tsv"

#finding the selected hek and ecoli psms
ecoli_U_df, ecoli_L_df, ecoli_C_df = ecoli_psm_selection(c1_unlabeled_sample, c1_ecoli_labeled, ecoli_control, 200, "Processed_files/Control_files/ecoli_std.tsv")
c1_U_hek_df, c1_L_hek_df = human_psm_selection(unlabeled_hek_selection(c1_unlabeled_sample), labeled_hek_selection(c1_hek_labeled, C1))

#summarizing the preceeding and succeeding ecoli psms to all the hek labeled and unlabeled psms
c1_preceding_succeeding_labeled_df = intensity_transformation_analysis(c1_L_hek_df, ecoli_L_df, "Labeled")    
c1_preceding_succeeding_unlabeled_df = intensity_transformation_analysis(c1_U_hek_df, ecoli_U_df,  "Unlabeled")

#outputing the data with the hydrophobicity info
peptide_hydrophobicity_analysis(c1_preceding_succeeding_labeled_df, "C1", final_hek_unlabeled_tsv)
peptide_hydrophobicity_analysis(c1_preceding_succeeding_labeled_df, "C1", final_hek_labeled_tsv)

### C2
print("C2 Ecoli r-value (unlabeled, labeled): ")
c2_unlabeled_sample = 'Input_files/Unlabeled_files/c2-c6_ul_01_psm.tsv'
c2_ecoli_labeled = "Input_files/Labeled_files/C2/c2_ecoli_l_02_psm.tsv"
c2_hek_labeled = "Input_files/Labeled_files/C2/c2_hek_l_02_psm.tsv"

final_hek_unlabeled_tsv = "Processed_files/Unlabeled_files/C2_Unlabeled_Hek.tsv"
final_hek_labeled_tsv = "Processed_files/Labeled_files/C2_Labeled_Hek.tsv"

#finding the selected hek and ecoli psms
ecoli_U_df, ecoli_L_df, ecoli_C_df = ecoli_psm_selection(c2_unlabeled_sample, c2_ecoli_labeled, ecoli_control, 200, "Processed_files/Control_files/ecoli_std.tsv")
c2_U_hek_df, c2_L_hek_df = human_psm_selection(unlabeled_hek_selection(c2_unlabeled_sample), labeled_hek_selection(c2_hek_labeled, C2))


#summarizing the preceeding and succeeding ecoli psms to all the hek labeled and unlabeled psms
c2_preceding_succeeding_labeled_df = intensity_transformation_analysis(c2_L_hek_df, ecoli_L_df, "Labeled")    
c2_preceding_succeeding_unlabeled_df = intensity_transformation_analysis(c2_U_hek_df, ecoli_U_df,  "Unlabeled")

#outputing the data with the hydrophobicity info
peptide_hydrophobicity_analysis(c2_preceding_succeeding_labeled_df, "C2", final_hek_unlabeled_tsv)
peptide_hydrophobicity_analysis(c2_preceding_succeeding_labeled_df, "C2", final_hek_labeled_tsv)

### C3
print("C3 Ecoli r-value (unlabeled, labeled): ")
c3_unlabeled_sample = 'Input_files/Unlabeled_files/c2-c6_ul_01_psm.tsv'
c3_ecoli_labeled = "Input_files/Labeled_files/C3/c3_ecoli_l_02_psm.tsv"
c3_hek_labeled = "Input_files/Labeled_files/C3/c3_hek_l_02_psm.tsv"

final_hek_unlabeled_tsv = "Processed_files/Unlabeled_files/C3_Unlabeled_Hek.tsv"
final_hek_labeled_tsv = "Processed_files/Labeled_files/C3_Labeled_Hek.tsv"

#finding the selected hek and ecoli psms
ecoli_U_df, ecoli_L_df, ecoli_C_df = ecoli_psm_selection(c3_unlabeled_sample, c3_ecoli_labeled, ecoli_control, 200, "Processed_files/Control_files/ecoli_std.tsv")
c3_U_hek_df, c3_L_hek_df = human_psm_selection(unlabeled_hek_selection(c3_unlabeled_sample), labeled_hek_selection(c3_hek_labeled, C3))


#summarizing the preceeding and succeeding ecoli psms to all the hek labeled and unlabeled psms
c3_preceding_succeeding_labeled_df = intensity_transformation_analysis(c3_L_hek_df, ecoli_L_df, "Labeled")    
c3_preceding_succeeding_unlabeled_df = intensity_transformation_analysis(c3_U_hek_df, ecoli_U_df,  "Unlabeled")

#outputing the data with the hydrophobicity info
peptide_hydrophobicity_analysis(c3_preceding_succeeding_labeled_df, "C3", final_hek_unlabeled_tsv)
peptide_hydrophobicity_analysis(c3_preceding_succeeding_labeled_df, "C3", final_hek_labeled_tsv)

### C4
print("C4 Ecoli r-value (unlabeled, labeled): ")
c4_unlabeled_sample = 'Input_files/Unlabeled_files/c2-c6_ul_01_psm.tsv'
c4_ecoli_labeled = "Input_files/Labeled_files/C4/c4_ecoli_l_02_psm.tsv"
c4_hek_labeled = "Input_files/Labeled_files/C4/c4_hek_l_02_psm.tsv"

final_hek_unlabeled_tsv = "Processed_files/Unlabeled_files/C4_Unlabeled_Hek.tsv"
final_hek_labeled_tsv = "Processed_files/Labeled_files/C4_Labeled_Hek.tsv"

#finding the selected hek and ecoli psms
ecoli_U_df, ecoli_L_df, ecoli_C_df = ecoli_psm_selection(c4_unlabeled_sample, c4_ecoli_labeled, ecoli_control, 200, "Processed_files/Control_files/ecoli_std.tsv")
c4_U_hek_df, c4_L_hek_df = human_psm_selection(unlabeled_hek_selection(c4_unlabeled_sample), labeled_hek_selection(c4_hek_labeled, C4))


#summarizing the preceeding and succeeding ecoli psms to all the hek labeled and unlabeled psms
c4_preceding_succeeding_labeled_df = intensity_transformation_analysis(c4_L_hek_df, ecoli_L_df, "Labeled")    
c4_preceding_succeeding_unlabeled_df = intensity_transformation_analysis(c4_U_hek_df, ecoli_U_df,  "Unlabeled")

#outputing the data with the hydrophobicity info
peptide_hydrophobicity_analysis(c4_preceding_succeeding_labeled_df, "C4", final_hek_unlabeled_tsv)
peptide_hydrophobicity_analysis(c4_preceding_succeeding_labeled_df, "C4", final_hek_labeled_tsv)

### C5
print("C5 Ecoli r-value (unlabeled, labeled): ")
c5_unlabeled_sample = 'Input_files/Unlabeled_files/c2-c6_ul_01_psm.tsv'
c5_ecoli_labeled = "Input_files/Labeled_files/C5/c5_ecoli_l_01_psm.tsv"
c5_hek_labeled = "Input_files/Labeled_files/C5/c5_hek_l_01_psm.tsv"

final_hek_unlabeled_tsv = "Processed_files/Unlabeled_files/C5_Unlabeled_Hek.tsv"
final_hek_labeled_tsv = "Processed_files/Labeled_files/C5_Labeled_Hek.tsv"

#finding the selected hek and ecoli psms
ecoli_U_df, ecoli_L_df, ecoli_C_df = ecoli_psm_selection(c5_unlabeled_sample, c5_ecoli_labeled, ecoli_control, 200, "Processed_files/Control_files/ecoli_std.tsv")
c5_U_hek_df, c5_L_hek_df = human_psm_selection(unlabeled_hek_selection(c5_unlabeled_sample), labeled_hek_selection(c5_hek_labeled, C5))


#summarizing the preceeding and succeeding ecoli psms to all the hek labeled and unlabeled psms
c5_preceding_succeeding_labeled_df = intensity_transformation_analysis(c5_L_hek_df, ecoli_L_df, "Labeled")    
c5_preceding_succeeding_unlabeled_df = intensity_transformation_analysis(c5_U_hek_df, ecoli_U_df,  "Unlabeled")

#outputing the data with the hydrophobicity info
peptide_hydrophobicity_analysis(c5_preceding_succeeding_labeled_df, "C5", final_hek_unlabeled_tsv)
peptide_hydrophobicity_analysis(c5_preceding_succeeding_labeled_df, "C5", final_hek_labeled_tsv)

### C6
print("C6 Ecoli r-value (unlabeled, labeled): ")
c6_unlabeled_sample = 'Input_files/Unlabeled_files/c2-c6_ul_01_psm.tsv'
c6_ecoli_labeled = "Input_files/Labeled_files/C6/c6_ecoli_l_02_psm.tsv"
c6_hek_labeled = "Input_files/Labeled_files/C6/c6_hek_l_02_psm.tsv"

final_hek_unlabeled_tsv = "Processed_files/Unlabeled_files/C6_Unlabeled_Hek.tsv"
final_hek_labeled_tsv = "Processed_files/Labeled_files/C6_Labeled_Hek.tsv"

#finding the selected hek and ecoli psms
ecoli_U_df, ecoli_L_df, ecoli_C_df = ecoli_psm_selection(c6_unlabeled_sample, c6_ecoli_labeled, ecoli_control, 200, "Processed_files/Control_files/ecoli_std.tsv")
c6_U_hek_df, c6_L_hek_df = human_psm_selection(unlabeled_hek_selection(c6_unlabeled_sample), labeled_hek_selection(c6_hek_labeled, C6))


#summarizing the preceeding and succeeding ecoli psms to all the hek labeled and unlabeled psms
c6_preceding_succeeding_labeled_df = intensity_transformation_analysis(c6_L_hek_df, ecoli_L_df, "Labeled")    
c6_preceding_succeeding_unlabeled_df = intensity_transformation_analysis(c6_U_hek_df, ecoli_U_df,  "Unlabeled")

#outputing the data with the hydrophobicity info
peptide_hydrophobicity_analysis(c6_preceding_succeeding_labeled_df, "C6", final_hek_unlabeled_tsv)
peptide_hydrophobicity_analysis(c6_preceding_succeeding_labeled_df, "C6", final_hek_labeled_tsv)

C1 Ecoli r-value (unlabeled, labeled): 
0.9092555369162675
0.9681027103193446
C2 Ecoli r-value (unlabeled, labeled): 
0.9523248411728156
0.8940109330685092
C3 Ecoli r-value (unlabeled, labeled): 
0.9523248411728156
0.9873316097663668
C4 Ecoli r-value (unlabeled, labeled): 
0.9523248411728156
0.9916451967742617
C5 Ecoli r-value (unlabeled, labeled): 
0.9523248411728156
0.9901629461767284
C6 Ecoli r-value (unlabeled, labeled): 
0.9523248411728156
0.9852790471750184


Unnamed: 0,peptide,modified_peptide,unique,charge,rt,intensity,modification,carbon chains,label_frequency,protein,...,succeeding Ecoli peptide ID,succeeding Ecoli int,succeeding HEK/Ecoli int ratio,succeeding Ecoli rt,succeeding Ecoli delta rt,Gravy Score,BB Index,ClogP,Charge Labeled ClogP,SMILES
0,QRQPPLLGDHPAEYGGPHGGYHSHYHDEGYGPPPPHYEGR,n[169]QRQPPLLGDHPAEYGGPHGGYHSHYHDEGYGPPPPHYEGR,yes,6,1370.7963,30010352,N-term(168.1878),2,01,sp|P14866|HNRPL_HUMAN,...,n[29]AIGEAK[156]DDDTADILTAASR_3,62205996.0,0.482435,1379.3599,8.5636,-1.665000,0.619225,-18.35166,-14.65032,CC(C)C[C@H](NC(=O)[C@H](CC(C)C)NC(=O)[C@@H]1CC...
4,GGGQIIPTAR,n[85]GGGQIIPTAR,no,2,1616.5505,20913772,N-term(84.0939),1,10,sp|P13639|EF2_HUMAN,...,n[29]VYSGVVNSGDTVLNSVK[156]_3,233705136.0,0.089488,1633.0502,16.4997,-0.070000,0.804000,-5.81643,-3.91706,CC[C@H](C)[C@H](NC(=O)[C@H](CCC(N)=O)NC(=O)CNC...
10,RGEAHLAVNDFELAR,n[85]RGEAHLAVNDFELAR,no,3,1688.6953,10456103,N-term(84.0939),1,10,sp|Q02790|FKBP4_HUMAN,...,n[29]FGDVGADTLGHIAEACAK[156]_3,68831008.0,0.151910,1704.8721,16.1768,-0.440000,0.463067,-7.56046,-5.66109,CC(C)C[C@H](NC(=O)[C@H](Cc1c[nH]cn1)NC(=O)[C@H...
16,SEEAHAEDSVMDHHFR,n[169]SEEAHAEDSVMDHHFR,no,5,1799.5223,20396804,N-term(168.1878),2,01,sp|Q8NC51-2|PAIRB_HUMAN,...,n[29]ALEIEEMQLK[156]_2,78187608.0,0.260870,1800.1161,0.5938,-1.293750,1.213688,-9.94903,-6.24769,CSCC[C@H](NC(=O)[C@@H](NC(=O)[C@H](CO)NC(=O)[C...
17,SEEAHAEDSVMDHHFR,n[169]SEEAHAEDSVMDHHFR,no,4,1799.7818,24315040,N-term(168.1878),2,01,sp|Q8NC51-2|PAIRB_HUMAN,...,n[29]ALEIEEMQLK[156]_2,78187608.0,0.310983,1800.1161,0.3343,-1.293750,1.213688,-9.94903,-6.24769,CSCC[C@H](NC(=O)[C@@H](NC(=O)[C@H](CO)NC(=O)[C...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
998,YQILPLHSQIPR,n[169]YQILPLHSQIPR,yes,3,2521.9601,104222719,N-term(168.1878),2,01,sp|Q08211|DHX9_HUMAN,...,,,,,,-0.283333,-1.473333,-3.37143,0.32991,CC[C@H](C)[C@H](NC(=O)[C@H](CCC(N)=O)NC(=O)[C@...
999,VDATEESDLAQQYGVR,n[169]VDATEESDLAQQYGVR,yes,3,2522.3374,30247799,N-term(168.1878),2,01,sp|P07237|PDIA1_HUMAN,...,,,,,,-0.806250,0.791000,-10.50573,-6.80439,CC(C)C[C@H](NC(=O)[C@H](CC(=O)O)NC(=O)[C@H](CO...
1001,TVVSGLVNHVPLEQMQNR,n[169]TVVSGLVNHVPLEQMQNR,yes,3,2523.8627,45179120,N-term(168.1878),2,01,sp|Q12904-2|AIMP1_HUMAN,...,,,,,,-0.133333,-0.000944,-10.00443,-6.30309,CSCC[C@H](NC(=O)[C@H](CCC(N)=O)NC(=O)[C@H](CCC...
1002,ADEGISFR,n[169]ADEGISFR,yes,2,2524.1482,47391308,N-term(168.1878),2,01,sp|Q06830|PRDX1_HUMAN,...,,,,,,-0.450000,0.356625,-4.67293,-0.97159,CC[C@H](C)[C@H](NC(=O)CNC(=O)[C@H](CCC(=O)O)NC...
