Drive Mount and Package Import

In [None]:
from google.colab import drive
drive.mount('/content/drive')

!pip install bioinfokit
!pip install seaborn
!pip install seaborn[stats]

from IPython.display import display, HTML
from pandas.core.algorithms import duplicated
from pandas.errors import AccessorRegistrationWarning
from scipy import stats as st
from scipy.stats import ttest_ind_from_stats
from scipy.stats import combine_pvalues
from bioinfokit import analys, visuz
import math
import os
import glob
import pandas as pd
import re
import numpy as np
from numpy.ma.core import mean
import matplotlib as mpl
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import csv
import statistics
from sklearn.impute import KNNImputer


In [None]:
# Set Global Variables
significance_cutoff = np.log10(0.05)*-10
change_cutoff = np.log2(1)
FC_sex = 'Male-Female'
ontologyDB_sex = 'Male-Female'
comparison = None

In [None]:
# Data Scaling functions
def auto_scaling(df):
  df_mean = np.mean(df[f'{comparison}_Fold_change'])
  df_std = np.std(df[f'{comparison}_Fold_change'])
  df[f'{comparison}_Fold_change'] = df[f'{comparison}_Fold_change'] - df_mean
  df[f'{comparison}_Fold_change'] = df[f'{comparison}_Fold_change'] / df_std
  return(df)

def range_scaling(df):
  df_min = np.min(df)
  df_max = np.max(df)
  df_mean = np.mean(df)
  df = df - df_mean
  df = df / (df_max - df_min)
  return(df)

def pareto_scaling(df):
  df_mean = np.mean(df[f'{comparison}_Fold_change'])
  df_std = np.std(df[f'{comparison}_Fold_change'])
  df[f'{comparison}_Fold_change'] = df[f'{comparison}_Fold_change'] - df_mean
  df[f'{comparison}_Fold_change'] = df[f'{comparison}_Fold_change'] /  np.sqrt(df_std)
  return(df)

def vast_scaling(df):
  df_mean = np.mean(df[f'{comparison}_Fold_change'])
  df_std = np.std(df[f'{comparison}_Fold_change'])
  df[f'{comparison}_Fold_change'] = df[f'{comparison}_Fold_change'] - df_mean
  df[f'{comparison}_Fold_change'] = df[f'{comparison}_Fold_change'] / (df_std)
  df[f'{comparison}_Fold_change'] = df[f'{comparison}_Fold_change'] * (df_mean/df_std)
  return(df)

def level_scaling(df):
  df_mean = np.mean(df[f'{comparison}_Fold_change'])
  df[f'{comparison}_Fold_change'] = df[f'{comparison}_Fold_change'] - df_mean
  df[f'{comparison}_Fold_change'] = df[f'{comparison}_Fold_change'] / (df_mean)
  return(df)

This Section analyzes Peaks protein.csv outputs

In [None]:
# Normalization (Aguilan et Al) and stats workflow
from numpy.core.shape_base import atleast_3d

# Establish the location of the protein.csv files, and create a list of all filenames
where_are_the_files = f'/content/drive/MyDrive/ApoE Analysis January-11-2023/{sex}/Peaks_DB_Data'
os.chdir(where_are_the_files)
all_file_names = [i for i in glob.glob('*.{}'.format('csv'))]

# Create a dataframe for each comparison
E2vE3_PVFC_DF = pd.DataFrame()
E4vE3_PVFC_DF = pd.DataFrame()
E4vE2_PVFC_DF = pd.DataFrame()

# This loop uses the filename list to normalize and analyze every file individually
for dataSet in all_file_names:
  fileName = dataSet.strip('.csv')
  dataDF = pd.read_csv(dataSet)

  # Read CSV, set the 'Accession' as the index, filter out proteins with <2 unique peptides, and filter out non-area data.
  dataDF = dataDF.set_index('Accession')
  dataDF = dataDF.loc[(dataDF['#Unique'] >= 2),]
  dataDF = dataDF.loc[dataDF['Top'] == True,]
  dataDF = dataDF.filter(regex='Area')

  # Replace 0 with NaN, create allele specific dataframes, count non-zero values, remove proteins with >1 missing value
  dataDF = dataDF.replace(np.NaN,0)
  dataDF['nonZero_Count'] = (dataDF != 0).astype(int).sum(axis=1)

  # Create a genotype specific dataframe for the purpose of filtering out proteins with (n-1) valid values in each genotype sample set
  E2_DF = dataDF.filter(regex='A2').copy()
  E3_DF = dataDF.filter(regex='A3').copy()
  E4_DF = dataDF.filter(regex='A4').copy()

  # Calculate number of samples (columns) in each condition
  cols_E2_DF = len(E2_DF.axes[1])
  cols_E3_DF = len(E3_DF.axes[1])
  cols_E4_DF = len(E4_DF.axes[1])

  # Calculate the number of non-zero values in each condition
  E2_DF['E2_nonZero_Count'] = (E2_DF != 0).astype(int).sum(axis=1)
  E3_DF['E3_nonZero_Count'] = (E3_DF != 0).astype(int).sum(axis=1)
  E4_DF['E4_nonZero_Count'] = (E4_DF != 0).astype(int).sum(axis=1)

  # Remove any proteins with More than n-1 nonZeros or NANs
  E2_DF = E2_DF.loc[(E2_DF['E2_nonZero_Count'] >= (cols_E2_DF-1)),]
  E3_DF = E3_DF.loc[(E3_DF['E3_nonZero_Count'] >= (cols_E3_DF-1)),]
  E4_DF = E4_DF.loc[(E4_DF['E4_nonZero_Count'] >= (cols_E4_DF-1)),]

  # Create a comparison specific dataframes for analysis, delete any NAN of non-Area columns
  E2vE3_comparison_DF = pd.concat([E2_DF,E3_DF],axis=1).dropna().filter(regex='Area').copy()
  E4vE3_comparison_DF = pd.concat([E4_DF,E3_DF],axis=1).dropna().filter(regex='Area').copy()
  E4vE2_comparison_DF = pd.concat([E4_DF,E2_DF],axis=1).dropna().filter(regex='Area').copy()

  # Create a dataframe with all genotypes
  E4vE3vE2_comparison_DF = pd.concat([E4_DF,E3_DF,E2_DF],axis=1).dropna().filter(regex='Area').copy()
  E4vE3vE2_comparison_DF = E4vE3vE2_comparison_DF.replace(0,np.NaN)

  # This function returns the normalized dataset
  def slope_normalize(input_df):
    # Creat allele comparison dataframes, log2 normalize and replace -inf values with NaN.
    df_to_normalize = input_df.copy()
    df_to_normalize = np.log2(df_to_normalize)
    df_to_normalize = df_to_normalize.replace(-np.inf, np.NaN)

    # Normalize Data by the Average
    for sample in df_to_normalize.columns:
      df_to_normalize[sample] = df_to_normalize[sample] - np.mean(df_to_normalize[sample])

    # Normalize Data by the Slope
    df_to_normalize['protein_avg'] = np.mean(df_to_normalize,axis=1)
    for sample in df_to_normalize.columns:
      x= df_to_normalize['protein_avg']
      y= df_to_normalize[sample]
      mask = ~np.isnan(x) & ~np.isnan(y)
      slope = st.linregress(x[mask], y[mask])[0]
      df_to_normalize[sample] = df_to_normalize[sample]/slope
    normalized_DF = df_to_normalize.filter(regex='Area')

    # Impute missing values using KNN imputer
    imputer = KNNImputer(n_neighbors=2)
    normalized_DF = pd.DataFrame(imputer.fit_transform(normalized_DF),columns=normalized_DF.columns,index=normalized_DF.index)
    x = normalized_DF.plot.density(figsize=(10, 10))
    return(normalized_DF)

  # use the normalization function to normalize the dataframe with the genotypes
  E4vE3vE2_norm_DF = slope_normalize(E4vE3vE2_comparison_DF)

  # Create normalized dataframes for each comparison
  E2vE3_norm_DF = E4vE3vE2_norm_DF.filter(regex='A2|A3').copy()
  E4vE3_norm_DF = E4vE3vE2_norm_DF.filter(regex='A4|A3').copy()
  E4vE2_norm_DF = E4vE3vE2_norm_DF.filter(regex='A4|A2').copy()

  # Store the normalized comparisons as CSVs
  E2vE3_norm_DF.to_csv(f'/content/drive/MyDrive/ApoE Analysis January-11-2023/{sex}/Normalized_Peaks_Data/{fileName}_E2vE3_Normalized_DataSet.csv',index=True)
  E4vE3_norm_DF.to_csv(f'/content/drive/MyDrive/ApoE Analysis January-11-2023/{sex}/Normalized_Peaks_Data/{fileName}_E4vE3_Normalized_DataSet.csv',index=True)
  E4vE2_norm_DF.to_csv(f'/content/drive/MyDrive/ApoE Analysis January-11-2023/{sex}/Normalized_Peaks_Data/{fileName}_E4vE2_Normalized_DataSet.csv',index=True)

  # This function tests the datasets for equality of variance using an F-test; based on the results of the F-test, it will calculate statistical significance for each comparison; Finally, it will calcualte the Fold change
  def stats_calc(input_df,comparison=None):
    if comparison == 'E2vE3':
      allele_a = 'A3'
      allele_b = 'A2'
    elif comparison == 'E4vE3':
      allele_a = 'A3'
      allele_b = 'A4'
    elif comparison == 'E4vE2':
      allele_a = 'A2'
      allele_b = 'A4'

    # Create genotype specific dataframes
    comparison_DF_a = input_df.filter(regex=allele_a).copy()
    comparison_DF_b = input_df.filter(regex=allele_b).copy()

    # Create dictionaries to store PVs and FCs
    comparison_p_val_dict = {}
    comparison_FC_dict = {}

    for protein in input_df.index.to_list():
      #ApoEb vs ApoEa P-value and FC
      a = comparison_DF_a.loc[protein,]
      b = comparison_DF_b.loc[protein,]

      # For each protein in the dataset, use the f-test to determine if variance is equal between both populations;
      variance_a = np.var(a, ddof=1)
      variance_b = np.var(b, ddof=1)

      if variance_a > variance_b:
        f_value = variance_a / variance_b
      else:
        f_value = variance_b / variance_a

      df_a = len(a) - 1
      df_b = len(b) - 1

      if variance_a > variance_b:
        f_PV =1 - st.f.cdf(f_value, df_a, df_b)
      else:
        f_PV = st.f.cdf(f_value, df_b, df_a)

      # Use the results of the f-test to determine what t-test to use for to calculate PV; Calculate PV and store it in PV dictionary
      if f_PV > 0.05:
        comparison_p_val = st.ttest_ind(a,b, equal_var=True, alternative='two-sided',nan_policy='omit')[1]
        comparison_p_val_dict[protein] = comparison_p_val

      elif f_PV < 0.05:
        comparison_p_val = st.ttest_ind(a,b, equal_var=False, alternative='two-sided',nan_policy='omit')[1]
        comparison_p_val_dict[protein] = comparison_p_val

      # Calculate FC and store it in FC dictionary
      comparison_FC = (np.mean(b)) - (np.mean(a))
      comparison_FC_dict[protein] = comparison_FC

    # Convert PV and FC dictionaries into dataframes
    comparison_df_PV = pd.DataFrame.from_dict(comparison_p_val_dict.items()).rename(columns={0 : 'Accession',1 : f'{comparison}_PV'}).set_index('Accession')
    comparison_df_FC = pd.DataFrame.from_dict(comparison_FC_dict.items()).rename(columns={0 : 'Accession',1 : f'{comparison}_FC'}).set_index('Accession')

    comparison_df_PV = pd.melt(comparison_df_PV,ignore_index=False).rename(columns={'value' : 'P_val'})
    comparison_df_PV = comparison_df_PV.drop(columns='variable')

    comparison_df_FC = pd.melt(comparison_df_FC,ignore_index=False).rename(columns={'value' : 'Fold_change'})

    # Create a dataframe that contains both the PC and the FC
    comparison_PVFC = pd.concat([comparison_df_PV,comparison_df_FC],axis=1)
    comparison_PVFC['-log10(PV)'] = np.log10(comparison_PVFC['P_val'])*-10
    return(comparison_PVFC)

  # Use the stats_calc() function to calculate the PV and FC for each comparison
  E2vE3_DF = stats_calc(E2vE3_norm_DF,comparison='E2vE3')
  E4vE3_DF = stats_calc(E4vE3_norm_DF,comparison='E4vE3')
  E4vE2_DF = stats_calc(E4vE2_norm_DF,comparison='E4vE2')

  # Store the PV and FC calculations into a single dataframe that contains the PV and FC from all datasets
  E2vE3_PVFC_DF = pd.concat([E2vE3_PVFC_DF,E2vE3_DF])
  E4vE3_PVFC_DF = pd.concat([E4vE3_PVFC_DF,E4vE3_DF])
  E4vE2_PVFC_DF = pd.concat([E4vE2_PVFC_DF,E4vE2_DF])

  # This function Calculates the benjamini hochberg P-value correction for the DataSet
  def performBH_correction(input_df):
    input_df['Benjamini_Hochberg_pval'] = None
    input_df = input_df.reset_index()
    #sort cleaned_df by pvalue jc mod
    input_df = input_df.sort_values(by='P_val')
    input_df = input_df.reset_index(drop=True) #sort keeps the origional index value so you need to re-index to use it in the BH calc
    #calculate benjamini_hochberg correction as (rank/total numer of tests)*probability of false positive jc mod
    total_rows = len(input_df.index)
    for row in input_df.itertuples():
      BH_pval = ((row.Index+1)/total_rows)*0.25
      input_df.at[row.Index, 'Benjamini_Hochberg_pval'] = BH_pval
    input_df['-10*log10(BH)'] = np.log10(input_df['Benjamini_Hochberg_pval'].astype(float))*-10
    return(input_df)

  # Use the performBH_correction() to calcualte PV correction
  E2vE3_DF = performBH_correction(E2vE3_DF)
  E4vE3_DF = performBH_correction(E4vE3_DF)
  E4vE2_DF = performBH_correction(E4vE2_DF)

  # Store the dataframes with the PV and FC calculations
  E2vE3_DF.to_csv(f'/content/drive/MyDrive/ApoE Analysis January-11-2023/{sex}/PVFC/E2vE3_PVFC_{fileName}.csv',index=True)
  E4vE3_DF.to_csv(f'/content/drive/MyDrive/ApoE Analysis January-11-2023/{sex}/PVFC/E4vE3_PVFC_{fileName}.csv',index=True)
  E4vE2_DF.to_csv(f'/content/drive/MyDrive/ApoE Analysis January-11-2023/{sex}/PVFC/E4vE2_PVFC_{fileName}.csv',index=True)

  # Create Volcano plots for the comparisons
  def plotVP(input_df,comparison=None):
    df = input_df
    sns.set_style('white')
    plt.figure(figsize=(10, 10))
    sns.set(style='white', context='talk')
    plt.axvline(x=FC, ymin=0, ymax=1, linestyle=':',color='gray')
    plt.axvline(x=-FC, ymin=0, ymax=1, linestyle=':',color='gray')
    plt.axhline(y=significance_cutoff, xmin=0, xmax=1, linestyle=':',color='gray')
    plt.xlabel("Fold_change")
    plt.ylabel("-10*log10(BH)")
    plt.title(f'P-value vs. Fold Change',fontsize=30)
    plt.xlim(-6, 6)

    df['cutoff'] = ''

    df.loc[((df['Fold_change'] < FC) | (df['Fold_change'] > -FC)) & (df['-10*log10(BH)'] < significance_cutoff),['cutoff']] = 100
    df.loc[((df['Fold_change'] < FC) | (df['Fold_change'] > -FC)) & (df['-10*log10(BH)'] >= significance_cutoff),['cutoff']] = 100
    df.loc[((df['Fold_change'] >= FC)) & (df['-10*log10(BH)'] >= significance_cutoff),['cutoff']] = 250
    df.loc[((df['Fold_change'] <= -FC)) & (df['-10*log10(BH)'] >= significance_cutoff),['cutoff']] = 15

    VP = sns.scatterplot(x='Fold_change',
                        y='-10*log10(BH)',
                        data=df,
                        legend=False,
                        palette='cividis',
                        hue='cutoff',
                        s = 350,
                        alpha=0.8)

    plt.savefig(f'/content/drive/MyDrive/ApoE Analysis January-11-2023/{sex}/Figures/{fileName}_{comparison}_VP.svg',format="svg",transparent=True)

    return(VP)

  plotVP(E2vE3_DF,'E2vE3')
  plotVP(E4vE3_DF,'E4vE3')
  plotVP(E4vE2_DF,'E4vE2')

  continue

The following section combines the fold changes and P-values from each dataset into a single P-value and Fold Change each protein.

In [None]:
where_are_the_files = f'/content/drive/MyDrive/ApoE Analysis January-11-2023/{sex}/PVFC'
os.chdir(where_are_the_files)
all_file_names = [i for i in glob.glob('*.{}'.format('csv'))]

primary_E2vE3_PVFC = pd.DataFrame()
primary_E4vE3_PVFC = pd.DataFrame()
primary_E4vE2_PVFC = pd.DataFrame()

for dataSet in all_file_names:
  fileName = dataSet.strip('.csv')
  PVFC_DF = pd.read_csv(dataSet).set_index('Accession').filter(regex='P_val|Fold_change')
  PVFC_DF['Fold_change'] = PVFC_DF['Fold_change']

  if 'E2vE3' in dataSet:
    primary_E2vE3_PVFC = pd.concat([primary_E2vE3_PVFC,PVFC_DF])
  elif 'E4vE3' in dataSet:
    primary_E4vE3_PVFC = pd.concat([primary_E4vE3_PVFC,PVFC_DF])
  elif 'E4vE2' in dataSet:
    primary_E4vE2_PVFC = pd.concat([primary_E4vE2_PVFC,PVFC_DF])
  continue

def combine_PV_and_FC(input_PVFC_DF):
  protein_list = input_PVFC_DF.index.drop_duplicates()

  pv_df = input_PVFC_DF.copy().filter(regex='P_val')
  PV_dict = {}
  combined_PV_dict = {}

  for protein in protein_list:
    vals = pv_df['P_val'][protein].tolist()
    pv_array = np.array(vals, ndmin=1)
    combined_pv = combine_pvalues(pv_array,method='mudholkar_george',weights=None)[1]
    PV_dict[protein] = vals
    combined_PV_dict[protein] = combined_pv

# Calculate Average fold-change
  fc_df = input_PVFC_DF.copy().filter(regex='Fold_change')
  fc_df = 2**(fc_df)
  fc_df = fc_df.reset_index()
  fc_df = fc_df.groupby(by='Accession',as_index=True).mean()
  fc_df = np.log2(fc_df['Fold_change'])

  clean_PV_DF = pd.DataFrame.from_dict(combined_PV_dict.items()).rename(columns={0 : 'Accession',1 : 'fisher_combined_PV'}).set_index('Accession')
  cleaned_PVFC_DF = pd.concat([clean_PV_DF,fc_df],axis=1)
  cleaned_PVFC_DF['-10log10(comb_PV)'] = np.log10(cleaned_PVFC_DF['fisher_combined_PV'])*-10

  def performBH_correction(input_df):
    input_df['Benjamini_Hochberg_pval'] = None
    input_df = input_df.reset_index()
    #sort cleaned_df by pvalue jc mod
    input_df = input_df.sort_values(by='fisher_combined_PV')
    input_df = input_df.reset_index(drop=True) #sort keeps the origional index value so you need to re-index to use it in the BH calc
    #calculate benjamini_hochberg correction as (rank/total numer of tests)*probability of false positive jc mod
    total_rows = len(input_df.index)
    for row in input_df.itertuples():
      BH_pval = ((row.Index+1)/total_rows)*0.25
      input_df.at[row.Index, 'Benjamini_Hochberg_pval'] = BH_pval
    input_df['-10*log10(BH)'] = np.log10(input_df['Benjamini_Hochberg_pval'].astype(float))*-10
    return(input_df)

  cleaned_PVFC_DF = performBH_correction(cleaned_PVFC_DF).set_index('Accession')

  def plotVP(input_df):
    df = input_df
    sns.set_style('white')
    plt.figure(figsize=(10, 10))
    sns.set(style='white', context='talk')
    plt.axvline(x=FC, ymin=0, ymax=1, linestyle=':',color='gray')
    plt.axvline(x=-FC, ymin=0, ymax=1, linestyle=':',color='gray')
    plt.axhline(y=significance_cutoff, xmin=0, xmax=1, linestyle=':',color='gray')
    plt.xlabel("Fold_change")
    plt.ylabel("-10*log10(BH)")
    plt.title(f'P-value vs. Fold Change',fontsize=30)
    # plt.xlim(-4,4)

    df['cutoff'] = ''

    df.loc[((df['Fold_change'] < FC) | (df['Fold_change'] > -FC)) & (df['-10*log10(BH)'] < significance_cutoff),['cutoff']] = '-'
    df.loc[((df['Fold_change'] < FC) | (df['Fold_change'] > -FC)) & (df['-10*log10(BH)'] > significance_cutoff),['cutoff']] = '-'
    df.loc[((df['Fold_change'] >= FC)) & (df['-10*log10(BH)'] >= significance_cutoff),['cutoff']] = 'sigUp'
    df.loc[((df['Fold_change'] <= -FC)) & (df['-10*log10(BH)'] >= significance_cutoff),['cutoff']] = 'sigDown'

    sigUp = len(df.loc[(df['cutoff'] == 'sigUp'),['cutoff']].index)
    sigDown = len(df.loc[(df['cutoff'] == 'sigDown'),['cutoff']].index)

    print(sigUp,sigDown)

    VP = sns.scatterplot(x='Fold_change',
                        y='-10*log10(BH)',
                        data=df,
                        legend=False,
                        palette='cividis',
                        hue='cutoff',
                        s = 350,
                        alpha=0.8)
    return(VP)

  plotVP(cleaned_PVFC_DF)
  return(cleaned_PVFC_DF)

primary_E2vE3_stats = combine_PV_and_FC(primary_E2vE3_PVFC)
primary_E4vE3_stats = combine_PV_and_FC(primary_E4vE3_PVFC)
primary_E4vE2_stats = combine_PV_and_FC(primary_E4vE2_PVFC)

primary_E2vE3_stats.to_csv(f'/content/drive/MyDrive/ApoE Analysis January-11-2023/{sex}/primary_PVFC/E2vE3_primary_PVFC.csv',index=True)
primary_E4vE3_stats.to_csv(f'/content/drive/MyDrive/ApoE Analysis January-11-2023/{sex}/primary_PVFC/E4vE3_primary_PVFC.csv',index=True)
primary_E4vE2_stats.to_csv(f'/content/drive/MyDrive/ApoE Analysis January-11-2023/{sex}/primary_PVFC/E4vE2_primary_PVFC.csv',index=True)


This portion requires a StringDB Multiprotein Tool output, an ID map from uniprot (Accession to Gene Name), and abundance fold changes from the analysis.

In [None]:
def string_conc_analysis(comparison = None):
  # # Prepare the dataframe with the fold changes for each protein in each comparison
  comparison_DF = pd.read_csv(f'/content/drive/MyDrive/ApoE Analysis January-11-2023/{FC_sex}/primaryy_PVFC/{comparison}_primary_PVFC.csv').set_index('Accession').rename(columns={'Fold_change':f'{comparison}_Fold_change'}).filter(regex=f'{comparison}|BH')

  # Scale Fold Change Data using Range Scaling
  def range_scaling(df):
    df_min = np.min(df[f'{comparison}_Fold_change'])
    df_max = np.max(df[f'{comparison}_Fold_change'])
    df_mean = np.mean(df[f'{comparison}_Fold_change'])
    df_std = np.std(df[f'{comparison}_Fold_change'])

    df[f'{comparison}_Fold_change'] = df[f'{comparison}_Fold_change'] - df_mean
    # df[f'{comparison}_Fold_change'] = df[f'{comparison}_Fold_change'] / np.sqrt(df_std)
    # df[f'{comparison}_Fold_change'] = df[f'{comparison}_Fold_change'] * (df_mean/df_std)
    df[f'{comparison}_Fold_change'] = df[f'{comparison}_Fold_change'] / (df_max - df_min)

    df = df.filter(regex=f'{comparison}|BH')

    return(df)

  comparison_DF = range_scaling(comparison_DF)

  # Reverser Log2 the data for calculations
  comparison_DF[f'{comparison}_Fold_change'] = 2**comparison_DF[f'{comparison}_Fold_change']

  # Create a list that contains the protein ID used to match the protein names in the String output
  protein_list = comparison_DF.index.to_list()
  for position,protein in enumerate(protein_list):
    protein = protein.split('|')[0]
    protein_list[position] = protein

  comparison_DF['Accession'] = protein_list

  comparison_DF = comparison_DF.set_index('Accession')

  # full_comparison_DF = full_comparison_DF[[f'{comparison}_Fold_change','Protein_ID','Accession']].dropna()
  full_comparison_DF = comparison_DF

  STRING_ID_MAP = pd.read_csv('/content/drive/MyDrive/ApoE Analysis January-11-2023/String_ID_Map.tsv', sep='\t').rename(columns={'From':'Accession','To':'Protein_ID'})
  STRING_ID_MAP = STRING_ID_MAP.sort_values(by='Accession')
  STRING_ID_MAP = STRING_ID_MAP.drop_duplicates('Accession').set_index('Accession')

  full_comparison_DF = pd.concat([full_comparison_DF,STRING_ID_MAP],axis=1).dropna()

  # Read string DB output CSV
  string_annot = pd.read_csv(f'/content/drive/MyDrive/ApoE Analysis January-11-2023/String All Genotype Analysis.tsv',sep='\t')
  string_annot['matching proteins in your network (labels)'] = string_annot['matching proteins in your network (labels)'][:].str.split(r',')

  string_annot['Ontology_coverage (%)'] = string_annot['observed gene count']/string_annot['background gene count']
  string_annot = string_annot.loc[string_annot['Ontology_coverage (%)'] >= 0.250,]

  ontologies_of_interest = ['GO Process','GO Function','GO Component','KEGG','Reactome','WikiPathways']
  string_annot = string_annot.loc[string_annot['#category'].isin(ontologies_of_interest)].reset_index(drop=True)

  string_annot[[f'{comparison}']] = None
  string_annot[f'measurement_count'] = None
  string_annot[f'sig_proteins_in_ont'] = None
  string_annot[f'sig_proteins_count'] = None

  # Create a dataframe column that contains all protein fold changes from the ontology
  counter = 0
  while counter < len(string_annot.index):
    protein_list = string_annot.loc[counter,'matching proteins in your network (labels)']
    protein_in_ontology = full_comparison_DF.loc[full_comparison_DF['Protein_ID'].isin(protein_list)]
    string_annot[f'{comparison}'][counter] = (protein_in_ontology[f'{comparison}_Fold_change'].values.tolist())
    string_annot.loc[counter,f'measurement_count'] = len((protein_in_ontology[f'{comparison}_Fold_change'].values.tolist()))
    counter += 1

  # Create a column with a list of proteins that have significant change in concentration
  sig_proteins = full_comparison_DF.loc[full_comparison_DF['-10*log10(BH)'] > np.log10(0.05)*-10, ]

  counter = 0
  while counter < len(string_annot.index):

    protein_list = string_annot.loc[counter,'matching proteins in your network (labels)']
    sig_protein_in_ontology = sig_proteins.loc[sig_proteins['Protein_ID'].isin(protein_list)].index.to_list()
    string_annot[f'sig_proteins_in_ont'][counter] = sig_protein_in_ontology
    string_annot.loc[counter,f'sig_proteins_count'] = len(sig_protein_in_ontology)
    counter += 1

  string_annot[f'sig_proteins_ont_coverage'] = string_annot['sig_proteins_count']/string_annot['background gene count']

  # Create a column for average ontology fold change and p-value calculated from all protein fold changes.
  string_annot[f'{comparison}_PV_from_Ontology_FC'] = None
  string_annot[f'{comparison}_Avg_Ontology_FC'] = None

  # Use a one-sample t-test for fold changes in the ontology (log2(FC) != 1) and calculate FC average for the ontology
  counter = 0

  while counter < len(string_annot.index):
    if len(string_annot[f'{comparison}'][counter]) > 1:
      string_annot.loc[counter,f'{comparison}_Avg_Ontology_FC'] = np.mean(string_annot[f'{comparison}'][counter])
      string_annot.loc[counter,f'{comparison}_PV_from_Ontology_FC'] = st.ttest_1samp((string_annot[f'{comparison}'][counter]), 1 , nan_policy='omit', alternative='two-sided')[1]
      counter += 1
    else:
      counter += 1

  # Log transform FCs and PVs
  string_annot[f'{comparison}_log_PV'] = np.log10(string_annot[f'{comparison}_PV_from_Ontology_FC'].astype(float))*-10
  string_annot[f'{comparison}_log_FC'] = np.log2(string_annot[f'{comparison}_Avg_Ontology_FC'].astype(float))*10

  string_annot = string_annot.dropna(subset=[f'{comparison}_log_PV'])
  string_annot = string_annot.sort_values(by=f'{comparison}_PV_from_Ontology_FC')
  string_annot = string_annot.drop_duplicates(subset=['term ID'], keep='first')

  # Calculate BH P-value correction
  def performBH_correction(input_df,comparison=None):
    input_df[f'{comparison}_BH_from_Ontology_FC'] = None
    input_df = input_df.loc[input_df[f'{comparison}_log_PV'] > 0,]
    input_df = input_df.reset_index()
    #sort cleaned_df by pvalue jc mod
    input_df = input_df.sort_values(by=f'{comparison}_PV_from_Ontology_FC')
    input_df = input_df.reset_index(drop=True) #sort keeps the origional index value so you need to re-index to use it in the BH calc
    #calculate benjamini_hochberg correction as (rank/total numer of tests)*probability of false positive jc mod
    total_rows = len(input_df.index)
    for row in input_df.itertuples():
      BH_pval = ((row.Index+1)/total_rows)*0.25
      input_df.at[row.Index, f'{comparison}_BH_from_Ontology_FC'] = BH_pval
    input_df[f'{comparison}_-10*log10(BH)'] = np.log10(input_df[f'{comparison}_BH_from_Ontology_FC'].astype(float))*-10
    return(input_df)

  Ontology_Comparison = performBH_correction(string_annot,comparison=f'{comparison}')

  Ontology_Comparison.to_csv(f'/content/drive/MyDrive/ApoE Analysis January-11-2023/Male-Female/String_Ontology_Stats/{comparison}_SDB_Concentration_Stats.csv')

  # display(Ontology_Comparison)

  return(Ontology_Comparison)

string_E2vE3_conc_df = string_conc_analysis(comparison='E2vE3')
string_E4vE3_conc_df = string_conc_analysis(comparison='E4vE3')
string_E4vE2_conc_df = string_conc_analysis(comparison='E4vE2')

This section combines the Rate values output from deuterate and performs an ontology analysis similar to the abundance analysis.

In [None]:
def create_Rate_DF(comparison=None):

  protein_ID_guide = pd.read_csv('/content/drive/MyDrive/ApoE Turnover Data/Protein_ID_Guide/Protein_ID_Guide_File.csv').set_index('Accession')

  # Create dataframes for rates in each isoform
  a2_df = pd.DataFrame()
  a3_df = pd.DataFrame()
  a4_df = pd.DataFrame()

  # Where are the Rate files located
  where_are_the_files = f'/content/drive/MyDrive/ApoE Turnover Data/Female_Rates_Peptide_FDR_2'
  os.chdir(where_are_the_files)
  all_file_names = [i for i in glob.glob('*.{}'.format('csv'))]

  full_bkg_df = pd.DataFrame()

  # Filter each rate file and create allele specific dataframes
  for dataSet in all_file_names:
    fileName = dataSet.strip('.csv')
    TR_df = pd.read_csv(dataSet).set_index('analyte_id').filter(regex='Combined')

    TR_df = TR_df.fillna(0)
    TR_df = TR_df.loc[TR_df['Combined uniques'].astype(int) > 1,]
    TR_df = TR_df.loc[TR_df['Combined rate'] != 'Insufficient Timepoints',]
    TR_df = TR_df.loc[TR_df['Combined rate'] != 'value could not be determined',]
    TR_df = TR_df.loc[TR_df['Combined rate'].astype(float) > 0,]
    TR_df = TR_df.loc[TR_df['Combined R2'].astype(float) > 0.6,]
    TR_df = TR_df.filter(regex='rate')

    allele = re.findall('A\d|M\d', fileName)[0]
    TR_df = TR_df.rename(columns={'Combined rate' : f'{allele}'})
    if allele == 'A2':
      a2_df = pd.concat([a2_df,TR_df])
    elif allele == 'A3':
      a3_df = pd.concat([a3_df,TR_df])
    elif allele == 'A4':
      a4_df = pd.concat([a4_df,TR_df])


  # Calculate the average rate for each protein from the different rate files
  a2_df['A2'] = a2_df['A2'].astype(float)
  a2_df = a2_df.groupby(by='analyte_id',as_index=True).mean()
  a2_std = np.std(np.log2(a2_df['A2']))
  a2_df['A2'] = np.log2(a2_df['A2'])

  a3_df['A3'] = a3_df['A3'].astype(float)
  a3_df = a3_df.groupby(by='analyte_id',as_index=True).mean()
  a3_std = np.std(np.log2(a3_df['A3']))
  a3_df['A3'] = np.log2(a3_df['A3'])

  a4_df['A4'] = a4_df['A4'].astype(float)
  a4_df = a4_df.groupby(by='analyte_id',as_index=True).mean()
  a4_std = np.std(np.log2(a4_df['A4']))
  a4_df['A4'] = np.log2(a4_df['A4'])

  # Create a dataframe with all the rates and
  all_rates = pd.concat([a2_df,a3_df,a4_df,protein_ID_guide],axis=1)

  # Create comparison specific dataframes
  E2vE3_rates_df = pd.concat([a2_df,a3_df,protein_ID_guide],axis=1).dropna()
  E4vE3_rates_df = pd.concat([a3_df,a4_df,protein_ID_guide],axis=1).dropna()
  E4vE2_rates_df = pd.concat([a2_df,a4_df,protein_ID_guide],axis=1).dropna()

  # Calculate fold change
  E2vE3_rates_df['E2vE3_rate_FC'] = ((E2vE3_rates_df['A2'] - E2vE3_rates_df['A3']) )
  E4vE3_rates_df['E4vE3_rate_FC'] = ((E4vE3_rates_df['A4'] - E4vE3_rates_df['A3']) )
  E4vE2_rates_df['E4vE2_rate_FC'] = ((E4vE2_rates_df['A4'] - E4vE2_rates_df['A2']) )


  # Concatenate the the comparison dataframes
  Rate_FC_DF = pd.concat([E2vE3_rates_df['E2vE3_rate_FC'],E4vE3_rates_df['E4vE3_rate_FC'],E4vE2_rates_df['E4vE2_rate_FC'],protein_ID_guide],axis=1)
  Rate_FC_DF.to_csv('/content/drive/MyDrive/ApoE Turnover Data/Rate_FC_DF')
  full_Rate_FC_df = Rate_FC_DF.filter(regex='FC')

  print('This function outputs a rate FC dataframe; the FC was calculated with the log2 rate values, FC = [log2(B) - log(A)]')

  return(full_Rate_FC_df)

def string_rate_analysis(comparison=None):
  #Filter and Scale rate data
  full_Rate_FC_df = create_Rate_DF(comparison=None)
  Rate_FC_df = full_Rate_FC_df.copy()
  Rate_FC_df = Rate_FC_df.filter(regex=f'{comparison}|ID').dropna()

  def auto_scaling(df):
    Rate_FC_DF_mean = np.mean(df[f'{comparison}_rate_FC'])
    Rate_FC_DF_std = np.std(df[f'{comparison}_rate_FC'])
    df[f'{comparison}_rate_FC'] = df[f'{comparison}_rate_FC'] - Rate_FC_DF_mean
    df[f'{comparison}_rate_FC'] = df[f'{comparison}_rate_FC'] / (Rate_FC_DF_std)
    # df[f'{comparison}_rate_FC'] = df[f'{comparison}_rate_FC'] * (Rate_FC_DF_mean/Rate_FC_DF_std)
    return(df)

  scaled_RDF = auto_scaling(Rate_FC_df).filter(regex='rate')

  #Match protein accession to protein ID for string analysis
  STRING_ID_MAP = pd.read_csv('/content/drive/MyDrive/ApoE Analysis January-11-2023/String_ID_Map.tsv', sep='\t').rename(columns={'From':'Accession','To':'Protein_ID'})
  STRING_ID_MAP = STRING_ID_MAP.sort_values(by='Accession')
  STRING_ID_MAP = STRING_ID_MAP.drop_duplicates('Accession').set_index('Accession')

  scaled_RDF =scaled_RDF.copy()
  scaled_RDF = pd.concat([scaled_RDF,STRING_ID_MAP],axis=1).dropna()
  full_Rate_FC_df = scaled_RDF.copy()

  # Format the string file to create readable proteins lists for each ontology
  string_annot = pd.read_csv(f'/content/drive/MyDrive/ApoE Analysis January-11-2023/String All Genotype Analysis.tsv',sep='\t')
  string_annot['matching proteins in your network (labels)'] = string_annot['matching proteins in your network (labels)'][:].str.split(r',')

  string_annot['Ontology_coverage (%)'] = string_annot['observed gene count']/string_annot['background gene count']
  string_annot = string_annot.loc[string_annot['Ontology_coverage (%)'] >= 0.25,]

  ontologies_of_interest = ['GO Process','GO Function','GO Component','KEGG','Reactome','WikiPathways']
  string_annot = string_annot.loc[string_annot['#category'].isin(ontologies_of_interest)].reset_index(drop=True)
  string_annot[[f'{comparison}_Rate']] = None

  # Create a dataframe column that contains all protein fold changes from the ontology
  counter = 0
  while counter < len(string_annot.index):
    protein_list = string_annot.loc[counter,'matching proteins in your network (labels)']
    protein_in_ontology = full_Rate_FC_df.loc[full_Rate_FC_df['Protein_ID'].isin(protein_list)]
    string_annot[f'{comparison}_Rate'][counter] = (protein_in_ontology[f'{comparison}_rate_FC'].dropna()).values.tolist()
    counter += 1

  # Create a column for average ontology fold change and p-value calculated from all protein fold changes.
  counter = 0
  string_annot[f'{comparison}_PV_from_Ontology_Rate_FC'] = None
  string_annot[f'{comparison}_PV_from_Ontology_Rate_FC'] = None

  string_annot[f'{comparison}_Avg_Ontology_Rate_FC'] = None
  string_annot[f'{comparison}_Avg_Ontology_Rate_FC'] = None

  # Use a one-sample t-test for fold changes in the ontology (FC != 1) and calculate FC average for the ontology
  counter = 0
  while counter < len(string_annot.index):
    if len(string_annot[f'{comparison}_Rate'][counter]) > 2:
      string_annot.loc[counter,f'{comparison}_PV_from_Ontology_Rate_FC'] = st.ttest_1samp((string_annot[f'{comparison}_Rate'][counter]), 0 , nan_policy='omit', alternative='two-sided')[1]
      string_annot.loc[counter,f'{comparison}_Avg_Ontology_Rate_FC'] = np.mean(string_annot[f'{comparison}_Rate'][counter])
      counter += 1
    else:
      counter += 1

  rate_string_analysis = string_annot.copy()

  rate_string_analysis[f'{comparison}_Rate_log_PV'] = np.log10(rate_string_analysis[f'{comparison}_PV_from_Ontology_Rate_FC'].astype(float))*-10
  rate_string_analysis[f'{comparison}_Rate_log_FC'] = (rate_string_analysis[f'{comparison}_Avg_Ontology_Rate_FC'].astype(float))

  rate_string_analysis = rate_string_analysis.dropna(subset=[f'{comparison}_Rate_log_PV'])
  rate_string_analysis = rate_string_analysis.sort_values(by=f'{comparison}_PV_from_Ontology_Rate_FC')

  # Calculate BH P-value correction
  def performBH_correction(input_df,comparison=None):
    input_df[f'{comparison}_BH_from_Ontology_Rate_FC'] = None
    input_df = input_df.loc[input_df[f'{comparison}_Rate_log_PV'] > 0,]
    input_df = input_df.reset_index()
    input_df = input_df.sort_values(by=f'{comparison}_PV_from_Ontology_Rate_FC')
    input_df = input_df.reset_index(drop=True) #sort keeps the origional index value so you need to re-index to use it in the BH calc
    #calculate benjamini_hochberg correction as (rank/total numer of tests)*probability of false positive jc mod
    total_rows = len(input_df.index)
    for row in input_df.itertuples():
      BH_pval = ((row.Index+1)/total_rows)*0.25
      input_df.at[row.Index, f'{comparison}_BH_from_Ontology_Rate_FC'] = BH_pval
    input_df[f'{comparison}_Rate_-10*log10(BH)'] = np.log10(input_df[f'{comparison}_BH_from_Ontology_Rate_FC'].astype(float))*-10
    return(input_df)

  rate_string_analysis = performBH_correction(rate_string_analysis,comparison=f'{comparison}')

  rate_string_analysis.to_csv(f'/content/drive/MyDrive/ApoE Analysis January-11-2023/Male-Female/String_Ontology_Stats/{comparison}_SDB_Rate_Stats.csv')

  return(rate_string_analysis)

string_E2vE3_rate_df = string_rate_analysis(comparison='E2vE3')
string_E4vE3_rate_df = string_rate_analysis(comparison='E4vE3')
string_E4vE2_rate_df = string_rate_analysis(comparison='E4vE2')

Use this portion to combine the rate ontology analysis and the abundance ontology analysis from a given comparison. This code will also combine the results from both comparisons and graph the proteostasis plot.

In [None]:
# This code concatenates the rate string analysis and the concentration string analysis
def create_RNC_DF(comparison=None):
  # Read the concentration SBD analysis
  SDB_conc_Stats = pd.read_csv(f'{directory}/Male-Female/String_Ontology_Stats/{comparison}_SDB_Concentration_Stats.csv')
  SDB_conc_Stats['Ontology_coverage (%)'] = (SDB_conc_Stats['observed gene count']/SDB_conc_Stats['background gene count'])*100
  SDB_conc_Stats = SDB_conc_Stats.set_index('term ID').filter(regex='E\dvE\d_log_FC|term.description|PV|labels|observed|BH|coverage|sig').rename(columns={f'{comparison}' : f'{comparison}_FCs',f'{comparison}_log_FC' : f'{comparison}_Conc_log_FC'})

  # Read the Rate SBD analysis
  SDB_rate_Stats = pd.read_csv(f'{directory}/Male-Female/String_Ontology_Stats/{comparison}_SDB_Rate_Stats.csv')
  SDB_rate_Stats = SDB_rate_Stats.set_index('term ID').filter(regex=f'{comparison}_Rate_log_FC|category')
  SDB_rate_Stats[f'{comparison}_Rate_log_FC'] = SDB_rate_Stats[f'{comparison}_Rate_log_FC']

  # Combined the Concentration and Rate SDB analysis
  RnC_Df = pd.concat([SDB_conc_Stats,SDB_rate_Stats],axis=1).dropna()
  RnC_Df = RnC_Df.rename(columns={'observed gene count':'Ontology Protein Count'})
  RnC_Df["term description (Count)"] = (RnC_Df['term description']).astype(str) + " " +"(" + (RnC_Df['Ontology Protein Count'].astype(int)).astype(str) + ")"
  RnC_Df = RnC_Df.filter(regex='E\dvE\d_log_FC|term.description.|labels|observed|10.BH|coverage|log_FC|count|category|sig')

  RnC_Df = RnC_Df.rename(columns={f'sig_proteins_in_ont':f'{comparison}_sig_proteins_in_ont'})
  RnC_Df = RnC_Df.rename(columns={f'sig_proteins_count':f'{comparison}_sig_proteins_count'})

  RnC_Df = RnC_Df[['#category',
                  'term description (Count)',
                  f'{comparison}_Conc_log_FC',
                  f'{comparison}_Rate_log_FC',
                  f'{comparison}_-10*log10(BH)',
                  'Ontology_coverage (%)',
                  'matching proteins in your network (labels)',
                  f'{comparison}_sig_proteins_in_ont',
                  f'{comparison}_sig_proteins_count']]


  return(RnC_Df)

RnC_Df_for_lists_E2vE3 = create_RNC_DF(comparison='E2vE3')
RnC_Df_for_lists_E4vE3 = create_RNC_DF(comparison='E4vE3')
RnC_Df_for_lists_E4vE2 = create_RNC_DF(comparison='E4vE2')


# Create a combined dataframe of both comparisons with only significant ontologies.
RnC_Df_E2vE3 = RnC_Df_for_lists_E2vE3.copy()
RnC_Df_E4vE3 = RnC_Df_for_lists_E4vE3.copy()

RNC_DF_both_comparisons = pd.concat([RnC_Df_E2vE3,RnC_Df_E4vE3],axis=1)
RNC_DF_both_comparisons = RNC_DF_both_comparisons#.loc[((RNC_DF_both_comparisons[f'E2vE3_-10*log10(BH)'] > np.log10(0.025)*-10) | (RNC_DF_both_comparisons[f'E4vE3_-10*log10(BH)'] > np.log10(0.025)*-10)), ]

RNC_DF_both_comparisons["E4_proteostasis"] = None
RNC_DF_both_comparisons["E2_proteostasis"] = None

RNC_DF_both_comparisons.loc[((RNC_DF_both_comparisons['E2vE3_Conc_log_FC'] > 0) & (RNC_DF_both_comparisons['E2vE3_Rate_log_FC'] > 0)),'E2_proteostasis'] = '⬆synthesis'
RNC_DF_both_comparisons.loc[((RNC_DF_both_comparisons['E2vE3_Conc_log_FC'] < 0) & (RNC_DF_both_comparisons['E2vE3_Rate_log_FC'] > 0)),'E2_proteostasis'] = '⬆Degradation'
RNC_DF_both_comparisons.loc[((RNC_DF_both_comparisons['E2vE3_Conc_log_FC'] < 0) & (RNC_DF_both_comparisons['E2vE3_Rate_log_FC'] < 0)),'E2_proteostasis'] = '⬇synthesis'
RNC_DF_both_comparisons.loc[((RNC_DF_both_comparisons['E2vE3_Conc_log_FC'] > 0) & (RNC_DF_both_comparisons['E2vE3_Rate_log_FC'] < 0)),'E2_proteostasis'] = '⬇Degradation'

RNC_DF_both_comparisons.loc[((RNC_DF_both_comparisons['E4vE3_Conc_log_FC'] > 0) & (RNC_DF_both_comparisons['E4vE3_Rate_log_FC'] > 0)),'E4_proteostasis'] = '⬆synthesis'
RNC_DF_both_comparisons.loc[((RNC_DF_both_comparisons['E4vE3_Conc_log_FC'] < 0) & (RNC_DF_both_comparisons['E4vE3_Rate_log_FC'] > 0)),'E4_proteostasis'] = '⬆Degradation'
RNC_DF_both_comparisons.loc[((RNC_DF_both_comparisons['E4vE3_Conc_log_FC'] < 0) & (RNC_DF_both_comparisons['E4vE3_Rate_log_FC'] < 0)),'E4_proteostasis'] = '⬇synthesis'
RNC_DF_both_comparisons.loc[((RNC_DF_both_comparisons['E4vE3_Conc_log_FC'] > 0) & (RNC_DF_both_comparisons['E4vE3_Rate_log_FC'] < 0)),'E4_proteostasis'] = '⬇Degradation'

RNC_DF_both_comparisons = RNC_DF_both_comparisons.loc[:,~RNC_DF_both_comparisons.columns.duplicated()].dropna()

RNC_DF_both_comparisons = RNC_DF_both_comparisons[['#category','Ontology_coverage (%)','term description (Count)','E2_proteostasis','E4_proteostasis','E2vE3_-10*log10(BH)','E4vE3_-10*log10(BH)','E2vE3_Conc_log_FC','E2vE3_Rate_log_FC','E4vE3_Conc_log_FC','E4vE3_Rate_log_FC','matching proteins in your network (labels)','E2vE3_sig_proteins_in_ont','E2vE3_sig_proteins_count','E4vE3_sig_proteins_in_ont','E4vE3_sig_proteins_count']]

sig_RNC_DF_both_comparisons = RNC_DF_both_comparisons.loc[((RNC_DF_both_comparisons['E2vE3_-10*log10(BH)'] > np.log10(0.05)*-10) | (RNC_DF_both_comparisons['E4vE3_-10*log10(BH)'] > np.log10(0.05)*-10)),]
sig_RNC_DF_both_comparisons['Ontology_coverage (%)'] = sig_RNC_DF_both_comparisons['Ontology_coverage (%)']



colors = ['#00375F','#730707']
custom_pal = sns.color_palette(colors)

def plot_Quadrants(input_df,
           plot_title=None,
           Conc_column_name=None,
           Rate_column_name=None,
           fractions=None,
           comparison=None,
           style=None,
           list_of_interest=None,
           Ontology_name=None):

  df = input_df.copy().filter(regex='log.FC|term|labels|observed|BH')

  plt.figure(figsize=(3,3))

  custom_params = {'axes.linewidth': 1}

  sns.set(style='ticks', context='paper', color_codes=True,rc=custom_params)

  plt.axvline(x=0, ymin=0, ymax=1, linestyle=':',color='gray',linewidth=2)
  plt.axhline(y=0, xmin=0, xmax=1, linestyle=':',color='gray',linewidth=2)
  plt.xlabel("ΔAbundance (arb. unit)", fontweight ='bold', size=10)
  plt.ylabel("ΔRate (arb. unit)", fontweight ='bold',size=10)
  plt.title(f'{comparison} Ontology ΔAbundance vs ΔRate',fontweight ='bold',size=10, pad=5, wrap=False)
  plt.tick_params(axis='both', which='major', labelsize=10,length = 2,color = 'black',width =1)

  df["significance"] = "Insignificant"
  df.loc[(df[f'{comparison}_-10*log10(BH)'] > np.log10(0.025)*-10), 'significance'] = "Significant"
  df.sort_values(by=f'{comparison}_-10*log10(BH)',inplace=True)

  sig_len = len(df.loc[(df["significance"] == 'Significant'),])
  insig_len = len(df.loc[(df["significance"] == 'Insignificant'),])

  df["significance"] = f"Insignificant ({insig_len})"
  df.loc[(df[f'{comparison}_-10*log10(BH)'] > np.log10(0.025)*-10), 'significance'] = f"Significant ({sig_len})"
  df.sort_values(by=f'{comparison}_-10*log10(BH)',inplace=True)


  VP = sns.scatterplot(x=f'{comparison}_Conc_log_FC',
                      y=f'{comparison}_Rate_log_FC',
                      data=df,
                      legend=True,
                      palette=custom_pal,
                      # c='#00375F',
                      hue='significance',
                      s = 50,
                      alpha=0.7,
                      edgecolor="black",
                      linewidth=0.4,
                      hue_order = [f"Insignificant ({insig_len})",f"Significant ({sig_len})"],
                      style=style)

  plt.legend(loc=8, prop={'size':8},title_fontsize=10,markerscale=.75,borderpad=0.1,borderaxespad=-5, ncols=2)
  sns.despine(top=True, right=True, left=False, bottom=False, offset=None, trim=False)

  plt.savefig(f'{directory}/{FC_sex}/String_Ontology_Stats/{comparison}_Proteostasis_Plot.svg',format="svg",transparent=True, dpi=1200,bbox_inches='tight')

  plt.show()

  return(VP)


plot_Quadrants(RnC_Df_for_lists_E2vE3,
              plot_title='Rate vs Abundance',
              Conc_column_name=f'E2vE3_log_FC',
              Rate_column_name=f'E2vE3_Rate_log_FC',
              fractions=f'E2vE3',
              comparison=f'E2vE3',
              style=None,
              list_of_interest=None,
              Ontology_name=None)

plot_Quadrants(RnC_Df_for_lists_E4vE3,
              plot_title='Rate vs Abundance',
              Conc_column_name=f'E4vE3_log_FC',
              Rate_column_name=f'E4vE3_Rate_log_FC',
              fractions=f'E4vE3',
              comparison=f'E4vE3',
              style=None,
              list_of_interest=None,
              Ontology_name=None)

This section uses sections of previous codes to create the boxplots of the ontologies of interest.

In [None]:
def create_comparison_PVFC(comparison=None):
  # # Prepare the dataframe with the fold changes for each protein in each comparison
  comparison_DF = pd.read_csv(f'{directory}/{FC_sex}/primary_PVFC/{comparison}_primary_PVFC.csv').set_index('Accession').rename(columns={'Fold_change':f'{comparison}_Fold_change'}).filter(regex=f'{comparison}|BH')

  # Scale Fold Change Data using Range Scaling
  comparison_DF[f'{comparison}_Fold_change'] = comparison_DF[f'{comparison}_Fold_change']

  comparison_DF_min = np.min(comparison_DF[f'{comparison}_Fold_change'])
  comparison_DF_max = np.max(comparison_DF[f'{comparison}_Fold_change'])
  comparison_DF_mean = np.mean(comparison_DF[f'{comparison}_Fold_change'])
  comparison_DF[f'{comparison}_Fold_change'] = comparison_DF[f'{comparison}_Fold_change'] - comparison_DF_mean
  comparison_DF[f'{comparison}_Fold_change'] = comparison_DF[f'{comparison}_Fold_change'] / (comparison_DF_max - comparison_DF_min)

  # Reverser Log2 the data for calculations
  comparison_DF[f'{comparison}_Fold_change'] = comparison_DF[f'{comparison}_Fold_change']

  # Create a list that contains the protein ID used to match the protein names in the String output
  protein_list = comparison_DF.index.to_list()
  for position,protein in enumerate(protein_list):
    protein = protein.split('|')[0]
    protein_list[position] = protein

  comparison_DF['Accession'] = protein_list
  comparison_DF = comparison_DF.set_index('Accession')

  STRING_ID_MAP = pd.read_csv(f'{directory}/String_ID_Map.tsv', sep='\t').rename(columns={'From':'Accession','To':'Protein_ID'})
  STRING_ID_MAP = STRING_ID_MAP.sort_values(by='Accession')
  STRING_ID_MAP = STRING_ID_MAP.drop_duplicates('Accession').set_index('Accession')

  comparison_DF = pd.concat([comparison_DF,STRING_ID_MAP],axis=1).dropna()

  return(comparison_DF)

def create_Rate_DF(comparison=None):
  # Create dataframes for rates in each isoform
  a2_df = pd.DataFrame()
  a3_df = pd.DataFrame()
  a4_df = pd.DataFrame()

  # Where are the Rate files located
  where_are_the_files = f'/content/drive/MyDrive/ApoE Turnover Data/Female_Rates_Peptide_FDR_2'
  os.chdir(where_are_the_files)
  all_file_names = [i for i in glob.glob('*.{}'.format('csv'))]

  full_bkg_df = pd.DataFrame()

  # Filter each rate file and create allele specific dataframes
  for dataSet in all_file_names:
    fileName = dataSet.strip('.csv')
    TR_df = pd.read_csv(dataSet).set_index('analyte_id')

    TR_df = TR_df.fillna(0)
    TR_df = TR_df.loc[TR_df['Combined uniques'].astype(int) > 1,]
    TR_df = TR_df.loc[TR_df['Combined rate'] != 'Insufficient Timepoints',]
    TR_df = TR_df.loc[TR_df['Combined rate'] != 'value could not be determined',]
    TR_df = TR_df.loc[TR_df['Combined rate'].astype(float) > 0,]
    TR_df = TR_df.loc[TR_df['Combined R2'].astype(float) > 0.6,]

    TR_a2_df = TR_df.loc[TR_df['group_name'] == 'A2',].copy().filter(regex='Combined rate')
    TR_a3_df = TR_df.loc[TR_df['group_name'] == 'A3',].copy().filter(regex='Combined rate')
    TR_a4_df = TR_df.loc[TR_df['group_name'] == 'A4',].copy().filter(regex='Combined rate')

    TR_a2_df = TR_a2_df.rename(columns={'Combined rate' : f'A2_Combined_rate'})
    TR_a3_df = TR_a3_df.rename(columns={'Combined rate' : f'A3_Combined_rate'})
    TR_a4_df = TR_a4_df.rename(columns={'Combined rate' : f'A4_Combined_rate'})

    a2_df = pd.concat([a2_df,TR_a2_df])
    a3_df = pd.concat([a3_df,TR_a3_df])
    a4_df = pd.concat([a4_df,TR_a4_df])


  # Calculate the average rate for each protein from the different rate files
  a2_df['A2_Combined_rate'] = a2_df['A2_Combined_rate'].astype(float)
  a2_df = a2_df.groupby(by='analyte_id',as_index=True).mean()
  a2_df['A2_Combined_rate'] = np.log2(a2_df['A2_Combined_rate'])

  a3_df['A3_Combined_rate'] = a3_df['A3_Combined_rate'].astype(float)
  a3_df = a3_df.groupby(by='analyte_id',as_index=True).mean()
  a3_df['A3_Combined_rate'] = np.log2(a3_df['A3_Combined_rate'])

  a4_df['A4_Combined_rate'] = a4_df['A4_Combined_rate'].astype(float)
  a4_df = a4_df.groupby(by='analyte_id',as_index=True).mean()
  a4_df['A4_Combined_rate'] = np.log2(a4_df['A4_Combined_rate'])

  # Create a dataframe with all the rates and
  all_rates = pd.concat([a2_df,a3_df,a4_df],axis=1)

  # Create comparison specific dataframes
  E2vE3_rates_df = pd.concat([a2_df,a3_df],axis=1).dropna()
  E4vE3_rates_df = pd.concat([a3_df,a4_df],axis=1).dropna()
  E4vE2_rates_df = pd.concat([a2_df,a4_df],axis=1).dropna()

  # Calculate fold change
  E2vE3_rates_df['E2vE3_Fold_change'] = ((E2vE3_rates_df['A2_Combined_rate'] - E2vE3_rates_df['A3_Combined_rate']) )
  E4vE3_rates_df['E4vE3_Fold_change'] = ((E4vE3_rates_df['A4_Combined_rate'] - E4vE3_rates_df['A3_Combined_rate']) )
  E4vE2_rates_df['E4vE2_Fold_change'] = ((E4vE2_rates_df['A4_Combined_rate'] - E4vE2_rates_df['A2_Combined_rate']) )

  # Concatenate the the comparison dataframes
  Rate_FC_DF = pd.concat([E2vE3_rates_df['E2vE3_Fold_change'],E4vE3_rates_df['E4vE3_Fold_change'],E4vE2_rates_df['E4vE2_Fold_change']],axis=1)

  print('This function outputs a rate FC dataframe; the FC was calculated with the log2 rate values, FC = [log2(B) - log(A)]')

  comparison_Fold_change_df = Rate_FC_DF.copy().filter(regex=f'{comparison}|ID').dropna()

  def auto_scaling(df):
    Rate_FC_DF_mean = np.mean(df[f'{comparison}_Fold_change'])
    Rate_FC_DF_std = np.std(df[f'{comparison}_Fold_change'])
    df[f'{comparison}_Fold_change'] = df[f'{comparison}_Fold_change'] - Rate_FC_DF_mean
    df[f'{comparison}_Fold_change'] = df[f'{comparison}_Fold_change'] / (Rate_FC_DF_std)
    return(df)

  scaled_RDF = auto_scaling(comparison_Fold_change_df).filter(regex='Fold_change')

  STRING_ID_MAP = pd.read_csv('/content/drive/MyDrive/ApoE Analysis January-11-2023/String_ID_Map.tsv', sep='\t').rename(columns={'From':'Accession','To':'Protein_ID'})
  STRING_ID_MAP = STRING_ID_MAP.sort_values(by='Accession')
  STRING_ID_MAP = STRING_ID_MAP.drop_duplicates('Accession').set_index('Accession')

  scaled_RDF = scaled_RDF.copy()

  scaled_RDF = pd.concat([scaled_RDF,STRING_ID_MAP],axis=1).dropna()

  return(scaled_RDF)

def create_ont_of_int_df(comparison=None):
  ontologies_of_interest_df = pd.read_csv('/content/drive/MyDrive/ApoE Analysis January-11-2023/ontologies_for_figures.tsv',sep='\t')
  ontologies_of_interest_df = ontologies_of_interest_df#.loc[((ontologies_of_interest_df[f'E2vE3_-10*log10(BH)'] > np.log10(0.025)*-10) | (ontologies_of_interest_df[f'E4vE3_-10*log10(BH)'] > np.log10(0.025)*-10)), ]

  ontologies_of_interest_df["E4_proteostasis"] = None
  ontologies_of_interest_df["E2_proteostasis"] = None

  ontologies_of_interest_df.loc[((ontologies_of_interest_df['E2vE3_Conc_log_FC'] > 0) & (ontologies_of_interest_df['E2vE3_Rate_log_FC'] > 0)),'E2_proteostasis'] = '⬆synthesis'
  ontologies_of_interest_df.loc[((ontologies_of_interest_df['E2vE3_Conc_log_FC'] < 0) & (ontologies_of_interest_df['E2vE3_Rate_log_FC'] > 0)),'E2_proteostasis'] = '⬆Degradation'
  ontologies_of_interest_df.loc[((ontologies_of_interest_df['E2vE3_Conc_log_FC'] < 0) & (ontologies_of_interest_df['E2vE3_Rate_log_FC'] < 0)),'E2_proteostasis'] = '⬇synthesis'
  ontologies_of_interest_df.loc[((ontologies_of_interest_df['E2vE3_Conc_log_FC'] > 0) & (ontologies_of_interest_df['E2vE3_Rate_log_FC'] < 0)),'E2_proteostasis'] = '⬇Degradation'

  ontologies_of_interest_df.loc[((ontologies_of_interest_df['E4vE3_Conc_log_FC'] > 0) & (ontologies_of_interest_df['E4vE3_Rate_log_FC'] > 0)),'E4_proteostasis'] = '⬆synthesis'
  ontologies_of_interest_df.loc[((ontologies_of_interest_df['E4vE3_Conc_log_FC'] < 0) & (ontologies_of_interest_df['E4vE3_Rate_log_FC'] > 0)),'E4_proteostasis'] = '⬆Degradation'
  ontologies_of_interest_df.loc[((ontologies_of_interest_df['E4vE3_Conc_log_FC'] < 0) & (ontologies_of_interest_df['E4vE3_Rate_log_FC'] < 0)),'E4_proteostasis'] = '⬇synthesis'
  ontologies_of_interest_df.loc[((ontologies_of_interest_df['E4vE3_Conc_log_FC'] > 0) & (ontologies_of_interest_df['E4vE3_Rate_log_FC'] < 0)),'E4_proteostasis'] = '⬇Degradation'

  return(ontologies_of_interest_df)

ontologies_of_interest_df = create_ont_of_int_df(comparison=f'{comparison}')


def ontology_group_boxp_n_heatm(ontologies_of_interest_df=None,comparison=None):
  all_sig_proteins = pd.DataFrame()

  comparison_DF = create_comparison_PVFC(comparison=f'{comparison}')
  scaled_RDF = create_Rate_DF(comparison=f'{comparison}')
  ontologies_of_interest_df = create_ont_of_int_df(comparison=f'{comparison}')

  ontology_groups_dict = {}
  ontology_groups = ontologies_of_interest_df['ontology_group'].drop_duplicates().to_list()
  ontologies_of_interest_df = ontologies_of_interest_df.loc[ontologies_of_interest_df[f'{comparison}_-10*log10(BH)'] > np.log10(0.05)*-10,]

  for ontology_group in ontology_groups:
    ontologies_in_group = ontologies_of_interest_df.loc[ontologies_of_interest_df['ontology_group'] == ontology_group,'term ID'].values.tolist()
    ontology_groups_dict[ontology_group] = ontologies_in_group
    continue

  all_top_proteins = pd.DataFrame()

  for Ontology_name, ontologies in ontology_groups_dict.items():
    # Read string DB output CSV
    string_annot = pd.read_csv(f'{directory}/String All Genotype Analysis.tsv',sep='\t')
    string_annot['matching proteins in your network (labels)'] = string_annot['matching proteins in your network (labels)'][:].str.split(r',')

    # string_annot = string_annot.rename(columns={'observed gene count':'Ontology Protein Count'})
    string_annot["term description (Count)"] = (string_annot['term description']).astype(str) + " " +"(" + (string_annot['observed gene count'].astype(int)).astype(str) + ")"
    string_annot['Ontology_coverage (%)'] = string_annot['observed gene count']/string_annot['background gene count']
    string_annot = string_annot.loc[string_annot['Ontology_coverage (%)'] >= 0.25,]

    ontologies_of_interest = ['GO Process','GO Function','GO Component','KEGG','Reactome','WikiPathways']
    string_annot = string_annot.loc[string_annot['#category'].isin(ontologies_of_interest)].reset_index(drop=True)
    string_annot = string_annot.loc[string_annot['term ID'].isin(ontologies)].reset_index(drop=True)
    proteins_in_ontology = pd.DataFrame()
    term_IDs = string_annot['term ID'].values.tolist()

    for term_ID in term_IDs:
      protein_list = string_annot.loc[string_annot['term ID'] == term_ID,'matching proteins in your network (labels)'].tolist()[0]
      protein_in_ontology = comparison_DF.loc[comparison_DF['Protein_ID'].isin(protein_list),].reset_index(drop=True)
      ontology_name = string_annot.loc[string_annot['term ID'] == term_ID,'term description (Count)'].tolist()[0]
      protein_in_ontology['Ontology'] = ontology_name
      protein_in_ontology['Abundance_FC/Rate_FC'] = 'ΔAbundance'
      proteins_in_ontology = pd.concat([proteins_in_ontology,protein_in_ontology],ignore_index=True)

    proteins_in_ontology[f'{comparison}_Fold_change'] = proteins_in_ontology[f'{comparison}_Fold_change']*10

    proteins_in_ontology = proteins_in_ontology.sort_values(by='Ontology')

    for term_ID in term_IDs:
      protein_list = string_annot.loc[string_annot['term ID'] == term_ID,'matching proteins in your network (labels)'].tolist()[0]
      protein_in_ontology = scaled_RDF.loc[scaled_RDF['Protein_ID'].isin(protein_list),].reset_index(drop=True).dropna()
      ontology_name = string_annot.loc[string_annot['term ID'] == term_ID,'term description (Count)'].tolist()[0]
      protein_in_ontology['Ontology'] = ontology_name
      protein_in_ontology['Abundance_FC/Rate_FC'] = 'ΔRate'
      proteins_in_ontology = pd.concat([proteins_in_ontology,protein_in_ontology],ignore_index=True)

    def create_box_plot(input_df=None):
      plt.figure(figsize=(1, 1))
      plt_size = len(ontologies)*.99
      # plt.figure(figsize=(6,plt_size))
      plt.figure(figsize=(3.5,plt_size))
      proteins_in_ontology = input_df.copy()
      proteins_in_ontology['Ontology'] =  ['\n'.join(wrap(l, 25)) for l in proteins_in_ontology['Ontology']]

      flierprops = dict(markerfacecolor='gray', markersize=3,
                        linestyle='none', markeredgecolor='gray',alpha=0.5)

      # make grouped boxplot and save it in a variable
      bp = sns.boxplot(x=f'{comparison}_Fold_change',
                       y='Ontology',
                      data=proteins_in_ontology,
                      palette="colorblind",
                      hue='Abundance_FC/Rate_FC',
                      fliersize=3,
                      flierprops=flierprops,
                      saturation=1,
                      showmeans=True,
                      meanprops={"marker": "o",
                         "markeredgecolor": "black",
                         "markersize": "5"},
                      showfliers = False)

      bp = sns.stripplot(x=f'{comparison}_Fold_change',
                       y='Ontology',
                      data=proteins_in_ontology,
                      palette="colorblind",
                      hue='Abundance_FC/Rate_FC')





      handles, labels = bp.get_legend_handles_labels()

      # specify just one legend
      l = plt.legend(handles[0:2], labels[0:2])
      plt.xlabel("Fold Change (arb. units)", fontweight ='bold', size=10)
      plt.ylabel("Protein Ontology (Protein Count)", fontweight ='bold',size=10)

      custom_params = {'axes.linewidth':2}

      sns.set(style='ticks', context='paper',font='sans-serif', font_scale=1, color_codes=True,rc=custom_params)

      plt.axvline(x=change_cutoff, ymin=0, ymax=1, linestyle=':',color='gray')
      plt.axvline(x=-change_cutoff, ymin=0, ymax=1, linestyle=':',color='gray')
      # plt.title(f'{Ontology_name} Ontologies \n ΔAbundance and ΔRate\n{comparison}',fontsize=15,fontweight ='bold')
      plt.title(f'{Ontology_name} Ontologies\n{comparison}',fontsize=12,fontweight ='bold',loc='center')
      plt.tick_params(axis='both', which='major', labelsize=10,length = 5,color = 'black',width =3)
      plt.yticks(rotation=30)
      plt.axvline(x=change_cutoff, ymin=0, ymax=1, linestyle='--',color='red')

      bp.legend(prop={'size':10},title_fontsize=10,markerscale=2,borderpad=0.5,loc=8,borderaxespad=-6, ncols=2)

      sns.despine(top=True, right=True, left=False, bottom=False, offset=None, trim=False)

      # plt.savefig(f'{directory}/Male-Female/Figures/{Ontology_name}_{comparison}_RnC_Plot.svg',format="svg",transparent=True,bbox_inches='tight')

      plt.show()

    create_box_plot(input_df=proteins_in_ontology)

    continue
  return(proteins_in_ontology)


ontology_group_boxp_n_heatm(ontologies_of_interest_df=ontologies_of_interest_df,comparison='E4vE3')
ontology_group_boxp_n_heatm(ontologies_of_interest_df=ontologies_of_interest_df,comparison='E2vE3')