In [13]:
import pandas as pd
import os
import numpy as np
from scipy.stats import ttest_ind
from statsmodels.stats.multitest import multipletests

In [14]:
# path = r"C:\Users\neil_\OneDrive\Desktop\New UHN\repo\heart-failure\data\proteomics_matrices\proteinGroups\log2_norm\ds-1.8_w0.3\NP_A_proteinGroup_matrix.log2.norm.ds-1.8.w0.3.tsv"
# read_proteomics_file(path)
# #read_prot_as_feature_df(path, "Genes")

In [15]:
def pivot_long_proteomics(df, intensity_col, gene_col="Protein Group", sample_col="Sample Name"):
    """
    Converts long-format proteomics table to wide format suitable for analysis.
    
    Parameters:
        df (pd.DataFrame): raw long-format dataframe
        intensity_col (str): column name for intensity values
        gene_col (str): column name for gene/protein ID
        sample_col (str): column name for sample ID
        
    Returns:
        pd.DataFrame: wide-format with genes as rows and samples as columns
    """
    pivoted = df.pivot_table(
        index=gene_col,
        columns=sample_col,
        values=intensity_col,
        aggfunc='mean'  # in case duplicates exist
    )
    #pivoted = pivoted.dropna(how="any")  # Optional: keep only complete rows
    return pivoted.T

In [16]:
path = r"C:\Users\neil_\OneDrive\Desktop\New UHN\repo\carotid_plaque_2\data\data_received\Proteograph_Protein_Group_Panel.tsv"
protein_group_np_df = pd.read_csv(path, sep='\t')

protein_group_df = pivot_long_proteomics(protein_group_np_df, "Intensities Log10")

In [17]:
protein_group_np_df

Unnamed: 0,Sample Name,Plate ID,Protein Group,Intensities Log10,DIA-NN Normalized Intensities Log10,Median Normalized Intensities Log10,Median80 Normalized Intensities Log10,PepCal Intensities Log10,PepCal Batch Intensities Log10,Protein Names,Gene Names,Biological Process,Molecular Function,Cellular Component
0,473627584286,2025us0137,A0A024R4E5,3.785528,3.683357,3.775900,3.706922,3.745368,3.751758,"Isoform of Q00341, High density lipoprotein bi...",HDLBP,,,
1,3497568,2025us0137,A0A024R4E5,4.204998,3.861297,3.968393,3.773923,4.144631,4.151020,"Isoform of Q00341, High density lipoprotein bi...",HDLBP,,,
2,3764247,2025us0137,A0A024R4E5,3.240921,3.127083,3.194525,3.134218,3.206956,3.213346,"Isoform of Q00341, High density lipoprotein bi...",HDLBP,,,
3,46994949,2025us0137,A0A024R4E5,3.121686,3.164418,3.153627,3.113034,3.088550,3.094939,"Isoform of Q00341, High density lipoprotein bi...",HDLBP,,,
4,4377307,2025us0137,A0A024R4E5,3.892703,3.845887,3.939642,3.895633,3.865911,3.872301,"Isoform of Q00341, High density lipoprotein bi...",HDLBP,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1642740,3441929,2025us0136,Q8TCU4;Q8TCU4-2,3.587517,3.663231,3.635887,3.649876,3.670165,3.670165,Centrosome-associated protein ALMS1;Isoform of...,ALMS1;ALMS1,endosomal transport [GO:0016197]; regulation o...,molecular_function [GO:0003674]; microtubule b...,centriole [GO:0005814]; spindle pole [GO:00009...
1642741,3764234,2025us0135,Q8TCU4;Q8TCU4-2,3.791865,4.055126,3.905419,3.988811,3.861354,3.670165,Centrosome-associated protein ALMS1;Isoform of...,ALMS1;ALMS1,endosomal transport [GO:0016197]; regulation o...,molecular_function [GO:0003674]; microtubule b...,centriole [GO:0005814]; spindle pole [GO:00009...
1642742,473627584357,2025us0136,P13535,4.096952,4.056714,4.112647,4.083071,4.167278,4.167278,Myosin-8,MYH8,skeletal muscle contraction [GO:0003009]; prot...,microfilament motor activity [GO:0000146]; cal...,cytoplasm [GO:0005737]; cytosol [GO:0005829]; ...
1642743,3330416,2025us0137,Q9UPS8,4.422133,4.399793,4.442645,4.533094,4.316386,4.084084,Ankyrin repeat domain-containing protein 26,ANKRD26,negative regulation of fat cell differentiatio...,protein binding [GO:0005515],centrosome [GO:0005813]


In [18]:
protein_group_df

Protein Group,A0A024R4E5,A0A024R6I7;A0A0G2JRN3,A0A067XG54;A0A804HIW2,A0A075B6H7,A0A075B6H9,A0A075B6I0,A0A075B6I1,A0A075B6I9,A0A075B6J1,A0A075B6J6,...,Q9Y6X5,Q9Y6Y0,Q9Y6Y8,Q9Y6Y9,Q9Y6Z7,R4GMX3,V9GYE7,V9GYJ8,X5CMH5,X6R8S9
Sample Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1171547,3.223473,4.599702,4.539886,4.583185,4.201430,4.557852,4.185262,4.199298,,,...,3.324891,,3.444025,3.648374,5.335976,,4.401444,4.737867,,
1293728,3.447194,3.802972,4.435182,4.432298,4.256502,4.200245,,4.174031,,,...,3.975678,3.474205,3.815025,3.689304,5.262773,,4.149881,4.717541,,
1354335,3.109029,4.150103,4.651963,4.668380,4.056409,4.435465,4.541784,4.456149,3.639617,,...,3.787213,,3.577760,3.935054,5.563267,,4.111441,5.220491,4.309879,
1529876,3.699543,4.470080,4.648747,4.649616,4.010497,3.905307,4.060533,4.439933,,,...,4.195095,,3.821646,3.415045,5.373434,,4.397660,4.597698,,
1571611,3.785136,5.678896,4.362492,5.061246,4.872729,3.993571,4.033218,4.314890,3.866061,,...,4.378670,3.143008,4.267123,3.820691,5.080411,,4.012736,4.428084,4.508046,3.084583
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
473627545420,3.447698,4.214774,4.458441,5.065181,4.066853,3.926851,3.816284,4.232220,,,...,4.187608,3.256368,3.839617,3.861246,5.278037,,4.299215,4.750153,4.208793,
473627584286,3.785528,5.761459,4.715582,4.621158,4.413726,4.753044,3.790723,4.638473,,,...,4.092650,3.878624,4.125953,3.438122,5.385823,,4.265580,4.773798,4.363936,
473627584357,4.024324,3.497897,4.523099,4.888786,4.227338,4.634991,3.900255,4.448840,,,...,3.962559,3.891428,4.016360,3.496675,5.339089,,4.160171,4.888978,,
473627624600,,3.967460,4.766132,4.696746,4.435573,4.314108,4.293127,4.478214,,,...,4.091906,,4.003232,3.530528,5.495092,,4.218223,4.682849,,


## Load Clinical

In [19]:
path = r"C:\Users\neil_\OneDrive\Desktop\New UHN\repo\carotid_plaque_2\data\data_received\2025US-1242-06_2025_sample_description.csv"
clinical_df = pd.read_csv(path, sep=',', index_col="Sample Name")
clinical_df


Unnamed: 0_level_0,Sample ID,Cuatom_Status,Sample Type,Species,Condition,Description,Custom_Sex,Custom_Age,Diseaseseverity,Custom_Hypertension,Custom_Dyslipidemia,Custom_Active smoker,Custom_Ever smoker,Custom_DM
Sample Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
3316837,3316837,ND,Plasma,Human,1-C,Human,M,77,,YES,YES,NO,NO,YES
473627746005,473627746005,D,Plasma,Human,2-SC,Human,M,79,Low,NO,YES,NO,YES,NO
3478950,3478950,ND,Plasma,Human,1-C,Human,F,54,,NO,NO,NO,YES,NO
3670126,3670126,D,Plasma,Human,2-SC,Human,M,69,Low,YES,NO,NO,NO,YES
3440510,3440510,ND,Plasma,Human,1-C,Human,M,59,,YES,NO,NO,NO,NO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3789227,3789227,D,Plasma,Human,4-S,Human,M,74,Severe,YES,YES,YES,YES,NO
3563073,3563073,D,Plasma,Human,3-A,Human,M,73,Moderate,NO,NO,NO,YES,NO
3343133,3343133,D,Plasma,Human,3-A,Human,M,66,Moderate,YES,YES,NO,YES,NO
3432353,3432353,D,Plasma,Human,3-A,Human,M,75,Severe,YES,YES,NO,YES,NO


In [20]:
# Summarize unique values for each variable in df
for var in clinical_df.columns:
    uniq = clinical_df[var].dropna().unique()
    if len(uniq) <10 and len(uniq) > 1:
        print(f"{var}: {uniq}")

Cuatom_Status: ['ND' 'D']
Condition: ['1-C' '2-SC' '3-A' '4-S']
Custom_Sex: ['M' 'F']
Diseaseseverity: ['Low' 'Severe' 'Moderate-severe' 'Moderate' 'Mild-moderate' 'Mild']
Custom_Hypertension: ['YES' 'NO' 'UNKNOWN']
Custom_Dyslipidemia: ['YES' 'NO' 'UNKNOWN' '????']
Custom_Active smoker: ['NO' 'YES' 'UNKNOWN']
Custom_Ever smoker: ['NO' 'YES' 'UNKNOWN']
Custom_DM: ['YES' 'NO' 'YES ' 'UNKNOWN']


In [21]:
protgroup_gene_map = protein_group_np_df.set_index("Protein Group")["Gene Names"].to_dict()

In [22]:
# Save protgroup_gene_map, clinical_df, and protein_group_df
import json

# Save DataFrames as CSV files
clinical_df.to_csv('clinical_df.csv')
protein_group_df.to_csv('protein_group_df.csv')

# Save protgroup_gene_map as JSON
with open('protgroup_gene_map.json', 'w') as f:
    json.dump(protgroup_gene_map, f, indent=2)

print("Saved files:")
print("- clinical_df.csv")
print("- protein_group_df.csv") 
print("- protgroup_gene_map.json")




Saved files:
- clinical_df.csv
- protein_group_df.csv
- protgroup_gene_map.json
