Identifying Common Proteins Between the Different Datasets

In [18]:
# Import the packages we may need
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [19]:
# Import the data

Petrera_df=pd.read_excel("Petrera_JProteome_S2.xlsx")
# Petrera data set source: https://pubs.acs.org/doi/full/10.1021/acs.jproteome.0c00641?casa_token=jmZDPVZOvegAAAAA%3ARv_oH-9X2AfOxbH826lXOOUjBr8xhagsxlecoH2jrUE_aaJUsM1bINZ_g4RxtZSuMI0B1D3th1VilHUy8w

Gonzalez_df=pd.read_excel("Gonzales_CancerRes_TableS2.xlsx")
# Gonzalez data set source: https://aacrjournals.org/cancerrescommun/article/4/7/1726/746418

Gao_df= pd.read_excel("Gao_JImmunoTher_Supp.xlsx")
# Gao data set source: https://pmc.ncbi.nlm.nih.gov/articles/PMC10836376/

Alvez_df = pd.read_excel("pancancer_olink_data_biostudies_v2.xlsx")
# Alvez data set source: https://pmc.ncbi.nlm.nih.gov/articles/PMC10354027/

In [14]:
# Description of papers
# Petrera measured 728 plasma proteins in 173 human plasma samples from a Southern German population-based cohort
# Gonzalez measured 92 plasma proteins in 54 patients with refractory or relapsed hodgkins lymphoma
# Gao measured 92 plasma proteins in 91 patients with esophageal cancer before and after ICI treatment
# Alvez measured 1463 proteins in 1477 patients from twelve cancer types, including acute myeloid leukemia (AML) (n = 50), 
# chronic lymphocytic leukemia (CLL) (n = 48), diffuse large B-cell lymphoma (DLBCL) (n = 55), myeloma (n = 38), 
# colorectal cancer (n = 221), lung cancer (n = 268), glioma (n = 145), breast cancer (n = 152), cervical cancer (n = 102), 
# endometrial cancer (n = 101), ovarian cancer (n = 134), and prostate cancer (n = 163).

In [15]:
# Separate the dfs based on olinkID and uniprot IDs

Petrera_df_olinkID=Petrera_df['OlinkID']
Petrera_df_uniprot= Petrera_df['UniprotID']

Gonzalez_df_olinkID=Gonzalez_df['OlinkID']
Gonzalez_df_uniprot = Gonzalez_df['UniprotID']

Gao_df_olinkID=Gao_df['OlinkID']
Gao_df_uniprot=Gao_df['UniprotID']

Alvez_df_olinkID=Alvez_df['OlinkID']
Alvez_df_uniprot=Alvez_df['UniProt']


In [16]:
# Find the list of uniprot IDs that are in common for the 4 papers of interest
incommon=list(set(Alvez_df_uniprot) & set(Petrera_df_uniprot) & set(Gao_df_uniprot) & set(Gonzalez_df_uniprot))
len(incommon)

49

In [17]:
print(incommon) # prints the uniprot IDs that are in common between the different datasets

['Q14116', 'P49763', 'P01127', 'O95727', 'P22301', 'Q9UQV4', 'P09382', 'P10147', 'P09601', 'O00182', 'P10747', 'P18627', 'P05231', 'Q13241', 'P12544', 'Q9NP84', 'Q9BQ51', 'P50591', 'Q01151', 'P09237', 'P09038', 'Q02763', 'P14210', 'Q16790', 'P35968', 'P43489', 'P10144', 'P42701', 'Q8WXI7', 'O76036', 'P01133', 'P39900', 'P48023', 'P01730', 'P09341', 'Q15389', 'P05113', 'P26842', 'P32970', 'O75144', 'P48061', 'Q92583', 'P07585', 'O75509', 'P20718', 'O43927', 'P29965', 'P13500', 'P15692']
