Shortening the Alvez (Pancancer) Dataset to the 49 Proteins

In [1]:
# Import the packages we may need
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [2]:
# Read in the list of proteins in common

Proteinsincommon = pd.read_excel("Proteinsincommon.xlsx")

In [3]:
# Read in the pancancer dataset
Pancancer_df=pd.read_csv("pancancer.csv")

In [4]:
# Check what it looks like
Pancancer_df.head(15)

Unnamed: 0,Sample_ID,Cancer,Assay,OlinkID,UniProt,Panel,NPX
0,AML_1,AML,AARSD1,OID21311,Q9BTE6,Oncology,5.01745
1,AML_2,AML,AARSD1,OID21311,Q9BTE6,Oncology,2.8679
2,AML_3,AML,AARSD1,OID21311,Q9BTE6,Oncology,4.58825
3,AML_4,AML,AARSD1,OID21311,Q9BTE6,Oncology,1.97255
4,AML_5,AML,AARSD1,OID21311,Q9BTE6,Oncology,3.45995
5,AML_6,AML,AARSD1,OID21311,Q9BTE6,Oncology,5.74925
6,AML_7,AML,AARSD1,OID21311,Q9BTE6,Oncology,3.0529
7,AML_8,AML,AARSD1,OID21311,Q9BTE6,Oncology,4.1634
8,AML_10,AML,AARSD1,OID21311,Q9BTE6,Oncology,2.6189
9,AML_11,AML,AARSD1,OID21311,Q9BTE6,Oncology,2.4197


In [5]:
# Shorten the dataset to only those 49 proteins of interest
Pancancer_df_shortened=Pancancer_df[(Pancancer_df['UniProt'].isin(Proteinsincommon))]

In [6]:
# Check what the shortened dataframe looks like
Pancancer_df_shortened

Unnamed: 0,Sample_ID,Cancer,Assay,OlinkID,UniProt,Panel,NPX
2843,AML_1,AML,ANGPT1,OID20740,Q15389,Inflammation,3.87015
2844,AML_2,AML,ANGPT1,OID20740,Q15389,Inflammation,0.55795
2845,AML_3,AML,ANGPT1,OID20740,Q15389,Inflammation,1.30175
2846,AML_4,AML,ANGPT1,OID20740,Q15389,Inflammation,-2.39660
2847,AML_5,AML,ANGPT1,OID20740,Q15389,Inflammation,0.28030
...,...,...,...,...,...,...,...
2076598,PRC_159,PRC,VEGFA,OID20650,P15692,Inflammation,1.60260
2076599,PRC_160,PRC,VEGFA,OID20650,P15692,Inflammation,1.28940
2076600,PRC_161,PRC,VEGFA,OID20650,P15692,Inflammation,1.22800
2076601,PRC_162,PRC,VEGFA,OID20650,P15692,Inflammation,0.64845


In [7]:
set(Pancancer_df_shortened['Cancer'])

{'AML',
 'BRC',
 'CLL',
 'CRC',
 'CVX',
 'ENDC',
 'GLIOM',
 'LUNGC',
 'LYMPH',
 'MYEL',
 'OVC',
 'PRC'}

In [8]:
# Confirm that there are 49 proteins in the shortened dataset
len(set(Pancancer_df_shortened['UniProt']))

49

In [9]:
# Create a pivot table that makes each sample a row, and includes the data for each protein in a column

Pancancer_pivot = Pancancer_df_shortened.pivot_table(index = ['Sample_ID', 'Cancer'], columns = 'UniProt', values = 'NPX' )
Pancancer_pivot.reset_index(inplace=True)
Pancancer_pivot.head(15)

UniProt,Sample_ID,Cancer,O00182,O43927,O75144,O75509,O76036,O95727,P01127,P01133,...,Q02763,Q13241,Q14116,Q15389,Q16790,Q8WXI7,Q92583,Q9BQ51,Q9NP84,Q9UQV4
0,AML_1,AML,1.5923,3.1517,-0.4436,0.1274,-0.57045,1.29635,4.97385,5.5042,...,0.85,0.2094,1.0845,3.87015,-1.1462,2.4212,2.04445,0.50115,0.4492,-2.7778
1,AML_10,AML,1.1667,0.3065,0.10625,0.36865,-0.0882,0.1487,0.8392,1.60795,...,0.4762,-0.1125,0.0494,0.23345,0.0032,0.3153,0.83345,-0.12485,-0.0795,1.4669
2,AML_11,AML,1.41675,0.44485,0.0047,0.03025,0.1798,0.5592,1.62395,2.6982,...,0.6146,0.4089,0.16355,1.1175,-0.35315,-1.6681,0.38805,0.02615,-0.16465,1.0576
3,AML_12,AML,0.98105,1.0981,0.3438,0.32815,-0.13315,,-3.2107,-1.52715,...,0.76085,0.85425,0.0316,-1.28095,-0.07265,-0.44885,-0.62225,0.1993,-0.07115,-0.5041
4,AML_13,AML,2.1388,3.8005,-0.4592,0.0457,-0.4247,-0.94935,3.5168,4.11555,...,0.53765,-0.26875,1.0281,1.89,-0.09185,-0.482,0.88205,0.85705,0.5448,0.5192
5,AML_14,AML,4.5651,1.2373,1.2381,1.2057,-0.11685,-0.1653,0.7772,2.2749,...,1.29115,-0.83805,1.1933,1.31915,1.1544,0.34115,0.31695,0.2088,2.0033,-0.0272
6,AML_15,AML,3.17215,0.3957,-0.51475,1.18285,1.5708,1.2836,-0.52645,0.25475,...,0.6901,1.3189,1.73905,0.7526,-0.475,1.3278,-0.37605,0.25185,0.4402,0.5736
7,AML_16,AML,1.33105,0.4054,0.06235,1.0402,0.11895,2.03875,-3.72715,-1.9597,...,0.56625,0.84855,1.57205,-2.9437,-0.12315,1.1097,-0.37655,0.6684,0.3244,0.083
8,AML_17,AML,3.26195,5.1031,,1.71945,1.86235,,1.65395,2.776,...,0.2785,2.1178,0.08635,1.6174,0.8036,,7.30745,,0.89465,1.6354
9,AML_18,AML,3.37575,1.41015,0.3999,1.01035,0.7239,0.8102,-2.32635,-0.7101,...,0.9314,1.5402,2.40685,-1.3081,1.35865,1.2928,-2.10125,0.86385,2.26055,1.0129


In [23]:
# Export the shortened dataset
Pancancer_pivot.to_csv('pancancer_cleaned.csv', index = False)