Shortening the Gao (Esophageal Cancer) Pretreatment Data to the 49 proteins

In [1]:
# Import the packages we may need
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [2]:
# Read in the list of proteins in common

Proteinsincommon = pd.read_excel("DataCleaning_and_ExploratoryAnalysis/Proteinsincommon.xlsx")

In [3]:
# Read in the Gao pretreatment data set

Gao_pretreatment_df = pd.read_csv("DataCleaning_and_ExploratoryAnalysis/Gao_pretreatment.csv")
# Data set source: https://pmc.ncbi.nlm.nih.gov/articles/PMC10836376/

In [4]:
# Check what it looks like
Gao_pretreatment_df.head(15)

Unnamed: 0,sample_id,P10145,Q07011,Q02763,P80098,P29965,P01583,Q9BZW8,P01133,Q15389,...,P35225,P78556,P01375,Q13241,P10144,Q01151,"P29459,P29460",P09603,Unnamed: 93,Unnamed: 94
0,R21043026,4.89898,6.01419,6.73672,1.35202,5.95495,-2.79663,5.65495,8.67672,7.897,...,0.41607,6.12544,2.12797,5.33351,5.92401,2.47693,5.64796,9.40197,plate1_1362562541.csv,Pass
1,R21043028,6.10887,5.49825,7.37053,1.29614,7.98518,-2.6627,6.26213,10.21614,9.16064,...,-0.00857,9.15514,2.46813,5.66605,5.49523,2.39335,6.81496,9.82524,plate2_1362562385.csv,Pass
2,R21043030,4.44095,5.60867,7.59944,0.94615,4.94,-2.74069,5.82213,7.39997,6.78645,...,-0.12533,6.16322,2.61357,5.52813,3.98589,2.71992,6.83475,9.16964,plate1_1362562541.csv,Pass
3,R21043032,7.6427,5.4599,7.20598,2.49907,8.19881,-2.09092,5.98716,10.72494,9.73018,...,0.72216,9.18583,2.71404,5.86032,5.50904,1.80735,6.23084,9.63688,plate1_1362562541.csv,Pass
4,R21043034,7.26992,6.62226,7.10424,2.59908,8.9658,-0.06198,6.94189,11.08722,9.84664,...,0.18909,8.48542,3.57668,6.36265,6.26009,2.79629,7.62235,9.78305,plate2_1362562385.csv,Pass
5,R21043036,5.35178,5.86844,6.61774,1.75922,7.56376,-2.3027,5.60631,9.47385,8.19895,...,0.90151,7.61078,2.43857,4.89465,4.93348,2.57071,6.03402,9.37822,plate2_1362562385.csv,Pass
6,R21043038,6.14246,6.00516,6.94127,1.78955,2.79903,-2.18047,5.08345,5.25143,5.58798,...,0.14716,7.464,2.65736,5.39196,3.63704,2.23685,6.24256,9.36871,plate2_1362562385.csv,Pass
7,R21043040,5.59958,4.92705,6.78342,1.44002,6.91034,-1.34021,5.57711,9.07897,8.46699,...,0.96647,6.35105,2.37421,5.09528,4.53309,2.19132,6.16382,9.34945,plate2_1362562385.csv,Pass
8,R21043042,5.09819,5.37177,7.27319,1.94442,7.21094,-2.56134,5.88887,9.2983,9.10919,...,-0.13798,5.82493,2.15704,5.25893,4.00356,2.58802,6.37464,8.98988,plate2_1362562385.csv,Pass
9,R21043044,4.96994,5.54274,7.42369,1.16354,7.43981,-2.50504,5.95695,9.24985,8.53798,...,0.7167,7.06332,2.50351,5.24105,5.11205,2.34324,5.92109,9.19859,plate2_1362562385.csv,Pass


In [5]:
# Concatenate the sample_id column with the columns for the 49 proteins of interest
Gao_pretreatment_df_shortened = pd.concat([Gao_pretreatment_df['sample_id'], Gao_pretreatment_df[Proteinsincommon.columns]], axis=1)

In [6]:
# View the output
Gao_pretreatment_df_shortened.head(15)

Unnamed: 0,sample_id,P10144,O75509,P09038,O75144,P12544,P18627,P15692,Q9UQV4,P09382,...,P01730,Q02763,Q01151,P49763,Q14116,O95727,Q9NP84,P48061,Q8WXI7,P43489
0,R21043026,5.92401,7.44433,2.3817,4.98006,6.28307,1.96712,6.61329,4.53648,6.47513,...,3.36083,6.73672,2.47693,7.18993,8.07464,4.98924,3.68838,0.96529,0.30757,4.82277
1,R21043028,5.49523,7.01627,3.76988,4.79317,6.69784,1.55523,8.45701,4.58424,6.75024,...,3.36096,7.37053,2.39335,7.04327,8.52729,5.14125,3.16032,1.1338,-0.01885,4.93184
2,R21043030,3.98589,7.27124,1.44624,5.18132,6.13571,1.92803,6.92434,4.7286,6.64692,...,3.11736,7.59944,2.71992,7.05853,7.8727,5.0295,3.49077,0.80583,-0.34209,5.5662
3,R21043032,5.50904,6.56424,3.77024,4.28462,6.73423,1.8269,7.94358,3.23996,6.85275,...,3.2859,7.20598,1.80735,6.98395,7.94973,5.04637,3.1926,1.26882,0.59406,5.16836
4,R21043034,6.26009,7.29889,4.90694,5.18895,7.59606,2.41357,10.14572,5.64368,7.19276,...,4.21606,7.10424,2.79629,7.68547,9.2099,6.14203,3.87951,1.04312,1.76947,6.02129
5,R21043036,4.93348,6.93244,3.67537,4.79652,6.10295,1.61246,8.24951,3.20451,6.81832,...,3.06623,6.61774,2.57071,7.04752,7.74294,4.96999,3.71103,0.90291,-0.10825,4.73799
6,R21043038,3.63704,7.04549,0.87948,4.74908,6.10655,1.54659,7.4854,4.81166,6.26755,...,3.47003,6.94127,2.23685,7.21077,8.03204,5.08869,3.58088,1.02229,1.10607,4.89466
7,R21043040,4.53309,6.72454,2.92216,4.6228,6.31824,1.55509,8.30347,4.0925,6.70359,...,3.21781,6.78342,2.19132,6.50469,7.59073,4.75515,3.14225,0.7879,0.89192,4.76504
8,R21043042,4.00356,7.22066,1.95481,4.69552,5.82734,1.892,8.44715,5.62985,6.56869,...,2.84354,7.27319,2.58802,6.74611,8.77556,4.7707,3.41334,0.84785,1.27568,5.33802
9,R21043044,5.11205,7.11492,2.33451,4.98824,6.16728,1.89634,8.06577,3.39461,6.60876,...,3.11258,7.42369,2.34324,6.8047,7.35493,5.01268,3.88897,1.19893,0.27582,4.70439


In [62]:
# Export the shortened dataset
Gao_pretreatment_df_shortened.to_csv('DataCleaning_and_ExploratoryAnalysis/Gao_cleaned.csv', index = False)