In [1]:
"""
Perform PCA on the transcriptome data using sklearn's normalization and PCA implementations. 
Create a csv "top_95_PCs.csv" containing the PC's responsible for 95% of the variation in the microarray data. 

"""

import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from matplotlib import pyplot as plt


In [2]:
expression_df = pd.read_excel("GSE132903_Matrix_Normalized.xlsx")

In [3]:
# To perform PCA, the genes (components) need to be in the columns
expression_df.head()

Unnamed: 0,ID_REF,Illumina probe name,ND_1_08-81,ND_2_08-85,ND_4_08-72,ND_5_08-83,ND_6_97-53,ND_7_98-19,ND_8_97-46,ND_9_97-17,...,ND_14_97-09,ND_20_99-29,ND_15_97-10,ND_21_99-22,ND_16_97-02,ND_22_99-02,ND_17_97-14,ND_23_98-22,ND_18_97-37,ND_24_98-32
0,6450255,ILMN_1762337,6.250487,6.213403,6.072612,6.136687,6.235316,6.186059,6.157642,6.228337,...,6.156161,6.136243,6.095036,6.3015,6.156794,6.211105,6.159308,6.120674,6.1633,6.147267
1,2570615,ILMN_2055271,6.629155,7.811384,6.728518,6.440821,6.290671,6.52192,6.481466,6.490025,...,6.263134,6.281289,6.362918,6.44063,6.520428,6.301552,6.529516,6.333581,6.309065,6.360013
2,6370619,ILMN_1736007,6.237866,6.055326,6.234923,6.218833,6.210193,6.180704,6.236182,6.261672,...,6.279721,6.212151,6.208651,6.120526,6.203077,6.212418,6.191249,6.198287,6.216233,6.247808
3,2600039,ILMN_2383229,6.044737,6.097212,6.055573,6.220438,6.076194,6.144904,6.081975,6.174246,...,6.114173,6.106865,6.143669,6.153107,6.183167,6.060196,6.083065,6.148202,6.098208,6.101815
4,2650615,ILMN_1806310,6.206666,6.363891,6.305701,6.227695,6.278323,6.301026,6.354064,6.294581,...,6.14749,6.208215,6.329623,6.208439,6.185513,6.198625,6.295551,6.129278,6.226315,6.239366


In [4]:
# Let's remove the ID_REF column first (it's redundant w/ probe name column)
cols = list(expression_df.columns)
expression_df = expression_df.loc[:,cols[1:]]
expression_df.head()

Unnamed: 0,Illumina probe name,ND_1_08-81,ND_2_08-85,ND_4_08-72,ND_5_08-83,ND_6_97-53,ND_7_98-19,ND_8_97-46,ND_9_97-17,ND_10_97-19,...,ND_14_97-09,ND_20_99-29,ND_15_97-10,ND_21_99-22,ND_16_97-02,ND_22_99-02,ND_17_97-14,ND_23_98-22,ND_18_97-37,ND_24_98-32
0,ILMN_1762337,6.250487,6.213403,6.072612,6.136687,6.235316,6.186059,6.157642,6.228337,6.168465,...,6.156161,6.136243,6.095036,6.3015,6.156794,6.211105,6.159308,6.120674,6.1633,6.147267
1,ILMN_2055271,6.629155,7.811384,6.728518,6.440821,6.290671,6.52192,6.481466,6.490025,6.389241,...,6.263134,6.281289,6.362918,6.44063,6.520428,6.301552,6.529516,6.333581,6.309065,6.360013
2,ILMN_1736007,6.237866,6.055326,6.234923,6.218833,6.210193,6.180704,6.236182,6.261672,6.166677,...,6.279721,6.212151,6.208651,6.120526,6.203077,6.212418,6.191249,6.198287,6.216233,6.247808
3,ILMN_2383229,6.044737,6.097212,6.055573,6.220438,6.076194,6.144904,6.081975,6.174246,6.088439,...,6.114173,6.106865,6.143669,6.153107,6.183167,6.060196,6.083065,6.148202,6.098208,6.101815
4,ILMN_1806310,6.206666,6.363891,6.305701,6.227695,6.278323,6.301026,6.354064,6.294581,6.23099,...,6.14749,6.208215,6.329623,6.208439,6.185513,6.198625,6.295551,6.129278,6.226315,6.239366


In [5]:
# Now transpose the data frame. Fist peel off the illumina probe column
probe_names = expression_df.loc[:,'Illumina probe name'].to_list()
# Get the columns sans illumina probe name as a list
cols = expression_df.columns.to_list()[1:]
# Transpose of the expression data frame without illumina probe column
sample_rows_df = expression_df.loc[:,cols].transpose()
# Probe names are the column names
sample_rows_df.columns = probe_names
sample_rows_df.rename_axis("Sample_ ID", inplace=True)
sample_rows_df.head()

Unnamed: 0_level_0,ILMN_1762337,ILMN_2055271,ILMN_1736007,ILMN_2383229,ILMN_1806310,ILMN_1779670,ILMN_1705025,ILMN_1814316,ILMN_2359168,ILMN_1731507,...,ILMN_2348512,ILMN_1743643,ILMN_1794932,ILMN_1723439,ILMN_1656676,ILMN_2371169,ILMN_1701875,ILMN_1786396,ILMN_1653618,ILMN_2137536
Sample_ ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ND_1_08-81,6.250487,6.629155,6.237866,6.044737,6.206666,6.250487,6.12547,8.315913,8.274257,7.187185,...,6.557572,6.596731,6.31729,6.080668,11.360604,7.181341,9.055721,8.856049,7.568357,8.703117
ND_2_08-85,6.213403,7.811384,6.055326,6.097212,6.363891,6.21723,6.38051,7.579899,7.50459,6.361856,...,6.414877,6.300365,6.258881,6.03334,10.674451,7.348808,9.051958,8.534578,7.145082,8.126596
ND_4_08-72,6.072612,6.728518,6.234923,6.055573,6.305701,6.176223,6.163069,8.544403,8.463371,7.535209,...,6.540391,6.544077,6.212729,6.192897,11.323182,7.328814,9.234449,8.738885,8.106824,8.831392
ND_5_08-83,6.136687,6.440821,6.218833,6.220438,6.227695,6.195175,6.055975,8.599468,8.474713,7.534324,...,6.510822,6.756633,6.267512,6.179123,11.275523,7.087884,8.728555,8.626707,8.17642,8.684191
ND_6_97-53,6.235316,6.290671,6.210193,6.076194,6.278323,6.193169,6.202457,8.662215,8.56271,7.650545,...,6.476036,6.535137,6.316976,6.086432,11.262469,7.020743,8.6581,8.456142,8.106194,8.559834


In [6]:
# Standardizing the data
sample_ids = sample_rows_df.index
#This creates a numpy array X with standardized values. Basically a straight up 2-D array version of 
#the sample_row_df dataframe, without the pandas functionality or row labels.
x = StandardScaler().fit_transform(sample_rows_df.values)
# PCA should be run on standardized data
# Performing the PCA. PC generation stops once 95% of variability is accounted for
pca = PCA(0.95)
x = pca.fit_transform(x)
PC_count = pca.n_components_
print(f"{PC_count} priciple components account for 95% of the variability")
# re-create our dataframe from principle components
cols = [f"PC{i+1}" for i in range(PC_count)]
PC_df = pd.DataFrame(x, columns=cols, index=sample_ids)
PC_df 

164 priciple components account for 95% of the variability


Unnamed: 0_level_0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,...,PC155,PC156,PC157,PC158,PC159,PC160,PC161,PC162,PC163,PC164
Sample_ ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ND_1_08-81,57.330124,23.045619,82.483203,22.309241,-29.618373,-51.148577,24.506510,-13.222223,9.207557,-0.947670,...,4.437143,-8.570612,6.801249,-3.342435,-7.021174,3.158304,-4.764394,0.831382,3.143811,3.433116
ND_2_08-85,181.148028,165.044855,62.972181,-39.871016,-74.609379,67.399473,27.826739,1.780192,6.113208,-23.799985,...,-0.467337,-3.103248,0.306505,0.474706,1.960845,2.147851,1.405745,-3.654829,6.401186,-7.401313
ND_4_08-72,47.619128,23.089666,44.311498,23.622168,-49.021438,-33.640211,58.885460,2.775350,4.131780,-1.484267,...,4.198575,-0.913238,9.335808,-7.566890,-5.904901,-3.262774,2.346872,-7.843685,-9.204211,-3.122260
ND_5_08-83,-35.013961,-6.461728,67.779944,27.184108,-22.621570,-16.537329,6.711061,-22.186922,21.166937,1.348871,...,-28.498039,-21.258731,-6.117294,23.857926,28.749086,8.015407,19.269738,-2.339070,8.811300,28.575758
ND_6_97-53,-102.428761,-24.653259,45.106721,12.187463,-22.791963,31.930690,-19.002523,-20.654506,15.154517,3.584463,...,14.638042,-16.201469,-1.805665,-0.386177,22.384590,8.371111,8.532146,7.681583,-3.736170,-0.836166
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ND_22_99-02,-85.321295,28.841297,-21.512914,6.847360,44.395997,6.001832,-29.714869,5.508041,12.910735,-18.607705,...,-1.652723,15.111913,11.646395,-2.100308,-4.675291,9.745008,-8.036022,13.463041,20.275203,39.173895
ND_17_97-14,14.738886,45.958683,-22.858915,20.325361,44.151650,-48.891768,-33.023873,-3.932280,3.311100,-26.770617,...,-28.698568,7.951479,-7.557061,3.791076,-9.401843,26.491233,-0.003258,-7.008191,-0.370877,-18.030035
ND_23_98-22,-39.301522,59.856340,9.562294,-11.982849,85.945696,-50.328740,3.644226,12.912212,4.155163,-11.715315,...,5.067764,0.241010,-4.112621,0.961365,-1.251713,-0.255466,-1.167985,0.045679,0.165858,-0.245710
ND_18_97-37,-121.218331,3.205736,51.236828,20.505798,44.982516,31.690640,-17.111978,7.342242,20.322962,-28.626948,...,-11.720260,-22.575734,-2.151127,0.763352,6.869325,-34.008305,9.855683,-4.996336,-12.858077,-26.652143


In [7]:
# Adding a "target" column which represents the classification of the sample 
# non-diseased (ND), Alzheimer's disease (AD)
is_ND = PC_df.index.to_series().str.startswith('ND').to_list()
target = ["ND" if b else "AD" for b in is_ND]
target = pd.Series(target, name="target")
target.index = PC_df.index
data = pd.concat([PC_df, target], axis=1)
data

Unnamed: 0_level_0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,...,PC156,PC157,PC158,PC159,PC160,PC161,PC162,PC163,PC164,target
Sample_ ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ND_1_08-81,57.330124,23.045619,82.483203,22.309241,-29.618373,-51.148577,24.506510,-13.222223,9.207557,-0.947670,...,-8.570612,6.801249,-3.342435,-7.021174,3.158304,-4.764394,0.831382,3.143811,3.433116,ND
ND_2_08-85,181.148028,165.044855,62.972181,-39.871016,-74.609379,67.399473,27.826739,1.780192,6.113208,-23.799985,...,-3.103248,0.306505,0.474706,1.960845,2.147851,1.405745,-3.654829,6.401186,-7.401313,ND
ND_4_08-72,47.619128,23.089666,44.311498,23.622168,-49.021438,-33.640211,58.885460,2.775350,4.131780,-1.484267,...,-0.913238,9.335808,-7.566890,-5.904901,-3.262774,2.346872,-7.843685,-9.204211,-3.122260,ND
ND_5_08-83,-35.013961,-6.461728,67.779944,27.184108,-22.621570,-16.537329,6.711061,-22.186922,21.166937,1.348871,...,-21.258731,-6.117294,23.857926,28.749086,8.015407,19.269738,-2.339070,8.811300,28.575758,ND
ND_6_97-53,-102.428761,-24.653259,45.106721,12.187463,-22.791963,31.930690,-19.002523,-20.654506,15.154517,3.584463,...,-16.201469,-1.805665,-0.386177,22.384590,8.371111,8.532146,7.681583,-3.736170,-0.836166,ND
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ND_22_99-02,-85.321295,28.841297,-21.512914,6.847360,44.395997,6.001832,-29.714869,5.508041,12.910735,-18.607705,...,15.111913,11.646395,-2.100308,-4.675291,9.745008,-8.036022,13.463041,20.275203,39.173895,ND
ND_17_97-14,14.738886,45.958683,-22.858915,20.325361,44.151650,-48.891768,-33.023873,-3.932280,3.311100,-26.770617,...,7.951479,-7.557061,3.791076,-9.401843,26.491233,-0.003258,-7.008191,-0.370877,-18.030035,ND
ND_23_98-22,-39.301522,59.856340,9.562294,-11.982849,85.945696,-50.328740,3.644226,12.912212,4.155163,-11.715315,...,0.241010,-4.112621,0.961365,-1.251713,-0.255466,-1.167985,0.045679,0.165858,-0.245710,ND
ND_18_97-37,-121.218331,3.205736,51.236828,20.505798,44.982516,31.690640,-17.111978,7.342242,20.322962,-28.626948,...,-22.575734,-2.151127,0.763352,6.869325,-34.008305,9.855683,-4.996336,-12.858077,-26.652143,ND


In [8]:
# Saving the PC_df dataframe to a CSV for use in training/testing models
PC_df.to_csv("95_percent_var_PCs.csv")