In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.io
import plotly.express as px

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

In [3]:
# The dataset includes the gene expression values of five
# types of cancer: lung adenocarcinoma (LUAD),
# breast invasive carcinoma (BRCA), kidney renal clear cell
# carcinoma (KIRC), lung squamous cell carcinoma (LUSC),
# and uterine corpus endometrial carcinoma (UCEC).

# https://data.mendeley.com/datasets/sf5n64hydt/1
# https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GPL20573

In [4]:
dir = os.path.join('data', 'cancer types.mat')
mat = scipy.io.loadmat(dir)
mat

{'__header__': b'MATLAB 5.0 MAT-file, Platform: PCWIN64, Created on: Tue Aug 01 17:46:29 2017',
 '__version__': '1.0',
 '__globals__': [],
 'cancerTypes': array([[array(['BRCA'], dtype='<U4')],
        [array(['BRCA'], dtype='<U4')],
        [array(['BRCA'], dtype='<U4')],
        ...,
        [array(['UCEC'], dtype='<U4')],
        [array(['UCEC'], dtype='<U4')],
        [array(['UCEC'], dtype='<U4')]], dtype=object),
 'data': array([[44.02354236,  9.21628619, 11.31907844, ...,  4.2956195 ,
          8.76876844,  1.        ],
        [29.74615746,  9.7656002 , 40.540128  , ...,  3.81901941,
          5.75850117,  1.        ],
        [35.79931536,  9.88478062,  3.886043  , ...,  4.38238737,
          5.30617722,  1.        ],
        ...,
        [14.66541424, 20.19564581,  6.7034771 , ...,  1.83021579,
          2.35529186,  5.        ],
        [31.11602192, 14.56206615,  9.12158501, ...,  1.17613142,
          1.89129728,  5.        ],
        [74.04662769, 13.00396207,  9.47227364

In [5]:
print('\n'.join([key for key in mat.keys()]))

__header__
__version__
__globals__
cancerTypes
data
geneIds


In [6]:
# cancer type for each row in the data
mat['cancerTypes'].shape

(2086, 1)

In [7]:
cancerTypes = [type[0][0] for type in mat['cancerTypes']]
cancerTypes[:5]

['BRCA', 'BRCA', 'BRCA', 'BRCA', 'BRCA']

In [8]:
# gene expression values
mat['data'].shape

(2086, 972)

In [9]:
# gene names
mat['geneIds'].shape

(1, 971)

In [10]:
data = mat['data'][:,:971] # removed column indicating cancer type
data.shape

(2086, 971)

In [11]:
genes = [id[0]for id in mat['geneIds'][0]]
genes[:5]

['AARS', 'ABCB6', 'ABCC5', 'ABCF1', 'ABCF3']

In [12]:
encoded_types = mat['data'][:,971]
encoded_types = encoded_types - 1 # zero-indexed
np.unique(encoded_types)

array([0., 1., 2., 3., 4.])

In [13]:
encoded_types.shape

(2086,)

In [14]:
df = pd.DataFrame(data=data, columns=genes)
df

Unnamed: 0,AARS,ABCB6,ABCC5,ABCF1,ABCF3,ABHD4,ABHD6,ABL1,ACAA1,ACAT2,...,ZMIZ1,ZMYM2,ZNF131,ZNF274,ZNF318,ZNF395,ZNF451,ZNF586,ZNF589,ZW10
0,44.023542,9.216286,11.319078,33.215176,16.901427,9.031338,1.109961,20.017821,16.724363,10.494192,...,29.275809,21.706486,16.315579,4.224009,8.602081,23.762341,8.302416,1.408731,4.295620,8.768768
1,29.746157,9.765600,40.540128,30.169134,20.047393,32.237287,2.460624,17.029112,28.346167,17.017284,...,65.896789,12.815215,10.150965,8.914809,6.797915,15.379187,11.420690,6.599729,3.819019,5.758501
2,35.799315,9.884781,3.886043,29.984211,17.135946,21.273727,1.501203,20.598204,25.855152,12.275738,...,44.571276,14.344729,11.224647,7.870991,7.724003,25.762396,8.628786,4.104879,4.382387,5.306177
3,26.490401,7.085828,10.804003,23.482255,17.044085,14.880104,1.299056,14.978582,31.214294,10.015235,...,51.223656,13.660995,9.730124,7.804760,5.030966,8.964868,7.990036,4.251886,3.702483,7.500498
4,27.632466,7.642971,3.670265,16.584843,20.375321,22.174600,1.553541,14.909150,54.435490,13.392213,...,55.927277,16.650019,8.584938,7.485410,5.945771,9.205302,8.761025,4.656969,3.827945,7.939863
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2081,23.758989,15.394219,10.883932,21.691852,20.209391,30.078659,0.978559,13.002383,28.629486,6.387968,...,19.716977,10.900725,4.188131,2.499783,2.422833,10.061337,3.966998,2.323246,42.449235,2.661792
2082,27.990210,28.998590,8.701462,27.579071,29.770012,15.744797,3.759037,13.468529,20.548527,5.557939,...,38.833607,13.409906,8.146828,3.390904,3.621281,18.706152,4.272848,1.199481,3.990134,1.665184
2083,14.665414,20.195646,6.703477,19.648529,12.530305,24.321260,2.263398,9.642926,30.248579,22.157856,...,29.583893,4.310888,7.920039,2.731255,4.168946,4.470333,3.064729,1.341491,1.830216,2.355292
2084,31.116022,14.562066,9.121585,13.831678,15.535040,41.278765,1.044817,8.012867,8.701291,4.777847,...,15.115717,5.303415,3.716170,1.802522,1.466401,4.209371,2.633839,0.908784,1.176131,1.891297


In [15]:
X = df.values
X

array([[44.02354236,  9.21628619, 11.31907844, ...,  1.40873072,
         4.2956195 ,  8.76876844],
       [29.74615746,  9.7656002 , 40.540128  , ...,  6.59972888,
         3.81901941,  5.75850117],
       [35.79931536,  9.88478062,  3.886043  , ...,  4.10487884,
         4.38238737,  5.30617722],
       ...,
       [14.66541424, 20.19564581,  6.7034771 , ...,  1.34149059,
         1.83021579,  2.35529186],
       [31.11602192, 14.56206615,  9.12158501, ...,  0.9087842 ,
         1.17613142,  1.89129728],
       [74.04662769, 13.00396207,  9.47227364, ...,  4.25296664,
         4.33718205,  9.33315169]])

In [16]:
y = encoded_types
y

array([0., 0., 0., ..., 4., 4., 4.])

In [17]:
scaled_X = StandardScaler().fit_transform(X)
scaled_X

array([[ 0.17836082, -0.38885229, -0.25400484, ..., -1.04988521,
         0.24837591,  0.77270934],
       [-0.5282126 , -0.32843441,  0.64026652, ...,  1.12750571,
         0.05306413, -0.16047082],
       [-0.22864791, -0.31532601, -0.481483  , ...,  0.08102801,
         0.28393357, -0.30069084],
       ...,
       [-1.27454343,  0.81874406, -0.39525918, ..., -1.07808943,
        -0.76195199, -1.21546267],
       [-0.46041939,  0.19911865, -0.3212562 , ..., -1.25959035,
        -1.02999721, -1.35930057],
       [ 1.66417313,  0.0277461 , -0.31052384, ...,  0.1431442 ,
         0.26540833,  0.94766764]])

In [18]:
pca = PCA(n_components=3)
principal_components = pca.fit_transform(scaled_X)
principal_df = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2', 'PC3'])
principal_df

Unnamed: 0,PC1,PC2,PC3
0,1.944555,-5.781887,-0.076202
1,6.685335,-10.197388,-8.019990
2,8.753383,-3.274183,-9.731205
3,5.971544,0.610894,-9.492820
4,4.992904,-1.638601,-12.268776
...,...,...,...
2081,-2.172339,20.524243,-5.028996
2082,1.122024,17.474742,-3.003930
2083,-8.299643,12.981320,-0.523182
2084,-16.151747,13.746504,-4.113257


In [19]:
final_df = pd.concat([principal_df, pd.DataFrame(cancerTypes)], axis=1)
final_df.rename(columns={0:'cancer_type'}, inplace=True)
final_df

Unnamed: 0,PC1,PC2,PC3,cancer_type
0,1.944555,-5.781887,-0.076202,BRCA
1,6.685335,-10.197388,-8.019990,BRCA
2,8.753383,-3.274183,-9.731205,BRCA
3,5.971544,0.610894,-9.492820,BRCA
4,4.992904,-1.638601,-12.268776,BRCA
...,...,...,...,...
2081,-2.172339,20.524243,-5.028996,UCEC
2082,1.122024,17.474742,-3.003930,UCEC
2083,-8.299643,12.981320,-0.523182,UCEC
2084,-16.151747,13.746504,-4.113257,UCEC


In [20]:
loadings = pd.DataFrame(pca.components_.T)
loadings.columns = ['PC1', 'PC2', 'PC3']
loadings['gene'] = genes
loadings = loadings[['gene', 'PC1', 'PC2', 'PC3']]
loadings

Unnamed: 0,gene,PC1,PC2,PC3
0,AARS,0.038193,0.008083,0.034775
1,ABCB6,0.001169,0.036301,0.033037
2,ABCC5,0.031167,0.002968,0.033554
3,ABCF1,0.057150,0.012712,0.027749
4,ABCF3,0.031732,0.025511,0.026951
...,...,...,...,...
966,ZNF395,-0.048116,-0.030328,0.027157
967,ZNF451,0.028892,-0.061565,0.005646
968,ZNF586,0.035258,-0.025104,-0.045507
969,ZNF589,0.020521,0.022526,-0.032895


In [21]:
loadings['sum'] = loadings['PC1'] + loadings['PC2'] + loadings['PC3']
loadings

Unnamed: 0,gene,PC1,PC2,PC3,sum
0,AARS,0.038193,0.008083,0.034775,0.081051
1,ABCB6,0.001169,0.036301,0.033037,0.070507
2,ABCC5,0.031167,0.002968,0.033554,0.067689
3,ABCF1,0.057150,0.012712,0.027749,0.097610
4,ABCF3,0.031732,0.025511,0.026951,0.084193
...,...,...,...,...,...
966,ZNF395,-0.048116,-0.030328,0.027157,-0.051287
967,ZNF451,0.028892,-0.061565,0.005646,-0.027026
968,ZNF586,0.035258,-0.025104,-0.045507,-0.035353
969,ZNF589,0.020521,0.022526,-0.032895,0.010151


In [22]:
loadings.sort_values(by=['sum'], ascending=False)

Unnamed: 0,gene,PC1,PC2,PC3,sum
130,CDC45,0.051073,0.043605,0.060150,0.154828
115,CCNB2,0.059255,0.039169,0.050602,0.149027
633,PLK1,0.056618,0.048097,0.040673,0.145388
60,AURKB,0.042797,0.062401,0.038787,0.143985
126,CDC20,0.046564,0.057282,0.039734,0.143580
...,...,...,...,...,...
286,FAM63A,0.025570,-0.038748,-0.077295,-0.090473
39,ARHGEF12,-0.016270,-0.073907,-0.003623,-0.093800
33,APBB2,0.015465,-0.056570,-0.060082,-0.101186
722,REEP5,0.005292,-0.057269,-0.056504,-0.108482


In [23]:
fig = px.scatter_3d(final_df,
                    x='PC1', y='PC2', z='PC3',
                    color='cancer_type',
                    width=1000, height=800,
                    opacity=1
                    )

fig.update_traces(marker=dict(size=5,
                              line=dict(width=1,
                                        color='DarkSlateGrey')
                              ),
                  )

fig.show()

In [24]:
tsne = TSNE(n_components=3).fit_transform(scaled_X)
principal_df = pd.DataFrame(data=tsne, columns=['t-SNE_1', 't-SNE_2', 't-SNE_3'])
final_df = pd.concat([principal_df, pd.DataFrame(cancerTypes)], axis=1)
final_df.rename(columns={0:'cancer_type'}, inplace=True)
final_df

Unnamed: 0,t-SNE_1,t-SNE_2,t-SNE_3,cancer_type
0,4.679261,-7.638051,3.367977,BRCA
1,10.229391,-5.269011,-3.974849,BRCA
2,6.664847,-1.421313,-7.457870,BRCA
3,4.638742,-0.692407,-7.130258,BRCA
4,6.067872,1.786575,-9.265186,BRCA
...,...,...,...,...
2081,-0.940870,8.518971,-3.357358,UCEC
2082,-0.854386,9.668539,-0.067360,UCEC
2083,-3.655662,7.822003,-0.684412,UCEC
2084,-6.337566,6.259916,-3.429480,UCEC


In [25]:
fig = px.scatter_3d(final_df,
                    x='t-SNE_1', y='t-SNE_2', z='t-SNE_3',
                    color='cancer_type',
                    width=1000, height=800,
                    opacity=1
                    )

fig.update_traces(marker=dict(size=5,
                              line=dict(width=1,
                                        color='DarkSlateGrey')
                              ),
                  )

fig.show()