In [155]:
import cptac

cptac.list_datasets()

Unnamed: 0_level_0,Description,Data reuse status,Publication link
Dataset name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Brca,breast cancer,no restrictions,https://pubmed.ncbi.nlm.nih.gov/33212010/
Ccrcc,clear cell renal cell carcinoma (kidney),no restrictions,https://pubmed.ncbi.nlm.nih.gov/31675502/
Colon,colorectal cancer,no restrictions,https://pubmed.ncbi.nlm.nih.gov/31031003/
Endometrial,endometrial carcinoma (uterine),no restrictions,https://pubmed.ncbi.nlm.nih.gov/32059776/
Gbm,glioblastoma,no restrictions,https://pubmed.ncbi.nlm.nih.gov/33577785/
Hnscc,head and neck squamous cell carcinoma,no restrictions,https://pubmed.ncbi.nlm.nih.gov/33417831/
Lscc,lung squamous cell carcinoma,no restrictions,https://pubmed.ncbi.nlm.nih.gov/34358469/
Luad,lung adenocarcinoma,no restrictions,https://pubmed.ncbi.nlm.nih.gov/32649874/
Ovarian,high grade serous ovarian cancer,no restrictions,https://pubmed.ncbi.nlm.nih.gov/27372738/
Pdac,pancreatic ductal adenocarcinoma,no restrictions,https://pubmed.ncbi.nlm.nih.gov/34534465/


In [156]:
cptac.download(dataset='Ccrcc')

ccrcc = cptac.Ccrcc()

                                          

In [157]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [158]:
# 1. Find the top 5 most differentially expressed proteins between Stage I and Stage III patients

clinical_data = ccrcc.get_clinical()
rna_data = ccrcc.get_transcriptomics()
protein_data = ccrcc.get_proteomics()

protein_data.columns = protein_data.columns.get_level_values(0) 
protein_data = protein_data.dropna(axis=1)

tumor_mask = clinical_data.loc[:, "Sample_Tumor_Normal"] == "Tumor"
clinical_data = clinical_data.loc[tumor_mask, :]

stage_I_mask = clinical_data.loc[:, "tumor_stage_pathological"] == "Stage I"
stage_III_mask = clinical_data.loc[:, "tumor_stage_pathological"] == "Stage III"

stage_I_patients = (clinical_data.index[stage_I_mask]).dropna()
stage_III_patients = (clinical_data.index[stage_III_mask]).dropna()


In [168]:
data = [stage_I_patients.values, stage_III_patients.values]

protein_stage_I = protein_data.loc[stage_I_patients, :]
protein_stage_III = protein_data.loc[stage_III_patients, :]

protein_diff = protein_stage_I.mean() - protein_stage_III.mean()
protein_diff = protein_diff.abs()
protein_diff = protein_diff.sort_values()
protein_data = protein_data.loc[tumor_mask, :]

protein_list = ["FTL", "HBZ", "HBA2", "CMA1", "HBB"]

top_5_protein = protein_data.loc[:, protein_list]


In [169]:
protein_diff

Name
ACADM      0.000011
SMCHD1     0.000045
ZC3H11A    0.000054
EXOSC3     0.000071
NAXE       0.000083
             ...   
HBB        0.557303
CMA1       0.583464
HBA2       0.589100
HBZ        0.603453
FTL        0.862165
Length: 6665, dtype: float64

In [171]:
rna_data = np.log2(rna_data).dropna(axis=1)
rna_stage_I = rna_data.loc[stage_I_patients, :]
rna_stage_III = rna_data.loc[stage_III_patients, :]

rna_diff = rna_diff.dropna()
rna_diff = rna_stage_I.mean() - rna_stage_III.mean()
rna_diff = rna_diff.abs()
rna_diff = rna_diff.sort_values()
rna_data = rna_data[tumor_mask]

rna_list = ["CC2D2B", "CCRL2", "TNC", "CBWD7", "PTTG1"]

top_5_rna = rna_data.loc[:, rna_list]
rna_diff


  result = func(self.values, **kwargs)
  rna_data = rna_data[tumor_mask]


Name
SNRPN     0.000003
RBM14     0.000015
PMPCB     0.000017
VPS35     0.000020
SNRPD3    0.000021
            ...   
PTTG1     0.410133
CBWD7     0.410572
TNC       0.422949
CCRL2     0.502770
CC2D2B    0.510390
Length: 9806, dtype: float64

In [172]:
# 2. Create a new dataframe: rows = patients, cols = expression values of genes
top_10 = pd.concat([top_5_protein, top_5_rna], axis=1)


In [173]:
# 3. Create a separate list of the patients' cancer stages

top_10["tumor_stage_pathological"] = clinical_data.loc[:, "tumor_stage_pathological"]

top_10

Name,FTL,HBZ,HBA2,CMA1,HBB,CC2D2B,CCRL2,TNC,CBWD7,PTTG1,tumor_stage_pathological
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
C3L-00004,-1.724339,-0.370098,-0.786307,-0.918273,-0.762287,0.175725,0.893170,1.226618,0.410942,1.080103,Stage III
C3L-00010,-0.363228,0.240576,-0.081263,0.747860,-0.034944,0.027322,-0.180820,1.006059,0.288196,0.702747,Stage I
C3L-00011,-0.977364,-0.087641,-0.418663,-0.255054,-0.371128,0.280647,0.151815,0.561198,0.508859,2.292182,Stage IV
C3L-00026,1.301930,0.981930,0.780756,0.049086,0.855097,0.929820,0.144756,1.776796,0.672007,1.047153,Stage I
C3L-00079,-1.496648,-0.441854,-1.014088,-0.620829,-0.939448,0.209431,0.161999,1.343163,-0.542388,2.404735,Stage III
...,...,...,...,...,...,...,...,...,...,...,...
C3N-01646,-1.817120,-0.724017,-1.180566,-0.000699,-1.127855,0.333554,-0.175023,2.115398,-2.178603,0.759980,Stage III
C3N-01648,0.459593,-0.396638,-0.672383,-0.611409,-0.605164,0.346245,2.099780,1.693689,-0.309480,1.305223,Stage II
C3N-01649,-1.066512,-0.109630,-0.584516,0.322161,-0.527777,0.990790,0.241445,2.320249,-1.896508,0.567970,Stage III
C3N-01651,-1.895398,-0.096922,-0.649448,-0.302394,-0.571897,-0.155102,-0.035762,1.822354,0.081185,1.608496,Stage II


In [182]:
# 4. Scale and encode data

from sklearn.preprocessing import StandardScaler
from umap import UMAP
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
unencoded_columns = top_10.loc[:, "tumor_stage_pathological"]
encoded_columns = encoder.fit_transform(unencoded_columns)

scaler = StandardScaler()

scaled_data = scaler.fit_transform(top_10.iloc[:, 0:10])
embedding_scaled = UMAP().fit_transform(scaled_data)

data = scaled_data[:, 0:9]
target = scaled_data[:, 10]

In [197]:
top_10.shape

(110, 11)

In [188]:
# 5. Create a train test split

from sklearn.model_selection import train_test_split

target = top_10.loc[:, "tumor_stage_pathological"]

X_train, X_test, y_train, y_test = train_test_split(scaled_data, target, train_size = 0.7)

scaler = StandardScaler()

scaler.fit(X_train)

X_train = scaler.transform(X_train)

X_test = scaler.transform(X_test)


In [189]:
# 6. Test all 4 classification models

from sklearn.neighbors import KNeighborsClassifier # default number of neighbors looked at is 5
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB

In [199]:
classifier = KNeighborsClassifier()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

accuracy = sum(y_pred == y_test) / len(y_test)

print(f'Using the KNeighborsClassifier, the model was able to accurately predict {accuracy * 100}% of the test set.')

classifier = DecisionTreeClassifier()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

accuracy = sum(y_pred == y_test) / len(y_test)

print(f'Using the DecisionTreeClassifier, the model was able to accurately predict {accuracy * 100}% of the test set.')

classifier = MLPClassifier()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

accuracy = sum(y_pred == y_test) / len(y_test)

print(f'Using the MLPClassifier, the model was able to accurately predict {accuracy * 100}% of the test set.')

classifier = GaussianNB()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

accuracy = sum(y_pred == y_test) / len(y_test)

print(f'Using the GaussianNB, the model was able to accurately predict {accuracy * 100}% of the test set.')


Using the KNeighborsClassifier, the model was able to accurately predict 63.63636363636363% of the test set.
Using the DecisionTreeClassifier, the model was able to accurately predict 36.36363636363637% of the test set.
Using the MLPClassifier, the model was able to accurately predict 54.54545454545454% of the test set.
Using the GaussianNB, the model was able to accurately predict 54.54545454545454% of the test set.




In [200]:
# 7. Compare 4 mean accuracies.

# According to the results,
# KNeighbors    - 63.6% - "Best"
# DecisionTree  - 36.3% - "Worst"
# MLPClassifier - 54.5%
# GaussianNB    - 54.5%

# If I did my code right, none of these models seem to predict the data very well.

# All of these values imply the set size is a multiple of 11 - while this initially caused confusion if the final 
# tumor_stage_pathological was included (making number of columns = 11), I also realized the number of rows (X values)
# is 110, so maybe there is no problem?