After speaking with Kayla and Mahija, we determined that the source of my code not running is due to my mac being from 2016. Because of this, I kept getting the "kernel is restarting" error and all of my ML code was not able to run. Due to this, I wasn't able to directly find any bugs from running the code.

# Setting Working Directory

In [2]:
import os

os.chdir('/Users/meganrajan/Documents/qbio490/qbio_490_meganrajan/analysis_data')

Importing Libraries

In [3]:
# 1. Import cptac
import cptac

# 2. Examine the data sets available with list_datasets()
cptac.list_datasets()

# 3. Download the cancer data set -- fill in dataset!
# cptac.download(dataset="pdac")

# 4. Load the cancer data
pdac = cptac.Pdac()

Checking that pdac index is up-to-date...



                                         

Creating separate variables to store each aspect of the pdac dataset. 

In [6]:
import pandas as pd
import numpy as np

clinical_data = pdac.get_clinical()
proteomics_data = pdac.get_proteomics()
transcriptomics_data = pdac.get_transcriptomics()

transcriptomics_data_log = np.log2(transcriptomics_data + 1)  # Adding 1 to avoid log(0)

clinical_filtered = clinical_data[clinical_data['tumor_stage_pathological'].isin(['Stage I', 'Stage III'])]
proteomics_filtered = proteomics_data.loc[clinical_filtered.index]
transcriptomics_filtered = transcriptomics_data_log.loc[clinical_filtered.index]

def find_top_differentials(data, clinical_data, num_features=5):
    stage_I = data.loc[clinical_data['tumor_stage_pathological'] == 'Stage I']
    stage_III = data.loc[clinical_data['tumor_stage_pathological'] == 'Stage III']
    
    mean_diffs = (stage_I.mean() - stage_III.mean()).abs()
    top_differentials = mean_diffs.nlargest(num_features).index.tolist()
    
    return top_differentials

top_5_proteins = find_top_differentials(proteomics_filtered, clinical_filtered)
top_5_rnas = find_top_differentials(transcriptomics_filtered, clinical_filtered)

combined_genes = list(set(top_5_proteins + top_5_rnas))

In [7]:
combined_genes = ['A1BG', 'A1CF', 'A1BG-AS1', 'A2M', 'A2ML1', 'A4GALT', 'A2M-AS1']

# init arrary
X_data = pd.DataFrame(index=clinical_filtered.index)

# loop thru 
for gene in combined_genes:
    if gene in proteomics_data.columns:
        X_data[gene + '_prot'] = proteomics_filtered[gene]
    
    if gene in transcriptomics_data.columns:
        X_data[gene + '_rna'] = transcriptomics_filtered[gene]



3) Create a separate list of the patients’ cancer stages, ie. tumor_stage_pathological (y data).

In [8]:
# extract the cancer stages for the patients in X_data
y_data = clinical_filtered['tumor_stage_pathological']

# display the first few entries to verify
print(y_data.head())

Patient_ID
C3L-00102    Stage III
C3L-00598    Stage III
C3L-00625    Stage III
C3L-01124    Stage III
C3L-01662    Stage III
Name: tumor_stage_pathological, dtype: object


# Scaling and Encoding

In [9]:
from sklearn.preprocessing import StandardScaler, LabelEncoder

scaler = StandardScaler()
encoder = LabelEncoder()

# scale
X_scaled = scaler.fit_transform(X_data)

# encode

y_encoded = encoder.fit_transform(y_data)

# add scaled data to dataframe
X_scaled_df = pd.DataFrame(X_scaled, columns=X_data.columns, index=X_data.index)

# Show the first few rows of the scaled features and encoded target
print(X_scaled_df.head())
print(y_encoded[:5])


            A1BG_prot  A1BG_rna  A1CF_prot  A1CF_rna  A1BG-AS1_rna  A2M_prot  \
Patient_ID                                                                     
C3L-00102    0.768001  1.292106  -0.699052  0.513265     -0.774917  2.139522   
C3L-00598    1.788376 -1.625292  -0.521252 -2.666078     -2.381984 -0.216494   
C3L-00625    1.240868  0.530270  -0.340637 -0.098307      0.611999  0.999953   
C3L-01124    0.408271 -0.437940   0.042364  0.623502      0.586025 -0.665417   
C3L-01662   -0.471062 -0.558806   1.098742  0.577210     -0.985918 -1.394530   

             A2M_rna  A2ML1_prot  A2ML1_rna  A4GALT_prot  A4GALT_rna  \
Patient_ID                                                             
C3L-00102   0.459260   -0.208410   0.441047          NaN   -0.175757   
C3L-00598  -1.086554    2.869159   1.796662          NaN    0.709494   
C3L-00625   1.300586   -0.064713  -1.216539          NaN    0.666852   
C3L-01124  -0.273611   -0.491213   0.538500          NaN   -0.404821   
C3L-016

# Train - Test Split

In [12]:
# drop rows where any row has a NaN value + corresponing y-enc rows
X_scaled_df = X_scaled_df.dropna()
X_scaled_df
y_encoded = y_encoded[X_scaled_df.index]


Unnamed: 0_level_0,A1BG_prot,A1BG_rna,A1CF_prot,A1CF_rna,A1BG-AS1_rna,A2M_prot,A2M_rna,A2ML1_prot,A2ML1_rna,A4GALT_prot,A4GALT_rna,A2M-AS1_rna
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
C3L-04495,-0.287694,0.348168,0.616971,0.527305,0.063042,-0.532384,0.575585,-0.516936,0.116699,-0.556454,-0.817089,0.841702
C3N-03853,0.563852,-1.468778,0.396759,0.738249,-1.912184,0.054646,0.003123,-0.698128,0.132536,-0.684315,-0.546884,-1.485782
C3N-04126,-0.113697,0.209781,1.297216,0.400467,0.463202,0.015162,-0.420453,1.879075,1.178788,-0.486936,0.415214,0.109233
C3N-04283,1.115155,0.116764,0.81808,-0.885342,-0.639117,0.445902,-2.924896,0.599181,1.057887,1.727705,1.59839,-1.50162


# Test Code of 4 Different Classifiers

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

classifiers = {
    "KNeighbors": KNeighborsClassifier(),
    "DecisionTree": DecisionTreeClassifier(),
    "MLP": MLPClassifier(max_iter=1000),
    "GaussianNB": GaussianNB()
}

accuracies = {name: [] for name in classifiers}

num_runs = 10

# loop over each classifier
for name, clf in classifiers.items():
    for i in range(num_runs):
        
        X_train, X_test, y_train, y_test = train_test_split(X_dropped, y_dropped, test_size=0.25, random_state=i)
        
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        
        accuracy = accuracy_score(y_test, y_pred)
        accuracies[name].append(accuracy)

# Find accuracy of different classifiers and compare

In [None]:
mean_accuracies = {name: np.mean(acc) for name, acc in accuracies.items()}

# sort to compare
sorted_classifiers = sorted(mean_accuracies.items())

for name, accuracy in sorted_classifiers:
    print(f"{name}: {accuracy:.4f}")

best_model_name, best_model_accuracy = sorted_classifiers[0]

best_model_name

Because my code is not running, I am not able to detect any possible bugs or determine the actual worst and best model.