In [31]:
import pandas as pd
import numpy as np
from matplotlib.pyplot import subplots as plt
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from ISLP import load_data, confusion_table
from sklearn.tree import (DecisionTreeClassifier as DTC,
                          DecisionTreeRegressor as DTR,
                          plot_tree,
                          export_text)
from sklearn.metrics import (accuracy_score,
                             log_loss)
from sklearn.ensemble import \
     (RandomForestRegressor as RF,
      GradientBoostingRegressor as GBR,
      RandomForestClassifier as RC, 
      GradientBoostingClassifier as GBC)
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold as KF
from functools import partial
from sklearn.model_selection import \
     (cross_validate,
      KFold,
      ShuffleSplit)
from sklearn.decomposition import PCA
from sklearn.base import clone
from ISLP.models import sklearn_sm

In [32]:
TumorType = pd.read_csv("labels.csv").rename(columns={"Unnamed: 0" :"Sample"})
TumorType

Unnamed: 0,Sample,Class
0,sample_0,PRAD
1,sample_1,LUAD
2,sample_2,PRAD
3,sample_3,PRAD
4,sample_4,BRCA
...,...,...
796,sample_796,BRCA
797,sample_797,LUAD
798,sample_798,COAD
799,sample_799,PRAD


In [33]:
# Updating feature names for easier retrieval 
GeneRaw = pd.read_csv("data.csv")
GeneRaw = GeneRaw.rename(columns={"Unnamed: 0": "Sample"})
GeneRaw.head()

Unnamed: 0,Sample,gene_0,gene_1,gene_2,gene_3,gene_4,gene_5,gene_6,gene_7,gene_8,...,gene_20521,gene_20522,gene_20523,gene_20524,gene_20525,gene_20526,gene_20527,gene_20528,gene_20529,gene_20530
0,sample_0,0.0,2.017209,3.265527,5.478487,10.431999,0.0,7.175175,0.591871,0.0,...,4.926711,8.210257,9.723516,7.22003,9.119813,12.003135,9.650743,8.921326,5.286759,0.0
1,sample_1,0.0,0.592732,1.588421,7.586157,9.623011,0.0,6.816049,0.0,0.0,...,4.593372,7.323865,9.740931,6.256586,8.381612,12.674552,10.517059,9.397854,2.094168,0.0
2,sample_2,0.0,3.511759,4.327199,6.881787,9.87073,0.0,6.97213,0.452595,0.0,...,5.125213,8.127123,10.90864,5.401607,9.911597,9.045255,9.788359,10.09047,1.683023,0.0
3,sample_3,0.0,3.663618,4.507649,6.659068,10.196184,0.0,7.843375,0.434882,0.0,...,6.076566,8.792959,10.14152,8.942805,9.601208,11.392682,9.694814,9.684365,3.292001,0.0
4,sample_4,0.0,2.655741,2.821547,6.539454,9.738265,0.0,6.566967,0.360982,0.0,...,5.996032,8.891425,10.37379,7.181162,9.84691,11.922439,9.217749,9.461191,5.110372,0.0


In [37]:
data = pd.merge(left=GeneRaw, right=TumorType).drop(columns={"Sample"}).rename(columns={"Class": "Tumor"})
data.head()

Unnamed: 0,gene_0,gene_1,gene_2,gene_3,gene_4,gene_5,gene_6,gene_7,gene_8,gene_9,...,gene_20522,gene_20523,gene_20524,gene_20525,gene_20526,gene_20527,gene_20528,gene_20529,gene_20530,Tumor
0,0.0,2.017209,3.265527,5.478487,10.431999,0.0,7.175175,0.591871,0.0,0.0,...,8.210257,9.723516,7.22003,9.119813,12.003135,9.650743,8.921326,5.286759,0.0,PRAD
1,0.0,0.592732,1.588421,7.586157,9.623011,0.0,6.816049,0.0,0.0,0.0,...,7.323865,9.740931,6.256586,8.381612,12.674552,10.517059,9.397854,2.094168,0.0,LUAD
2,0.0,3.511759,4.327199,6.881787,9.87073,0.0,6.97213,0.452595,0.0,0.0,...,8.127123,10.90864,5.401607,9.911597,9.045255,9.788359,10.09047,1.683023,0.0,PRAD
3,0.0,3.663618,4.507649,6.659068,10.196184,0.0,7.843375,0.434882,0.0,0.0,...,8.792959,10.14152,8.942805,9.601208,11.392682,9.694814,9.684365,3.292001,0.0,PRAD
4,0.0,2.655741,2.821547,6.539454,9.738265,0.0,6.566967,0.360982,0.0,0.0,...,8.891425,10.37379,7.181162,9.84691,11.922439,9.217749,9.461191,5.110372,0.0,BRCA


In [43]:
# Using cross validation with PCA to determine the most important features
cross_val = KF(n_splits=5, random_state=42, shuffle=True)
cross_val.get_n_splits(GeneRaw)

model = PCA(n_components=5)

cv_score = cross_validate(model, X=data.drop(columns={"Tumor"}), y=data["Tumor"], cv=cross_val, n_jobs=1)
cv_score
    

{'fit_time': array([0.46602082, 0.72822523, 0.45893884, 0.41708183, 0.37581325]),
 'score_time': array([57.25280261, 48.65269184, 49.03814745, 49.46982193, 50.24018645]),
 'test_score': array([-54351.16504937, -54483.3514019 , -54452.20608983, -54311.29768792,
        -54255.66073726])}

In [45]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(model, X=data.drop(columns=["Tumor"]), y=data["Tumor"], cv=5)
print("Cross-Validation Scores:", scores)

Cross-Validation Scores: [-54367.37099217 -54439.31911278 -54365.20798832 -54293.21737383
 -54392.43553778]
