In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
data=pd.read_csv('train_call.tsv', sep='\t')

In [3]:
data = data.transpose()

In [4]:
data2 = data
data = data.drop(data.index[[1, 2, 3]])

In [5]:
samples = pd.read_csv('train_clinical.txt', sep='\t')
samples = samples.set_index('Sample', drop=False).rename_axis(None)

In [6]:
data_simplified = data.drop("Chromosome")
data_simplified.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2824,2825,2826,2827,2828,2829,2830,2831,2832,2833
Array.129,0,0,0,0,0,0,0,0,0,0,...,2,2,2,2,0,1,1,1,1,1
Array.34,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1
Array.67,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1
Array.24,0,0,0,0,0,0,0,-1,0,0,...,0,0,0,0,0,0,0,0,0,0
Array.22,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1


In [7]:
samples.head()

Unnamed: 0,Sample,Subgroup
Array.129,Array.129,HER2+
Array.34,Array.34,HR+
Array.67,Array.67,HR+
Array.24,Array.24,Triple Neg
Array.22,Array.22,Triple Neg


In [8]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

In [9]:
features = list(data.columns)

newsamples = samples["Subgroup"]

In [10]:
X = data_simplified[features]
y = newsamples

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
rf = RandomForestClassifier(random_state=0)

rf.fit(X_train,y_train)

y_pred = rf.predict(X_test)

print("Classification report:\n", classification_report(y_test, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

Classification report:
               precision    recall  f1-score   support

       HER2+       1.00      0.86      0.92         7
         HR+       0.60      0.86      0.71         7
  Triple Neg       0.75      0.50      0.60         6

    accuracy                           0.75        20
   macro avg       0.78      0.74      0.74        20
weighted avg       0.78      0.75      0.75        20

Confusion matrix:
 [[6 1 0]
 [0 6 1]
 [0 3 3]]


In [12]:
# Compute the feature importances
importances = rf.feature_importances_

important_features_dict = {}
for idx, val in enumerate(rf.feature_importances_):
    important_features_dict[idx] = val

important_features_list = sorted(important_features_dict,
                                 key=important_features_dict.get,
                                 reverse=True)

print(important_features_list[:60])

[2184, 2213, 2219, 2206, 851, 853, 1904, 1068, 2211, 301, 764, 1059, 2108, 2026, 833, 692, 385, 2024, 68, 2017, 2116, 856, 1656, 471, 854, 998, 623, 1883, 843, 757, 2162, 837, 743, 2818, 2751, 1678, 1663, 2205, 1910, 2419, 1972, 855, 2756, 478, 860, 1644, 766, 2203, 174, 2208, 861, 721, 389, 2207, 835, 790, 1732, 2198, 476, 1667]


In [13]:
new_features_df = data_simplified[[2184, 2213, 2219, 2206, 851, 853, 1904, 1068, 2211, 301, 764, 1059, 2108, 2026, 833, 692, 385, 2024, 68, 2017, 2116, 856, 1656, 471, 854, 998, 623, 1883, 843, 757, 2162, 837, 743, 2818, 2751, 1678, 1663, 2205, 1910, 2419, 1972, 855, 2756, 478, 860, 1644, 766, 2203, 174, 2208, 861, 721, 389, 2207, 835, 790, 1732, 2198, 476, 1667]
]

In [14]:
from sklearn.model_selection import LeaveOneOut, GridSearchCV, KFold
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from numpy import mean, std

X = new_features_df
y = newsamples

cv = KFold()
cv1 = LeaveOneOut()

param_grid = { 
    'n_estimators': [100, 200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'criterion' :['gini', 'entropy']
}

# fit model
model = RandomForestClassifier(random_state=1)

#inner k fold
best_model = GridSearchCV(model, param_grid = param_grid, scoring = 'accuracy', cv=cv,verbose=20,n_jobs=-1)

best_model.fit(X,y)

#outer k fold. Use k-fold twice.
scores = cross_val_score(best_model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)

# report performance
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

Fitting 5 folds for each of 18 candidates, totalling 90 fits
Accuracy: 0.810 (0.058)


In [15]:
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold

outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)
inner_cv = KFold(n_splits=5, shuffle=True, random_state=42)

In [16]:
#this is nested cross-validation working and priniting accuracies. However I'm not sure how to extract the best model out of all the 5 cross validated models. Probably bne

In [26]:
nested_cv_scores = []
i = -1

for train_index, test_index in outer_cv.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]

    param_grid = { 
    'n_estimators': [100, 200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'criterion' :['gini', 'entropy']}
    
    # Perform grid search with cross-validation on the inner loop
    grid_search = GridSearchCV(rf, param_grid = param_grid, cv=inner_cv, scoring='accuracy')
    result = grid_search.fit(X_train, y_train.ravel())
    best_model = result.best_estimator_

    # Evaluate the model on the test set and append the score to the list
    test_score = best_model.score(X_test, y_test)
    nested_cv_scores.append(test_score)
    print(f"Test set accuracy: {test_score:.2f}")
    
    #probably here i need some kind of if statement to extract the final best cross validated model. something like
    i = i + 1
    if test_score > nested_cv_scores[i]:
        GOAT_model = best_model
    

Test set accuracy: 0.80
Test set accuracy: 0.75
Test set accuracy: 0.85
Test set accuracy: 0.80
Test set accuracy: 0.85


NameError: name 'GOAT_model' is not defined

In [18]:
#below here is the graveyard

In [19]:


rf = RandomForestClassifier(random_state=0)



rf.fit(X_train,y_train)

y_pred = rf.predict(X_test)

print("Classification report:\n", classification_report(y_test, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

Classification report:
               precision    recall  f1-score   support

       HER2+       1.00      1.00      1.00         6
         HR+       0.88      0.88      0.88         8
  Triple Neg       0.83      0.83      0.83         6

    accuracy                           0.90        20
   macro avg       0.90      0.90      0.90        20
weighted avg       0.90      0.90      0.90        20

Confusion matrix:
 [[6 0 0]
 [0 7 1]
 [0 1 5]]


In [20]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold

# evaluate a given model using cross-validation
def evaluate_model(model, X, y):
 # define the evaluation procedure
 cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
 # evaluate the model and collect the results
 scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
 return scores

In [21]:
evaluate_model(rf, X, y)

array([0.9, 0.9, 1. , 0.8, 0.8, 0.9, 0.5, 0.8, 0.9, 1. , 0.9, 0.9, 0.9,
       0.8, 0.7, 0.7, 1. , 0.8, 0.5, 0.9, 0.9, 0.8, 1. , 0.9, 0.8, 0.9,
       0.8, 0.9, 0.8, 0.7])