In [160]:
%reset -f

In [161]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import RFE
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
%matplotlib inline

## Import the data

In [162]:
X=pd.read_csv('train_call.tsv', sep='\t')
y=pd.read_csv('train_clinical.txt', sep='\t', index_col=0)

## Transpose X and remove the first 4 rows

In [163]:
X = X.transpose()
X = X.tail(-4)

## Define inner and outer loops

In [164]:
outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)
inner_cv = KFold(n_splits=5, shuffle=True, random_state=42)

## Create a pipeline with PCA and a classifier (in this case, an SVM classifier):

In [165]:
pca = PCA()
svm = SVC()
pipeline = Pipeline([
    ('rfe', RFE(LinearSVC())),
    ('reduce_dim', PCA()),
    ('classify', SVC(kernel='linear'))
])

## Define the hyperparameter search space for PCA,SVM and RFE:

In [166]:
param_grid = {
    'rfe__n_features_to_select': [20, 30, 40],
    'reduce_dim__n_components': [15,25,35],
    'classify__C': ['linear', 'rbf']
}

In [167]:
nested_cv_scores = []

## Perform grid search with cross-validation on the inner loop:

In [168]:
y = np.array(y)

In [None]:
for train_index, test_index in outer_cv.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Perform grid search with cross-validation on the inner loop
    grid_search = GridSearchCV(pipeline, param_grid, cv=inner_cv, scoring='accuracy')
    grid_search.fit(X_train, y_train.ravel())

    # Evaluate the model on the test set and append the score to the list
    test_score = grid_search.score(X_test, y_test)
    nested_cv_scores.append(test_score)
    print(f"Test set accuracy: {test_score:.2f}")

## Calculate the mean and standard deviation of the nested cross-validation scores:

In [None]:
mean_score = np.mean(nested_cv_scores)
std_score = np.std(nested_cv_scores)

In [None]:
print(f"Nested cross-validation mean accuracy: {mean_score:.2f} (std: {std_score:.2f})")