In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

def preprocess_data(rna_data_path, prediction_data_path, train_data_path, val_data_path, test_data_path):
    rna_data = pd.read_csv(rna_data_path)
    prediction_data = pd.read_csv(prediction_data_path)
    
    rna_data.rename(columns={rna_data.columns[0]: 'SampleID'}, inplace=True)
    prediction_data.rename(columns={prediction_data.columns[0]: 'SampleID'}, inplace=True)


    
    transposed_rna_data = rna_data.set_index('SampleID').transpose().reset_index().rename(columns={'index': 'SampleID'})
    merged_data = pd.merge(prediction_data, transposed_rna_data, on='SampleID', how='inner')
    merged_data.to_csv('C:/Users/eirin/OneDrive/Υπολογιστής/ML4CRC/merged.csv', index=False)



    # merged_data = merged_data.dropna(subset=['msi_status'])

    most_frequent = merged_data['msi_status'].value_counts().idxmax()
    merged_data['msi_status'].fillna(most_frequent, inplace=True)
    
    filtered_data = merged_data[merged_data['msi_status'] != 'Indeterminate'] # Filter out 'Indeterminate' MSI status
    
    features = filtered_data.drop(columns=['SampleID', 'msi_status']).select_dtypes(include=['int64', 'float64'])
    target = filtered_data['msi_status']
    
    # Split the data into training+validation and test sets
    X_temp, X_test, y_temp, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
    
    # Split the training+validation set into individual training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42)  # 0.25 x 0.8 = 0.2
    
    # Standardize the data based on the training set
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    X_test_scaled = scaler.transform(X_test)
    pd.concat([pd.DataFrame(X_train_scaled, columns=X_train.columns), y_train.reset_index(drop=True)], axis=1).to_csv(train_data_path, index=False)
    pd.concat([pd.DataFrame(X_val_scaled, columns=X_val.columns), y_val.reset_index(drop=True)], axis=1).to_csv(val_data_path, index=False)
    pd.concat([pd.DataFrame(X_test_scaled, columns=X_test.columns), y_test.reset_index(drop=True)], axis=1).to_csv(test_data_path, index=False)
    
    return X_train_scaled, y_train, X_val_scaled, y_val, X_test_scaled, y_test

rna_data_path = 'C:/Users/eirin/OneDrive/Υπολογιστής/ML4CRC/data/raw/tcga_rna_count_data_crc.csv'
prediction_data_path = 'C:/Users/eirin/OneDrive/Υπολογιστής/ML4CRC/data/raw/prediction_file_crc.csv'
# train_data_path = 'C:/Users/eirin/OneDrive/Υπολογιστής/ML4CRC/data/processed/train_data.csv'
train_data_path = "C:/Users/eirin/OneDrive/Υπολογιστής/ML4CRC/train_features_smote.csv"
val_data_path = 'C:/Users/eirin/OneDrive/Υπολογιστής/ML4CRC/data/processed/val_data.csv'
test_data_path = 'C:/Users/eirin/OneDrive/Υπολογιστής/ML4CRC/data/processed/test_data.csv'
X_train_scaled, y_train, X_val_scaled, y_val, X_test_scaled, y_test = preprocess_data(rna_data_path, prediction_data_path, train_data_path, val_data_path, test_data_path)


print(X_train_scaled)
print(y_train)
print(X_val_scaled)
print(y_val)
print(X_test_scaled)
print(y_test)

[[-0.79233082  3.90501345  4.88281915 ...  0.          0.
  -0.06030227]
 [-0.72948477 -1.08030466 -1.00500744 ...  0.          0.
  -0.06030227]
 [-0.4362032  -0.30332765  0.13984773 ...  0.          0.
  -0.06030227]
 ...
 [ 1.49107568  1.1563734   0.95760142 ...  0.          0.
  -0.06030227]
 [-1.10656107 -1.37327096 -1.33210891 ...  0.          0.
  -0.06030227]
 [-1.10656107 -0.92272141 -0.67790596 ...  0.          0.
  -0.06030227]]
202      MSS
274    MSI-H
124      MSS
350    MSI-L
282    MSI-H
       ...  
384      MSS
178      MSS
374      MSS
309    MSI-H
332      MSS
Name: msi_status, Length: 276, dtype: object
[[-0.39430584  0.75461426  0.63049995 ...  1.          0.
  -0.06030227]
 [-0.60379267  0.72249359  0.63049995 ...  0.          0.
  -0.06030227]
 [-0.03817822  1.28256041  1.2847029  ...  0.          0.
  -0.06030227]
 ...
 [-0.9389716   0.53610042  0.63049995 ...  0.          0.
  -0.06030227]
 [ 0.35984677  0.02650833  0.13984773 ...  0.          0.
  -0.06030227

In [20]:
# Train the SVM model
from sklearn.svm import SVC

classifier = SVC(C=0.25, kernel = 'linear', random_state = 0)
classifier.fit(X_train_scaled, y_train)

In [3]:
# Confusion matrix precision recall f1-score

from sklearn.metrics import confusion_matrix, classification_report

y_pred = classifier.predict(X_test_scaled)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


[[ 9  0  2]
 [ 0  3 20]
 [ 1  3 54]]
              precision    recall  f1-score   support

       MSI-H       0.90      0.82      0.86        11
       MSI-L       0.50      0.13      0.21        23
         MSS       0.71      0.93      0.81        58

    accuracy                           0.72        92
   macro avg       0.70      0.63      0.62        92
weighted avg       0.68      0.72      0.66        92



Hyperparameter Tuning

In [6]:
from sklearn.model_selection import GridSearchCV

parameters = [{'C' : [0.25, 0.5, 0.75, 1], 'kernel' : ['linear']},
              {'C' : [0.25, 0.5, 0.75, 1], 'kernel' : ['rbf'], 'gamma' : [0.1, 0.2, 0.3, 0.4, 0.5]}]

grid_search = GridSearchCV(estimator = classifier, param_grid = parameters, scoring = 'accuracy', cv = 10, n_jobs = -1)

grid_search = grid_search.fit(X_train_scaled, y_train)

best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
print(best_accuracy)
print(best_parameters)

0.7472934472934473
{'C': 0.25, 'kernel': 'linear'}


Hydra Optuna

In [21]:
import joblib
import optuna
from sklearn.svm import SVC
from sklearn.metrics import f1_score

class SVMModel:
    def __init__(self, random_state=42):
        self.random_state = random_state
        self.model = None

    def train(self, X_train, y_train, C=1.0, kernel='rbf', gamma='scale'):
        """
        Train the SVM model with given hyperparameters.
        """
        self.model = SVC(
            C=C,
            kernel=kernel,
            gamma=gamma,
            random_state=self.random_state
        )
        self.model.fit(X_train, y_train)

    def predict(self, X):
        return self.model.predict(X)

    def evaluate(self, X_test, y_test):
        predictions = self.predict(X_test)
        return f1_score(y_test, predictions, average='macro')

    def save_model(self, filepath):
        joblib.dump(self.model, filepath)


def tune_hyperparameters(self, X_train, y_train, X_test, y_test, n_trials=100):
    def objective(trial):
        # Define the hyperparameter configuration space
        C = trial.suggest_loguniform('C', 0.1, 10.0)
        kernel = trial.suggest_categorical('kernel', ['linear', 'poly', 'rbf', 'sigmoid'])
        if kernel == 'poly':
            degree = trial.suggest_int('degree', 2, 5)
        else:
            degree = None
        if kernel in ['rbf', 'poly', 'sigmoid']:
            gamma = trial.suggest_categorical('gamma', ['scale', 'auto'])
        else:
            gamma = 'scale'

        # Train the model with suggested hyperparameters
        self.train(
            X_train, y_train,
            C=C,
            kernel=kernel,
            gamma=gamma
        )
        
        # Evaluate the model
        test_f1 = self.evaluate(X_test, y_test)
        
        print(f"Iteration {trial.number}: Test F1 Score: {test_f1}, Parameters: {trial.params}")
        
        return test_f1

    # Create a study object and optimize the objective function
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=n_trials)

    best_params = study.best_params
    best_score = study.best_value
    
    # Retrain with best parameters and evaluate on the training set
    self.train(
        X_train, y_train,
        C=best_params['C'],
        kernel=best_params['kernel'],
        gamma=best_params['gamma']
    )
    
    train_f1 = self.evaluate(X_train, y_train)

    print(f"Best Parameters: {best_params}")
    print(f"Best Test F1 Score: {best_score}")
    print(f"Train F1 Score with Best Parameters: {train_f1}")

    return best_params, best_score, train_f1


         