In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

def preprocess_data(rna_data_path, prediction_data_path, train_data_path, val_data_path, test_data_path):
    rna_data = pd.read_csv(rna_data_path)
    prediction_data = pd.read_csv(prediction_data_path)
    
    rna_data.rename(columns={rna_data.columns[0]: 'SampleID'}, inplace=True)
    prediction_data.rename(columns={prediction_data.columns[0]: 'SampleID'}, inplace=True)
    
    transposed_rna_data = rna_data.set_index('SampleID').transpose().reset_index().rename(columns={'index': 'SampleID'})
    merged_data = pd.merge(prediction_data, transposed_rna_data, on='SampleID', how='inner')
    merged_data.to_csv('C:/Users/eirin/OneDrive/Υπολογιστής/ML4CRC/merged.csv', index=False)

    merged_data = merged_data.dropna(subset=['msi_status'])

    # most_frequent = merged_data['msi_status'].value_counts().idxmax()
    # merged_data['msi_status'].fillna(most_frequent, inplace=True)
    
    filtered_data = merged_data[merged_data['msi_status'] != 'Indeterminate'] # Filter out 'Indeterminate' MSI status
    
    features = filtered_data.drop(columns=['SampleID', 'msi_status']).select_dtypes(include=['int64', 'float64'])
    target = filtered_data['msi_status']
    
    # Split the data into training+validation and test sets
    X_temp, X_test, y_temp, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
    
    # Split the training+validation set into individual training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42)  # 0.25 x 0.8 = 0.2
    
    # Standardize the data based on the training set
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    X_test_scaled = scaler.transform(X_test)
    
    pd.concat([pd.DataFrame(X_train_scaled, columns=X_train.columns), y_train.reset_index(drop=True)], axis=1).to_csv(train_data_path, index=False)
    pd.concat([pd.DataFrame(X_val_scaled, columns=X_val.columns), y_val.reset_index(drop=True)], axis=1).to_csv(val_data_path, index=False)
    pd.concat([pd.DataFrame(X_test_scaled, columns=X_test.columns), y_test.reset_index(drop=True)], axis=1).to_csv(test_data_path, index=False)
    
    return X_train_scaled, y_train, X_val_scaled, y_val, X_test_scaled, y_test

rna_data_path = 'C:/Users/eirin/OneDrive/Υπολογιστής/ML4CRC/data/raw/tcga_rna_count_data_crc.csv'
prediction_data_path = 'C:/Users/eirin/OneDrive/Υπολογιστής/ML4CRC/data/raw/prediction_file_crc.csv'
train_data_path = 'C:/Users/eirin/OneDrive/Υπολογιστής/ML4CRC/data/processed/train_data.csv'
val_data_path = 'C:/Users/eirin/OneDrive/Υπολογιστής/ML4CRC/data/processed/val_data.csv'
test_data_path = 'C:/Users/eirin/OneDrive/Υπολογιστής/ML4CRC/data/processed/test_data.csv'
X_train_scaled, y_train, X_val_scaled, y_val, X_test_scaled, y_test = preprocess_data(rna_data_path, prediction_data_path, train_data_path, val_data_path, test_data_path)


print(X_train_scaled)
print(y_train)
print(X_val_scaled)
print(y_val)
print(X_test_scaled)
print(y_test)

[[-0.46356716 -0.58191878 -0.21586507 ... -0.06201737  0.
  -0.06201737]
 [ 1.14803496  0.4577952   0.48839472 ... -0.06201737  0.
  -0.06201737]
 [ 1.75238575  0.58602294  0.31232977 ... -0.06201737  0.
  -0.06201737]
 ...
 [ 2.79992713  0.02459928 -0.74405992 ... -0.06201737  0.
  -0.06201737]
 [ 1.12788993  2.48806895  2.07297926 ... -0.06201737  0.
  -0.06201737]
 [-1.10820801 -1.46120303 -1.44831971 ... -0.06201737  0.
  -0.06201737]]
151    MSI-H
187      MSS
62       MSS
204      MSS
400    MSI-H
       ...  
119    MSI-H
389      MSS
219      MSS
293      MSS
434    MSI-L
Name: msi_status, Length: 261, dtype: object
[[ 0.82571453  1.30619166  0.84052462 ... -0.06201737  0.
  -0.06201737]
 [-0.9470478   0.51704838  0.66445967 ... -0.06201737  0.
  -0.06201737]
 [-1.16864309 -1.11491392 -0.92012486 ... -0.06201737  0.
  -0.06201737]
 ...
 [-1.0276279  -0.62812442 -0.56799497 ... -0.06201737  0.
  -0.06201737]
 [-0.42327711  0.51638795  0.66445967 ... -0.06201737  0.
  -0.06201737

In [7]:
# Train the SVM model

from sklearn.svm import SVC

classifier = SVC(C=0.25, kernel = 'linear', random_state = 0)
classifier.fit(X_train_scaled, y_train)

In [8]:
# Confusion matrix precision recall f1-score

from sklearn.metrics import confusion_matrix, classification_report

y_pred = classifier.predict(X_val_scaled)
print(confusion_matrix(y_val, y_pred))
print(classification_report(y_val, y_pred))


[[11  1  1]
 [ 0  1 12]
 [ 4  6 51]]
              precision    recall  f1-score   support

       MSI-H       0.73      0.85      0.79        13
       MSI-L       0.12      0.08      0.10        13
         MSS       0.80      0.84      0.82        61

    accuracy                           0.72        87
   macro avg       0.55      0.59      0.57        87
weighted avg       0.69      0.72      0.70        87



Hyperparameter Tuning

In [6]:
from sklearn.model_selection import GridSearchCV

parameters = [{'C' : [0.25, 0.5, 0.75, 1], 'kernel' : ['linear']},
              {'C' : [0.25, 0.5, 0.75, 1], 'kernel' : ['rbf'], 'gamma' : [0.1, 0.2, 0.3, 0.4, 0.5]}]

grid_search = GridSearchCV(estimator = classifier, param_grid = parameters, scoring = 'accuracy', cv = 10, n_jobs = -1)

grid_search = grid_search.fit(X_train_scaled, y_train)

best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
print(best_accuracy)
print(best_parameters)

0.7472934472934473
{'C': 0.25, 'kernel': 'linear'}
