Ονοματεπώνυμο: Ζαμάγιας Μιχαήλ Ανάργυρος

ΑΜ: ΤΠ5000

## Import modules

In [1]:
from typing import Tuple

import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
# import matplotlib.pyplot as plt

# Data pre-processing

## Read datasets

In [2]:
datasets = ['LungTrain.txt', 'LungTest.txt', 'KEGG_DATA.txt']
training_dataset, testing_dataset, kegg_dataset = [
    pd.read_csv(dataset, sep='\t')
    for dataset in datasets
]

In [3]:
def preprocess_data(dataset: pd.DataFrame) -> Tuple[pd.Series, pd.Series, pd.DataFrame]:
    """To process read datasets.
    "Cleared" up column names are stored in the list dataset_labels.
    Firstly, this function returns gene names as a pd.Series type, indexed.
    Secondly, it returns label names as a pd.Series type, indexed.
    Thirdly, it returns the dataset altered in the following ways:
        -   Its columns are renamed by the dataset_labels list.
        -   It gets transposed.
    Args:
        dataset(pd.DataFrame): Read dataset.
    Returns:
        Tuple[pd.Series, pd.DataFrame]: Returns a tuple containing dataset genes, dataset labels and processed dataset.
    """
    dataset_genes = dataset.iloc[:, 0]
    dataset_labels = pd.Series(
        [
            'Cancer' if 'Cancer' in dataset_column else 'Normal' if 'Normal' in dataset_column else dataset_column
            for dataset_column in list(dataset.columns)
        ]
    )
    altered_dataset = dataset.set_axis(dataset_labels, axis='columns').T.iloc[1:]
    return dataset_genes, dataset_labels[1:], altered_dataset


In [4]:
training_genes, training_labels, training_dataset = preprocess_data(training_dataset)
testing_genes, testing_labels, testing_dataset = preprocess_data(testing_dataset)

# Data analysis

## Support Vector Machines model

### Train the SVM model with the training dataset

In [5]:
train_features = np.array(training_dataset)
train_labels = np.array(training_labels)
svm_model = SVC().fit(train_features, train_labels)

### Run trained SVM model on testing dataset

In [6]:
test_features = np.array(testing_dataset)
test_labels = np.array(testing_labels)

### Calculate its prediction, confusion matrix and accuracy

In [7]:
svm_prediction = svm_model.predict(test_features)
svm_confusion_matrix = confusion_matrix(test_labels, svm_prediction)
svm_accuracy = svm_model.score(test_features, test_labels)

### Print its prediction, confusion matrix and accuracy

In [8]:
print(svm_prediction)
print(svm_confusion_matrix)
print(svm_accuracy)

['Normal' 'Normal' 'Cancer' 'Cancer' 'Normal' 'Cancer' 'Normal' 'Cancer'
 'Normal' 'Normal' 'Normal' 'Normal' 'Cancer' 'Normal' 'Cancer' 'Normal'
 'Cancer' 'Normal' 'Cancer' 'Cancer' 'Cancer' 'Cancer' 'Cancer' 'Normal'
 'Cancer' 'Normal' 'Cancer' 'Cancer' 'Normal' 'Normal']
[[14  0]
 [ 1 15]]
0.9666666666666667


In [9]:
#   train a decision tree model on training dataset, k-fold not required, print its accuracy and tree

In [10]:
#   run trained model on testing dataset, print prediction for each sample in testing dataset and print confusion matrix

In [11]:
#   comment on these two models, svm and decision tree, and reason which of the two is more efficient, if one is

## Gene selection
