In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import svm, metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from preprocess import preprocess, load_dataset

# Classification of cell types with RNA-seq data

## Preparation

In [2]:
# Load data and preprocess
datadir = 'data/muraro'
data = load_dataset(datadir, 'muraro')
X, y = preprocess(data)

In [3]:
# Train/Test splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, shuffle=True)

## Algorithms

In [4]:
def evaluate_classifier(clf, X_train, X_test, y_train, y_test, average='macro'):
    clf.fit(X_train, y_train)
    prediction = clf.predict(X_test)
    precision, recall, f1, support = metrics.precision_recall_fscore_support(y_test, prediction,
                                                                         average=average,
                                                                         zero_division='warn')
    return precision, recall, f1

In [5]:
eval_log = {}

# SVM classifier with linear kernel
clf = svm.SVC(kernel='linear')
precision, recall, f1 = evaluate_classifier(clf, X_train, X_test, y_train, y_test, average='micro')
eval_log['svm-linear'] = (precision, recall, f1)

# SVM classifier with polynomial kernel
clf = svm.SVC(kernel='poly', degree=3)
precision, recall, f1 = evaluate_classifier(clf, X_train, X_test, y_train, y_test, average='micro')
eval_log['svm-poly'] = (precision, recall, f1)

# SVM classifier with RBF kernel
clf = svm.SVC(kernel='rbf')
precision, recall, f1 = evaluate_classifier(clf, X_train, X_test, y_train, y_test, average='micro')
eval_log['svm-rbf'] = (precision, recall, f1)

# SVM classifier with sigmoid kernel
clf = svm.SVC(kernel='sigmoid')
precision, recall, f1 = evaluate_classifier(clf, X_train, X_test, y_train, y_test, average='micro')
eval_log['svm-sigmoid'] = (precision, recall, f1)

In [6]:
num_labels = len(np.unique(y))

# kNN classifier with Euclidean distance
clf = KNeighborsClassifier(n_neighbors=num_labels, metric='euclidean')
precision, recall, f1 = evaluate_classifier(clf, X_train, X_test, y_train, y_test, average='micro')
eval_log['knn-euclidean'] = (precision, recall, f1)

# kNN classifier with Manhattan distance
clf = KNeighborsClassifier(n_neighbors=num_labels, metric='manhattan')
precision, recall, f1 = evaluate_classifier(clf, X_train, X_test, y_train, y_test, average='micro')
eval_log['knn-manhattan'] = (precision, recall, f1)

In [7]:
# Multi-layer perceptron classifier
clf = MLPClassifier()
precision, recall, f1 = evaluate_classifier(clf, X_train, X_test, y_train, y_test, average='micro')
eval_log['mlp'] = (precision, recall, f1)

In [8]:
eval_log

{'svm-linear': (0.9874213836477987, 0.9874213836477987, 0.9874213836477987),
 'svm-poly': (0.8522012578616353, 0.8522012578616353, 0.8522012578616353),
 'svm-rbf': (0.9654088050314465, 0.9654088050314465, 0.9654088050314465),
 'svm-sigmoid': (0.949685534591195, 0.949685534591195, 0.949685534591195),
 'knn-euclidean': (0.9748427672955975, 0.9748427672955975, 0.9748427672955975),
 'knn-manhattan': (0.9748427672955975, 0.9748427672955975, 0.9748427672955975),
 'mlp': (0.9779874213836478, 0.9779874213836478, 0.9779874213836478)}