In [8]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import svm, metrics
from sklearn.neighbors import KNeighborsClassifier, DistanceMetric
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from preprocess import preprocess, load_dataset

# Classification of cell types with RNA-seq data

## Preparation

In [35]:
# Load data and preprocess
datadir = 'data/muraro'
data = load_dataset(datadir, 'muraro')
X, y = preprocess(data)
num_labels = len(np.unique(y))
num_features = X.shape[1]

In [36]:
# Train/Test splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, shuffle=True)

## Algorithms

In [37]:
def evaluate_classifier(clf, X_train, X_test, y_train, y_test, average='macro'):
    clf.fit(X_train, y_train)
    prediction = clf.predict(X_test)
    precision, recall, f1, support = metrics.precision_recall_fscore_support(y_test, prediction,
                                                                         average=average,
                                                                         zero_division='warn')
    return precision, recall, f1

In [5]:
eval_log = {}

# SVM classifier with linear kernel
clf = svm.SVC(kernel='linear')
precision, recall, f1 = evaluate_classifier(clf, X_train, X_test, y_train, y_test, average='micro')
eval_log['svm-linear'] = (precision, recall, f1)

# SVM classifier with polynomial kernel
clf = svm.SVC(kernel='poly', degree=3)
precision, recall, f1 = evaluate_classifier(clf, X_train, X_test, y_train, y_test, average='micro')
eval_log['svm-poly'] = (precision, recall, f1)

# SVM classifier with RBF kernel
clf = svm.SVC(kernel='rbf')
precision, recall, f1 = evaluate_classifier(clf, X_train, X_test, y_train, y_test, average='micro')
eval_log['svm-rbf'] = (precision, recall, f1)

# SVM classifier with sigmoid kernel
clf = svm.SVC(kernel='sigmoid')
precision, recall, f1 = evaluate_classifier(clf, X_train, X_test, y_train, y_test, average='micro')
eval_log['svm-sigmoid'] = (precision, recall, f1)

In [6]:
# kNN classifier with Euclidean distance
clf = KNeighborsClassifier(n_neighbors=num_labels, metric='euclidean')
precision, recall, f1 = evaluate_classifier(clf, X_train, X_test, y_train, y_test, average='micro')
eval_log['knn-euclidean'] = (precision, recall, f1)

# kNN classifier with Manhattan distance
clf = KNeighborsClassifier(n_neighbors=num_labels, metric='manhattan')
precision, recall, f1 = evaluate_classifier(clf, X_train, X_test, y_train, y_test, average='micro')
eval_log['knn-manhattan'] = (precision, recall, f1)

In [7]:
# Multi-layer perceptron classifier
clf = MLPClassifier()
precision, recall, f1 = evaluate_classifier(clf, X_train, X_test, y_train, y_test, average='micro')
eval_log['mlp'] = (precision, recall, f1)

In [8]:
eval_log

{'svm-linear': (0.9874213836477987, 0.9874213836477987, 0.9874213836477987),
 'svm-poly': (0.8522012578616353, 0.8522012578616353, 0.8522012578616353),
 'svm-rbf': (0.9654088050314465, 0.9654088050314465, 0.9654088050314465),
 'svm-sigmoid': (0.949685534591195, 0.949685534591195, 0.949685534591195),
 'knn-euclidean': (0.9748427672955975, 0.9748427672955975, 0.9748427672955975),
 'knn-manhattan': (0.9748427672955975, 0.9748427672955975, 0.9748427672955975),
 'mlp': (0.9779874213836478, 0.9779874213836478, 0.9779874213836478)}

## Kernel-based kNN

In [78]:
def rbf_kernel_dist(x, y, gamma):
    return 1 - np.exp(- gamma * ((x - y) ** 2).sum())

def poly_kernel_dist(x, y, gamma, r=0., d=3):
    Kxx = (r + gamma * (x ** 2).sum()) ** d
    Kyy = (r + gamma * (y ** 2).sum()) ** d
    Kxy = (r + gamma * np.dot(x, y)) ** d
    return Kxx + Kyy - 2 * Kxy

def sigmoid_kernel_dist(x, y, gamma, r=0.):
    Kxx = np.tanh(r + gamma * (x ** 2).sum())
    Kyy = np.tanh(r + gamma * (y ** 2).sum())
    Kxy = np.tanh(r + gamma * np.dot(x, y))
    return Kxx + Kyy - 2 * Kxy

In [74]:
clf = KNeighborsClassifier(n_neighbors=num_labels, metric=rbf_kernel_dist, 
                           metric_params={'gamma' : 1 / num_features})
clf.fit(X_train, y_train)
prediction = clf.predict(X_test)
print(f"Classification report for classifier {clf}:\n"
      f"{metrics.classification_report(y_test, prediction)}\n")

Classification report for classifier KNeighborsClassifier(metric=<function rbf_kernel_dist at 0x7fa591dff700>,
                     metric_params={'gamma': 5.2879276611495956e-05},
                     n_neighbors=9):
              precision    recall  f1-score   support

      acinar       1.00      0.35      0.52        37
       alpha       0.97      0.97      0.97       118
        beta       0.93      0.94      0.94        70
       delta       1.00      0.96      0.98        26
        duct       0.56      0.97      0.71        40
 endothelial       1.00      1.00      1.00         3
 mesenchymal       1.00      0.69      0.82        13
          pp       1.00      0.73      0.84        11

    accuracy                           0.87       318
   macro avg       0.93      0.83      0.85       318
weighted avg       0.92      0.87      0.87       318




In [80]:
clf = KNeighborsClassifier(n_neighbors=num_labels, metric=poly_kernel_dist, 
                           metric_params={'gamma' : 1 / num_features})
clf.fit(X_train, y_train)
prediction = clf.predict(X_test)
print(f"Classification report for classifier {clf}:\n"
      f"{metrics.classification_report(y_test, prediction)}\n")

Classification report for classifier KNeighborsClassifier(metric=<function poly_kernel_dist at 0x7fa591fa4af0>,
                     metric_params={'gamma': 5.2879276611495956e-05},
                     n_neighbors=9):
              precision    recall  f1-score   support

      acinar       1.00      0.95      0.97        37
       alpha       0.98      1.00      0.99       118
        beta       0.93      0.96      0.94        70
       delta       1.00      1.00      1.00        26
        duct       0.87      0.97      0.92        40
 endothelial       1.00      0.67      0.80         3
 mesenchymal       1.00      0.77      0.87        13
          pp       1.00      0.73      0.84        11

    accuracy                           0.96       318
   macro avg       0.97      0.88      0.92       318
weighted avg       0.96      0.96      0.96       318




In [81]:
clf = KNeighborsClassifier(n_neighbors=num_labels, metric=sigmoid_kernel_dist, 
                           metric_params={'gamma' : 1 / num_features})
clf.fit(X_train, y_train)
prediction = clf.predict(X_test)
print(f"Classification report for classifier {clf}:\n"
      f"{metrics.classification_report(y_test, prediction)}\n")

Classification report for classifier KNeighborsClassifier(metric=<function sigmoid_kernel_dist at 0x7fa5957eb670>,
                     metric_params={'gamma': 5.2879276611495956e-05},
                     n_neighbors=9):
              precision    recall  f1-score   support

      acinar       0.00      0.00      0.00        37
       alpha       0.00      0.00      0.00       118
        beta       0.21      0.11      0.15        70
       delta       0.00      0.00      0.00        26
        duct       0.12      0.75      0.21        40
 endothelial       0.00      0.00      0.00         3
 mesenchymal       1.00      0.08      0.14        13
          pp       0.00      0.00      0.00        11

    accuracy                           0.12       318
   macro avg       0.17      0.12      0.06       318
weighted avg       0.10      0.12      0.06       318




  _warn_prf(average, modifier, msg_start, len(result))
