In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import svm, metrics
from sklearn.model_selection import train_test_split

Muraro: 19140 features, 2126 samples.

In [2]:
# Loading data
samples = pd.read_csv('data/muraro/data.csv', delimiter='\t').transpose()
samples.index.name = 'cell'
samples.reset_index(inplace=True)
labels = pd.read_csv('data/muraro/cell_type_annotation_Cels2016.csv', 
                        delimiter='\t', skiprows=1, names=['cell', 'label'])

# Formatting data
samples['cell'] = samples['cell'].astype('string')
labels['cell'] = labels['cell'].astype('string')
labels['cell'] = labels['cell'].apply(lambda name: name.replace('.', '-'))

# Merge samples with features to labels
data = samples.merge(labels, on='cell', how='inner')

In [3]:
# Data formatted before passing to pre-processing
data

Unnamed: 0,cell,A1BG-AS1__chr19,A1BG__chr19,A1CF__chr10,A2M-AS1__chr12,A2ML1__chr12,A2M__chr12,A4GALT__chr22,A4GNT__chr3,AAAS__chr12,...,ZWINT__chr10,ZXDA__chrX,ZXDB__chrX,ZXDC__chr3,ZYG11A__chr1,ZYG11B__chr1,ZYX__chr7,ZZEF1__chr17,ZZZ3__chr1,label
0,D28-1_1,0.0,0.000000,6.071431,0.000000,0.0,0.000000,0.000000,0.0,1.001958,...,0.0,0.000000,1.001958,5.049473,0.000000,2.007853,0.000000,0.000000,0.000000,alpha
1,D28-1_2,0.0,0.000000,0.000000,0.000000,0.0,5.049473,1.001958,0.0,0.000000,...,0.0,0.000000,1.001958,9.162012,0.000000,6.071431,5.049473,4.031579,1.001958,endothelial
2,D28-1_3,0.0,1.001958,2.007853,0.000000,0.0,0.000000,0.000000,0.0,0.000000,...,0.0,1.001958,0.000000,1.001958,0.000000,1.001958,0.000000,1.001958,0.000000,delta
3,D28-1_4,0.0,1.001958,6.071431,0.000000,0.0,0.000000,0.000000,0.0,1.001958,...,0.0,0.000000,2.007853,10.200553,1.001958,6.071431,1.001958,3.017717,2.007853,beta
4,D28-1_5,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,...,0.0,0.000000,0.000000,3.017717,0.000000,3.017717,0.000000,0.000000,0.000000,unclear
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2121,D30-8_90,0.0,0.000000,3.017717,0.000000,0.0,0.000000,0.000000,0.0,1.001958,...,0.0,0.000000,2.007853,1.001958,0.000000,1.001958,0.000000,0.000000,0.000000,alpha
2122,D30-8_91,0.0,0.000000,0.000000,1.001958,0.0,11.243324,0.000000,0.0,1.001958,...,0.0,0.000000,1.001958,3.017717,0.000000,4.031579,0.000000,4.031579,4.031579,mesenchymal
2123,D30-8_92,0.0,0.000000,3.017717,0.000000,0.0,0.000000,0.000000,0.0,0.000000,...,0.0,1.001958,1.001958,7.097484,0.000000,2.007853,1.001958,0.000000,1.001958,alpha
2124,D30-8_93,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,1.001958,...,0.0,0.000000,0.000000,1.001958,0.000000,3.017717,8.127667,2.007853,0.000000,duct


In [4]:
# Preprocessing

DROP_LABELS = ['unclear']

# Filter out cells with 'unclear' label
data = data.loc[~data['label'].isin(DROP_LABELS)]

# Find features with values = 0 across all cells
columns = data.columns[1:-1]  # Exclude 'cell' and 'label' columns
nonzero_columns = []
for col in columns:
    if (data[col] != 0).any():
        nonzero_columns.append(col)
        
# Filter out columns with values = 0 across all cells
data = data[['cell', 'label'] + nonzero_columns]

# Filter out cells with number of detected genes less than 3 MAD 
# from the median of number of detected genes
num_detected = data.iloc[:, 2:].apply(lambda features: np.sum(features != 0), axis=1)
median_num_detected = np.median(num_detected)
absolute_deviation = np.abs(num_detected - median_num_detected)
mad = np.median(absolute_deviation)
data['num_detected'] = num_detected
data = data.loc[median_num_detected - data['num_detected'] <= 3 * mad]

# Extract the features and the labels
X = data.iloc[:, 2:-1].to_numpy()
y = data.loc[:, 'label'].to_numpy()

In [5]:
# Pre-processed data
data

Unnamed: 0,cell,label,A1BG__chr19,A1CF__chr10,A2M-AS1__chr12,A2ML1__chr12,A2M__chr12,A4GALT__chr22,A4GNT__chr3,AAAS__chr12,...,ZWINT__chr10,ZXDA__chrX,ZXDB__chrX,ZXDC__chr3,ZYG11A__chr1,ZYG11B__chr1,ZYX__chr7,ZZEF1__chr17,ZZZ3__chr1,num_detected
0,D28-1_1,alpha,0.000000,6.071431,0.000000,0.0,0.000000,0.000000,0.0,1.001958,...,0.0,0.000000,1.001958,5.049473,0.000000,2.007853,0.000000,0.000000,0.000000,5448
1,D28-1_2,endothelial,0.000000,0.000000,0.000000,0.0,5.049473,1.001958,0.0,0.000000,...,0.0,0.000000,1.001958,9.162012,0.000000,6.071431,5.049473,4.031579,1.001958,6464
2,D28-1_3,delta,1.001958,2.007853,0.000000,0.0,0.000000,0.000000,0.0,0.000000,...,0.0,1.001958,0.000000,1.001958,0.000000,1.001958,0.000000,1.001958,0.000000,5212
3,D28-1_4,beta,1.001958,6.071431,0.000000,0.0,0.000000,0.000000,0.0,1.001958,...,0.0,0.000000,2.007853,10.200553,1.001958,6.071431,1.001958,3.017717,2.007853,7318
5,D28-1_13,duct,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,...,0.0,0.000000,1.001958,2.007853,0.000000,0.000000,1.001958,1.001958,4.031579,5911
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2121,D30-8_90,alpha,0.000000,3.017717,0.000000,0.0,0.000000,0.000000,0.0,1.001958,...,0.0,0.000000,2.007853,1.001958,0.000000,1.001958,0.000000,0.000000,0.000000,5145
2122,D30-8_91,mesenchymal,0.000000,0.000000,1.001958,0.0,11.243324,0.000000,0.0,1.001958,...,0.0,0.000000,1.001958,3.017717,0.000000,4.031579,0.000000,4.031579,4.031579,7404
2123,D30-8_92,alpha,0.000000,3.017717,0.000000,0.0,0.000000,0.000000,0.0,0.000000,...,0.0,1.001958,1.001958,7.097484,0.000000,2.007853,1.001958,0.000000,1.001958,6209
2124,D30-8_93,duct,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,1.001958,...,0.0,0.000000,0.000000,1.001958,0.000000,3.017717,8.127667,2.007853,0.000000,7165


## SVM Demo

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=True)

In [7]:
clf = svm.SVC(kernel='rbf')
clf.fit(X_train, y_train)
predicted = clf.predict(X_test)

In [8]:
predicted

array(['beta', 'beta', 'alpha', 'alpha', 'alpha', 'beta', 'delta',
       'alpha', 'beta', 'alpha', 'alpha', 'acinar', 'delta', 'beta',
       'beta', 'alpha', 'alpha', 'beta', 'acinar', 'alpha', 'acinar',
       'duct', 'alpha', 'beta', 'delta', 'alpha', 'alpha', 'alpha',
       'alpha', 'acinar', 'beta', 'duct', 'alpha', 'mesenchymal', 'beta',
       'alpha', 'alpha', 'alpha', 'alpha', 'alpha', 'beta', 'mesenchymal',
       'beta', 'acinar', 'alpha', 'alpha', 'delta', 'beta', 'alpha',
       'acinar', 'alpha', 'duct', 'alpha', 'beta', 'alpha', 'duct',
       'alpha', 'acinar', 'duct', 'beta', 'alpha', 'alpha', 'delta',
       'alpha', 'acinar', 'beta', 'duct', 'duct', 'delta', 'alpha',
       'acinar', 'alpha', 'alpha', 'beta', 'alpha', 'delta', 'alpha',
       'alpha', 'alpha', 'acinar', 'alpha', 'alpha', 'beta', 'delta',
       'alpha', 'duct', 'duct', 'alpha', 'pp', 'alpha', 'alpha', 'acinar',
       'alpha', 'alpha', 'duct', 'alpha', 'beta', 'alpha', 'alpha',
       'alpha', 'bet

In [9]:
print(f"Classification report for classifier {clf}:\n"
      f"{metrics.classification_report(y_test, predicted)}\n")

Classification report for classifier SVC():
              precision    recall  f1-score   support

      acinar       1.00      0.96      0.98        24
       alpha       1.00      0.99      0.99        99
        beta       0.94      0.97      0.96        34
       delta       1.00      1.00      1.00        17
        duct       0.88      0.95      0.91        22
 endothelial       0.00      0.00      0.00         1
 mesenchymal       1.00      1.00      1.00         6
          pp       1.00      1.00      1.00         9

    accuracy                           0.98       212
   macro avg       0.85      0.86      0.86       212
weighted avg       0.97      0.98      0.97       212




  _warn_prf(average, modifier, msg_start, len(result))
