# LDA classification of CyTOF

#### Verifying results of Predicting   cell   types   in   single   cell   mass   cytometry data

In [14]:
import pandas as pd
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import numpy as np
from sklearn.metrics import f1_score
import os
from sklearn.model_selection import KFold
from sklearn import metrics
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure


### Sample of data
28 features over a large number of cells per CSV

In [15]:
sample = pd.read_csv("../Data/MultiCenter/Samples/sample01.csv", header=None)
markers = ['CCR6','CD20','CD45','CD14','CD16','CD8','CD3','CD4']
sample.columns = markers
sample.head(10)

Unnamed: 0,CCR6,CD20,CD45,CD14,CD16,CD8,CD3,CD4
0,17.89,10.231,790.55,2.7417,0.0,1.0653,179.6,45.063
1,0.0,4.6372,1112.7,468.41,0.25689,2.4362,1.7811,13.506
2,0.002912,7.2418,1294.2,624.25,0.0,0.49315,2.3514,2.0904
3,2.9118,2.7268,1321.2,579.82,0.0,0.91006,1.5757,8.3613
4,0.0,4.5986,778.36,0.0,0.0,0.0,253.72,38.567
5,501.77,921.67,934.45,2.6003,0.0,0.043438,2.6143,1.787
6,0.75311,2.8833,964.34,383.89,4.1172,0.0,3.5132,1.442
7,0.15742,0.0,748.81,80.876,1.5933,4.5509,4.1106,5.1319
8,0.0,1.7064,965.98,1.452,65.134,0.18693,0.008875,0.0
9,0.0,2.5197,1308.4,0.2186,16.537,83.937,4.5476,1.5487


X-fold validation with one file/sample being left out each iteration and used as test:
CV-samples in paper

In [16]:
labelfiles = os.listdir("../Data/MultiCenter/Labels/")
samplefiles = os.listdir("../Data/MultiCenter/Samples/")
labelfiles.sort()
samplefiles.sort()



## Stratify MultiCenter per sample
Train 10 iterations of MultiCenter with 9 samples, leaving one out <br>
Predict labels for sample which was excluded from training <br>
Save predictions to file and prediction accuracy to variable <br>
MultiCenter CV-samples in paper

In [17]:
kfoldsplitter = KFold(n_splits=len(samplefiles))
for train, test in kfoldsplitter.split(labelfiles):
    classifier = LinearDiscriminantAnalysis()
    for trainingset in train:
        X = pd.read_csv("../Data/MultiCenter/Samples/" + samplefiles[trainingset], header=None).values #train
        y = pd.read_csv("../Data/MultiCenter/Labels/" + labelfiles[trainingset], header=None).values.ravel() #labels
        classifier.fit(X, y)
    testdata = pd.read_csv("../Data/MultiCenter/Samples/" + samplefiles[test[0]], header=None).values #test
    testlabels = pd.read_csv("../Data/MultiCenter/Labels/" + labelfiles[test[0]], header=None).values.ravel() #testlabels
    #score testdata with its true labels in model trained against 9 other samples & save to file
    prediction = classifier.predict(testdata)
    with open('../Results/Predictions/MultiCenter LDA/CV-sample/predict_'+samplefiles[test[0]], 'w') as f:
        for item in prediction:
            f.write("%s\n" % item)
    print("Done testing %s" % samplefiles[test[0]])


Done testing sample01.csv
Done testing sample02.csv
Done testing sample03.csv
Done testing sample04.csv
Done testing sample05.csv
Done testing sample06.csv
Done testing sample07.csv
Done testing sample08.csv
Done testing sample09.csv
Done testing sample10.csv
Done testing sample11.csv
Done testing sample12.csv
Done testing sample13.csv
Done testing sample14.csv
Done testing sample15.csv
Done testing sample16.csv


## Stratify PANORAMA 5-fold

In [18]:
kfoldsplitter = KFold(n_splits=5)

#Load all samples and labels for 5-cold CV
samplesheap = pd.DataFrame()
labelsheap = pd.DataFrame()

for labelname, samplename in zip(labelfiles, samplefiles):
    samplesheap = samplesheap.append(pd.read_csv("../Data/MultiCenter/Samples/" + samplename, header=None))
    labelsheap = labelsheap.append(pd.read_csv("../Data/MultiCenter/Labels/" + labelname, header=None))



Number of predictions/trues matches original numer of instances

In [19]:


batch = 1
for train, test in kfoldsplitter.split(samplesheap):
    classifier = LinearDiscriminantAnalysis()
    #select test/train/true labels
    testdata = samplesheap.iloc[test].values
    traindata = samplesheap.iloc[train].values
    trainlabels = labelsheap.iloc[train].values.ravel()
    #fit & predict
    classifier.fit(traindata, trainlabels)
    prediction = classifier.predict(testdata)
    #write prediction to file
    print(len(prediction))

    with open('../Results/Predictions/MultiCenter LDA/CV-cells/predict_batch_' + str(batch) +'.csv', 'w') as f:
        for item in prediction:
            f.write("%s\n" % item)
    print("Done testing %s" % str(batch))
    batch += 1

185937
Done testing 1
185937
Done testing 2
185937
Done testing 3
185937
Done testing 4
185937
Done testing 5


In [20]:
#remove enormous datasets from memory
print(len(samplesheap))

del labelsheap
del samplesheap

929685
