# LDA classification of CyTOF

#### Verifying results of Predicting   cell   types   in   single   cell   mass   cytometry data

In [4]:
import pandas as pd
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import numpy as np
from sklearn.metrics import f1_score
import os
from sklearn.model_selection import KFold
from sklearn import metrics
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure


### Sample of data
28 features over a large number of cells per CSV

In [7]:
sample = pd.read_csv("../Data/PANORAMA/Samples/Samples01.csv", header=None)
markers = ['Ter119','CD45.2','Ly6G','IgD','CD11c','F480','CD3','NKp46','CD23',
            'CD34','CD115','CD19','120g8','CD8','Ly6C','CD4','CD11b','CD27','CD16_32',
            'SiglecF','Foxp3','B220','CD5','FceR1a','TCRgd','CCR7','Sca1','CD49b','cKit',
            'CD150','CD25','TCRb','CD43','CD64','CD138','CD103','IgM','CD44','MHCII']
sample.columns = markers
sample.head(10)

Unnamed: 0,Ter119,CD45.2,Ly6G,IgD,CD11c,F480,CD3,NKp46,CD23,CD34,...,CD150,CD25,TCRb,CD43,CD64,CD138,CD103,IgM,CD44,MHCII
0,-0.30416,1.3123,-0.28697,-0.20859,1.4027,1.2065,-0.21372,-0.054569,-0.23933,-0.28224,...,0.87474,-0.016584,-0.099003,5.7203,0.66856,-0.098138,-0.22934,-0.24672,3.346,-0.050778
1,-0.33094,0.19375,-0.009644,-0.33023,-0.33983,1.3274,-0.32119,-0.29426,-0.21541,-0.29241,...,-0.3042,-0.31898,-0.23943,0.29711,0.43763,0.023776,-0.30394,0.20291,3.9978,0.34463
2,-0.30554,1.745,1.0514,-0.27056,-0.14313,-0.30412,-0.19982,-0.261,-0.25309,-0.34176,...,-0.22832,-0.31539,-0.23872,-0.25098,-0.33615,-0.18939,-0.13723,5.1083,0.39405,3.6663
3,-0.34303,2.0117,-0.17223,-0.33583,-0.342,2.0031,-0.28428,-0.18604,0.30058,-0.29058,...,-0.2637,-0.22253,-0.26603,-0.15527,2.7679,-0.23938,-0.023883,-0.034218,4.4852,0.70171
4,-0.19893,1.8105,-0.30043,-0.32335,-0.34281,0.60875,-0.31348,0.57074,-0.24489,-0.096765,...,-0.29652,-0.13683,1.123,3.5553,0.69628,0.10203,-0.016809,-0.26499,4.786,0.60626
5,-0.21946,0.61753,-0.29855,-0.22917,-0.29682,1.0927,-0.25765,-0.33593,-0.29593,0.40998,...,-0.28084,0.27857,-0.33204,0.18901,2.2902,0.22934,-0.20956,0.37675,3.7926,0.66965
6,-0.28746,0.95229,0.018152,-0.31537,-0.13907,2.2274,-0.1974,-0.14814,-0.23947,0.71752,...,-0.30203,-0.24237,-0.29407,0.092166,0.21569,-0.32049,0.55912,-0.047981,3.184,0.61627
7,-0.32242,0.9039,0.29245,0.67402,0.033029,1.9748,-0.33136,0.19163,-0.062204,1.5187,...,1.1633,-0.22519,-0.15124,4.1138,-0.12754,-0.099417,-0.27024,-0.31543,4.8717,0.9354
8,-0.26139,0.2981,0.075409,-0.32091,-0.21694,-0.20411,-0.3365,-0.28278,-0.21323,-0.29208,...,0.027988,-0.34094,-0.29122,3.3148,0.12707,0.29847,-0.2668,0.06375,3.5569,0.004834
9,-0.25832,0.70085,0.17934,-0.31544,-0.31511,0.53833,-0.027095,-0.30274,-0.21848,-0.15244,...,-0.15976,-0.32048,0.23973,0.043636,2.2528,0.50706,-0.30717,-0.18014,3.6351,-0.11215


X-fold validation with one file/sample being left out each iteration and used as test:
CV-samples in paper

In [13]:
labelfiles = os.listdir("../Data/PANORAMA/Labels/")
samplefiles = os.listdir("../Data/PANORAMA/Samples/")
labelfiles.sort()
samplefiles.sort()



## Stratify PANORAMA per sample
Train 10 iterations of PANORAMA with 9 samples, leaving one out <br>
Predict labels for sample which was excluded from training <br>
Save predictions to file and prediction accuracy to variable <br>
PANORAMA CV-samples in paper

In [14]:
kfoldsplitter = KFold(n_splits=10)
for train, test in kfoldsplitter.split(labelfiles):
    classifier = LinearDiscriminantAnalysis()
    for trainingset in train:
        X = pd.read_csv("../Data/PANORAMA/Samples/" + samplefiles[trainingset], header=None).values #train
        y = pd.read_csv("../Data/PANORAMA/Labels/" + labelfiles[trainingset], header=None).values.ravel() #labels
        classifier.fit(X, y)
    testdata = pd.read_csv("../Data/PANORAMA/Samples/" + samplefiles[test[0]], header=None).values #test
    testlabels = pd.read_csv("../Data/PANORAMA/Labels/" + labelfiles[test[0]], header=None).values.ravel() #testlabels
    #score testdata with its true labels in model trained against 9 other samples & save to file
    prediction = classifier.predict(testdata)
    with open('../Results/Predictions/PANORAMA LDA/CV-sample/predict_'+samplefiles[test[0]], 'w') as f:
        for item in prediction:
            f.write("%s\n" % item)
    print("Done testing %s" % samplefiles[test[0]])


Done testing Samples01.csv
Done testing Samples02.csv
Done testing Samples03.csv
Done testing Samples04.csv
Done testing Samples05.csv
Done testing Samples06.csv
Done testing Samples07.csv
Done testing Samples08.csv
Done testing Samples09.csv
Done testing Samples10.csv


## Stratify PANORAMA 5-fold

In [16]:
kfoldsplitter = KFold(n_splits=5)

#Load all samples and labels for 5-cold CV
samplesheap = pd.DataFrame()
labelsheap = pd.DataFrame()

for labelname, samplename in zip(labelfiles, samplefiles):
    samplesheap = samplesheap.append(pd.read_csv("../Data/PANORAMA/Samples/" + samplename, header=None))
    labelsheap = labelsheap.append(pd.read_csv("../Data/PANORAMA/Labels/" + labelname, header=None))



Number of predictions/trues matches original numer of instances

In [17]:


batch = 1
for train, test in kfoldsplitter.split(samplesheap):
    classifier = LinearDiscriminantAnalysis()
    #select test/train/true labels
    testdata = samplesheap.iloc[test].values
    traindata = samplesheap.iloc[train].values
    trainlabels = labelsheap.iloc[train].values.ravel()
    #fit & predict
    classifier.fit(traindata, trainlabels)
    prediction = classifier.predict(testdata)
    #write prediction to file
    print(len(prediction))

    with open('../Results/Predictions/PANORAMA LDA/CV-cells/predict_batch_' + str(batch) +'.csv', 'w') as f:
        for item in prediction:
            f.write("%s\n" % item)
    print("Done testing %s" % str(batch))
    batch += 1

102878
Done testing 1
102877
Done testing 2
102877
Done testing 3
102877
Done testing 4
102877
Done testing 5


In [18]:
#remove enormous datasets from memory
print(len(samplesheap))

del labelsheap
del samplesheap

514386
