<a href="https://colab.research.google.com/github/moqri/deep_cell/blob/master/starter_code/blood_cells_machine_learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# Download the data Atlas of human blood data from 
# https://singlecell.broadinstitute.org/single_cell/study/SCP43/atlas-of-human-blood-dendritic-cells-and-monocytes
# and put in the /data folder
# in particular, you need the 
# 1. raw_expression_matrix.txt (for input)
# 2. metadata.txt (for labels)

In [0]:
# if you already have a compressed copy of the data (single_cell_portal_blood.tar.gz):
 !mkdir data
 !tar -xf /content/single_cell_portal_blood.tar.gz --directory /content/data/

In [0]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn import svm
import itertools


In [0]:
data_path='/content/data/'
folder='single_cell_portal_blood/'
expression_file='raw_expression_matrix.txt'
cell_file='metadata.txt'
expression_df=pd.read_table(data_path+folder+expression_file,index_col=0).transpose()
cell_df=pd.read_table(data_path+folder+cell_file,skiprows=1,index_col=0)

In [5]:
expression_df.head()

Gene.ID,1/2-SBSRNA4,5S_RRNA,5_8S_RRNA,7SK,A1BG,A1BG-AS1,A1CF,A2LD1,A2M,A2ML1,A2MP1,A4GALT,A4GNT,AA06,AAA1,AAAS,AACS,AACSP1,AADAC,AADACL2,AADACL3,AADACL4,AADAT,AAGAB,AAK1,AAMP,AANAT,AARS,AARS2,AARSD1,AASDH,AASDHPPT,AASS,AATF,AATK,AATK-AS1,AB059369,AB062081,AB062083,AB074162,...,ZRANB2,ZRANB2-AS1,ZRANB3,ZRSR1,ZRSR2,ZSCAN1,ZSCAN10,ZSCAN12,ZSCAN12P1,ZSCAN16,ZSCAN18,ZSCAN2,ZSCAN20,ZSCAN21,ZSCAN22,ZSCAN23,ZSCAN29,ZSCAN30,ZSCAN4,ZSCAN5A,ZSCAN5B,ZSWIM1,ZSWIM2,ZSWIM3,ZSWIM4,ZSWIM5,ZSWIM6,ZSWIM7,ZUFSP,ZW10,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3
CD141_P10_S73_rsem,0.0,0.0,0.0,101.04,13.45,0.0,0.21,0.0,0.0,0.6,0.0,23.31,0.0,0.0,0.0,103.19,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.53,133.73,0.0,0.0,0.0,103.18,0.0,0.0,0.0,94.7,0.0,0.0,0.0,10.45,0.0,0.0,...,0.0,0.0,0.0,0.0,3.54,0.0,0.0,0.0,0.0,0.0,1.28,0.0,0.0,0.0,0.0,1.29,1.42,0.0,0.0,0.0,0.0,3.37,0.0,0.0,0.0,0.0,0.0,35.72,0.0,0.0,0.0,0.0,0.0,0.0,20.58,0.0,1.34,72.15,30.18,0.0
CD141_P10_S74_rsem,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.96,0.89,148.31,0.0,156.6,0.0,2.8,0.0,0.0,0.0,2.31,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,10.15,0.0,0.0,0.0,0.0,0.0,2.07,3.78,0.0,0.0,1.21,0.0,2.92,0.0,0.0,0.0,0.0,2.24,0.0,0.0,0.0,0.0,0.86,0.0,0.0,204.13,0.0,0.0,0.0,0.0,61.17,3.54,3.08,0.0,5.4,0.0
CD141_P10_S75_rsem,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.73,0.0,0.0,0.0,130.95,0.0,1.31,0.0,0.0,0.0,0.0,0.0,0.0,1.08,0.0,0.0,308.15,0.0,0.0,22.92,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,26.44,0.0,0.0,0.0,0.0,0.0,0.0,1.85,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,149.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,43.33,1.61,0.0,0.0,0.0,0.0,0.0,52.43,0.23,0.87,1.43,54.6,50.47,0.0
CD141_P10_S76_rsem,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19.04,3.71,0.0,0.0,0.0,0.0,0.0,0.0,46.79,10.98,110.61,0.0,0.0,2.17,174.4,0.0,25.82,0.0,86.93,0.0,0.0,0.0,0.0,0.0,0.0,...,3.77,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.45,0.0,0.0,0.0,0.0,0.0,94.12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.82,1.93,46.58,0.0,0.0
CD141_P10_S77_rsem,2.35,0.0,0.0,93.63,0.0,0.0,0.0,0.0,0.0,0.77,0.0,0.0,0.0,0.0,0.0,0.0,143.85,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.07,0.0,0.0,0.0,1.23,2.54,0.0,0.0,0.72,0.0,0.0,0.0,0.0,8.99,0.0,0.0,...,188.57,0.0,0.0,0.0,0.0,0.0,0.0,0.46,0.0,0.0,0.35,0.84,0.0,0.0,1.71,1.2,1.88,0.0,0.0,0.0,0.0,2.3,0.0,0.0,0.0,0.0,0.0,0.2,26.03,0.0,0.0,0.0,0.0,0.0,0.0,2.03,2.42,17.59,0.0,0.0


In [6]:
cell_df.head()

Unnamed: 0_level_0,group
TYPE,Unnamed: 1_level_1
CD141_P10_S73,DC1
CD141_P10_S74,DC5
CD141_P10_S75,DC1
CD141_P10_S76,DC1
CD141_P10_S77,DC1


In [7]:
cell_df['group'].value_counts()

DC4      173
DC6      173
DC1      165
Mono1    163
Mono2    122
DC3      107
DC2       94
Mono3     31
DC5       30
Mono4     20
Name: group, dtype: int64

In [0]:
expression_df.index=expression_df.index.str[:-5]
expression_df=expression_df.loc[cell_df.index]
cell_df['group'] = pd.Categorical(cell_df['group'])
cell_df['group'] = cell_df['group'].cat.codes

In [0]:
expression_array=expression_df.values
expression_array=expression_array[:,~np.all(expression_array == 0, axis=0)] # remove all zero columns
expression_array_log=np.log(expression_array+1)
cell_array=cell_df.values

expression_array_top=SelectKBest(chi2, k=20).fit_transform(expression_array_log, cell_array)
expression_array_normalized=np.round(expression_array_top) #(expression_array_top / np.linalg.norm(expression_array_top),3)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(
    expression_array_normalized, cell_array, test_size=0.33, random_state=1)
clf = svm.SVC(decision_function_shape='ovo')
y_predict=clf.fit(X_train, y_train)
y_predict=clf.predict(X_test)
accuracy_score(y_test, y_predict)


  y = column_or_1d(y, warn=True)


0.8342696629213483

In [11]:
predictions=pd.DataFrame(data={'label':list(itertools.chain(*y_test)),'predicted':list(y_predict)})
predictions.head()

Unnamed: 0,label,predicted
0,3,3
1,1,1
2,3,3
3,0,0
4,3,3


In [12]:
%tensorflow_version 2.x
from __future__ import absolute_import, division, print_function, unicode_literals
import tensorflow as tf

TensorFlow 2.x selected.


In [13]:
dataset=tf.data.Dataset.from_tensor_slices((expression_array_normalized,cell_array ))
dataset

<TensorSliceDataset shapes: ((20,), (1,)), types: (tf.float64, tf.int8)>

In [14]:
for feat, targ in dataset.take(5):
  print ('Features: {}, Target: {}'.format(feat, targ))

Features: [0. 0. 7. 5. 0. 6. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.], Target: [0]
Features: [5. 6. 0. 0. 0. 1. 5. 0. 0. 0. 0. 0. 6. 2. 0. 8. 0. 0. 0. 4.], Target: [4]
Features: [0. 0. 7. 5. 0. 6. 2. 0. 0. 0. 0. 0. 0. 0. 0. 1. 2. 0. 0. 1.], Target: [0]
Features: [0. 0. 8. 6. 0. 7. 0. 0. 0. 0. 0. 0. 0. 0. 0. 3. 0. 0. 0. 0.], Target: [0]
Features: [0. 0. 7. 0. 0. 7. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.], Target: [0]


In [0]:
train_dataset = dataset.shuffle(len(cell_df)).batch(1)

In [0]:
def get_compiled_model():
  model = tf.keras.Sequential([
    tf.keras.layers.Dense(5, activation='relu'),
    tf.keras.layers.Dense(10, activation='softmax')
  ])

  model.compile(optimizer='adam',
                loss='sparse_categorical_crossentropy',
                metrics=['accuracy'])
  return model

In [18]:
model = get_compiled_model()
model.fit(train_dataset, epochs=10)
model.evaluate(x_test,  y_test, verbose=2)

Train for 1078 steps
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


NameError: ignored