<a href="https://colab.research.google.com/github/moqri/deep_cell/blob/master/starter_code/intestin_cells.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# Download the data PBMC cell data from 
# https://singlecell.broadinstitute.org/single_cell/study/SCP424
# and put in the /data folder
# in particular, you need the 
# 1. raw_expression_matrix.txt (for input)
# 2. metadata.txt (for labels)

In [3]:
# if you already have a compressed copy of the data (single_cell_portal_intestine.tar.gz) on Google Drive:
from google.colab import drive
drive.mount('/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /gdrive


In [0]:
compressed_file_path='/gdrive/My\ Drive/pr/data/single_cell_portal_pbmc.tar.gz'
!mkdir data
!tar -xf {compressed_file_path} --directory /content/data/

In [0]:
data_path='/content/data/'
folder='single_cell_portal_pbmc/'
expression_file='counts.read.txt'
cell_file='cells.read.txt'

In [0]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn import svm
import itertools
import scipy

In [7]:
expression_df=pd.read_table(data_path+folder+expression_file,delimiter=' ',skiprows=1)
expression_df.shape

(37755755, 3)

In [8]:
d=expression_df.head()
d

Unnamed: 0,33694,45017,37755755
0,6,1,13
1,32,1,20
2,39,1,5
3,42,1,3
4,45,1,74


In [0]:
s=scipy.sparse.coo_matrix((d.iloc[:,2], (d.iloc[:,0], d.iloc[:,1])))
n=s.toarray()

In [10]:
cell_df=pd.read_table(data_path+folder+cell_file,skiprows=1,index_col=0)
cell_df.index.rename('Cell_ID',inplace=1)
cell_df=cell_df[['group.1']]
cell_df.columns=['Cell_Type']
cell_df.shape

KeyError: ignored

In [0]:
cell_df.head()

In [0]:
cell_df['Cell_Type'].value_counts()

In [0]:
shared=set(cell_df.index)&set(expression_df.index)
expression_df=expression_df.loc[shared]
cell_df=cell_df.loc[shared]

In [0]:
cell_df['Cell_Type'] = pd.Categorical(cell_df['Cell_Type'])
cell_df['Cell_Type'] = cell_df['Cell_Type'].cat.codes

In [0]:
#expression_df_norm=(expression_df+1).transform(np.log)
expression_df_norm=np.round(expression_df,3)
selector = SelectKBest(chi2, k=1000)

In [0]:
selector.fit(expression_df, cell_df)
cols = selector.get_support(indices=True)
# Create new dataframe with only desired columns, or overwrite existing
expression_top_df = expression_df.iloc[:,cols]
expression_top_df.head()

In [0]:
expression_array_top=expression_top_df.values
cell_array=cell_df.values
expression_array_normalized=np.round(expression_array_top) 

In [0]:
np.random.seed(1)
X_train, X_test, y_train, y_test = train_test_split(
    expression_array_normalized, cell_array, test_size=0.33, random_state=1)
clf = svm.SVC(decision_function_shape='ovo')
y_predict=clf.fit(X_train, y_train)
y_predict=clf.predict(X_test)
accuracy_score(y_test, y_predict)

In [0]:
predictions=pd.DataFrame(data={'label':list(itertools.chain(*y_test)),'predicted':list(y_predict)})
predictions.head()

In [0]:
%tensorflow_version 2.x
from __future__ import absolute_import, division, print_function, unicode_literals
import tensorflow as tf

In [0]:
tf.random.set_seed(1)

In [0]:
dataset=tf.data.Dataset.from_tensor_slices((expression_array_normalized,cell_array ))
dataset

In [0]:
for feat, targ in dataset.take(1):
  print ('Features: {}, Target: {}'.format(feat, targ))

In [0]:
cell_type_count=len(cell_df['Cell_Type'].value_counts())

In [0]:
def get_compiled_model():
  model = tf.keras.Sequential([
    tf.keras.layers.Dense(100, activation='relu'),
    tf.keras.layers.Dense(cell_type_count, activation='softmax')
  ])

  model.compile(optimizer='adam',
                loss='sparse_categorical_crossentropy',
                metrics=['accuracy'])
  return model

In [0]:
model = get_compiled_model()
model.fit(X_train, y_train, epochs=10)
model.evaluate(X_test,  y_test, verbose=2)