<a href="https://colab.research.google.com/github/moqri/deep_cell/blob/master/starter_code/intestin_cells.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# Download the data Atlas of human intestin data from 
# https://singlecell.broadinstitute.org/single_cell/study/SCP44
# and put in the /data folder
# in particular, you need the 
# 1. raw_expression_matrix.txt (for input)
# 2. metadata.txt (for labels)

In [11]:
# if you already have a compressed copy of the data (single_cell_portal_intestine.tar.gz) on Google Drive:
from google.colab import drive
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [12]:
compressed_file_path='/gdrive/My\ Drive/pr/cs230/data/single_cell_portal_intestin.tar.gz'
!mkdir data
!tar -xf {compressed_file_path} --directory /content/data/

mkdir: cannot create directory ‘data’: File exists


In [0]:
data_path='/content/data/'
folder='single_cell_portal_intestin/'
expression_file='atlas_Log2Tpm_round2.txt'
cell_file='atlas_metadata.txt'

In [0]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn import svm
import itertools

In [15]:
expression_df=pd.read_table(data_path+folder+expression_file,index_col=0).transpose()
expression_df.index.rename('Cell_ID',inplace=1)
expression_df.shape

(7216, 15971)

In [16]:
expression_df.head()

GENE,Mrpl15,Lypla1,Gm37988,Tcea1,Atp6v1h,Rb1cc1,4732440D04Rik,Fam150a,St18,Pcmtd1,Gm26901,Gm30414,Rrs1,Adhfe1,Mybl1,Vcpip1,1700034P13Rik,Sgk3,Mcmdc2,Snhg6,Tcf24,Cops5,Cspp1,Arfgef1,Slco5a1,Prdm14,Ncoa2,Tram1,Lactb2,Xkr9,Trpa1,Kcnb2,Terf1,Sbspon,Rpl7,Rdh10,Stau2,Ube2w,Tceb1,Tmem70,...,4930552P12Rik,Tcf7l2,Habp2,Nrap,Gm17197,Casp7,Plekhs1,Dclre1a,Nhlrc2,Adrb1,Ccdc186,Tdrd1,Vwa2,Afap1l2,Ablim1,B230217O12Rik,Fam160b1,Trub1,Atrnl1,Gfra1,Pnliprp1,Pnliprp2,Hspa12a,Eno4,Shtn1,Slc18a2,Pdzd8,Rps12-ps3,Rab11fip2,Fam204a,Cacul1,Nanos1,Eif3a,Fam45a,Sfxn4,Prdx3,Grk5,Zfp950,Gm7102,Csf2ra
Cell_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
AAACATTGTTTGGG_M1,2.1,2.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.4,0.0,0.0,0.0,0.0,0.0,3.64,0.0,0.0,0.0,1.4,1.4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.4,0.0,0.0,0.0,0.0,2.56,0.0,0.0,0.0,0.0
AAACCGTGCCAAGT_M1,2.13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.41,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AAACGCACAGCCTA_M1,1.81,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.77,0.0,0.0,0.0,1.17,1.81,...,0.0,0.0,1.17,0.0,0.0,1.17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.17,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AAACGCACCAGATC_M1,1.71,1.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.71,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.71,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.42,0.0,0.0,0.0,1.71,0.0,...,0.0,0.0,0.0,0.0,0.0,2.14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.1,0.0,0.0,0.0,3.76,0.0,0.0,0.0,0.0,1.1,0.0,0.0
AAACGCACCGCCTT_M1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.3,0.0,0.0,0.0,0.0,0.0,4.61,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.3,0.0,0.0,1.97,0.0,0.0,1.3,0.0,0.0,0.0,0.0


In [17]:
cell_df=pd.read_table(data_path+folder+cell_file,skiprows=1,index_col=0)
cell_df.index.rename('Cell_ID',inplace=1)
cell_df=cell_df[['group.1']]
cell_df.columns=['Cell_Type']
cell_df.shape

(28723, 1)

In [18]:
cell_df.head()

Unnamed: 0_level_0,Cell_Type
Cell_ID,Unnamed: 1_level_1
AAACATTGTTTGGG_M1,Enterocyte.Immature.Distal
AAACCGTGCCAAGT_M1,Tuft
AAACGCACAGCCTA_M1,Goblet
AAACGCACCAGATC_M1,Enteroendocrine
AAACGCACCGCCTT_M1,Stem


In [19]:
cell_df['Cell_Type'].value_counts()

Stem                            6803
TA                              4164
Goblet                          2827
Enterocyte                      2499
Enterocyte.Progenitor           2178
TA.Early                        1970
EP                              1957
Tuft                             840
Enterocyte.Progenitor.Early      829
Paneth                           789
Enteroendocrine                  634
Enterocyte.Mature.Proximal       581
Enterocyte.Immature.Distal       512
TA.G2                            410
TA.G1                            408
Enterocyte.Progenitor.Late       404
Endocrine                        380
Enterocyte.Immature.Proximal     297
Enterocyte.Mature.Distal         241
Name: Cell_Type, dtype: int64

In [0]:
shared=set(cell_df.index)&set(expression_df.index)
expression_df=expression_df.loc[shared]
cell_df=cell_df.loc[shared]

In [0]:
cell_df['Cell_Type'] = pd.Categorical(cell_df['Cell_Type'])
cell_df['Cell_Type'] = cell_df['Cell_Type'].cat.codes

In [0]:
#expression_df_norm=(expression_df+1).transform(np.log)
expression_df_norm=np.round(expression_df,3)
selector = SelectKBest(chi2, k=1000)

In [23]:
selector.fit(expression_df, cell_df)
cols = selector.get_support(indices=True)
# Create new dataframe with only desired columns, or overwrite existing
expression_top_df = expression_df.iloc[:,cols]
expression_top_df.head()

GENE,St18,Trpa1,Ptpn18,Fhl2,Myo1b,Hspd1,Hspe1,Spats2l,Eef1b2,Cps1,Vil1,Cdk5r2,Fev,Cryba2,Ptprn,Resp18,Scg2,Itm2c,4933407L21Rik,Ncl,Ptma,Alpi,Inpp5d,Ramp1,Hes6,Ano7,Dtymk,Gal3st2,Pam,Tmem37,Lct,Mcm6,Nucks1,Btg2,Chil1,Ube2t,Rgs2,Rgs13,Pla2g4a,Glul,...,Hsbp1l1,Cndp2,Cyb5a,2010003K11Rik,Cst6,Slc25a45,Cdc42ep2,Naaladl1,Ehd1,Gm14964,Prdx5,Slc3a2,Eef1g,Ahnak,Incenp,Syt7,Tkfc,Ms4a10,Ms4a18,AW112010,Ms4a8a,Rfk,Aldh1a1,Gda,Asah2,Papss2,Ifit1bl2,Kif20b,Kif11,Rbp4,Hells,Cyp2c55,Cyp2c65,Cyp2c66,Cyp2c29,Cyp2c68,Sfrp5,Abcc2,Xpnpep1,Acsl5
Cell_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
GATACTCTTAGCCA_M3,0.0,0.0,0.0,0.0,0.0,1.64,2.88,0.0,3.25,0.0,1.64,0.0,0.0,0.0,0.0,0.0,0.0,1.64,0.0,0.0,5.13,0.0,0.0,0.0,1.64,0.0,1.64,0.0,0.0,0.0,0.0,1.64,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.64,...,0.0,0.0,2.39,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.39,4.48,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.64,2.39,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.64,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TGCAACGATGGAAA_M6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.97,0.0,4.17,0.0,0.0,0.0,0.0,0.0,0.0,2.14,0.0,0.0,1.71,1.1,0.0,0.0,0.0,0.0,1.1,0.0,0.0,2.14,1.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,3.87,3.49,0.0,2.47,3.16,1.71,0.0,0.0,0.0,2.14,2.14,1.71,0.0,0.0,0.0,2.47,2.97,1.71,1.1,3.49,3.87,0.0,1.1,0.0,1.1,1.71,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.47,2.47
CACTCCGATGGTCA_M1,0.0,0.0,0.0,0.0,0.0,2.67,3.55,0.0,2.67,2.67,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.67,3.55,0.0,0.0,0.0,0.0,0.0,2.67,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,3.55,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.67,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.67,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AGCTCGCTACGGTT_M3,0.0,0.0,0.0,1.27,0.0,0.0,1.27,0.0,1.94,0.0,1.94,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.27,0.0,0.0,0.0,0.0,0.0,0.0,1.27,0.0,0.0,1.27,1.27,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,3.45,2.39,0.0,2.74,0.0,0.0,2.74,0.0,0.0,1.94,1.94,2.74,0.0,1.27,0.0,0.0,1.27,3.01,0.0,2.74,1.27,0.0,1.27,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.27,0.0,0.0,1.94,0.0,0.0,3.25,3.01
TCAAGGACGCGAGA_M5,0.0,0.0,1.69,0.0,0.0,1.69,4.05,0.0,4.15,2.72,1.08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.45,4.67,0.0,0.0,0.0,0.0,0.0,3.73,0.0,0.0,0.0,0.0,1.69,1.08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3.31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.15,1.08,0.0,0.0,0.0,1.08,0.0,1.08,0.0,1.08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [0]:
expression_array_top=expression_top_df.values
cell_array=cell_df.values
expression_array_normalized=np.round(expression_array_top) 

In [25]:
np.random.seed(1)
X_train, X_test, y_train, y_test = train_test_split(
    expression_array_normalized, cell_array, test_size=0.33, random_state=1)
clf = svm.SVC(decision_function_shape='ovo')
y_predict=clf.fit(X_train, y_train)
y_predict=clf.predict(X_test)
accuracy_score(y_test, y_predict)

  y = column_or_1d(y, warn=True)


0.7518891687657431

In [26]:
predictions=pd.DataFrame(data={'label':list(itertools.chain(*y_test)),'predicted':list(y_predict)})
predictions.head()

Unnamed: 0,label,predicted
0,10,10
1,5,5
2,5,5
3,10,10
4,5,5


In [27]:
%tensorflow_version 2.x
from __future__ import absolute_import, division, print_function, unicode_literals
import tensorflow as tf

TensorFlow 2.x selected.


In [0]:
tf.random.set_seed(1)

In [29]:
dataset=tf.data.Dataset.from_tensor_slices((expression_array_normalized,cell_array ))
dataset

<TensorSliceDataset shapes: ((1000,), (1,)), types: (tf.float64, tf.int8)>

In [30]:
for feat, targ in dataset.take(1):
  print ('Features: {}, Target: {}'.format(feat, targ))

Features: [0. 0. 0. 0. 0. 2. 3. 0. 3. 0. 2. 0. 0. 0. 0. 0. 0. 2. 0. 0. 5. 0. 0. 0.
 2. 0. 2. 0. 0. 0. 0. 2. 0. 0. 0. 0. 0. 0. 0. 2. 2. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 2. 2. 0. 0. 2. 0. 0. 2. 0. 0. 2. 0. 2. 0. 0. 0. 0. 2. 0. 0. 2.
 2. 2. 0. 0. 0. 0. 0. 0. 2. 2. 2. 0. 2. 0. 0. 0. 2. 0. 0. 0. 0. 4. 0. 0.
 0. 0. 3. 0. 0. 2. 0. 3. 0. 0. 0. 2. 0. 2. 0. 2. 0. 0. 2. 0. 0. 0. 0. 0.
 3. 0. 2. 2. 0. 0. 0. 3. 0. 0. 0. 0. 2. 0. 2. 2. 0. 0. 0. 0. 0. 0. 0. 4.
 0. 0. 0. 0. 2. 0. 3. 0. 0. 2. 0. 0. 2. 0. 0. 0. 0. 3. 2. 0. 0. 0. 0. 0.
 2. 0. 0. 0. 4. 2. 0. 2. 3. 2. 2. 2. 0. 0. 2. 2. 0. 0. 0. 0. 2. 0. 0. 0.
 0. 0. 0. 2. 2. 0. 4. 2. 0. 2. 0. 0. 0. 0. 0. 4. 0. 0. 0. 2. 0. 0. 0. 0.
 5. 0. 2. 0. 8. 0. 2. 2. 0. 0. 2. 0. 2. 0. 2. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 2. 0. 5. 0. 6. 5. 0. 0. 0. 2. 0. 0. 3. 2. 2. 0. 0. 0. 2. 0. 0. 0. 0.
 0. 0. 2. 0. 0. 0. 2. 0. 2. 2. 0. 0. 0. 0. 0. 0. 0. 0. 0. 2. 3. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 6. 0. 0. 0. 0. 0. 2. 0. 0. 2. 0. 0. 0.
 0. 0. 0. 3. 0. 0. 0. 0. 3. 0. 0. 0. 0. 0

In [0]:
cell_type_count=len(cell_df['Cell_Type'].value_counts())

In [0]:
def get_compiled_model():
  model = tf.keras.Sequential([
    tf.keras.layers.Dense(100, activation='relu'),
    tf.keras.layers.Dense(cell_type_count, activation='softmax')
  ])

  model.compile(optimizer='adam',
                loss='sparse_categorical_crossentropy',
                metrics=['accuracy'])
  return model

In [45]:
model = get_compiled_model()
model.fit(X_train, y_train, epochs=10)
model.evaluate(X_test,  y_test, verbose=2)

Train on 4834 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
2382/2382 - 0s - loss: 0.9273 - accuracy: 0.7326


[0.92727497838106, 0.7325777]