In [1]:
import os, sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scanpy as sc
import torch

sys.path.append("./Tangram") 
import tangram as tg

## Convert spatial data

In [2]:
# slideseq coordinates
path = os.path.join('data', 'puck_coords.txt')
sp_coords = pd.read_csv(path, index_col=0)
sp_coords

Unnamed: 0_level_0,x,y
barcode,Unnamed: 1_level_1,Unnamed: 2_level_1
TTCTCCAAGGCCG,1478.453901,1810.524823
TACTAAAGAATTA,2224.000000,1230.397849
TCTCTTAGTTGGC,1914.742424,2221.954545
GCTTTTCGTTCCC,1682.151515,3886.575758
TACGGGCGAAAAG,2416.261194,4216.902985
...,...,...
CTACCGTGCGGCG,5176.223776,3567.398601
CCGATATGCGGCG,2222.324324,3906.189189
TTGGTATCGCCGC,3187.536082,2031.268041
TTCTTATCGCCGC,3686.817460,3833.047619


In [3]:
# slideseq counts
path = os.path.join('data', 'puck_counts.txt')
sp_counts = pd.read_csv(path, index_col=0)
sp_counts

Unnamed: 0_level_0,0610007P14Rik,0610009B22Rik,0610009E02Rik,0610009L18Rik,0610009O20Rik,0610010F05Rik,0610010K14Rik,0610011F06Rik,0610030E20Rik,0610037L13Rik,...,mt-Tl1,mt-Tl2,mt-Tm,mt-Tp,mt-Tq,mt-Tr,mt-Ts2,mt-Tt,mt-Tv,n-R5-8s1
barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TTCTCCAAGGCCG,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TACTAAAGAATTA,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TCTCTTAGTTGGC,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
GCTTTTCGTTCCC,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TACGGGCGAAAAG,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CTACCGTGCGGCG,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CCGATATGCGGCG,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TTGGTATCGCCGC,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TTCTTATCGCCGC,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
# make sure the index are in the same order
assert sp_coords.index.equals(sp_counts.index)

In [6]:
# convert to spatial anndata
ad_sp = sc.AnnData(sp_counts)
ad_sp.obs['x'] = sp_coords.x
ad_sp.obs['y'] = sp_coords.y
ad_sp

AnnData object with n_obs × n_vars = 27261 × 17919
    obs: 'x', 'y'

In [7]:
ad_sp.obs

Unnamed: 0_level_0,x,y
barcode,Unnamed: 1_level_1,Unnamed: 2_level_1
TTCTCCAAGGCCG,1478.453901,1810.524823
TACTAAAGAATTA,2224.000000,1230.397849
TCTCTTAGTTGGC,1914.742424,2221.954545
GCTTTTCGTTCCC,1682.151515,3886.575758
TACGGGCGAAAAG,2416.261194,4216.902985
...,...,...
CTACCGTGCGGCG,5176.223776,3567.398601
CCGATATGCGGCG,2222.324324,3906.189189
TTGGTATCGCCGC,3187.536082,2031.268041
TTCTTATCGCCGC,3686.817460,3833.047619


In [8]:
ad_sp.var

0610007P14Rik
0610009B22Rik
0610009E02Rik
0610009L18Rik
0610009O20Rik
...
mt-Tr
mt-Ts2
mt-Tt
mt-Tv
n-R5-8s1


In [9]:
ad_sp.X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [10]:
# ad_sp.write_h5ad('data/corrected_ad_sp.h5ad')

## Convert single-cell data

In [2]:
# single-cell meta data - cell-type annotation
path = os.path.join('data', 'puck_cell_cluster.txt')
# sc_meta = pd.read_csv(path, 
#                       index_col=0, 
#                       dtype={'cluster':'category', 'subcluster':'category'} 
#                      )
sc_meta = pd.read_csv(path, index_col=0)
sc_meta

Unnamed: 0,cluster,subcluster,reason
P60CBRep1P1_GACTCTACACCC,4,4-1,
P60CBRep1P1_TATTATATCTAA,2,2,curation
P60CBRep1P1_GCGTCGCCAGTT,10,10-3,
P60CBRep1P1_CTATCGATTTCN,2,2,curation
P60CBRep1P1_GTACCTGAGCCT,4,4-3,
...,...,...,...
P60CBRep3P2_TATCTTGCTCTG,1,1,small_cell
P60CBRep3P2_ATTCAAGAAGGC,1,1,small_cell
P60CBRep3P2_GTGGCATTTCCA,1,1,small_cell
P60CBRep3P2_GCGCGACACTGC,1,1,small_cell


In [6]:
celltype_dict = {1:"Granular",
                2:"Purkinje",
                3:"PV Interneurons",
                4:"Other Interneurons",
                5:"Microglia",
                6:"Oligo- Poly-dendrocyte",
                7:"Bergmann Glia",
                8:"Astrocytes",
                9:"Choroid Plexus",
                10:"Endothelial Stalk",
                11:"Muraland Tip"}

sc_meta_named = sc_meta.replace({"cluster": celltype_dict}) 
sc_meta_named

Unnamed: 0,cluster,subcluster,reason
P60CBRep1P1_GACTCTACACCC,Other Interneurons,4-1,
P60CBRep1P1_TATTATATCTAA,Purkinje,2,curation
P60CBRep1P1_GCGTCGCCAGTT,Endothelial Stalk,10-3,
P60CBRep1P1_CTATCGATTTCN,Purkinje,2,curation
P60CBRep1P1_GTACCTGAGCCT,Other Interneurons,4-3,
...,...,...,...
P60CBRep3P2_TATCTTGCTCTG,Granular,1,small_cell
P60CBRep3P2_ATTCAAGAAGGC,Granular,1,small_cell
P60CBRep3P2_GTGGCATTTCCA,Granular,1,small_cell
P60CBRep3P2_GCGCGACACTGC,Granular,1,small_cell


In [7]:
sc_counts = pd.read_csv('data/puck_dge.txt', index_col=0)
sc_counts

Unnamed: 0,0610010F05Rik,0610030E20Rik,1110004E09Rik,1110019D14Rik,1110037F02Rik,1110046J04Rik,1110059G10Rik,1190003K10Rik,1200014J11Rik,1300002E11Rik,...,Zmynd11,Zmynd8,Znhit6,Znrd1as,Zranb1,Zranb2,Zrsr1,Zrsr2,Zyg11b,Zzef1
P60CBRep1P1_GACTCTACACCC,1,0,3,0,0,0,0,0,0,0,...,1,0,0,0,0,2,1,1,2,0
P60CBRep1P1_TATTATATCTAA,1,1,0,0,0,0,2,0,2,0,...,2,1,0,0,6,7,4,0,0,3
P60CBRep1P1_GCGTCGCCAGTT,0,0,2,0,1,0,2,0,2,1,...,3,0,0,0,0,1,0,3,2,0
P60CBRep1P1_CTATCGATTTCN,0,0,1,1,1,0,1,0,0,0,...,3,1,0,0,0,6,1,0,0,1
P60CBRep1P1_GTACCTGAGCCT,0,0,3,0,0,1,0,0,1,0,...,2,0,1,0,0,6,2,2,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
P60CBRep3P2_TATCTTGCTCTG,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
P60CBRep3P2_ATTCAAGAAGGC,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
P60CBRep3P2_GTGGCATTTCCA,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
P60CBRep3P2_GCGCGACACTGC,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [8]:
# make sure the index are in the same order
assert sc_meta.index.equals(sc_counts.index)

In [9]:
# convert to single-ce;; anndata
ad_sc = sc.AnnData(sc_counts)
ad_sc.obs['cluster'] = sc_meta_named.cluster
ad_sc.obs['subcluster'] = sc_meta_named.subcluster
ad_sc

AnnData object with n_obs × n_vars = 26139 × 2505
    obs: 'cluster', 'subcluster'

In [10]:
ad_sc.obs

Unnamed: 0,cluster,subcluster
P60CBRep1P1_GACTCTACACCC,Other Interneurons,4-1
P60CBRep1P1_TATTATATCTAA,Purkinje,2
P60CBRep1P1_GCGTCGCCAGTT,Endothelial Stalk,10-3
P60CBRep1P1_CTATCGATTTCN,Purkinje,2
P60CBRep1P1_GTACCTGAGCCT,Other Interneurons,4-3
...,...,...
P60CBRep3P2_TATCTTGCTCTG,Granular,1
P60CBRep3P2_ATTCAAGAAGGC,Granular,1
P60CBRep3P2_GTGGCATTTCCA,Granular,1
P60CBRep3P2_GCGCGACACTGC,Granular,1


In [17]:
ad_sc.var

0610010F05Rik
0610030E20Rik
1110004E09Rik
1110019D14Rik
1110037F02Rik
...
Zranb2
Zrsr1
Zrsr2
Zyg11b
Zzef1


In [18]:
ad_sc.X

array([[1., 0., 3., ..., 1., 2., 0.],
       [1., 1., 0., ..., 0., 0., 3.],
       [0., 0., 2., ..., 3., 2., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [5]:
# metacell_dict = {'1':"Granular",
#                 '2':"Purkinje",
#                 '3':"PV Interneurons",
#                 '4':"Other Interneurons",
#                 '5':"Microglia",
#                 '6':"Oligo- Poly-dendrocyte",
#                 '7':"Bergmann Glia",
#                 '8':"Astrocytes",
#                 '9':"Choroid Plexus",
#                 '10':"Endothelial Stalk",
#                 '11':"Muraland Tip"}

# celltype_dict = {1:"Granular",
#                 2:"Purkinje",
#                 3:"PV Interneurons",
#                 4:"Other Interneurons",
#                 5:"Microglia",
#                 6:"Oligo- Poly-dendrocyte",
#                 7:"Bergmann Glia",
#                 8:"Astrocytes",
#                 9:"Choroid Plexus",
#                 10:"Endothelial Stalk",
#                 11:"Muraland Tip"}

In [11]:
# ad_sc.write_h5ad('data/corrected_ad_sc.h5ad')

  if is_string_dtype(df[key]) and not is_categorical(df[key])
... storing 'cluster' as categorical
... storing 'subcluster' as categorical
