# TISCH2에서 데이터 가져오기

- Welcome to TISCH2 BETA 190 datasets 6,297,320 cells

Tumor Immune Single-cell Hub 2 (TISCH2) is a scRNA-seq database focusing on tumor microenvironment (TME). TISCH2 provides detailed cell-type annotation at the single-cell level, enabling the exploration of TME across different cancer types.

접속 주소: http://tisch.comp-genomics.org/gallery/

# 필요한 파일

- NSCLC_GSE127471_expression.h5
- NSCLC_GSE127471_CellMetainfo_table.tsv


In [1]:
library(hdf5r)
library(tidyverse)

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.2 ──
[32m✔[39m [34mggplot2[39m 3.4.1     [32m✔[39m [34mpurrr  [39m 1.0.1
[32m✔[39m [34mtibble [39m 3.1.8     [32m✔[39m [34mdplyr  [39m 1.1.0
[32m✔[39m [34mtidyr  [39m 1.3.0     [32m✔[39m [34mstringr[39m 1.5.0
[32m✔[39m [34mreadr  [39m 2.1.4     [32m✔[39m [34mforcats[39m 1.0.0
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m     masks [34mstats[39m::filter()
[31m✖[39m [34mpurrr[39m::[32mflatten_df()[39m masks [34mhdf5r[39m::flatten_df()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m        masks [34mstats[39m::lag()


In [2]:
h5_data <- hdf5r::H5File$new('../input/NSCLC_GSE127471_expression.h5', mode = 'r')
df_meta <- read_tsv("../input/NSCLC_GSE127471_CellMetainfo_table.tsv")

feature_matrix <- Matrix::sparseMatrix(
  i = h5_data[['matrix/indices']][],
  p = h5_data[['matrix/indptr']][],
  x = h5_data[['matrix/data']][],
  dimnames = list(
    h5_data[['matrix/features/name']][],
    h5_data[['matrix/barcodes']][]
  ),
  dims = h5_data[['matrix/shape']][],
  index1 = FALSE
)

# Specify rows to keep
keep_rows <- c("TIGIT","PVR","CD226","PVRL2","LILRB1", "LILRB2", "LILRB4", "HLA-G")         
m_subset <- feature_matrix[rownames(feature_matrix) %in% keep_rows, ] # Extract rows from matrix
# mat_subset

m_subset %>% as.matrix() %>%
  as_tibble(t(.), rownames="Cell") -> df_mat

# df_mat <- as.data.frame(as.matrix(m_subset))
# df_mat <- as_tibble(t(df_mat), rownames="Cell")
df <- df_meta %>% left_join(., df_mat)
head(df)

#write.csv(t(df), "../output/230220_GSE127471.csv")

[1mRows: [22m[34m1108[39m [1mColumns: [22m[34m11[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m (8): Cell, Celltype (malignancy), Celltype (major-lineage), Celltype (mi...
[32mdbl[39m (3): UMAP_1, UMAP_2, Cluster

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1m[22mJoining with `by = join_by(Cell)`


Cell,UMAP_1,UMAP_2,Celltype (malignancy),Celltype (major-lineage),Celltype (minor-lineage),Cluster,Patient,Sample,Source,⋯,TAAGCGTTCGCCCTTA-1,TACTCGCAGGATCGCA-1,TCAGCTCGTCTGGAGA-1,TCTATTGAGCCCAATT-1,TGACGGCTCTAACCGA-1,TGAGGGAAGCCGATTT-1,TGCCCTAAGCTACCGC-1,TGGTTCCGTGTCCTCT-1,TTCGGTCGTCCGTGAC-1,TTGGAACCATAGGATA-1
<chr>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<dbl>,<chr>,<chr>,<chr>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
AACGTTGGTCATACTG-1,-1.411341,6.299751,Immune cells,B,B,7,P1,P1,PBMC,⋯,,,,,,,,,,
AAGACCTCACAGGCCT-1,-1.24077,6.12752,Immune cells,B,B,7,P1,P1,PBMC,⋯,,,,,,,,,,
ACCGTAAGTCTTGTCC-1,-1.207882,6.265944,Immune cells,B,B,7,P1,P1,PBMC,⋯,,,,,,,,,,
ACGCAGCGTGTGCCTG-1,-1.991852,6.321991,Immune cells,B,B,7,P1,P1,PBMC,⋯,,,,,,,,,,
ACGGAGACAAGCCTAT-1,-2.926991,6.180597,Immune cells,B,B,7,P1,P1,PBMC,⋯,,,,,,,,,,
ACGGCCATCACCGGGT-1,-2.652965,6.081234,Immune cells,B,B,7,P1,P1,PBMC,⋯,,,,,,,,,,


# Todo

함수로 만들어서 반복문을 하면 좋을 것 같다

In [None]:
#write.csv(t(df), "../output/230220_GSE127471.csv")

In [13]:
library(fs)
files <- dir_ls('../input/', glob='*.csv')


In [19]:
files[1]

# cell 마다 데이터베이스 다운로드 하게 만들기


In [4]:
file_name <- "NSCLC_GSE127471"
# Specify rows to keep
keep_rows <- c("TIGIT","PVR","CD226","PVRL2","LILRB1", "LILRB2", "LILRB4", "HLA-G")         

# load dataset
df_meta <- read_tsv(paste("../input/",file_name,"_CellMetainfo_table.tsv", sep=""))
h5_data <- hdf5r::H5File$new(paste("../input/",file_name,"_expression.h5", sep=""), mode = 'r')

feature_matrix <- Matrix::sparseMatrix(
  i = h5_data[['matrix/indices']][],
  p = h5_data[['matrix/indptr']][],
  x = h5_data[['matrix/data']][],
  dimnames = list(
    h5_data[['matrix/features/name']][],
    h5_data[['matrix/barcodes']][]
  ),
  dims = h5_data[['matrix/shape']][],
  index1 = FALSE
)
rm(h5_data) # for saving memory


m_subset <- feature_matrix[rownames(feature_matrix) %in% keep_rows, ] # Extract rows from matrix
rm(feature_matrix)

df_mat <- as.data.frame(as.matrix(m_subset))
df_mat <- as_tibble(t(df_mat), rownames="Cell")

df <- df_meta %>% left_join(., df_mat)
head(df)

# to save files
write.csv(df, paste("../output/",file_name,".csv", sep=""))

[1mRows: [22m[34m1108[39m [1mColumns: [22m[34m11[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m (8): Cell, Celltype (malignancy), Celltype (major-lineage), Celltype (mi...
[32mdbl[39m (3): UMAP_1, UMAP_2, Cluster

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1m[22mJoining with `by = join_by(Cell)`


Cell,UMAP_1,UMAP_2,Celltype (malignancy),Celltype (major-lineage),Celltype (minor-lineage),Cluster,Patient,Sample,Source,Stage,TIGIT,HLA-G,CD226,PVRL2,LILRB2,LILRB1,LILRB4
<chr>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<dbl>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
AACGTTGGTCATACTG-1,-1.411341,6.299751,Immune cells,B,B,7,P1,P1,PBMC,Primary,0,0,0,0,0,0.0,0
AAGACCTCACAGGCCT-1,-1.24077,6.12752,Immune cells,B,B,7,P1,P1,PBMC,Primary,0,0,0,0,0,0.0,0
ACCGTAAGTCTTGTCC-1,-1.207882,6.265944,Immune cells,B,B,7,P1,P1,PBMC,Primary,0,0,0,0,0,0.0,0
ACGCAGCGTGTGCCTG-1,-1.991852,6.321991,Immune cells,B,B,7,P1,P1,PBMC,Primary,0,0,0,0,0,0.0,0
ACGGAGACAAGCCTAT-1,-2.926991,6.180597,Immune cells,B,B,7,P1,P1,PBMC,Primary,0,0,0,0,0,1.062495,0
ACGGCCATCACCGGGT-1,-2.652965,6.081234,Immune cells,B,B,7,P1,P1,PBMC,Primary,0,0,0,0,0,0.0,0


위에서 복사한 셀을 아래에 붙여 넣기

In [5]:
file_name <- "SKCM_GSE120575_aPD1aCTLA4"
# Specify rows to keep
keep_rows <- c("TIGIT","PVR","CD226","PVRL2","LILRB1", "LILRB2", "LILRB4", "HLA-G")         

# load dataset
df_meta <- read_tsv(paste("../input/",file_name,"_CellMetainfo_table.tsv", sep=""))
h5_data <- hdf5r::H5File$new(paste("../input/",file_name,"_expression.h5", sep=""), mode = 'r')

feature_matrix <- Matrix::sparseMatrix(
  i = h5_data[['matrix/indices']][],
  p = h5_data[['matrix/indptr']][],
  x = h5_data[['matrix/data']][],
  dimnames = list(
    h5_data[['matrix/features/name']][],
    h5_data[['matrix/barcodes']][]
  ),
  dims = h5_data[['matrix/shape']][],
  index1 = FALSE
)
rm(h5_data) # for saving memory


m_subset <- feature_matrix[rownames(feature_matrix) %in% keep_rows, ] # Extract rows from matrix
rm(feature_matrix)

df_mat <- as.data.frame(as.matrix(m_subset))
df_mat <- as_tibble(t(df_mat), rownames="Cell")

df <- df_meta %>% left_join(., df_mat)
head(df)

# to save files
write.csv(df, paste("../output/",file_name,".csv", sep=""))

[1mRows: [22m[34m16291[39m [1mColumns: [22m[34m14[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m (11): Cell, Celltype (malignancy), Celltype (major-lineage), Celltype (m...
[32mdbl[39m  (3): UMAP_1, UMAP_2, Cluster

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1m[22mJoining with `by = join_by(Cell)`


Cell,UMAP_1,UMAP_2,Celltype (malignancy),Celltype (major-lineage),Celltype (minor-lineage),Cluster,Response,Treatment,Patient,⋯,Gender,Stage,TimePoint,PVR,LILRB1,LILRB2,CD226,TIGIT,LILRB4,HLA-G
<chr>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<dbl>,<chr>,<chr>,<chr>,⋯,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
B3_P5_M15,12.26936,2.67050714,Immune cells,B,B,3,Responder,anti-PD1,P1,⋯,Male,Metastatic,Post,0,1.347674,0,0,0,0,0.16553912
B6_P6_M15,12.90418,-0.12057095,Immune cells,B,B,3,Responder,anti-PD1,P1,⋯,Male,Metastatic,Post,0,0.550023,0,0,0,0,0.15967263
D7_P4_M15,12.39708,2.98989935,Immune cells,B,B,3,Responder,anti-PD1,P1,⋯,Male,Metastatic,Post,0,0.9109808,0,0,0,0,0.1629677
A12_P3_M55_L001,13.2724,-0.02708223,Immune cells,B,B,3,Non-responder,anti-PD1,P1,⋯,Male,Metastatic,Post,0,0.0,0,0,0,0,0.09291377
A12_P4_M55_L001,11.99764,1.58152444,Immune cells,B,B,3,Non-responder,anti-PD1,P1,⋯,Male,Metastatic,Post,0,0.0,0,0,0,0,0.0589158
A2_P2_M55_L001,13.00366,0.97222597,Immune cells,B,B,3,Non-responder,anti-PD1,P1,⋯,Male,Metastatic,Post,0,0.4105434,0,0,0,0,0.15003802


In [6]:
file_name <- "SKCM_GSE72056"
# Specify rows to keep
keep_rows <- c("TIGIT","PVR","CD226","PVRL2","LILRB1", "LILRB2", "LILRB4", "HLA-G")         

# load datasetB
df_meta <- read_tsv(paste("../input/",file_name,"_CellMetainfo_table.tsv", sep=""))
h5_data <- hdf5r::H5File$new(paste("../input/",file_name,"_expression.h5", sep=""), mode = 'r')

feature_matrix <- Matrix::sparseMatrix(
  i = h5_data[['matrix/indices']][],
  p = h5_data[['matrix/indptr']][],
  x = h5_data[['matrix/data']][],
  dimnames = list(
    h5_data[['matrix/features/name']][],
    h5_data[['matrix/barcodes']][]
  ),
  dims = h5_data[['matrix/shape']][],
  index1 = FALSE
)
rm(h5_data) # for saving memory


m_subset <- feature_matrix[rownames(feature_matrix) %in% keep_rows, ] # Extract rows from matrix
rm(feature_matrix)

df_mat <- as.data.frame(as.matrix(m_subset))
df_mat <- as_tibble(t(df_mat), rownames="Cell")

df <- df_meta %>% left_join(., df_mat)
head(df)

# to save files
write.csv(df, paste("../output/",file_name,".csv", sep=""))

[1mRows: [22m[34m4645[39m [1mColumns: [22m[34m13[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m (10): Cell, Celltype (malignancy), Celltype (major-lineage), Celltype (m...
[32mdbl[39m  (3): UMAP_1, UMAP_2, Cluster

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1m[22mJoining with `by = join_by(Cell)`


Cell,UMAP_1,UMAP_2,Celltype (malignancy),Celltype (major-lineage),Celltype (minor-lineage),Celltype (original),Cluster,Celltype,Patient,Source,Gender,Stage,HLA-G,LILRB4,CD226,LILRB1,TIGIT,LILRB2,PVR
<chr>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
Cy72_CD45_H02_S758_comb,6.902168,-9.808507,Immune cells,B,B,B,1,B,Patient72,Tumor,Female,Metastatic,0.1244284,0,0,0.0,0,0,0.0
Cy72_CD45_A07_S679_comb,7.220452,-10.508887,Immune cells,B,B,B,1,B,Patient72,Tumor,Female,Metastatic,0.0434421,0,0,0.0,0,0,0.01038673
Cy71_CD45_H05_S569_comb,6.830885,-10.316877,Immune cells,B,B,Malignant,1,Malignant,Patient71,Tumor,Male,Metastatic,0.0,0,0,0.1409566,0,0,0.0
Cy74_CD45_B08_S404_comb,7.106164,-11.553431,Immune cells,B,B,B,1,B,Patient74,Tumor,Male,Metastatic,0.6685714,0,0,0.0,0,0,0.0
Cy72_CD45_G09_S753_comb,6.830161,-10.858874,Immune cells,B,B,Malignant,1,Malignant,Patient72,Tumor,Female,Metastatic,0.0,0,0,0.09906139,0,0,0.0
Cy74_CD45_C03_S411_comb,7.539371,-10.379721,Immune cells,B,B,B,1,B,Patient74,Tumor,Male,Metastatic,0.0,0,0,0.0,0,0,0.0


In [7]:
file_name <- "HNSC_GSE103322"
# Specify rows to keep
keep_rows <- c("TIGIT","PVR","CD226","PVRL2","LILRB1", "LILRB2", "LILRB4", "HLA-G")         

# load dataset
df_meta <- read_tsv(paste("../input/",file_name,"_CellMetainfo_table.tsv", sep=""))
h5_data <- hdf5r::H5File$new(paste("../input/",file_name,"_expression.h5", sep=""), mode = 'r')

feature_matrix <- Matrix::sparseMatrix(
  i = h5_data[['matrix/indices']][],
  p = h5_data[['matrix/indptr']][],
  x = h5_data[['matrix/data']][],
  dimnames = list(
    h5_data[['matrix/features/name']][],
    h5_data[['matrix/barcodes']][]
  ),
  dims = h5_data[['matrix/shape']][],
  index1 = FALSE
)
rm(h5_data) # for saving memory


m_subset <- feature_matrix[rownames(feature_matrix) %in% keep_rows, ] # Extract rows from matrix
rm(feature_matrix)

df_mat <- as.data.frame(as.matrix(m_subset))
df_mat <- as_tibble(t(df_mat), rownames="Cell")

df <- df_meta %>% left_join(., df_mat)
head(df)

# to save files
write.csv(df, paste("../output/",file_name,".csv", sep=""))

[1mRows: [22m[34m5902[39m [1mColumns: [22m[34m15[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m (12): Cell, Celltype (malignancy), Celltype (major-lineage), Celltype (m...
[32mdbl[39m  (3): UMAP_1, UMAP_2, Cluster

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1m[22mJoining with `by = join_by(Cell)`


Cell,UMAP_1,UMAP_2,Celltype (malignancy),Celltype (major-lineage),Celltype (minor-lineage),Celltype (original),Cluster,Site,Celltype,⋯,Gender,Stage,TNMstage,HLA-G,LILRB4,CD226,LILRB1,TIGIT,LILRB2,PVR
<chr>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<dbl>,<chr>,<chr>,⋯,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
HN25_P25_B08_S308_comb,5.387136,4.733791,Immune cells,CD4Tconv,CD4Tn,Dendritic,15,Lymph node,Dendritic,⋯,Female,Primary,II,0.1215125,0.0,0,0.0,0.0,0.0,0.0
HN26_P13_A02_S98_comb,5.397628,4.681664,Immune cells,CD4Tconv,CD4Tn,Dendritic,15,Lymph node,Dendritic,⋯,Male,Primary,III,0.4069121,0.0,0,0.0,0.0,0.0,0.005795162
HN26_P26_G10_S370_comb,5.412244,4.775695,Immune cells,CD4Tconv,CD4Tn,Dendritic,15,Lymph node,Dendritic,⋯,Male,Primary,III,0.1650751,0.0,0,0.0,0.0,0.0,0.0
HN26_P6_B09_S21_comb,10.841489,-1.543836,Immune cells,CD4Tconv,CD4Tn,T cell,9,Primary,T cell,⋯,Male,Primary,III,0.3732727,0.0,0,0.0,0.3594962,0.0,0.0
HNSCC26_P12_E08_S248_comb,8.661125,5.069691,Immune cells,CD4Tconv,CD4Tn,Macrophage,15,Lymph node,Macrophage,⋯,Male,Primary,III,0.1154432,0.01283061,0,0.8654621,0.0,0.005332531,0.0
HN25_P5_C11_S35_comb,5.380356,4.68756,Immune cells,CD4Tconv,CD4Tn,Dendritic,15,Primary,Dendritic,⋯,Female,Primary,II,0.1856017,0.0,0,0.0,0.0,0.0,0.0


In [8]:
file_name <- "BRCA_GSE114727_inDrop"
# Specify rows to keep
keep_rows <- c("TIGIT","PVR","CD226","PVRL2","LILRB1", "LILRB2", "LILRB4", "HLA-G")         

# load dataset
df_meta <- read_tsv(paste("../input/",file_name,"_CellMetainfo_table.tsv", sep=""))
h5_data <- hdf5r::H5File$new(paste("../input/",file_name,"_expression.h5", sep=""), mode = 'r')

feature_matrix <- Matrix::sparseMatrix(
  i = h5_data[['matrix/indices']][],
  p = h5_data[['matrix/indptr']][],
  x = h5_data[['matrix/data']][],
  dimnames = list(
    h5_data[['matrix/features/name']][],
    h5_data[['matrix/barcodes']][]
  ),
  dims = h5_data[['matrix/shape']][],
  index1 = FALSE
)
rm(h5_data) # for saving memory


m_subset <- feature_matrix[rownames(feature_matrix) %in% keep_rows, ] # Extract rows from matrix
rm(feature_matrix)

df_mat <- as.data.frame(as.matrix(m_subset))
df_mat <- as_tibble(t(df_mat), rownames="Cell")

df <- df_meta %>% left_join(., df_mat)
head(df)

# to save files
write.csv(df, paste("../output/",file_name,".csv", sep=""))

[1mRows: [22m[34m19676[39m [1mColumns: [22m[34m13[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m (10): Cell, Celltype (malignancy), Celltype (major-lineage), Celltype (m...
[32mdbl[39m  (3): UMAP_1, UMAP_2, Cluster

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1m[22mJoining with `by = join_by(Cell)`


Cell,UMAP_1,UMAP_2,Celltype (malignancy),Celltype (major-lineage),Celltype (minor-lineage),Cluster,Sample,Patient,Source,⋯,Stage,TNMstage,HLA-G,PVR,TIGIT,CD226,LILRB4,LILRB2,LILRB1,PVRL2
<chr>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<dbl>,<chr>,<chr>,<chr>,⋯,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
BC04BLOOD4_218,-8.800593,14.55176,Immune cells,B,B,10,BC04BLOOD4,BC04,PBMC,⋯,Primary,I,0,0,0,0,0,0,0,0
BC04BLOOD4_272,-8.305215,11.86706,Immune cells,B,B,15,BC04BLOOD4,BC04,PBMC,⋯,Primary,I,0,0,0,0,0,0,0,0
BC04BLOOD4_290,-7.171502,12.08958,Immune cells,B,B,15,BC04BLOOD4,BC04,PBMC,⋯,Primary,I,0,0,0,0,0,0,0,0
BC04BLOOD4_340,-8.384596,12.34753,Immune cells,B,B,15,BC04BLOOD4,BC04,PBMC,⋯,Primary,I,0,0,0,0,0,0,0,0
BC04BLOOD4_498,-7.047964,12.00084,Immune cells,B,B,15,BC04BLOOD4,BC04,PBMC,⋯,Primary,I,0,0,0,0,0,0,0,0
BC04BLOOD4_715,-8.249105,12.2762,Immune cells,B,B,15,BC04BLOOD4,BC04,PBMC,⋯,Primary,I,0,0,0,0,0,0,0,0


In [9]:
file_name <- "BRCA_GSE114727_10X"
# Specify rows to keep
keep_rows <- c("TIGIT","PVR","CD226","PVRL2","LILRB1", "LILRB2", "LILRB4", "HLA-G")         

# load dataset
df_meta <- read_tsv(paste("../input/",file_name,"_CellMetainfo_table.tsv", sep=""))
h5_data <- hdf5r::H5File$new(paste("../input/",file_name,"_expression.h5", sep=""), mode = 'r')

feature_matrix <- Matrix::sparseMatrix(
  i = h5_data[['matrix/indices']][],
  p = h5_data[['matrix/indptr']][],
  x = h5_data[['matrix/data']][],
  dimnames = list(
    h5_data[['matrix/features/name']][],
    h5_data[['matrix/barcodes']][]
  ),
  dims = h5_data[['matrix/shape']][],
  index1 = FALSE
)
rm(h5_data) # for saving memory


m_subset <- feature_matrix[rownames(feature_matrix) %in% keep_rows, ] # Extract rows from matrix
rm(feature_matrix)

df_mat <- as.data.frame(as.matrix(m_subset))
df_mat <- as_tibble(t(df_mat), rownames="Cell")

df <- df_meta %>% left_join(., df_mat)
head(df)

# to save files
write.csv(df, paste("../output/",file_name,".csv", sep=""))

[1mRows: [22m[34m28678[39m [1mColumns: [22m[34m13[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m (10): Cell, Celltype (malignancy), Celltype (major-lineage), Celltype (m...
[32mdbl[39m  (3): UMAP_1, UMAP_2, Cluster

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1m[22mJoining with `by = join_by(Cell)`


Cell,UMAP_1,UMAP_2,Celltype (malignancy),Celltype (major-lineage),Celltype (minor-lineage),Cluster,Patient,Sample,Source,Gender,Stage,TNMstage,TIGIT,HLA-G,CD226,PVR,PVRL2,LILRB1
<chr>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
BC09_TUMOR1@AAACCTGAGTTAAGTG-1,2.230674,2.0426747,Immune cells,CD4Tconv,Th17,0,BC09,BC09_TUMOR1,Tumor,Female,Primary,II,0.0,0,0.0,0,0,0
BC09_TUMOR1@AAACCTGCAACACCCG-1,2.157552,5.5871748,Immune cells,CD4Tconv,CD4Tn,1,BC09,BC09_TUMOR1,Tumor,Female,Primary,II,0.0,0,0.9804128,0,0,0
BC09_TUMOR1@AAACCTGTCCCTGACT-1,1.537627,3.9877924,Immune cells,CD4Tconv,CD4Tn,1,BC09,BC09_TUMOR1,Tumor,Female,Primary,II,1.314215,0,0.0,0,0,0
BC09_TUMOR1@AAACGGGAGCATGGCA-1,4.82421,2.1798624,Immune cells,CD4Tconv,Th17,0,BC09,BC09_TUMOR1,Tumor,Female,Primary,II,0.0,0,0.0,0,0,0
BC09_TUMOR1@AAACGGGCACTAAGTC-1,-7.379394,0.1972042,Immune cells,CD4Tconv,Tfh,6,BC09,BC09_TUMOR1,Tumor,Female,Primary,II,1.996102,0,0.9516879,0,0,0
BC09_TUMOR1@AAACGGGCAGCGTTCG-1,4.982963,1.4641977,Immune cells,CD4Tconv,Th17,0,BC09,BC09_TUMOR1,Tumor,Female,Primary,II,0.0,0,0.0,0,0,0


In [10]:
file_name <- "NSCLC_EMTAB6149"
# Specify rows to keep
keep_rows <- c("TIGIT","PVR","CD226","PVRL2","LILRB1", "LILRB2", "LILRB4", "HLA-G")         

# load dataset
df_meta <- read_tsv(paste("../input/",file_name,"_CellMetainfo_table.tsv", sep=""))
h5_data <- hdf5r::H5File$new(paste("../input/",file_name,"_expression.h5", sep=""), mode = 'r')

feature_matrix <- Matrix::sparseMatrix(
  i = h5_data[['matrix/indices']][],
  p = h5_data[['matrix/indptr']][],
  x = h5_data[['matrix/data']][],
  dimnames = list(
    h5_data[['matrix/features/name']][],
    h5_data[['matrix/barcodes']][]
  ),
  dims = h5_data[['matrix/shape']][],
  index1 = FALSE
)
rm(h5_data) # for saving memory


m_subset <- feature_matrix[rownames(feature_matrix) %in% keep_rows, ] # Extract rows from matrix
rm(feature_matrix)

df_mat <- as.data.frame(as.matrix(m_subset))
df_mat <- as_tibble(t(df_mat), rownames="Cell")

df <- df_meta %>% left_join(., df_mat)
head(df)

# to save files
write.csv(df, paste("../output/",file_name,".csv", sep=""))

[1mRows: [22m[34m40218[39m [1mColumns: [22m[34m11[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m (8): Cell, Celltype (malignancy), Celltype (major-lineage), Celltype (mi...
[32mdbl[39m (3): UMAP_1, UMAP_2, Cluster

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1m[22mJoining with `by = join_by(Cell)`


Cell,UMAP_1,UMAP_2,Celltype (malignancy),Celltype (major-lineage),Celltype (minor-lineage),Celltype (original),Cluster,DbCluster,Source,Stage,TIGIT,HLA-G,CD226,PVR,LILRB2,LILRB1,LILRB4
<chr>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<dbl>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
AGTTCTACGCATAC_1,1.842451,-16.59187,Others,Alveolar,Alveolar,Alveolar,13,Alveolar,Tumor,Primary,0,0,0,0,0,0,0
ATACCGGATCGTGA_1,1.75373,-16.29929,Others,Alveolar,Alveolar,Alveolar,13,Alveolar,Tumor,Primary,0,0,0,0,0,0,0
GAACGTTGATGGTC_1,1.481534,-15.80473,Others,Alveolar,Alveolar,Alveolar,13,Alveolar,Tumor,Primary,0,0,0,0,0,0,0
GCACGGTGTAACCG_1,3.88733,-14.12924,Others,Alveolar,Alveolar,Alveolar,13,Alveolar,Tumor,Primary,0,0,0,0,0,0,0
TCAGACGAGCAAGG_1,1.627641,-16.31171,Others,Alveolar,Alveolar,Alveolar,13,Alveolar,Tumor,Primary,0,0,0,0,0,0,0
GCGCACGACAGATC_3,-7.19263,-15.41526,Others,Alveolar,Alveolar,Malignant,13,Malignant,Tumor,Primary,0,0,0,0,0,0,0


In [11]:
file_name <- "BRCA_GSE176078"
# Specify rows to keep
keep_rows <- c("TIGIT","PVR","CD226","PVRL2","LILRB1", "LILRB2", "LILRB4", "HLA-G")         

# load dataset
df_meta <- read_tsv(paste("../input/",file_name,"_CellMetainfo_table.tsv", sep=""))
h5_data <- hdf5r::H5File$new(paste("../input/",file_name,"_expression.h5", sep=""), mode = 'r')

feature_matrix <- Matrix::sparseMatrix(
  i = h5_data[['matrix/indices']][],
  p = h5_data[['matrix/indptr']][],
  x = h5_data[['matrix/data']][],
  dimnames = list(
    h5_data[['matrix/features/name']][],
    h5_data[['matrix/barcodes']][]
  ),
  dims = h5_data[['matrix/shape']][],
  index1 = FALSE
)
rm(h5_data) # for saving memory


m_subset <- feature_matrix[rownames(feature_matrix) %in% keep_rows, ] # Extract rows from matrix
rm(feature_matrix)

df_mat <- as.data.frame(as.matrix(m_subset))
df_mat <- as_tibble(t(df_mat), rownames="Cell")

df <- df_meta %>% left_join(., df_mat)
head(df)

# to save files
write.csv(df, paste("../output/",file_name,".csv", sep=""))

[1mRows: [22m[34m89471[39m [1mColumns: [22m[34m13[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m (10): Cell, Celltype (malignancy), Celltype (major-lineage), Celltype (m...
[32mdbl[39m  (3): UMAP_1, UMAP_2, Cluster

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1m[22mJoining with `by = join_by(Cell)`


Cell,UMAP_1,UMAP_2,Celltype (malignancy),Celltype (major-lineage),Celltype (minor-lineage),Celltype (original),Cluster,Subtype,Celltype_major,⋯,Celltype_minor,Patient,TIGIT,HLA-G,CD226,PVR,PVRL2,LILRB2,LILRB1,LILRB4
<chr>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<dbl>,<chr>,<chr>,⋯,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
CID4495_AAGCCGCAGGCATGGT,-7.615879,-6.630866,Immune cells,B,B,Endothelial,8,TNBC,Endothelial,⋯,Endothelial_Endothelial ACKR1+,CID4495,0,0,0,0,0,0,0,2.761641
CID4495_GATCGATTCACGAAGG,-7.629208,-6.569772,Immune cells,B,B,Endothelial,8,TNBC,Endothelial,⋯,Endothelial_Endothelial ACKR1+,CID4495,0,0,0,0,0,0,0,2.109138
CID4495_CATCAAGCAGTGACAG,-7.570159,-6.324082,Immune cells,B,B,CAFs,8,TNBC,CAFs,⋯,CAFs_Inflammatory-CAFs,CID4495,0,0,0,0,0,0,0,3.010868
CID4495_CACCTTGCACCGAAAG,-9.622888,-3.392681,Immune cells,B,B,CAFs,8,TNBC,CAFs,⋯,CAFs_Inflammatory-CAFs,CID4495,0,0,0,0,0,0,0,0.0
CID4495_TGCGCAGGTTGGTAAA,-7.676848,-6.092321,Immune cells,B,B,CAFs,8,TNBC,CAFs,⋯,CAFs_Inflammatory-CAFs,CID4495,0,0,0,0,0,0,0,2.286632
CID4495_TGGGAAGCACTCAGGC,-7.65175,-6.488243,Immune cells,B,B,CAFs,8,TNBC,CAFs,⋯,CAFs_Inflammatory-CAFs,CID4495,0,0,0,0,0,0,0,0.0


In [12]:
file_name <- "AML_GSE116256"
# Specify rows to keep
keep_rows <- c("TIGIT","PVR","CD226","PVRL2","LILRB1", "LILRB2", "LILRB4", "HLA-G")         

# load dataset
df_meta <- read_tsv(paste("../input/",file_name,"_CellMetainfo_table.tsv", sep=""))
h5_data <- hdf5r::H5File$new(paste("../input/",file_name,"_expression.h5", sep=""), mode = 'r')

feature_matrix <- Matrix::sparseMatrix(
  i = h5_data[['matrix/indices']][],
  p = h5_data[['matrix/indptr']][],
  x = h5_data[['matrix/data']][],
  dimnames = list(
    h5_data[['matrix/features/name']][],
    h5_data[['matrix/barcodes']][]
  ),
  dims = h5_data[['matrix/shape']][],
  index1 = FALSE
)
rm(h5_data) # for saving memory


m_subset <- feature_matrix[rownames(feature_matrix) %in% keep_rows, ] # Extract rows from matrix
rm(feature_matrix)

df_mat <- as.data.frame(as.matrix(m_subset))
df_mat <- as_tibble(t(df_mat), rownames="Cell")

df <- df_meta %>% left_join(., df_mat)
head(df)

# to save files
write.csv(df, paste("../output/",file_name,".csv", sep=""))

[1mRows: [22m[34m38348[39m [1mColumns: [22m[34m15[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m (12): Cell, Celltype (malignancy), Celltype (major-lineage), Celltype (m...
[32mdbl[39m  (3): UMAP_1, UMAP_2, Cluster

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1m[22mJoining with `by = join_by(Cell)`


Cell,UMAP_1,UMAP_2,Celltype (malignancy),Celltype (major-lineage),Celltype (minor-lineage),Celltype (original),Cluster,PredictionRefined,Sample,⋯,Patient,Gender,Stage,CD226,HLA-G,LILRB1,LILRB2,LILRB4,PVR,TIGIT
<chr>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<dbl>,<chr>,<chr>,⋯,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
AML328-D113_ATTAAGCATTTC,-3.304277,-4.353379,Immune cells,B,B,B,18,normal,AML328-D113,⋯,AML328,Female,Primary,0,0,1.924949,0,0,0,0
AML328-D113_GAAATGCCCCAT,-3.453818,-4.270767,Immune cells,B,B,B,18,normal,AML328-D113,⋯,AML328,Female,Primary,0,0,0.0,0,0,0,0
AML328-D113_GGGTGGCGAAAC,-3.33842,-4.399727,Immune cells,B,B,B,18,normal,AML328-D113,⋯,AML328,Female,Primary,0,0,0.0,0,0,0,0
AML328-D113_TAACTAAGCTAC,-3.46575,-4.229091,Immune cells,B,B,B,18,normal,AML328-D113,⋯,AML328,Female,Primary,0,0,0.0,0,0,0,0
AML328-D113_TCGTTAGTTATT,-3.474983,-4.395324,Immune cells,B,B,B,18,normal,AML328-D113,⋯,AML328,Female,Primary,0,0,1.305719,0,0,0,0
AML328-D113_TCTAGATTCCGA,-3.43236,-4.451101,Immune cells,B,B,B,18,normal,AML328-D113,⋯,AML328,Female,Primary,0,0,0.0,0,0,0,0


In [None]:
file_name <- "AML_GSE116256"
# Specify rows to keep
keep_rows <- c("TIGIT","PVR","CD226","PVRL2","LILRB1", "LILRB2", "LILRB4", "HLA-G")         

# load dataset
df_meta <- read_tsv(paste("../input/",file_name,"_CellMetainfo_table.tsv", sep=""))
h5_data <- hdf5r::H5File$new(paste("../input/",file_name,"_expression.h5", sep=""), mode = 'r')

feature_matrix <- Matrix::sparseMatrix(
  i = h5_data[['matrix/indices']][],
  p = h5_data[['matrix/indptr']][],
  x = h5_data[['matrix/data']][],
  dimnames = list(
    h5_data[['matrix/features/name']][],
    h5_data[['matrix/barcodes']][]
  ),
  dims = h5_data[['matrix/shape']][],
  index1 = FALSE
)
rm(h5_data) # for saving memory


m_subset <- feature_matrix[rownames(feature_matrix) %in% keep_rows, ] # Extract rows from matrix
rm(feature_matrix)

df_mat <- as.data.frame(as.matrix(m_subset))
df_mat <- as_tibble(t(df_mat), rownames="Cell")

df <- df_meta %>% left_join(., df_mat)
head(df)

# to save files
write.csv(df, paste("../output/",file_name,".csv", sep=""))