# Add clone assignments to Seurat object and analyze clonal transcriptional profiles

In [5]:
library(ggplot2)
library(ggrepel)
library(plyr)
library(dplyr)
library(Seurat)
library(tidyr) 
library(DescTools)
library(pheatmap)
library(purrr)
library(ggbeeswarm)
library(forcats)

# Color palettes and helper functions
source("../sc_analysis_palettes_and_helpers.r")

dat_name <- "allClones_new"

## Add high-resolution clone assignments to primary tumor data

### Load full Seurat object


In [374]:
# Load full Seurat object

load('../transcriptome_analysis/data/allmerged_final_named_complete.Robj')


In [375]:
# Subset for MULTI-seq datasets with lineage info

Idents(all.comb) <- 'orig.ident'
dat <- subset(all.comb, idents = c('multi_seq_2_s1','multi_seq_2_s2',
                                   'multi_allos_1_tum',
                                   'multi_bAllos_1_tum',
                                   'multi_bAllos_2_tum',
                                   'multi_bAllos_3_tum',
                                   'multi_bAllos_4_PTs_S1','multi_bAllos_4_PTs_S2',
                                   'multiseq_17_S1','multiseq_17_S2'))

rm(all.comb)

### Load clone assignments

In [376]:
sample_specs <- read.delim('./data/PT_sample_specs.csv', sep = ';', stringsAsFactors = F)

In [377]:
allClones <- data.frame()
allClones_withDets <- data.frame()

Idents(dat) <- 'dataset'

for(i in 1:nrow(sample_specs)){


    sample_it <- sample_specs$sample[i]
    dataset_it <- sample_specs$dataset_name[i]
    
    
    dat_sub <- subset(dat, idents = dataset_it)
    
    
    clones_i <- data.frame()
    clones_i_det <- data.frame()

    for(y in unique(dat_sub$fish_id)){
        
        print(paste0('dataset: ',dataset_it,', fish:',y))

        if(file.exists(paste0(sample_specs$sample_wd[i],sample_specs$sample_name[i],'_',y,'_aut.csv'))){
            clones_aut <- read.delim(paste0(sample_specs$sample_wd[i],sample_specs$sample_name[i],'_',y,'_aut.csv'), sep = ',', stringsAsFactors = F, row.names = 1)
            clones_all_det <- read.delim(paste0(sample_specs$sample_wd[i],'original_',sample_specs$sample_name[i],'_',y,'_aut.csv'), sep = ',', stringsAsFactors = F, row.names = 1)
        }else{
            print('file does not exist!')
            next
        }
        
        
        if(dataset_it %in% c('multi_seq_05','multi_seq_06','multi_seq_08')){
            
            clones_aut$Barcode <- paste0(sample_specs$bc_extension[i],clones_aut$Barcode,'-1')
            clones_all_det$Barcode <- paste0(sample_specs$bc_extension[i],clones_all_det$Barcode,'-1')

        }else{
            
            clones_aut$Barcode <- paste0(sample_specs$bc_extension[i],clones_aut$Barcode)
            clones_all_det$Barcode <- paste0(sample_specs$bc_extension[i],clones_all_det$Barcode)

            }
        
        if(dataset_it %in% c('multi_seq_07','multi_seq_09')){
        
            clones_all_det$dataset <- dataset_it
        
        }
        
        print(head(clones_aut$Barcode))
        print(table(clones_aut$Barcode %in% rownames(dat_sub@meta.data)))
        clones_aut <- clones_aut[clones_aut$Barcode %in% rownames(dat_sub@meta.data),]
        
        clones_all_det <- clones_all_det[,c('Barcode','dataset','comb_seq_id','CIGAR','Gene','fish_clone','common_cluster')]

        
        clones_aut <- clones_aut[,c('Barcode','fish_clone','common_cluster')]
        clones_aut$dataset <- dataset_it
        clones_aut$fish_clone <- paste0(dataset_it,'_fish_',y,'_clone_',clones_aut$fish_clone)
        
        
        clones_aut$BC_cloneID <- paste0(clones_aut$Barcode,'_',clones_aut$fish_clone)
        clones_aut <- clones_aut[!duplicated(clones_aut$BC_cloneID),]

        print(paste0('# of duplicated barcodes: ',length((duplicated(clones_aut$Barcode)[duplicated(clones_aut$Barcode) == T]))))
        
        clones_i <- rbind(clones_i, clones_aut)
        clones_i_det <- rbind(clones_i_det, clones_all_det)
        rm(clones_aut)
    }

    allClones <- rbind(allClones, clones_i)
    allClones_withDets <- rbind(allClones_withDets, clones_i_det)
    rm(clones_i)


}

[1] "dataset: multi_seq_02, fish:5"
[1] "multi_2_multi_seq_2_s1_GCAGCTGAGTTGGCGA-1"
[2] "multi_2_multi_seq_2_s1_TACCTGCAGCTTAAGA-1"
[3] "multi_2_multi_seq_2_s1_TCTTGCGCAATGCAGG-1"
[4] "multi_2_multi_seq_2_s2_CTCTCGAGTCGGCTAC-1"
[5] "multi_2_multi_seq_2_s2_GATTCTTTCCGCGGAT-1"
[6] "multi_2_multi_seq_2_s2_GCATTAGAGCTTTGTG-1"

TRUE 
 100 
[1] "# of duplicated barcodes: 0"
[1] "dataset: multi_seq_02, fish:3"
[1] "multi_2_multi_seq_2_s1_AAACGCTCATCGTTCC-1"
[2] "multi_2_multi_seq_2_s1_AAAGAACAGTAGCTCT-1"
[3] "multi_2_multi_seq_2_s1_AAAGGGCTCATTCGTT-1"
[4] "multi_2_multi_seq_2_s1_AAAGTGATCCGGTAAT-1"
[5] "multi_2_multi_seq_2_s1_AACAACCTCTGCTTTA-1"
[6] "multi_2_multi_seq_2_s1_AACACACGTCTCGGGT-1"

TRUE 
 721 
[1] "# of duplicated barcodes: 0"
[1] "dataset: multi_seq_02, fish:4"
[1] "multi_2_multi_seq_2_s2_AAGGAATAGGAGTATT-1"
[2] "multi_2_multi_seq_2_s2_ACTTTGTAGGCCTAAG-1"
[3] "multi_2_multi_seq_2_s2_AGCGATTAGATACGAT-1"
[4] "multi_2_multi_seq_2_s2_CTGAGCGAGAGGCGTT-1"
[5] "multi_2_multi_seq_2_s2_TA

In [378]:
write.csv(allClones, paste0('allClones_merged_PTs',dat_name,'.csv'), quote = F)
write.csv(allClones_withDets, paste0('allClones_merged_PTs_withAllDetails',dat_name,'.csv'), quote = F)

### Merge clones with metadata

In [379]:
allClones <- read.delim(paste0('allClones_merged_PTs',dat_name,'.csv'), stringsAsFactors = F, header = T, row.names = 1, sep = ',')

In [380]:
metadat <- dat@meta.data
metadat$Barcode <- rownames(metadat)
dim(metadat[!metadat$Barcode %in% allClones$Barcode,])
dim(metadat)

In [381]:
metadat_ext <- left_join(metadat, allClones, by = 'Barcode')
rownames(metadat_ext) <- metadat_ext$Barcode
table(rownames(metadat_ext) == rownames(metadat))


 TRUE 
77779 

In [382]:
write.csv(metadat_ext, paste0('metadata_with_clones_merged_PTs_allCTs',dat_name,'.csv'), quote = F)

metadat_ext <- metadat_ext[metadat_ext$celltype_overall == 'NB',]

write.csv(metadat_ext, paste0('metadata_with_clones_merged_PTs_NBcells',dat_name,'.csv'), quote = F)

## Load module scores and merge with clone info and other metadata
### For zebrafish NB modules

In [7]:
# Load final module table
modules_pan <- read.delim('../gene_modules/final_gene_modules_list.csv', sep = ',', stringsAsFactors = F, header = T)

# Re-format into list
modules_pan <- as.list(modules_pan)

# Double-check that common gene names contain correct set of special characters
modules_pan <- lapply(modules_pan, function(x) gsub("si\\.","si:",x))
modules_pan <- lapply(modules_pan, function(x) gsub("zgc\\.","zgc:",x))
modules_pan <- lapply(modules_pan, function(x) gsub("mt\\.","mt-",x))
modules_pan <- lapply(modules_pan, function(x) gsub("dkey\\.","dkey-",x))
modules_pan <- lapply(modules_pan, function(x) gsub("h211\\.","h211-",x))
modules_pan <- lapply(modules_pan, function(x) gsub("1073\\.","1073-",x))
modules_pan <- lapply(modules_pan, function(x) gsub("1073\\.","1073-",x))
modules_pan <- lapply(modules_pan, function(x) gsub("ch73\\.","ch73-",x))

# Remove empty entries
modules_pan <- lapply(modules_pan, function(x) {
  # Remove NA and empty string "" entries
  x[!(is.na(x) | x == "")]
})

# Remove modules that are too short
modules_pan <- modules_pan[lengths(modules_pan) > 5]

In [6]:
# Load Seurat metadata with high-resolution clone assignments
metadat_ext <- read.delim(paste0('metadata_with_clones_merged_PTs_NBcells',dat_name,'.csv'), stringsAsFactors = F, sep = ',', row.names = 1, header = T)


### Load AUCell module expression scores

In [29]:
load('../transcriptome_analysis/data/moduleScores_AUC_NBcells_final_modules_allmerged_final_all_perSample.Robj')

In [30]:
# Merge all scores from different datasets into one dataframe
indtumours <- enrich_list

indtumours_merge <- data.frame()

for(i in 1:length(indtumours)){

    if(is.matrix(indtumours[[i]]) == T){
        indtumours_merge <- rbind(indtumours_merge, indtumours[[i]])
    }else{
        next
    }
}

indtumours_merge <- indtumours_merge[,sort(colnames(indtumours_merge))]

rm(indtumours)

In [32]:
indtumours_merge$Barcode <- rownames(indtumours_merge)

metadat_ext$Barcode <- rownames(metadat_ext)

table(rownames(indtumours_merge) %in% rownames(metadat_ext))


FALSE  TRUE 
38970 62886 

In [33]:
# Only keep cells that belong to a clone
metadat_scores_auc <- indtumours_merge[indtumours_merge$Barcode %in% rownames(metadat_ext),]

rm(indtumours_merge)
dim(metadat_scores_auc)

### Load 0-1 module expression scores

In [391]:
load('../transcriptome_analysis/data/moduleScores_Barkley_NBcells_final_modules_allmerged_final_all_perSample.Robj')

In [35]:
# Merge all scores from different datasets into one dataframe
indtumours <- enrich_list

indtumours_merge <- data.frame()

for(i in 1:length(indtumours)){

    if(is.matrix(indtumours[[i]]) == T){
        indtumours[[i]] <- t(indtumours[[i]])
        indtumours_merge <- rbind(indtumours_merge, indtumours[[i]])
    }else{
        next
    }
}

indtumours_merge <- indtumours_merge[,sort(colnames(indtumours_merge))]

rm(indtumours)

In [37]:
indtumours_merge$Barcode <- rownames(indtumours_merge)

metadat_ext$Barcode <- rownames(metadat_ext)

table(rownames(indtumours_merge) %in% rownames(metadat_ext))


FALSE  TRUE 
38970 62886 

In [38]:
metadat_scores_bark <- indtumours_merge[indtumours_merge$Barcode %in% rownames(metadat_ext),]


### Load countsums style module scores

In [396]:
load("../transcriptome_analysis/data/moduleScores_CountSum_NBcells_final_modules__allmerged_final_all_perSample.Robj")

In [40]:
# Merge all scores from different datasets into one dataframe
indtumours <- enrich_list

indtumours_merge <- data.frame()

for(i in 1:length(indtumours)){

    if(is.data.frame(indtumours[[i]]) == T){
        indtumours_merge <- rbind(indtumours_merge, indtumours[[i]])
    }else{
        next
    }
}

indtumours_merge <- indtumours_merge[,sort(colnames(indtumours_merge))]


rm(indtumours)

In [42]:
indtumours_merge$Barcode <- rownames(indtumours_merge)

metadat_ext$Barcode <- rownames(metadat_ext)

table(rownames(indtumours_merge) %in% rownames(metadat_ext))


FALSE  TRUE 
38970 62886 

In [43]:
# Only keep cells that belong to a clone
metadat_scores_counts <- indtumours_merge[indtumours_merge$Barcode %in% rownames(metadat_ext),]

rm(indtumours_merge)
dim(metadat_scores_counts)

### Harmonize all scores and add them to metadata and as dimensionality reductions

In [44]:
table(colnames(metadat_scores_bark) == colnames(metadat_scores_auc))
table(colnames(metadat_scores_bark) == colnames(metadat_scores_counts))
table(colnames(metadat_scores_auc) == colnames(metadat_scores_counts))



TRUE 
  31 


TRUE 
  31 


TRUE 
  31 

#### Merge Seurat metadata with clone info with scores

In [45]:
# First make module names in column names from each approach unique.
metadat_scores_bark_met <- metadat_scores_bark
colnames(metadat_scores_bark_met) <- paste0('barkley_',colnames(metadat_scores_bark_met))
metadat_scores_bark_met$Barcode <- metadat_scores_bark_met$barkley_Barcode 
metadat_scores_bark_met$barkley_Barcode <- NULL

metadat_scores_auc_met <- metadat_scores_auc
colnames(metadat_scores_auc_met) <- paste0('auc_',colnames(metadat_scores_auc_met))
metadat_scores_auc_met$Barcode <- metadat_scores_auc_met$auc_Barcode 
metadat_scores_bark_met$auc_Barcode <- NULL

metadat_scores_counts_met <- metadat_scores_counts
colnames(metadat_scores_counts_met) <- paste0('counts_',colnames(metadat_scores_counts_met))
metadat_scores_counts_met$Barcode <- metadat_scores_counts_met$counts_Barcode 
metadat_scores_counts_met$counts_Barcode <- NULL

In [46]:
# Merge all with metadata
metadat_ext_ext <- right_join(metadat_ext, metadat_scores_bark_met, by = 'Barcode')
dim(metadat_ext_ext)
metadat_ext_ext <- left_join(metadat_ext_ext, metadat_scores_auc_met, by = 'Barcode')
dim(metadat_ext_ext)
metadat_ext_ext <- left_join(metadat_ext_ext, metadat_scores_counts_met, by = 'Barcode')
dim(metadat_ext_ext)

rownames(metadat_ext_ext) <- metadat_ext_ext$Barcode

In [404]:
# prevent column duplication
metadat_ext_ext <- metadat_ext_ext[,!colnames(metadat_ext_ext) %in% colnames(dat@meta.data)]
table(colnames(metadat_ext_ext) %in% colnames(dat@meta.data))


FALSE 
   97 

In [405]:
# prepare for merging
dat@meta.data$Barcode <- rownames(dat@meta.data)
metadat_ext_ext$Barcode <- rownames(metadat_ext_ext)

table(rownames(metadat_ext_ext) %in% rownames(dat@meta.data))
table(rownames(dat@meta.data) %in% rownames(metadat_ext_ext))


 TRUE 
62886 


FALSE  TRUE 
14893 62886 

In [406]:
# merge and check whether new object row names match Seurat metadata rownames
metadat_ext <- left_join(dat@meta.data, metadat_ext_ext, by = 'Barcode')
rownames(metadat_ext) <- metadat_ext$Barcode

table(rownames(metadat_ext) == rownames(dat@meta.data))
table(rownames(dat@meta.data) %in% rownames(metadat_ext))


 TRUE 
77779 


 TRUE 
77779 

In [407]:
# Add to metadata
dat@meta.data <- metadat_ext

##### Subset Seurat object for NB cells with scores and clone assignments

In [408]:
# Only keep cells that belong to the primary tumour and have a module score
dim(dat)
dat <- subset(dat, cells = rownames(metadat_ext_ext))
dim(dat)


#### Add Scores as dimensionality reduction objects to Seurat object

In [25]:
module_names <- colnames(metadat_scores_auc)
module_names <- module_names[module_names != 'Barcode']

metadat_scores_bark <- metadat_scores_bark[rownames(metadat_scores_bark) %in% rownames(dat@meta.data),]
metadat_scores_bark <- metadat_scores_bark[rownames(dat@meta.data),]
metadat_scores_bark$Barcode <- NULL
nmf_obj_bark <- as.matrix(metadat_scores_bark)
colnames(nmf_obj_bark) <- paste0('bark', c(1:ncol(metadat_scores_bark)))

metadat_scores_auc <- metadat_scores_auc[rownames(metadat_scores_auc) %in% rownames(dat@meta.data),]
metadat_scores_auc <- metadat_scores_auc[rownames(dat@meta.data),]
metadat_scores_auc$Barcode <- NULL
nmf_obj_auc <- as.matrix(metadat_scores_auc)
colnames(nmf_obj_auc) <- paste0('auc', c(1:ncol(metadat_scores_auc)))

metadat_scores_counts <- metadat_scores_counts[rownames(metadat_scores_counts) %in% rownames(dat@meta.data),]
metadat_scores_counts <- metadat_scores_counts[rownames(dat@meta.data),]
metadat_scores_counts$Barcode <- NULL
nmf_obj_counts <- as.matrix(metadat_scores_counts)
colnames(nmf_obj_counts) <- paste0('count', c(1:ncol(metadat_scores_counts)))


ERROR: Error in rownames(dat@meta.data): object 'dat' not found


In [49]:
# Save order of module names in the data
write.csv(module_names, './data/mod_name_order_in_data.csv', quote = F)

In [411]:
# Add to Seurat object
dat[["scores_bark"]] <- CreateDimReducObject(embeddings = nmf_obj_bark, key = "bark_", assay = DefaultAssay(dat))
dat[["scores_auc"]] <- CreateDimReducObject(embeddings = nmf_obj_auc, key = "auc_", assay = DefaultAssay(dat))
dat[["scores_count"]] <- CreateDimReducObject(embeddings = nmf_obj_counts, key = "count_", assay = DefaultAssay(dat))


In [412]:
# Save
saveRDS(dat, file = paste0('seur_obj_NBcells_with_clones_and_scores_DR_NB_modules_',dat_name,'.rds'))
