In [1]:
library(data.table)
library(tidyverse)
library(conveniencefunctions)

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.1 ──

[32m✔[39m [34mggplot2[39m 3.3.6     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.7     [32m✔[39m [34mdplyr  [39m 1.0.9
[32m✔[39m [34mtidyr  [39m 1.2.0     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 2.1.2     [32m✔[39m [34mforcats[39m 0.5.1

── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mbetween()[39m   masks [34mdata.table[39m::between()
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m    masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mfirst()[39m     masks [34mdata.table[39m::first()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m       masks [34mstats[39m::lag()
[31m✖[39m [34mdplyr[39m::[32mlast()[39m      masks [34mdata.table[39m::last()
[31m✖[39m [34mpurrr[39m::[32mtranspose()[39m masks [34mdata.table[39m::transpose(

|Function                      |Shortcut         |
|:-----------------------------|:----------------|
|alignAssign                   |Ctrl+Q           |
|alignCursor                   |Ctrl+Shift+Q     |
|toggle_subsection             |Ctrl+Y           |
|toggle_subsubsection          |Ctrl+Shift+Y     |
|insertFormals                 |Ctrl+Shift+Alt+I |
|assignFormals                 |Ctrl+Shift+Alt+O |
|fixCommas                     |Ctrl+Alt+Z       |
|exposeAsArgument              |Ctrl+Alt+C       |
|toggle_mclapply               |Ctrl+Shift+G     |
|insert_debugonce              |Ctrl+Shift+Alt+G |
|transform_subsection          |Ctrl+J           |
|initiate_or_delete_subsection |Ctrl+Shift+J     |
|renumber_sections             |Ctrl+Shift+R     |
|insert_loopdebugger           |Ctrl+Shift+I     |
|toggle_blabla                 |Ctrl+Alt+O       |
|extract_importFrom            |Ctrl+Alt+I       |
|refactor_functionCall         |Ctrl+Alt+U       |
|insertHistory                 

In [2]:
## general functions to process data from all donors
preprocessing <- function(file) {
    donor <- gsub('../vdj_v1_hs_aggregated_', '', file)
    donor <- gsub('_binarized_matrix.csv', '', donor)
    cat('Preprocessing', donor, '...')
    
    dat <- fread(file, data.table = F)
    #dat <- dat[, -c(1:2, 4:18)]
    keep_col <- colnames(dat)[grepl('_binder', colnames(dat))]
    dat <- dat[, c('cell_clono_cdr3_aa', keep_col)]
    
    dat <- melt(dat, id.vars = c('cell_clono_cdr3_aa'), variable.name = 'complex')
    dat$cell_clono_cdr3_aa <- gsub('TRA:[A-Z]*.?(;|$)', '', dat$cell_clono_cdr3_aa)
    dat <- dat[dat$cell_clono_cdr3_aa != '',]
    
    # remove observations with ambiguous CDR3B
    dat <- dat[!grepl('TRB.*TRB.*', dat$cell_clono_cdr3_aa),]
    dat$cell_clono_cdr3_aa <- gsub('TRB:', '', dat$cell_clono_cdr3_aa)
    dat$cell_clono_cdr3_aa <- gsub('^C', '', dat$cell_clono_cdr3_aa)
    dat$cell_clono_cdr3_aa <- gsub('F$', '', dat$cell_clono_cdr3_aa)
    
    dat$complex <- as.character(dat$complex)
    dat$complex <- gsub('_binder', '', dat$complex)

    dat$HLA <- unlist(lapply(dat$complex, function(x) {unlist(strsplit(as.character(x), split = '_'))[1]}))
    dat$HLA <- gsub('[(NR)]', '', dat$HLA)
    dat$Epitope <- unlist(lapply(dat$complex, function(x) {unlist(strsplit(as.character(x), split = '_'))[2]}))
    dat$Antigen <- unlist(lapply(dat$complex, function(x) {unlist(strsplit(as.character(x), split = '_'))[3]}))
    dat$Source <- unlist(lapply(dat$complex, function(x) {unlist(strsplit(as.character(x), split = '_'))[4]}))

    dat <- unique(dat)
    dat <- dat %>%
     group_by(cell_clono_cdr3_aa, complex, HLA, Epitope, Antigen, Source) %>% 
     filter(n_distinct(value) == 1) %>%
     ungroup

    dat$complex <- NULL
    dat.pos <- dat[dat$value,]
    dat.pos$value <- NULL
    dat.neg <- dat[!dat$value,]
    dat.neg$value <- NULL
    
    write.table(dat, paste0('10X-', donor, '-loaded.csv'), quote = F, append = F, row.names = F, sep = ',')
    write.table(dat.pos, paste0('10X-', donor, '-pos-loaded.csv'), quote = F, append = F, row.names = F, sep = ',')
    write.table(dat.neg, paste0('10X-', donor, '-neg-loaded.csv'), quote = F, append = F, row.names = F, sep = ',')
    return_val <- list('sample' = dat, 'sample_pos' = dat.pos, 'sample_neg' = dat.neg)
    return(return_val)
}

In [3]:
files <- list.files('..', pattern = 'vdj_v1_hs_aggregated_', full.names = TRUE)
files

In [4]:
output <- lapply(files, preprocessing)

Preprocessing donor1 ...



Preprocessing donor2 ...



Preprocessing donor3 ...



Preprocessing donor4 ...



In [5]:
all <- merge(merge(output[[1]]$sample, output[[2]]$sample, by = colnames(output[[1]]$sample), all = TRUE), 
             merge(output[[3]]$sample, output[[4]]$sample, by = colnames(output[[3]]$sample), all = TRUE), 
             all = TRUE)
dim(output[[1]]$sample)
dim(output[[2]]$sample)
dim(output[[3]]$sample)
dim(output[[4]]$sample)
dim(all)

In [6]:
head(all)

Unnamed: 0_level_0,cell_clono_cdr3_aa,value,HLA,Epitope,Antigen,Source
Unnamed: 0_level_1,<chr>,<lgl>,<chr>,<chr>,<chr>,<chr>
1,AAAETGSAGELF,False,A0101,SLEGGGLGY,NC,
2,AAAETGSAGELF,False,A0101,STEGGGLAY,NC,
3,AAAETGSAGELF,False,A0101,VTEHDTLLY,IE-1,CMV
4,AAAETGSAGELF,False,A0201,ALIAPVHAV,NC,
5,AAAETGSAGELF,False,A0201,CLGGLLTMV,LMP-2A,EBV
6,AAAETGSAGELF,False,A0201,CLLGTYTQDV,Kanamycin-B-dioxygenase,


In [7]:
# filter conflict label across donors
all <- all %>%
     group_by(cell_clono_cdr3_aa, HLA, Epitope, Antigen, Source) %>% 
     filter(n_distinct(value) == 1) %>%
     ungroup
dim(all)

In [8]:
all.pos <- all[all$value,]
dim(all.pos)
all.neg <- all[!all$value,]
dim(all.neg)

In [9]:
write.table(all, paste0('10X-all-loaded.csv'), quote = F, append = F, row.names = F, sep = ',')
write.table(all.pos, paste0('10X-all-pos-loaded.csv'), quote = F, append = F, row.names = F, sep = ',')
write.table(all.neg, paste0('10X-all-neg-loaded.csv'), quote = F, append = F, row.names = F, sep = ',')

In [10]:
# ignore HLA
all_wohla <- all[, c('cell_clono_cdr3_aa', 'Epitope', 'value')]
dim(all_wohla)
all_wohla <- unique(all_wohla)
dim(all_wohla)

In [11]:
# check again, but the dimensions should stay the same
all_wohla <- all_wohla %>%
     group_by(cell_clono_cdr3_aa, Epitope) %>% 
     filter(n_distinct(value) == 1) %>%
     ungroup
dim(all_wohla)

In [12]:
write.table(all_wohla, paste0('10X-all-without-HLA.csv'), quote = F, append = F, row.names = F, sep = ',')

In [13]:
all_wohla.pos <- all_wohla[all_wohla$value,]
all_wohla.neg <- all_wohla[!all_wohla$value,]
all_wohla.pos$value <- all_wohla.neg$value <- NULL
dim(all_wohla.pos)
dim(all_wohla.neg)

In [14]:
write.table(all_wohla.pos, paste0('10X-pos-without-HLA.csv'), quote = F, append = F, row.names = F, sep = ',')
write.table(all_wohla.neg, paste0('10X-neg-without-HLA.csv'), quote = F, append = F, row.names = F, sep = ',')

In [15]:
head(all_wohla.pos)

cell_clono_cdr3_aa,Epitope
<chr>,<chr>
AAAETGSAGELF,KLGGALQAK
AAGEMFGLGETQY,AVFDRKSDAK
AAGGASYNEQF,KLGGALQAK
AAGGGSEMNTEAF,KLGGALQAK
AAGLASNEQF,KLGGALQAK
AAGSSGNQPQH,RAKFKQLL


In [16]:
unique(all$HLA)

In [17]:
all$HLA <- gsub('A', 'A*', all$HLA)
all$HLA <- gsub('B', 'B*', all$HLA)
all$HLA <- gsub('^(.{4})([0-9]+)$', '\\1:\\2', all$HLA)
all$HLA <- paste0('HLA-', all$HLA)

In [18]:
# from https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7204072/, filtering alleles having AF < 10
viet_alleles <- c("HLA-A*02:03", "HLA-A*02:07", "HLA-A*11:01", "HLA-A*24:02", "HLA-A*29:01", "HLA-A*33:03",
                  "HLA-B*07:05", "HLA-B*15:02", "HLA-B*15:25", "HLA-B*38:02", "HLA-B*40:01:02", "HLA-B*46:01", "HLA-B*58:01",
                  "HLA-C*01:02", "HLA-C*03:02:02", "HLA-C*03:04", "HLA-C*04:01", "HLA-C*04:03", "HLA-C*07:02", "HLA-C*08:01", "HLA-C*15:05:02",
                  "HLA-DRB1*03:01", "HLA-DRB1*04:05", "HLA-DRB1*08:03", "HLA-DRB1*09:01:02", "HLA-DRB1*10:01", "HLA-DRB1*12:02", "HLA-DRB1*15:02",
                  "HLA-DQB1*02:01", "HLA-DQB1*03:01", "HLA-DQB1*03:03", "HLA-DQB1*04:01", "HLA-DQB1*05:01", "HLA-DQB1*05:02", "HLA-DQB1*06:01"
                 )

In [19]:
dat_alleles <- unique(all$HLA)
dat_alleles

In [20]:
intersect(viet_alleles, dat_alleles)
table(all[all$HLA %in% intersect(viet_alleles, dat_alleles), c('HLA', 'value')])

             value
HLA            FALSE   TRUE
  HLA-A*11:01 125270    942
  HLA-A*24:02 254101     40

In [21]:
viet_over_dat_alleles <- viet_alleles[grepl(paste(str_escape(dat_alleles), collapse = "|"), viet_alleles) & !viet_alleles %in% dat_alleles]
viet_over_dat_alleles
dat_alleles_short <- unlist(lapply(dat_alleles, function(x) if(any(suppressMessages(grepl(str_escape(x), viet_over_dat_alleles)))) {x} else {}))
dat_alleles_short
table(all[all$HLA %in% dat_alleles_short, 'HLA'])
table(all[all$HLA %in% dat_alleles_short, c('HLA', 'value')])

Remember to cat() to see result with single escapes.



NULL

< table of extent 0 >

< table of extent 0 x 0 >

In [22]:
dat_over_viet_alleles <- dat_alleles[grepl(paste(str_escape(viet_alleles), collapse = "|"), dat_alleles) & !dat_alleles %in% viet_alleles]
dat_over_viet_alleles
table(all[all$HLA %in% dat_over_viet_alleles, 'HLA'])
table(all[all$HLA %in% dat_over_viet_alleles, c('HLA', 'value')])

Remember to cat() to see result with single escapes.



< table of extent 0 >

< table of extent 0 x 0 >

In [23]:
all <- unique(all[, c('cell_clono_cdr3_aa', 'HLA', 'Epitope', 'value')])
dim(all)

In [24]:
head(all)

cell_clono_cdr3_aa,HLA,Epitope,value
<chr>,<chr>,<chr>,<lgl>
AAAETGSAGELF,HLA-A*01:01,SLEGGGLGY,False
AAAETGSAGELF,HLA-A*01:01,STEGGGLAY,False
AAAETGSAGELF,HLA-A*01:01,VTEHDTLLY,False
AAAETGSAGELF,HLA-A*02:01,ALIAPVHAV,False
AAAETGSAGELF,HLA-A*02:01,CLGGLLTMV,False
AAAETGSAGELF,HLA-A*02:01,CLLGTYTQDV,False


In [25]:
all.pos <- all[all$value, -4]
all.neg <- all[!all$value, -4]
dim(all.pos)
dim(all.neg)

In [26]:
write.table(all.pos, '10X-pos-analyzed.csv', quote = F, append = F, row.names = F, sep = ',')
write.table(all.neg, '10X-neg-analyzed.csv', quote = F, append = F, row.names = F, sep = ',')

# Preprocessing on donor 1

In [27]:
donor1 <- fread('../vdj_v1_hs_aggregated_donor1_binarized_matrix.csv', data.table = F)
head(donor1)

Unnamed: 0_level_0,barcode,donor,cell_clono_cdr3_aa,cell_clono_cdr3_nt,CD3,CD19,CD45RA,CD4,CD8a,CD14,⋯,B0702_RPHERNGFTVL_pp65_CMV_binder,B0801_RAKFKQLL_BZLF1_EBV_binder,B0801_ELRRKMMYM_IE-1_CMV_binder,B0801_FLRGRAYGL_EBNA-3A_EBV_binder,A0101_SLEGGGLGY_NC_binder,A0101_STEGGGLAY_NC_binder,A0201_ALIAPVHAV_NC_binder,A2402_AYSSAGASI_NC_binder,B0702_GPAESAAGL_NC_binder,NR(B0801)_AAKGRGAAL_NC_binder
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>
1,AAACCTGAGACAAAGG-4,donor1,TRA:CAASVSIWTGTASKLTF;TRA:CAAWDMEYGNKLVF;TRB:CAISDPGLAGGGGEQFF,TRA:TGTGCAGCAAGCGTTAGTATTTGGACCGGCACTGCCAGTAAACTCACCTTT;TRA:TGTGCCGCCTGGGACATGGAATATGGAAACAAGCTGGTCTTT;TRB:TGTGCCATCAGTGACCCCGGACTAGCGGGAGGCGGGGGGGAGCAGTTCTTC,2125,0,912,1,2223,4,⋯,False,False,False,False,False,False,False,False,False,False
2,AAACCTGAGACTGTAA-34,donor1,TRB:CASDTPVGQFF,TRB:TGTGCCAGCGATACCCCGGTTGGGCAGTTCTTC,1023,0,2028,2,3485,1,⋯,False,False,False,False,False,False,False,False,False,False
3,AAACCTGAGAGCCCAA-5,donor1,TRA:CASYTDKLIF;TRB:CASSGGSISTDTQYF,TRA:TGTGCTTCCTACACCGACAAGCTCATCTTT;TRB:TGCGCCAGCAGTGGCGGGAGTATTAGCACAGATACGCAGTATTTT,1598,3,3454,4,3383,1,⋯,False,False,False,False,False,False,False,False,False,False
4,AAACCTGAGAGCTGCA-24,donor1,TRB:CASSGGQSSYEQYF,TRB:TGCGCCAGCAGTGGCGGACAGAGCTCCTACGAGCAGTACTTC,298,1,880,1,2389,1,⋯,False,False,False,False,False,False,False,False,False,False
5,AAACCTGAGAGGGATA-8,donor1,TRA:CAASGYGNTGRRALTF;TRB:CASSQDPAGGYNEQFF,TRA:TGTGCAGCAAGCGGGTATGGAAACACGGGCAGGAGAGCACTTACTTTT;TRB:TGCGCCAGCAGCCAAGACCCAGCGGGGGGGTACAATGAGCAGTTCTTC,1036,0,2457,2,3427,3,⋯,False,False,False,False,False,False,False,False,False,False
6,AAACCTGAGAGTGAGA-23,donor1,TRA:CAAHLSNFGNEKLTF;TRB:CATSRDRGHGDTIYF,TRA:TGTGCAGCACACTTATCTAACTTTGGAAATGAGAAATTAACCTTT;TRB:TGTGCCACCAGCAGAGATCGGGGCCATGGGGACACCATATATTTT,1729,1,39,160,5671,5,⋯,False,False,False,False,False,False,False,False,False,False


In [28]:
donor1 <- donor1[, -c(1:2, 4:18)]
head(donor1)

Unnamed: 0_level_0,cell_clono_cdr3_aa,A0101_VTEHDTLLY_IE-1_CMV,A0201_KTWGQYWQV_gp100_Cancer,A0201_ELAGIGILTV_MART-1_Cancer,A0201_CLLWSFQTSA_Tyrosinase_Cancer,A0201_IMDQVPFSV_gp100_Cancer,A0201_SLLMWITQV_NY-ESO-1_Cancer,A0201_KVAELVHFL_MAGE-A3_Cancer,A0201_KVLEYVIKV_MAGE-A1_Cancer,A0201_CLLGTYTQDV_Kanamycin-B-dioxygenase,⋯,B0702_RPHERNGFTVL_pp65_CMV_binder,B0801_RAKFKQLL_BZLF1_EBV_binder,B0801_ELRRKMMYM_IE-1_CMV_binder,B0801_FLRGRAYGL_EBNA-3A_EBV_binder,A0101_SLEGGGLGY_NC_binder,A0101_STEGGGLAY_NC_binder,A0201_ALIAPVHAV_NC_binder,A2402_AYSSAGASI_NC_binder,B0702_GPAESAAGL_NC_binder,NR(B0801)_AAKGRGAAL_NC_binder
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>
1,TRA:CAASVSIWTGTASKLTF;TRA:CAAWDMEYGNKLVF;TRB:CAISDPGLAGGGGEQFF,0,0,0,0,0,0,0,0,0,⋯,False,False,False,False,False,False,False,False,False,False
2,TRB:CASDTPVGQFF,0,0,0,0,0,0,0,0,0,⋯,False,False,False,False,False,False,False,False,False,False
3,TRA:CASYTDKLIF;TRB:CASSGGSISTDTQYF,0,0,0,0,0,0,0,0,1,⋯,False,False,False,False,False,False,False,False,False,False
4,TRB:CASSGGQSSYEQYF,0,0,0,0,0,0,0,0,0,⋯,False,False,False,False,False,False,False,False,False,False
5,TRA:CAASGYGNTGRRALTF;TRB:CASSQDPAGGYNEQFF,0,0,0,0,0,0,1,0,0,⋯,False,False,False,False,False,False,False,False,False,False
6,TRA:CAAHLSNFGNEKLTF;TRB:CATSRDRGHGDTIYF,0,0,0,0,0,0,0,0,0,⋯,False,False,False,False,False,False,False,False,False,False


In [29]:
keep_col <- colnames(donor1)[grepl('_binder', colnames(donor1))]
donor1 <- donor1[, c('cell_clono_cdr3_aa', keep_col)]
ncol(donor1)

In [30]:
# wide to long dataframe
donor1 <- melt(donor1, id.vars = c('cell_clono_cdr3_aa'), variable.name = 'complex')
table(donor1$value)
head(donor1)




  FALSE    TRUE 
2312027   14273 

Unnamed: 0_level_0,cell_clono_cdr3_aa,complex,value
Unnamed: 0_level_1,<chr>,<fct>,<lgl>
1,TRA:CAASVSIWTGTASKLTF;TRA:CAAWDMEYGNKLVF;TRB:CAISDPGLAGGGGEQFF,A0101_VTEHDTLLY_IE-1_CMV_binder,False
2,TRB:CASDTPVGQFF,A0101_VTEHDTLLY_IE-1_CMV_binder,False
3,TRA:CASYTDKLIF;TRB:CASSGGSISTDTQYF,A0101_VTEHDTLLY_IE-1_CMV_binder,False
4,TRB:CASSGGQSSYEQYF,A0101_VTEHDTLLY_IE-1_CMV_binder,False
5,TRA:CAASGYGNTGRRALTF;TRB:CASSQDPAGGYNEQFF,A0101_VTEHDTLLY_IE-1_CMV_binder,False
6,TRA:CAAHLSNFGNEKLTF;TRB:CATSRDRGHGDTIYF,A0101_VTEHDTLLY_IE-1_CMV_binder,False


In [31]:
# complex TRB may dampen the training
dim(donor1)
length(donor1$cell_clono_cdr3_aa[grepl('TRB.*TRB.*', donor1$cell_clono_cdr3_aa)])

In [32]:
donor1$cell_clono_cdr3_aa <- gsub('TRA:[A-Z]*.?(;|$)', '', donor1$cell_clono_cdr3_aa)
donor1 <- donor1[donor1$cell_clono_cdr3_aa != '',]
dim(donor1)

In [33]:
#donor1 <- donor1 %>% 
#mutate(cell_clono_cdr3_aa = strsplit(cell_clono_cdr3_aa, ";")) %>% 
#    unnest(cell_clono_cdr3_aa)
#head(donor1)

In [34]:
# remove observations with ambiguous CDR3B
donor1 <- donor1[!grepl('TRB.*TRB.*', donor1$cell_clono_cdr3_aa),]
dim(donor1)

In [35]:
donor1$cell_clono_cdr3_aa <- gsub('TRB:', '', donor1$cell_clono_cdr3_aa)
donor1$cell_clono_cdr3_aa <- gsub('^C', '', donor1$cell_clono_cdr3_aa)
donor1$cell_clono_cdr3_aa <- gsub('F$', '', donor1$cell_clono_cdr3_aa)
head(donor1)

Unnamed: 0_level_0,cell_clono_cdr3_aa,complex,value
Unnamed: 0_level_1,<chr>,<fct>,<lgl>
1,AISDPGLAGGGGEQF,A0101_VTEHDTLLY_IE-1_CMV_binder,False
2,ASDTPVGQF,A0101_VTEHDTLLY_IE-1_CMV_binder,False
3,ASSGGSISTDTQY,A0101_VTEHDTLLY_IE-1_CMV_binder,False
4,ASSGGQSSYEQY,A0101_VTEHDTLLY_IE-1_CMV_binder,False
5,ASSQDPAGGYNEQF,A0101_VTEHDTLLY_IE-1_CMV_binder,False
6,ATSRDRGHGDTIY,A0101_VTEHDTLLY_IE-1_CMV_binder,False


In [36]:
donor1$complex <- as.character(donor1$complex)
#donor1$complex <- gsub('NC_', 'NC_NC_', donor1$complex)

In [37]:
donor1$complex <- gsub('_binder', '', donor1$complex)
donor1$HLA <- unlist(lapply(donor1$complex, function(x) {unlist(strsplit(as.character(x), split = '_'))[1]}))
donor1$HLA <- gsub('[(NR)]', '', donor1$HLA)
donor1$Epitope <- unlist(lapply(donor1$complex, function(x) {unlist(strsplit(as.character(x), split = '_'))[2]}))
donor1$Antigen <- unlist(lapply(donor1$complex, function(x) {unlist(strsplit(as.character(x), split = '_'))[3]}))
donor1$Source <- unlist(lapply(donor1$complex, function(x) {unlist(strsplit(as.character(x), split = '_'))[4]}))
head(donor1)

Unnamed: 0_level_0,cell_clono_cdr3_aa,complex,value,HLA,Epitope,Antigen,Source
Unnamed: 0_level_1,<chr>,<chr>,<lgl>,<chr>,<chr>,<chr>,<chr>
1,AISDPGLAGGGGEQF,A0101_VTEHDTLLY_IE-1_CMV,False,A0101,VTEHDTLLY,IE-1,CMV
2,ASDTPVGQF,A0101_VTEHDTLLY_IE-1_CMV,False,A0101,VTEHDTLLY,IE-1,CMV
3,ASSGGSISTDTQY,A0101_VTEHDTLLY_IE-1_CMV,False,A0101,VTEHDTLLY,IE-1,CMV
4,ASSGGQSSYEQY,A0101_VTEHDTLLY_IE-1_CMV,False,A0101,VTEHDTLLY,IE-1,CMV
5,ASSQDPAGGYNEQF,A0101_VTEHDTLLY_IE-1_CMV,False,A0101,VTEHDTLLY,IE-1,CMV
6,ATSRDRGHGDTIY,A0101_VTEHDTLLY_IE-1_CMV,False,A0101,VTEHDTLLY,IE-1,CMV


In [38]:
dim(donor1)
donor1 <- unique(donor1)
dim(donor1)

In [39]:
# after removing TRA chain, the resulting TRB-HLA-Epitope can cause conflict labels -> remove contents of both labels
donor1 <- donor1 %>%
     group_by(cell_clono_cdr3_aa, complex, HLA, Epitope, Antigen, Source) %>% 
     filter(n_distinct(value) == 1) %>%
     ungroup
dim(donor1)

In [40]:
dim(unique(donor1[, c('cell_clono_cdr3_aa', 'HLA', 'Epitope')]))

In [41]:
donor1$complex <- NULL
donor1.pos <- donor1[donor1$value,]
donor1.pos$value <- NULL
nrow(donor1.pos)
donor1.neg <- donor1[!donor1$value,]
donor1.neg$value <- NULL
nrow(donor1.neg)