In [1]:
library(data.table)
library(conveniencefunctions)
library(VennDiagram)
library(readxl)

Loading required package: ggplot2



|Function                      |Shortcut         |
|:-----------------------------|:----------------|
|alignAssign                   |Ctrl+Q           |
|alignCursor                   |Ctrl+Shift+Q     |
|toggle_subsection             |Ctrl+Y           |
|toggle_subsubsection          |Ctrl+Shift+Y     |
|insertFormals                 |Ctrl+Shift+Alt+I |
|assignFormals                 |Ctrl+Shift+Alt+O |
|fixCommas                     |Ctrl+Alt+Z       |
|exposeAsArgument              |Ctrl+Alt+C       |
|toggle_mclapply               |Ctrl+Shift+G     |
|insert_debugonce              |Ctrl+Shift+Alt+G |
|transform_subsection          |Ctrl+J           |
|initiate_or_delete_subsection |Ctrl+Shift+J     |
|renumber_sections             |Ctrl+Shift+R     |
|insert_loopdebugger           |Ctrl+Shift+I     |
|toggle_blabla                 |Ctrl+Alt+O       |
|extract_importFrom            |Ctrl+Alt+I       |
|refactor_functionCall         |Ctrl+Alt+U       |
|insertHistory                 

Loading required package: grid

Loading required package: futile.logger



# Load data and check for exceptions

In [2]:
dat <- read_excel('TBAdb.xlsx')
dim(dat)
head(dat)

ICDname,Disease.name,Category,Antigen,Antigen.sequence,HLA,Locus,CDR3.alpha.aa,CDR3.beta.aa,CDR3.alpha.nt,⋯,Cell.subtype,Prepare.method,Evaluate.method,Case.num,Control.type,Control.num,Filteration,Journal,Pubmed.id,Grade
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,⋯,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>
A15,Tuberculosis,Pathogen,CFP10,TAAQAAVVRFQEAAN,DRB1*15:03,TRA-TRB,CIEHTNSGGSNYKLTF,CASSLEETQYF,-,⋯,CD4,Multiple PCR,Antigen-specific ex vivo proliferation,22,-,-,-,Nature,28636589,5
A15,Tuberculosis,Pathogen,CFP10,TAAQAAVVRFQEAAN,DRB1*15:03,TRA-TRB,CIVHTNSGGSNYKLTF,CASSPEETQYF,-,⋯,CD4,Multiple PCR,Antigen-specific ex vivo proliferation,22,-,-,-,Nature,28636589,5
A15,Tuberculosis,Pathogen,CFP10,TAAQAAVVRFQEAAN,DRB1*15:03,TRA-TRB,CIVKTNSGGSNYKLTF,CASSFEETQYF,-,⋯,CD4,Multiple PCR,Antigen-specific ex vivo proliferation,22,-,-,-,Nature,28636589,5
A15,Tuberculosis,Pathogen,ESAT-6;CFP-10,-,-,TRB,-,CASGRPYEQYF,-,⋯,T,5'RACE,Statistical analysis,25,Health,15,Stimulated with ESAT-6 or CFP-10,Tuberculosis,23845455,2
A15,Tuberculosis,Pathogen,ESAT-6;CFP-10,-,-,TRB,-,CASSFLERGLFFYEQYF,-,⋯,T,5'RACE,Statistical analysis,25,-,15,Stimulated with ESAT-6 or CFP-10,Tuberculosis,23845455,2
A15,Tuberculosis,Pathogen,ESAT-6;CFP-10,-,-,TRB,-,CASSFLLRGAFFYEQYF,-,⋯,T,5'RACE,Statistical analysis,25,Health,15,Stimulated with ESAT-6 or CFP-10,Tuberculosis,23845455,2


In [3]:
dat$CDR3.beta.aa <- gsub('^C', '', dat$CDR3.beta.aa)
dat$CDR3.beta.aa <- gsub('F$', '', dat$CDR3.beta.aa)

In [4]:
any(is.na(dat))

In [5]:
# check for special characters in CDR3 and epitope sequences
any(grepl("[[:punct:]]", dat$CDR3.beta.aa))
any(grepl("[[:punct:]]", dat$Antigen.sequence))

In [6]:
# check for special characters in CDR3 and epitope sequences
any(grepl(" ", dat$CDR3.beta.aa))
any(grepl(" ", dat$Antigen.sequence))

In [7]:
# remove unclear CDR3
dat <- dat[!grepl("[[:punct:]]", dat$CDR3.beta.aa),]
dim(dat)

In [8]:
# remove unclear epitope
dat <- dat[!grepl("[[:punct:]]", dat$Antigen.sequence),]
dim(dat)

In [9]:
dat <- unique(dat[, c('Disease.name', 'Category', 'Antigen', 'Antigen.sequence', 'HLA', 'CDR3.beta.aa')])
dim(dat)

In [10]:
head(dat)

Disease.name,Category,Antigen,Antigen.sequence,HLA,CDR3.beta.aa
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
Tuberculosis,Pathogen,CFP10,TAAQAAVVRFQEAAN,DRB1*15:03,ASSLEETQY
Tuberculosis,Pathogen,CFP10,TAAQAAVVRFQEAAN,DRB1*15:03,ASSPEETQY
Tuberculosis,Pathogen,CFP10,TAAQAAVVRFQEAAN,DRB1*15:03,ASSFEETQY
Tuberculosis,Pathogen,Rv1195,ADTLQSIGATTVASN,DRB1*15:03,ASSVALASGANVLT
Tuberculosis,Pathogen,Rv1195,ADTLQSIGATTVASN,DRB1*15:03,ASSVALATGEQY
Tuberculosis,Pathogen,Rv1195,ADTLQSIGATTVASN,DRB1*15:03,ASSVALQGVHTQY


In [11]:
# write loaded data
write.table(dat, 'TBAdb-loaded.csv', quote = F, row.names = F, sep = ',', append = F)

# Analysis

In [12]:
unique(dat$Antigen)

In [13]:
# check categories
unique(dat$Disease.name)

In [14]:
unique(dat$Category)

In [15]:
head(dat)

Disease.name,Category,Antigen,Antigen.sequence,HLA,CDR3.beta.aa
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
Tuberculosis,Pathogen,CFP10,TAAQAAVVRFQEAAN,DRB1*15:03,ASSLEETQY
Tuberculosis,Pathogen,CFP10,TAAQAAVVRFQEAAN,DRB1*15:03,ASSPEETQY
Tuberculosis,Pathogen,CFP10,TAAQAAVVRFQEAAN,DRB1*15:03,ASSFEETQY
Tuberculosis,Pathogen,Rv1195,ADTLQSIGATTVASN,DRB1*15:03,ASSVALASGANVLT
Tuberculosis,Pathogen,Rv1195,ADTLQSIGATTVASN,DRB1*15:03,ASSVALATGEQY
Tuberculosis,Pathogen,Rv1195,ADTLQSIGATTVASN,DRB1*15:03,ASSVALQGVHTQY


In [16]:
dat <- unique(dat[, c('Disease.name', 'Antigen.sequence', 'HLA', 'CDR3.beta.aa')])
dim(dat)

In [17]:
length(unique(dat$HLA))
sort(unique(dat$HLA))

In [18]:
table(dat$HLA)


                                                                         - 
                                                                         8 
                                                                   A*01:01 
                                                                        53 
A*01:01-A*02:01/B*08:01-B*57:01/Cw*06:02-Cw*07:01/DRB1*08:0321-DRB1*15:011 
                                                                         2 
                                                                      A*02 
                                                                      3923 
                                                                   A*02:01 
                                                                       626 
                                                                      A*11 
                                                                        16 
                                                                   A*24:02 
           

In [19]:
# In CDR3-epitope prediction, HLA alleles are not needed. Therefore, unclear epitopes are kept.
dat_wohla <- unique(dat[, c('Antigen.sequence', 'CDR3.beta.aa')])
dim(dat_wohla)

In [20]:
head(dat_wohla)
write.table(dat_wohla, 'TBAdb-without-HLA.csv', quote = F, row.names = F, append = F, sep = ',')

Antigen.sequence,CDR3.beta.aa
<chr>,<chr>
TAAQAAVVRFQEAAN,ASSLEETQY
TAAQAAVVRFQEAAN,ASSPEETQY
TAAQAAVVRFQEAAN,ASSFEETQY
ADTLQSIGATTVASN,ASSVALASGANVLT
ADTLQSIGATTVASN,ASSVALATGEQY
ADTLQSIGATTVASN,ASSVALQGVHTQY


In [21]:
dat <- dat[dat$HLA != '-' & !grepl('[/]', dat$HLA),]
table(dat$HLA)


   A*01:01       A*02    A*02:01       A*11    A*24:02  A-A*01:01  A-A*02:01 
        53       3923        626         16        106        199         14 
    A-B*08  A-B*08:01     A-B*18       B*07    B*07:02       B*08    B*08:01 
        20         34          8         14        108         46         99 
      B*15       B*27    B*27:05    B*35:01    B*35:08       B*42    B*42:01 
         5         16         92         55         34          1        306 
   B*44:05       B*57    B*57:01    B*57:03 DPB1*02:01        DQ2  DR3*02:02 
         1         33         31         49         14         10          3 
DRB1*04:01 DRB1*15:03 
        20          7 

In [22]:
dat$HLA <- gsub('A-', '', dat$HLA)
dat$HLA <- paste0('HLA-', dat$HLA)

In [23]:
table(dat$HLA)


   HLA-A*01:01       HLA-A*02    HLA-A*02:01       HLA-A*11    HLA-A*24:02 
           252           3923            640             16            106 
      HLA-B*07    HLA-B*07:02       HLA-B*08    HLA-B*08:01       HLA-B*15 
            14            108             66            133              5 
      HLA-B*18       HLA-B*27    HLA-B*27:05    HLA-B*35:01    HLA-B*35:08 
             8             16             92             55             34 
      HLA-B*42    HLA-B*42:01    HLA-B*44:05       HLA-B*57    HLA-B*57:01 
             1            306              1             33             31 
   HLA-B*57:03 HLA-DPB1*02:01        HLA-DQ2  HLA-DR3*02:02 HLA-DRB1*04:01 
            49             14             10              3             20 
HLA-DRB1*15:03 
             7 

In [24]:
# from https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7204072/, filtering alleles having AF < 10
viet_alleles <- c("HLA-A*02:03", "HLA-A*02:07", "HLA-A*11:01", "HLA-A*24:02", "HLA-A*29:01", "HLA-A*33:03",
                  "HLA-B*07:05", "HLA-B*15:02", "HLA-B*15:25", "HLA-B*38:02", "HLA-B*40:01:02", "HLA-B*46:01", "HLA-B*58:01",
                  "HLA-C*01:02", "HLA-C*03:02:02", "HLA-C*03:04", "HLA-C*04:01", "HLA-C*04:03", "HLA-C*07:02", "HLA-C*08:01", "HLA-C*15:05:02",
                  "HLA-DRB1*03:01", "HLA-DRB1*04:05", "HLA-DRB1*08:03", "HLA-DRB1*09:01:02", "HLA-DRB1*10:01", "HLA-DRB1*12:02", "HLA-DRB1*15:02",
                  "HLA-DQB1*02:01", "HLA-DQB1*03:01", "HLA-DQB1*03:03", "HLA-DQB1*04:01", "HLA-DQB1*05:01", "HLA-DQB1*05:02", "HLA-DQB1*06:01"
                 )

In [25]:
dat_alleles <- unique(dat$HLA)

In [26]:
dat_alleles

In [27]:
intersect(viet_alleles, dat_alleles)
table(dat[dat$HLA %in% intersect(viet_alleles, dat_alleles), 'HLA'])

HLA
HLA-A*24:02 
        106 

In [33]:
viet_over_dat_alleles <- viet_alleles[grepl(paste(str_escape(dat_alleles), collapse = "|"), viet_alleles) & !viet_alleles %in% dat_alleles]
viet_over_dat_alleles
table(dat[dat$HLA %in% gsub('[:]..$', '', viet_over_dat_alleles), 'HLA'])
table(dat[dat$HLA %in% gsub('[:]..$', '', viet_over_dat_alleles), c('HLA', 'Disease.name')])

Remember to cat() to see result with single escapes.



HLA
HLA-A*02 HLA-A*11 HLA-B*07 HLA-B*15 
    3923       16       14        5 

          Disease.name
HLA         CMV  EBV Human immunodeficiency virus (HIV)
  HLA-A*02 3912   11                                  0
  HLA-A*11    0   16                                  0
  HLA-B*07   14    0                                  0
  HLA-B*15    0    0                                  5

In [34]:
dat_over_viet_alleles <- dat_alleles[grepl(paste(str_escape(viet_alleles), collapse = "|"), dat_alleles) & !dat_alleles %in% viet_alleles]
dat_over_viet_alleles
table(dat[dat$HLA %in% gsub('[:]..$', '', dat_over_viet_alleles), 'HLA'])
table(dat[dat$HLA %in% gsub('[:]..$', '', dat_over_viet_alleles), c('HLA', 'Disease.name')])

Remember to cat() to see result with single escapes.



< table of extent 0 >

< table of extent 0 x 0 >

In [36]:
dat <- unique(dat[, c('CDR3.beta.aa', 'HLA', 'Antigen.sequence')])
dim(dat)

In [38]:
write.table(dat, 'TBAdb-analyzed.csv', quote = F, append = F, row.names = F, sep = ',')