In [1]:
library(data.table)
library(conveniencefunctions)
library(VennDiagram)

Loading required package: ggplot2



|Function                      |Shortcut         |
|:-----------------------------|:----------------|
|alignAssign                   |Ctrl+Q           |
|alignCursor                   |Ctrl+Shift+Q     |
|toggle_subsection             |Ctrl+Y           |
|toggle_subsubsection          |Ctrl+Shift+Y     |
|insertFormals                 |Ctrl+Shift+Alt+I |
|assignFormals                 |Ctrl+Shift+Alt+O |
|fixCommas                     |Ctrl+Alt+Z       |
|exposeAsArgument              |Ctrl+Alt+C       |
|toggle_mclapply               |Ctrl+Shift+G     |
|insert_debugonce              |Ctrl+Shift+Alt+G |
|transform_subsection          |Ctrl+J           |
|initiate_or_delete_subsection |Ctrl+Shift+J     |
|renumber_sections             |Ctrl+Shift+R     |
|insert_loopdebugger           |Ctrl+Shift+I     |
|toggle_blabla                 |Ctrl+Alt+O       |
|extract_importFrom            |Ctrl+Alt+I       |
|refactor_functionCall         |Ctrl+Alt+U       |
|insertHistory                 

Loading required package: grid

Loading required package: futile.logger



# Load data and check for exceptions

In [2]:
dat <- fread('McPAS-TCR.csv', data.table = F)
dim(dat)
head(dat)

Unnamed: 0_level_0,CDR3.alpha.aa,CDR3.beta.aa,Species,Category,Pathology,Pathology.Mesh.ID,Additional.study.details,Antigen.identification.method,Single.cell,NGS,⋯,TRAV,TRAJ,TRBV,TRBD,TRBJ,Reconstructed.J.annotation,CDR3.beta.nt,Mouse.strain,PubMed.ID,Remarks
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<chr>,<chr>,⋯,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<int>,<chr>
1,,CASSDAGANTEVF,Mouse,Pathogens,Lymphocytic choriomeningitis virus (LCMV),D008217,,2.1,No,No,⋯,,,TRBV8-1,,TRBJ1-1,No,,P14,1716213,
2,,CASSDAGAYAEQF,Mouse,Pathogens,Lymphocytic choriomeningitis virus (LCMV),D008217,,2.1,No,No,⋯,,,TRBV8-1,,TRBJ2-1,No,,P14,1716213,
3,,CASSDAGGAAEVF,Mouse,Pathogens,Lymphocytic choriomeningitis virus (LCMV),D008217,,2.1,No,No,⋯,,,TRBV8-3,,TRBJ1-1,No,,P14,1716213,
4,,CASSDAGHSPLYF,Mouse,Pathogens,Lymphocytic choriomeningitis virus (LCMV),D008217,,2.1,No,No,⋯,,,TRBV8-1,,TRBJ1-6,No,,P14,1716213,
5,,CASSDAWGGAEQYF,Mouse,Pathogens,Lymphocytic choriomeningitis virus (LCMV),D008217,,2.1,No,No,⋯,,,TRBV8-3,,TRBJ2-6,No,,P14,1716213,
6,,CASSDGANTEVF,Mouse,Pathogens,Lymphocytic choriomeningitis virus (LCMV),D008217,,2.1,No,No,⋯,,,TRBV8-1,,TRBJ1-1,No,,P14,1716213,


In [3]:
dat$CDR3.beta.aa <- gsub('^C', '', dat$CDR3.beta.aa)
dat$CDR3.beta.aa <- gsub('F$', '', dat$CDR3.beta.aa)

In [4]:
any(is.na(dat))

In [5]:
# check for special characters in CDR3 and epitope sequences
any(grepl("[[:punct:]]", dat$CDR3.beta.aa))
any(grepl("[[:punct:]]", dat$Epitope.peptide))

In [6]:
# check for special characters in CDR3 and epitope sequences
any(grepl(" ", dat$CDR3.beta.aa))
any(grepl(" ", dat$Epitope.peptide))

In [7]:
# remove unclear CDR3
dat <- dat[!is.na(dat$CDR3.beta.aa) & !grepl("[[:punct:]]", dat$CDR3.beta.aa),]
dat <- dat[!grepl('[a-z]', dat$CDR3.beta.aa), ]
dim(dat)

In [8]:
unique(dat$Species)

In [9]:
dat <- dat[!is.na(dat$Species) & dat$Species != 'Mouse', 
           c('CDR3.beta.aa', 'Species', 'Category', 'Pathology', 'Epitope.peptide', 'MHC')]

In [10]:
dim(dat)

In [11]:
head(dat)

Unnamed: 0_level_0,CDR3.beta.aa,Species,Category,Pathology,Epitope.peptide,MHC
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
75,AAGETSGVSYNEQ,Human,Cancer,Melanoma,,HLA-A*02
76,ASRPTITVPYSNQPQH,Human,Cancer,Melanoma,,HLA-A*01
77,ASRPTITVPYSNQPQH,Human,Cancer,Melanoma,,HLA-A*01
78,ASSLVVWDRGGNOPQH,Human,Cancer,Melanoma,,HLA-A*02
79,ASSQDLLSWDEQ,Human,Cancer,Melanoma,,HLA-A*02
93,ASSLGNEQ,Human,Cancer,Melanoma,EAAGIGILTV,HLA-A*02


In [12]:
# remove unclear epitope
dat <- unique(dat[!is.na(dat$Epitope.peptide) & !grepl("[[:punct:]]", dat$Epitope.peptide),])
dim(dat)

In [13]:
head(dat)

Unnamed: 0_level_0,CDR3.beta.aa,Species,Category,Pathology,Epitope.peptide,MHC
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
93,ASSLGNEQ,Human,Cancer,Melanoma,EAAGIGILTV,HLA-A*02
94,ASSLGVATGEL,Human,Cancer,Melanoma,EAAGIGILTV,HLA-A*02
95,ASSQEEGGGSWGNTIY,Human,Cancer,Melanoma,EAAGIGILTV,HLA-A*02
96,ASSQEGLAGASQY,Human,Cancer,Melanoma,EAAGIGILTV,HLA-A*02
97,ASSQETDIVFNOPQH,Human,Cancer,Melanoma,EAAGIGILTV,HLA-A*02
98,ASSQGQLTDTQY,Human,Cancer,Melanoma,EAAGIGILTV,HLA-A*02


In [14]:
# write loaded data
write.table(dat, 'McPAS-loaded.csv', quote = F, row.names = F, sep = ',', append = F)

# Analysis

In [15]:
unique(dat$Pathology)

In [16]:
unique(dat$Category)

In [17]:
head(dat)

Unnamed: 0_level_0,CDR3.beta.aa,Species,Category,Pathology,Epitope.peptide,MHC
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
93,ASSLGNEQ,Human,Cancer,Melanoma,EAAGIGILTV,HLA-A*02
94,ASSLGVATGEL,Human,Cancer,Melanoma,EAAGIGILTV,HLA-A*02
95,ASSQEEGGGSWGNTIY,Human,Cancer,Melanoma,EAAGIGILTV,HLA-A*02
96,ASSQEGLAGASQY,Human,Cancer,Melanoma,EAAGIGILTV,HLA-A*02
97,ASSQETDIVFNOPQH,Human,Cancer,Melanoma,EAAGIGILTV,HLA-A*02
98,ASSQGQLTDTQY,Human,Cancer,Melanoma,EAAGIGILTV,HLA-A*02


In [18]:
dat <- unique(dat[, c('CDR3.beta.aa', 'MHC', 'Epitope.peptide', 'Pathology')])
dim(dat)

In [19]:
length(unique(dat$MHC))
sort(unique(dat$MHC))

In [20]:
sort(table(dat$MHC))


     DQ8-trans          H-2Kb    HLA-A*02:02    HLA-A*02:03    HLA-A*02:04 
             1              1              1              1              1 
   HLA-A*02:05    HLA-A*02:06    HLA-A*02:07    HLA-A*02:08    HLA-A*02:09 
             1              1              1              1              1 
   HLA-A*02:10    HLA-A*02:11    HLA-A*02:12    HLA-A*02:16    HLA-A*02:17 
             1              1              1              1              1 
      HLA-B*42       HLA-A*01    HLA-A*02:13    HLA-A*02:14    HLA-A*02:15 
             1              2              2              2              2 
   HLA-B*35:02    HLA-B*44:05      DR3*02:02        HLA-B*8  HLA-Cw* 16:01 
             2              2              3              3              4 
      HLA-B*15     DRB1*04:01     DRB1*15:03     HLA-A*2:01      HLA-DQ2.5 
             5              7              7              7              7 
       HLA-DQ8 HLA-DRB1*04:01 HLA-DPB1*02:01       HLA-DRB1      HLA-A*011 
           

In [21]:
# In CDR3-epitope prediction, HLA alleles are not needed. Therefore, unclear epitopes are kept.
dat_wohla <- unique(dat[, c('CDR3.beta.aa', 'Epitope.peptide')])
dim(dat_wohla)

In [22]:
head(dat_wohla)
write.table(dat_wohla, 'McPAS-without-HLA.csv', quote = F, row.names = F, append = F, sep = ',')

Unnamed: 0_level_0,CDR3.beta.aa,Epitope.peptide
Unnamed: 0_level_1,<chr>,<chr>
93,ASSLGNEQ,EAAGIGILTV
94,ASSLGVATGEL,EAAGIGILTV
95,ASSQEEGGGSWGNTIY,EAAGIGILTV
96,ASSQEGLAGASQY,EAAGIGILTV
97,ASSQETDIVFNOPQH,EAAGIGILTV
98,ASSQGQLTDTQY,EAAGIGILTV


In [23]:
dat <- dat[!is.na(dat$MHC) & dat$MHC != '-' & !grepl('[/]', dat$MHC),]
table(dat$MHC)


     DQ8-trans      DR3*02:02     DRB1*04-01     DRB1*04:01     DRB1*15:03 
             1              3            364              7              7 
         H-2Kb       HLA-A*01      HLA-A*011    HLA-A*01:01       HLA-A*02 
             1              2             15             52            391 
   HLA-A*02:01    HLA-A*02:02    HLA-A*02:03    HLA-A*02:04    HLA-A*02:05 
          1284              1              1              1              1 
   HLA-A*02:06    HLA-A*02:07    HLA-A*02:08    HLA-A*02:09    HLA-A*02:10 
             1              1              1              1              1 
   HLA-A*02:11    HLA-A*02:12    HLA-A*02:13    HLA-A*02:14    HLA-A*02:15 
             1              1              2              2              2 
   HLA-A*02:16    HLA-A*02:17    HLA-A*11:01    HLA-A*24:02     HLA-A*2:01 
             1              1             15            203              7 
        HLA-A1         HLA-A2      HLA-A2:01    HLA-B*07:02       HLA-B*08 
           

In [24]:
dat$MHC <- gsub('HLA[-]', '', dat$MHC)
dat$MHC <- paste0('HLA-', dat$MHC)

In [25]:
table(dat$MHC)


      HLA-A*01      HLA-A*011    HLA-A*01:01       HLA-A*02    HLA-A*02:01 
             2             15             52            391           1284 
   HLA-A*02:02    HLA-A*02:03    HLA-A*02:04    HLA-A*02:05    HLA-A*02:06 
             1              1              1              1              1 
   HLA-A*02:07    HLA-A*02:08    HLA-A*02:09    HLA-A*02:10    HLA-A*02:11 
             1              1              1              1              1 
   HLA-A*02:12    HLA-A*02:13    HLA-A*02:14    HLA-A*02:15    HLA-A*02:16 
             1              2              2              2              1 
   HLA-A*02:17    HLA-A*11:01    HLA-A*24:02     HLA-A*2:01         HLA-A1 
             1             15            203              7            297 
        HLA-A2      HLA-A2:01    HLA-B*07:02       HLA-B*08    HLA-B*08:01 
          2756            191            466             46            208 
      HLA-B*15       HLA-B*27    HLA-B*27:05    HLA-B*35:01    HLA-B*35:02 
           

In [32]:
dat <- unique(dat[, c('CDR3.beta.aa', 'MHC', 'Epitope.peptide')])
dim(dat)

In [33]:
write.table(dat, 'McPAS-analyzed.csv', quote = F, append = F, row.names = F, sep = ',')