In [9]:
library(openxlsx)
library(readxl)
library(tidyr)
library(dplyr)


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union




In [10]:
dir.create('./data/in', recursive = T, showWarnings = F)
dir.create('./data/outs', recursive = T, showWarnings = F)

# Reading

# single sheets
Huang, D., Zhu, X., Ye, S. et al. Tumour circular RNAs elicit anti-tumour immunity by encoding cryptic peptides. Nature 625, 593–602 (2024). https://doi.org/10.1038/s41586-023-06834-7

In [12]:
canonical_antigens <- read_xlsx('./data/in/canonical_antigens_breast_cancer.xlsx', skip = 1, trim_ws = T)
head(canonical_antigens, 2)
tail(canonical_antigens, 2)

Patient No.,Sequence1,Source2,Gene type3,Peptide intensity4,Predicted percentile rank5
<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>
P2,LLEGEETR,NEFL,mRNA,1713818.8,19
,YEEEVLSR,NEFL,mRNA,490323.8,21


Patient No.,Sequence1,Source2,Gene type3,Peptide intensity4,Predicted percentile rank5
<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>
,SGVSLAALKK,H1-3,mRNA,10385236.0,7.9
"1 The peptide sequences detected in HLA-I peptidome, determined by HLA immunoprecipitation followed by mass spectrometry (MS). 2The MS results were matched with the UniProt database, the whole-exome sequencing (WES) as well as the RNA sequencing (RNA-seq) data of the breast tumour and their adjacent normal breast samples, respectively. The criteria were defined below: (1) Higher expression in breast tumour samples than in the paired normal breast tissues (read count > 5, fold change ³ 2). (2) Somatic single-nucleotide variant (SNV) and Indel were discovered according to Genome Analysis Toolkit (GATK, https://gatk.broadinstitute.org/hc/en-us) workflow. 3 The peptide intensities of HLA-I peptides determined by MS indicate the abundance of peptides presented on HLA-I.4 The MHC-I binding predictions were made using the IEDB analysis resource NetMHCpan (ver. 4.1) tool and the threshold for binding was set as rank < 2% to include weak binders.",,,,,


In [4]:
dim(canonical_antigens)

In [13]:
# clean the dat
canonical_antigens <- canonical_antigens[1:240, , drop = F]
tail(canonical_antigens, 2)

Patient No.,Sequence1,Source2,Gene type3,Peptide intensity4,Predicted percentile rank5
<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>
,CCTESLVNR,ALB,mRNA,906300.3,1.5
,SGVSLAALKK,H1-3,mRNA,10385236.0,7.9


In [14]:
# Change the na to its previous patient No.
canonical_antigens <- canonical_antigens %>% fill('Patient No.', .direction = 'down')
head(canonical_antigens, 2)
tail(canonical_antigens, 2)

Patient No.,Sequence1,Source2,Gene type3,Peptide intensity4,Predicted percentile rank5
<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>
P2,LLEGEETR,NEFL,mRNA,1713818.8,19
P2,YEEEVLSR,NEFL,mRNA,490323.8,21


Patient No.,Sequence1,Source2,Gene type3,Peptide intensity4,Predicted percentile rank5
<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>
P10,CCTESLVNR,ALB,mRNA,906300.3,1.5
P10,SGVSLAALKK,H1-3,mRNA,10385236.0,7.9


In [19]:
# cryptic peptides from rna-depleted rna-seq data
cryptic_peptides <- read_excel('./data/in/cryptic_antigen.xlsx', trim_ws = T, skip = 1)
head(cryptic_peptides, 2)
tail(cryptic_peptides, 2)
dim(cryptic_peptides)

Patient No.,Sequence1,Source2,Gene type,Peptide intensity3,Predicted percentile rank4
<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>
P1,RTAHYGTGR,circFAM53B,circRNA,18200004.5,0.19
,LSQNNFALGYK,circVDAC3,circRNA,721479.4,1.5


Patient No.,Sequence1,Source2,Gene type,Peptide intensity3,Predicted percentile rank4
<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>
P12,RQIGSGAEK,circPRRC2B,circRNA,66511858.0,4.0
"1 The peptide sequences detected in HLA-I peptidome, determined by HLA immunoprecipitation followed by mass spectrometry (MS). 2 The MS results were matched with the circRNA profiles obtained from ribosomal RNA-depleted RNA-sequencing (rRNA-depleted-RNA-Seq), as defined below: (1) CircRNA read count in the tumour samples > 5. (2) Higher expression in breast tumour samples than in the paired normal breast tissues (log2fold change ³ 1). (3) CircRNAs have conserved ORF and the length of ORF < the length of circRNA. (4) The neo-peptides formed by the ORF of circRNAs were distinct from the linear counterparts (neo-ORF). 3 The peptide intensities of HLA-I peptides determined by MS indicate the abundance of peptides presented on HLA-I. 4 The MHC-I binding predictions were made using the IEDB analysis resource NetMHCpan (ver. 4.1) tool and the threshold for binding was set as rank < 2% to include weak binders.",,,,,


In [20]:
cryptic_peptides <- cryptic_peptides[1:8, , drop = F]
cryptic_peptides <- cryptic_peptides %>% fill('Patient No.', .direction = 'down')
head(cryptic_peptides, 2)
tail(cryptic_peptides, 2)

Patient No.,Sequence1,Source2,Gene type,Peptide intensity3,Predicted percentile rank4
<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>
P1,RTAHYGTGR,circFAM53B,circRNA,18200004.5,0.19
P1,LSQNNFALGYK,circVDAC3,circRNA,721479.4,1.5


Patient No.,Sequence1,Source2,Gene type,Peptide intensity3,Predicted percentile rank4
<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>
P11,IPVSQVNPRS,circCTTN,circRNA,596410.6,9.2
P12,RQIGSGAEK,circPRRC2B,circRNA,66511858.3,4.0


In [21]:
# saving them 
all_antigens <- list(canonical = canonical_antigens, cryptic = cryptic_peptides)
glimpse(all_antigens)

List of 2
 $ canonical: tibble [240 × 6] (S3: tbl_df/tbl/data.frame)
  ..$ Patient No.               : chr [1:240] "P2" "P2" "P2" "P2" ...
  ..$ Sequence1                 : chr [1:240] "LLEGEETR" "YEEEVLSR" "AQLQDLNDR" "FTVLTESAAK" ...
  ..$ Source2                   : chr [1:240] "NEFL" "NEFL" "NEFL" "NEFL" ...
  ..$ Gene type3                : chr [1:240] "mRNA" "mRNA" "mRNA" "mRNA" ...
  ..$ Peptide intensity4        : num [1:240] 1713819 490324 3245532 1095394 228702 ...
  ..$ Predicted percentile rank5: num [1:240] 19 21 3.6 1.2 5.4 1.3 2.1 1.5 5.2 3.8 ...
 $ cryptic  : tibble [8 × 6] (S3: tbl_df/tbl/data.frame)
  ..$ Patient No.               : chr [1:8] "P1" "P1" "P1" "P3" ...
  ..$ Sequence1                 : chr [1:8] "RTAHYGTGR" "LSQNNFALGYK" "GELLEAIKR" "KLDVTMEMGV" ...
  ..$ Source2                   : chr [1:8] "circFAM53B" "circVDAC3" "circSOD2" "circITFG2" ...
  ..$ Gene type                 : chr [1:8] "circRNA" "circRNA" "circRNA" "circRNA" ...
  ..$ Peptide intensity3

In [22]:
write.xlsx(all_antigens, file = './data/outs/all_antigens.xlsx')

# Writing

In [38]:
data('attitude', 'sleep', 'trees')

In [39]:
all_data <- list(attitude = attitude, 
                student_slee = sleep,
                volcano = trees)

In [40]:
library(dplyr)
glimpse(all_data)

List of 3
 $ attitude    :'data.frame':	30 obs. of  7 variables:
  ..$ rating    : num [1:30] 43 63 71 61 81 43 58 71 72 67 ...
  ..$ complaints: num [1:30] 51 64 70 63 78 55 67 75 82 61 ...
  ..$ privileges: num [1:30] 30 51 68 45 56 49 42 50 72 45 ...
  ..$ learning  : num [1:30] 39 54 69 47 66 44 56 55 67 47 ...
  ..$ raises    : num [1:30] 61 63 76 54 71 54 66 70 71 62 ...
  ..$ critical  : num [1:30] 92 73 86 84 83 49 68 66 83 80 ...
  ..$ advance   : num [1:30] 45 47 48 35 47 34 35 41 31 41 ...
 $ student_slee:'data.frame':	20 obs. of  3 variables:
  ..$ extra: num [1:20] 0.7 -1.6 -0.2 -1.2 -0.1 3.4 3.7 0.8 0 2 ...
  ..$ group: Factor w/ 2 levels "1","2": 1 1 1 1 1 1 1 1 1 1 ...
  ..$ ID   : Factor w/ 10 levels "1","2","3","4",..: 1 2 3 4 5 6 7 8 9 10 ...
 $ volcano     :'data.frame':	31 obs. of  3 variables:
  ..$ Girth : num [1:31] 8.3 8.6 8.8 10.5 10.7 10.8 11 11 11.1 11.2 ...
  ..$ Height: num [1:31] 70 65 63 72 81 83 66 75 80 75 ...
  ..$ Volume: num [1:31] 10.3 10.3 10.2 16

In [41]:
write.xlsx(all_data, './data/outs/combined_atttitude_sleep_trees_rdata.xlsx')

In [None]:
# create workbook # alternatively
wb <- createWorkbook()
addWorksheet(wb, 'Sheet1')
addWorksheet(wb, 'Sheet2')
# Write data to Sheet1
writeData(wb, sheet = "Sheet1", x = data1)

# Write data to Sheet2, starting at row 2, column 2
writeData(wb, sheet = "Sheet2", x = data2, startRow = 2, startCol = 2)

# save
saveWorkbook(wb, "my_excel_file.xlsx", overwrite = TRUE)

# reading an excel sheets with multiple tabs

In [28]:
sheet_names <- excel_sheets('./data/outs/all_antigens.xlsx')
sheet_names

In [42]:
# creating a list of df

loading_all_antigens <- setNames(lapply(X = sheet_names, FUN = function(sheet) {
     read_excel(path = './data/outs/all_antigens.xlsx', sheet = sheet)
}), sheet_names)
glimpse(loading_all_antigens)

List of 2
 $ canonical: tibble [240 × 6] (S3: tbl_df/tbl/data.frame)
  ..$ Patient No.               : chr [1:240] "P2" "P2" "P2" "P2" ...
  ..$ Sequence1                 : chr [1:240] "LLEGEETR" "YEEEVLSR" "AQLQDLNDR" "FTVLTESAAK" ...
  ..$ Source2                   : chr [1:240] "NEFL" "NEFL" "NEFL" "NEFL" ...
  ..$ Gene type3                : chr [1:240] "mRNA" "mRNA" "mRNA" "mRNA" ...
  ..$ Peptide intensity4        : num [1:240] 1713819 490324 3245532 1095394 228702 ...
  ..$ Predicted percentile rank5: num [1:240] 19 21 3.6 1.2 5.4 1.3 2.1 1.5 5.2 3.8 ...
 $ cryptic  : tibble [8 × 6] (S3: tbl_df/tbl/data.frame)
  ..$ Patient No.               : chr [1:8] "P1" "P1" "P1" "P3" ...
  ..$ Sequence1                 : chr [1:8] "RTAHYGTGR" "LSQNNFALGYK" "GELLEAIKR" "KLDVTMEMGV" ...
  ..$ Source2                   : chr [1:8] "circFAM53B" "circVDAC3" "circSOD2" "circITFG2" ...
  ..$ Gene type                 : chr [1:8] "circRNA" "circRNA" "circRNA" "circRNA" ...
  ..$ Peptide intensity3

In [31]:
cryptic_loading <- read_excel(path = './data/outs/all_antigens.xlsx', sheet = 'cryptic')
head(cryptic_loading, 2)

Patient No.,Sequence1,Source2,Gene type,Peptide intensity3,Predicted percentile rank4
<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>
P1,RTAHYGTGR,circFAM53B,circRNA,18200004.5,0.19
P1,LSQNNFALGYK,circVDAC3,circRNA,721479.4,1.5


In [44]:
cryptic_loading <- loading_all_antigens$cryptic
head(cryptic_loading, 2)

Patient No.,Sequence1,Source2,Gene type,Peptide intensity3,Predicted percentile rank4
<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>
P1,RTAHYGTGR,circFAM53B,circRNA,18200004.5,0.19
P1,LSQNNFALGYK,circVDAC3,circRNA,721479.4,1.5
