# Create custom training set

In [None]:
seed <- 42

# Select entities to annotate
train_ner_vendor  <- TRUE
train_ner_product <- TRUE
train_ner_version <- FALSE

# Configure the data size and flavour
num_samples <- 20000
weighted_sampling <- TRUE

# Select data frame with parsed CPEs
folder_datasets <- "../datasets"
folder_dataraw <- "rawdata"
folder_trainsets <- "trainsets"
filename_dfcpes <- "df_cpes.rds"
path_dfcpes <- file.path(folder_datasets, folder_dataraw, filename_dfcpes)
path_dfcpes

In [None]:
str_bio <- if (train_ner_vendor && train_ner_product && train_ner_version) {
  "vpv"
} else if (train_ner_vendor && train_ner_product && !train_ner_version) {
  "vp"
} else if (!train_ner_vendor && train_ner_product && train_ner_version) {
  "pv"
} else if (train_ner_vendor && !train_ner_product && train_ner_version) {
  "vv"
} else if (train_ner_vendor && !train_ner_product && !train_ner_version) {
  "vend"
} else if (!train_ner_vendor && train_ner_product && !train_ner_version) {
  "prod"
} else if (!train_ner_vendor && !train_ner_product && train_ner_version) {
  "vers"
} else {
  "BAD"
}

filename_trainset <- paste0(paste("train_cpener",
                                  str_bio, 
                                  ifelse(num_samples >= 1000, 
                                         paste0(round(num_samples / 1000, 0), "k"), # nolint: line_length_linter.
                                         num_samples),
                                  ifelse(weighted_sampling, "wgh", "rnd"),
                                  sep = "_"),
                            seed, ".csv.gz")
path_trainset <- file.path(folder_datasets, folder_trainsets, filename_trainset)
path_trainset

# Requirements

In [None]:
library(nist)
suppressPackageStartupMessages(library(dplyr))

set.seed(seed)

# Load official CPE dataset

In [None]:
if (file.exists(path_dfcpes)) {
  df_cpes <- readRDS(path_dfcpes)
} else {
  if (!(dir.exists(folder_datasets))) dir.create(folder_datasets)
  if (!(dir.exists(file.path(folder_datasets, folder_dataraw))))
    dir.create(file.path(folder_datasets, folder_dataraw))
  cpes_file <- nist::getLatestdata(file.path(folder_datasets, folder_dataraw))
  df_cpes <- nist::cpes_etl(cpes_file)
  saveRDS(df_cpes, path_dfcpes)
}


In [None]:
View(df_cpes %>% sample_n(5))

# Annotate

In [None]:
df_cpe_tags <- nist::cpeNERannotate(cpes = df_cpes,
                                    vendor = train_ner_vendor,
                                    product = train_ner_product,
                                    version = train_ner_version)
View(df_cpe_tags %>% sample_n(5))

# Sampling

In [None]:
df_train <- nist::getCPEsample(df_cpe_tags, num_samples, weighted_sampling)

if (!train_ner_vendor) {
  df_train <- df_train %>% select(-"vendor")
}
if (!train_ner_product) {
  df_train <- df_train %>% select(-"product")
}
if (!train_ner_version) {
  df_train <- df_train %>% select(-"version")
}

View(df_train %>% sample_n(5))

# Save trainset

In [None]:
if (file.exists(path_trainset)) {
  write.csv(df_train, file = gzfile(path_trainset), row.names = FALSE)
} else {
  if (!(dir.exists(folder_datasets))) dir.create(folder_datasets)
  if (!(dir.exists(file.path(folder_datasets, folder_trainsets))))
    dir.create(file.path(folder_datasets, folder_trainsets))
  write.csv(df_train, file = gzfile(path_trainset), row.names = FALSE)
}

path_trainset