In [200]:
# Set path to articles folder
library(here)
folder_path <- here("articles")
setwd(folder_path)

In [201]:
# Create list of .txt files to create corpus from
file_list <- list.files(pattern = "\\.txt$")

In [229]:
# Read in file content by space character
read_file_content <- function(file) {
    file_content <- suppressWarnings(readLines(file))
    file_content <- paste(file_content, collapse = " ")
    return(file_content)
}

In [234]:
# Extract the title
extract_title <- function(file_content) {
    title <- sub("@BODY=.*", "", file_content)
    title <- sub("@TITLE=", "", title) 
    title <- gsub("\\s+", " ", gsub("(^\\s+|\\s+$)", "", title)) # remove bizarre spaces
    return(title)
}

In [236]:
# Extract body text
extract_body_text <- function(file) {
    body <- sub(".*@BODY=", "", file)
    return(body)
}

In [241]:
# Remove punctuation/special characters, a.k.a., ONLY KEEP LETTERS (and spaces)
keep_letters <- function(body) {
    body_letters_only <- gsub("-", " ", body_lowercase) # replaces "-" with " "
    body_letters_only <- gsub("[^[:alpha:][:space:]]", "", body_letters_only)
    body_letters_only <- gsub("\\s+", " ", body_letters_only) # remove multiple sequential spaces
    return(body_letters_only)
}

In [244]:
# Extract the words (tokenize)
tokenize <- function(body_cleaned) {
    body_words <- unlist(strsplit(body_letters_only, " "))
    return(body_words)
}

In [245]:
# Remove stop words from body
remove_stop_words <- function(body_tokenized) {
    body_stop_words_removed <- body_words[!body_words %in% stopwords("english")]
    return(body_stop_words_removed)
}

In [248]:
file_list[1]

In [247]:
# Get data from a file

i <- 1
ingest <- file_list[i]

file_content <- read_file_content(ingest)

# Get title
title <- extract_title(file_content)

# Get body
body <- extract_body_text(file_content)
body_lowercase <- tolower(body) # Make the body text lowercase
body_cleaned <- keep_letters(body_lowercase)
body_tokenized <- tokenize(body_cleaned)
body_stop_words_removed <- remove_stop_words(body_tokenized)

In [209]:
# Extract the issue from a 3 digit format, e.g. 096 for issue 96, 112 for 112, etc.
issue <- strsplit(ingest, "_")[[1]][1]
if (grepl("^0", issue)) {
  issue <- substring(issue, 2)
}

In [216]:
# Extract the date from the format mm-dd-yyyy
date <- strsplit(ingest, "_")[[1]][2]
date <- as.Date(date, format = "%m-%d-%Y")