In [1]:
# Set path to articles folder
library(here)
folder_path <- here("articles") # articles is the name of the folder containing the .txt's
setwd(folder_path) # set working directory of this notebook

here() starts at /home/mdaugher/DCS-2500/vietnam



In [2]:
# Put .txt files into list to create corpus from
file_list <- list.files(pattern = "\\.txt$")

In [3]:
# Filename: extract the ARTICLE ISSUE from a 3 digit format, e.g. 096 for issue 96, 112 for 112, etc.
get_issue <- function(file) {
    issue <- strsplit(file, "_")[[1]][1]
    if (grepl("^0", issue)) {
        issue <- substring(issue, 2)
    }
    return(issue)
}

In [4]:
# Filename: extract the ARTICLE PUBLICATION DATE from the format mm-dd-yyyy
get_date <- function(file) {
    date_raw <- strsplit(file, "_")[[1]][2]
    article_date <- as.Date(date_raw, format = "%m-%d-%Y")
    return(article_date)
}

In [5]:
# File Content: READ in by space character
read_file_content <- function(file) {
    file_content <- suppressWarnings(readLines(file))
    file_content <- paste(file_content, collapse = " ")
    return(file_content)
}

In [6]:
# File Content: extract the ARTICLE TITLE
extract_title <- function(file_content) {
    title <- sub("@BODY=.*", "", file_content)
    title <- sub("@TITLE=", "", title) 
    title <- gsub("\\s+", " ", gsub("(^\\s+|\\s+$)", "", title)) # remove bizarre spaces
    return(title)
}

In [7]:
# File Content: extract BODY TEXT
extract_body_text <- function(file) {
    body_text <- sub(".*@BODY=", "", file)
    return(body_text)
}

In [8]:
# File Content > Body Text: remove punctuation/special characters, a.k.a., ONLY KEEP LETTERS (and spaces)
keep_letters <- function(body) {
    body_letters_only <- gsub("-", " ", body) # replaces "-" with " "
    body_letters_only <- gsub("[^[:alpha:][:space:]]", "", body_letters_only)
    body_letters_only <- gsub("\\s+", " ", body_letters_only) # remove multiple sequential spaces
    return(body_letters_only)
}

In [9]:
# File Content > Body Text: tokenize by word
tokenize <- function(body_cleaned) {
    body_words <- unlist(strsplit(body_cleaned, " "))
    return(body_words)
}

In [10]:
# Load stop words
stopwords <- readLines(here("stopwords.txt"))
stopwords <- stopwords[stopwords != ""] # Filter out empty lines

In [11]:
# File Content > Body Text > Tokens: remove stop words
remove_stop_words <- function(body_tokenized) {
    body_stop_removed <- body_tokenized[!body_tokenized %in% stopwords]
    return(body_stop_removed)
}

In [12]:
# Get tokens from the body text of a file
get_body <- function(file_content) {
    body_parsed <- extract_body_text(file_content)
    body_lowercase <- tolower(body_parsed) # Make the body text lowercase
    body_letters_only <- keep_letters(body_lowercase)
    body_tokenized <- tokenize(body_letters_only)
    return(body_tokenized)
}

In [39]:
# Parse files to create corpus
library(tidyverse)
library(tidytext)

issues <- as.numeric()
dates <- list()
titles <- as.character()
body_texts <- as.character()
body_words <- list()

for(i in 1:length(file_list)) {
    issue <- get_issue(file_list[i])
    issues[[i]] <- issue
    
    article_date <- get_date(file_list[i])
    dates[[i]] <- article_date
    dates <- as.Date(unlist(dates), origin = "1970-01-01")
    
    file_content <- read_file_content(file_list[i])
    
    title <- extract_title(file_content)
    titles[[i]] <- title
    
    body_raw <- extract_body_text(file_content)
    body_texts[[i]] <- body_raw
    
    body_processed <- get_body(file_content)
    body_stop_removed <- remove_stop_words(body_processed)
    body_words[[i]] <- body_stop_removed
}

corpus <- tibble(
    issue = issues,
    date = dates,
    title = titles,
    body_processed = body_words,
    body_raw = body_texts,
)
corpus <- corpus %>% unnest(body_processed)

In [45]:
corpus[1:4]

issue,date,title,body_processed
<chr>,<date>,<chr>,<chr>
92,1962-04-26,Excerpts Develop Burnham's Views On World Situation,following
92,1962-04-26,Excerpts Develop Burnham's Views On World Situation,excerpts
92,1962-04-26,Excerpts Develop Burnham's Views On World Situation,articles
92,1962-04-26,Excerpts Develop Burnham's Views On World Situation,mr
92,1962-04-26,Excerpts Develop Burnham's Views On World Situation,burnham
92,1962-04-26,Excerpts Develop Burnham's Views On World Situation,appeared
92,1962-04-26,Excerpts Develop Burnham's Views On World Situation,national
92,1962-04-26,Excerpts Develop Burnham's Views On World Situation,review
92,1962-04-26,Excerpts Develop Burnham's Views On World Situation,reprinted
92,1962-04-26,Excerpts Develop Burnham's Views On World Situation,kind
