In [1]:
# Set folder path
folder_path <- "corpus/"

In [2]:
# Read in .txt documents
corpus_files <- list.files(path = folder_path, pattern = "\\.txt$", full.names = TRUE)

In [3]:
# Define regular expression patterns
patterns <- c("\\bUS\\b", "\\bUSA\\b", "\\bU\\.S\\.\\b", "\bU\\.S\\.A\\b", "\\bU\\.S\\.A\\.")

# "\\bUS\\b" "US"
# "\\bUSA\\b" "USA"
# "\\bU\\.S\\.\\b" "U.S."
# "\bU\\.S\\.A\\b" "U.S.A"
# "\\bU\\.S\\.A\\." "U.S.A."
# DOESN'T WORK "[@#^*|\\\\{}\\[\\]<>]" matches any of the characters @, #, ^, *, |, \, {, }, [, ], <, or >

# KWIC multiple contexts/document?

In [4]:
# KWIC function
kwic <- function(filename, patterns, window = 5) { # Window = number of context words before and after the match to display
  text <- suppressWarnings(readLines(filename)) # Read in text from the given file (filename)
  kwic_rows <- list() # Initialize an empty list to store the KWIC rows
  
  # Split the line into individual words
  for (i in seq_along(text)) {
    line <- text[i] 
    words <- strsplit(line, "\\s+")[[1]]
    
    # Loop through each keyword pattern to search for matches
    for (pattern in patterns) {
      matches <- grep(pattern, words) # Find the positions of the keyword matches in the current line
      
      # If there are any matches, generate a KWIC row for each one
      if (length(matches) > 0) {
        for (match in matches) {
          start <- max(1, match - window)
          end <- min(length(words), match + window)
          context <- words[start:end] # Extract the words in the context window
            
          # Construct a KWIC row with the article's filename, keyword, and context
          kwic_rows[[length(kwic_rows) + 1]] <- c("Article Filename" = basename(filename), setNames(paste(context, collapse = " "), pattern))
            print(pattern)
        }
      }
    }
  }
    
  # If there are any KWIC rows, combine them into a data frame and return it
  if (length(kwic_rows) > 0) {
    return(do.call(rbind, kwic_rows))
  } else {
    return(NULL)
  }
}

In [5]:
# Function to process the data frame
process_dataframe <- function(df) {
  # Convert the matrix to a data frame
  df <- as.data.frame(df, stringsAsFactors = FALSE)
  
  # Extract issue number, date, and title from the "Article Filename" column
  issue <- gsub("^([0-9]+)_.*", "\\1", df$`Article Filename`)
  date <- gsub("^[0-9]+_([0-9]+-[0-9]+-[0-9]+).*", "\\1", df$`Article Filename`)
  title <- gsub("^[0-9]+_[0-9]+-[0-9]+-[0-9]+_?(.*)\\.txt$", "\\1", df$`Article Filename`)
  title <- gsub("\\.", " ", title)

  # Add the extracted information as new columns to the right
  df$Issue <- issue
  df$Date <- date
  df$Title <- title
  
  return(df)
}

In [6]:
# Perform KWIC search
kwic_results <- do.call(rbind, lapply(corpus_files, kwic, patterns = patterns))

In [7]:
kwic_results

NULL