merge_algorithm.R

#############################################################
## Project: Internal and External Data Merge Model/Algorithm
## Script purpose: Merge native and external data sets
## Timeline: 6/1/2021 - 8/27/2021
## Author: Rakan AlZagha '21 Intern
#############################################################


########
#
# Results:
#
#
########


## Section 0: Install libraries
##################################################

# list necessary packages for the merge
package_list <- c("stringr", 'bit64',"data.table", "dplyr", "foreach", "stringdist", "doParallel", "openxlsx", "comparator", "compare", "rvest", "httr", 'ggplot2', 'doSNOW', 'parallel', 'progress', 'svMisc', 'fastmatch')

# install all libraries prior to loading them all in
lapply(package_list, install.packages, character.only = TRUE)

# load all libraries into the R session
lapply(package_list, library, character.only = TRUE)

## Section 1: Set Global Options
##################################################

# set to TRUE if you would like to save object using save_object() function
toggle <- TRUE

# number of cores to utilize for parallelization of algorithm (auto-generated by system command)
## will be set to maximum number of threads on machine - 1 (leaving 1 for computer operations)
### NOTE: More cores/threads in use means more computational overhead (takes longer to start algorithm, but usually shorter run time once started)
nthread <- getOption("sd_num_thread")

# path you would like to load objects into R from
load_directory <- ""

# path you would like to save objects from R to
save_directory <- ""

## Section 2: Function Declarations
##################################################

####
# Function Name: save_object()
#
# Purpose: save objects to .CSV or .XLSX format
#
# Parameters: 
# @toggle    (type: logical)   - set globally to TRUE or FALSE
# @directory (type: string)    - directory to save object to (see save_directory global option)
# @object    (type: R object)  - any R object (data.table, data.frame, matrix, etc...)
# @filename  (type: string)    - name of file
# @filetype  (type: string)    - either "CSV" or "XLSX" file type
#
# Return: NA
#
# Examples: 
# @save_object(toggle, save_directory, random_R_table, "final_table", "CSV")
#
####

save_object <- function(toggle, directory, object, filename, filetype){
  # case where global save option is set to TRUE
  if(toggle == TRUE){
    # save as XLSX
    if(filetype == "XLSX"){
      full_file_name <- paste0(directory, filename, ".xlsx") 
      write.xlsx(x = object, file = full_file_name)  
    }
    # save as CSV
    if(filetype == "CSV"){
      full_file_name <- paste0(directory, filename, ".csv") 
      write.csv(x = object, file = full_file_name, row.names = FALSE)
    }
  }
  # case where global save option is set to FALSE
  else{
    print("Toggle set to FALSE!")
  }
}


####
# Function Name: is_dup_data()
#
# Purpose: remove duplicated rows from a table
#
# Parameters: 
# @raw_data (type: R object) - any raw data object (data.table, data.frame)
#
# Return: unique raw data with no duplicates
#
# Example: 
# @is_dup_data(raw_internal_data)
#
####

is_dup_data <- function(raw_data){
  # find total of rows and unique ones, subset if not equal
  total_rows <- count(raw_data)
  unique_rows <- count(unique(raw_data))
  
  # mismatch between total rows and unique number
  if(total_rows != unique_rows){
    raw_data <- raw_data[!duplicated(raw_data)]
  }
  
  return(raw_data)
}

####
# Function Name: clean_name_data()
#
# Purpose: remove entity identifiers, non-essential characters, and inconsistent string grammar...
# to be utilized on both data sets that are to be merged for consistency. Add to gsub() per data needs.
#
# Parameters: 
# @raw_data       (type: R object) - any raw data object (data.table, data.frame)
# @formatted_name (type: R vector) - name vector from data
#
# Return: raw_data with a cleaned name column 'formatted_name'
#
# Examples:
# @clean_name_data(internal_data, internal_data$company_name)
#
####

clean_name_data <- function(raw_data, formatted_name){
  # convert all names to uppercase (eliminate case-sensitivity)
  raw_data <- raw_data[, formatted_name := toupper(formatted_name)]
  
  # further clean the data to boost merge (remove entity identifiers, irregularities in strings)
  raw_data[, formatted_name := gsub(" CORP.*", "", raw_data$formatted_name)]
  raw_data[, formatted_name := gsub(" INC..*", "", raw_data$formatted_name)]
  raw_data[, formatted_name := gsub(" INC.*", "", raw_data$formatted_name)]
  raw_data[, formatted_name := gsub(" INCORPORATED..*", "", raw_data$formatted_name)]
  raw_data[, formatted_name := gsub(" LTD.*", "", raw_data$formatted_name)]
  raw_data[, formatted_name := gsub(" LLC.*", "", raw_data$formatted_name)]
  raw_data[, formatted_name := gsub(" LLP*", "", raw_data$formatted_name)]
  raw_data[, formatted_name := gsub(" LP.*", "", raw_data$formatted_name)]
  raw_data[, formatted_name := gsub(" L P.*", "", raw_data$formatted_name)]
  raw_data[, formatted_name := gsub(" L.P..*", "", raw_data$formatted_name)]
  raw_data[, formatted_name := gsub(" & CO$.*", " & COMPANY", raw_data$formatted_name)]
  raw_data[, formatted_name := gsub(" & CO.\\$.*", " & COMPANY", raw_data$formatted_name)]
  raw_data[, formatted_name := gsub(" AND CO.\\$.*", " & COMPANY", raw_data$formatted_name)]
  raw_data[, formatted_name := gsub(" AND CO\\.", " & COMPANY", raw_data$formatted_name)]
  raw_data[, formatted_name := gsub(" PLC.*", "", raw_data$formatted_name)]
  raw_data[, formatted_name := gsub(" & ", " AND ", raw_data$formatted_name)]
  raw_data[, formatted_name := gsub(",.*", "", raw_data$formatted_name)]
  raw_data[, formatted_name := gsub(" CO$.*", "", raw_data$formatted_name)]
  raw_data[, formatted_name := gsub(" CO.$.*", "", raw_data$formatted_name)]
  raw_data[, formatted_name := gsub("\\(THE).*", "", raw_data$formatted_name)]
  raw_data[, formatted_name := gsub("^THE ", "", raw_data$formatted_name)]
  raw_data[, formatted_name := gsub("-", "", raw_data$formatted_name)]
  raw_data[, formatted_name := gsub("\\.", "", raw_data$formatted_name)]
  
  return(raw_data)
}

####
# Function Name: clean_domain_name_data()
#
# Purpose: convert domain name to lowercase and add 'http://' to beginning of domain name
# please edit function to the cleaning needs of your data, ultimate goal is for all domains to be
# 'http://www.random_website.com' format
#
# Parameters: 
# @raw_data         (type: R object) - any raw data object (data.table, data.frame)
# @is_external      (type: logical)  - is this the external or internal data? 
# @formatted_domain (type: R vector) - domain name vector from data
#
# Return: raw_data with a cleaned domain name column 'formatted_domain_name'
#
# Examples:
# @clean_domain_name_data(internal_data, is_external = FALSE, internal_data$in_domain)
#
####

clean_domain_name_data <- function(raw_data, is_external, formatted_domain){
  # convert domain name to lowercase (eliminate case-sensitivity)
  raw_data <- raw_data[, formatted_domain := tolower(formatted_domain)]
  
  # add 'http://' to beginning of domain name
  raw_data <- raw_data[, formatted_domain := ifelse(is.na(formatted_domain), formatted_domain, paste0("http://", formatted_domain))]
  
  if(is_external == TRUE){
    setnames(raw_data, "ex_domain", "ex_domain_unformatted")
    setnames(raw_data, "formatted_domain", "ex_domain")
  }
  
  else{
    setnames(raw_data, "in_domain", "in_domain_unformatted")
    setnames(raw_data, "formatted_domain", "in_domain")
  }
  
  return(raw_data)
}


####
# Function Name: match_algorithm_domain_name()
#
# Purpose: match data sets via domain name
#
# Parameters: 
# @nthread           (type: numeric)   - set from getOption("sd_num_thread"), will be max(cores_available)-1
# @iterations        (type: numeric)   - number of iterations to match (full dataset pass in nrow(external_data))
# @internal_data     (type: R object)  - internal data to match to the external data
# @external_data     (type: R object)  - external data to match to the internal data
# @in_domain_vector  (type: R object)  - vector of internal domain names
# @ex_domain_vector  (type: R object)  - vector of external domain names
#
# Return: merged data set between external and internal domain names
#
# Examples: 
# @match_algorithm_domain_name(nthread, iterations = nrow(external_data), 
# internal_data, external_data, internal_data$in_domain, external_data$ex_domain)
#
####

match_algorithm_domain_name <- function(nthread, iterations, internal_data, external_data, in_domain_vector, ex_domain_vector){
  print("Domain name matching starting...")
  
  # fast-match all domain names from external dataset to internal dataset
  output_domain_name <- fmatch(ex_domain_vector[1:iterations], in_domain_vector)
  
  # format the matched data
  output_domain_name <- format_domain_name_output(output_domain_name, internal_data, external_data)
  
  print("Domain name matching complete!")
  
  return(output_domain_name)
}

####
# Function Name: format_domain_name_output()
#
# Purpose: format domain name algorithm output
#
# Parameters: 
# @output         (type: R object)  - match_algorithm_domain_name results
# @internal_data  (type: R object)  - internal data set
# @external_data  (type: R object)  - external data set
#
# Return: formatted table with linked domain name matches
#
# Examples: 
# @format_domain_name_output(output_domain_name, internal_data, external_data)
#
####

format_domain_name_output <- function(output, internal_data, external_data){
  # convert vectorized output from html algorithm to data.table format
  output <- as.data.table(output)
  
  # set the column name to internal_index_domain_name
  colnames(output) <- "internal_index_domain_name"
  
  # set type to numeric for proceeding merge
  output <- output[, internal_index_domain_name := as.numeric(internal_index_domain_name)]
  
  # set exact matches to TRUE and no matches to false
  output <- output[, is_match := ifelse(internal_index_domain_name == "NA" | is.na(internal_index_domain_name), FALSE, TRUE)]
  
  # any match is a definite domain name match in this case (1:1)
  output <- output[, html_def_match := is_match]
  
  # re-assign index count (same as initial but will be utilized for merge, 
  ## more general than specific non-transferable codes)
  output$external_index <- seq.int(nrow(output))
  
  return(output)
}

####
# Function Name: match_algorithm_name()
#
# Purpose: match data sets based on Jaro-Winkler Similarity score (0 being an exact match, closer to 1 less match)
#
# Parameters: 
# @nthread        (type: numeric)   - set from getOption("sd_num_thread"), will be max(cores_available)-1
# @iterations     (type: numeric)   - number of iterations to match (full dataset pass in nrow(external_data))
# @internal_data  (type: R object)  - internal data to match to the external data
# @external_data  (type: R object)  - external data to match to the internal data
# @in_name_vector (type: R object)  - vector of internal names
# @ex_name_vector (type: R object)  - vector of external names
#
# Return: merged data set between external and internal names
#
# Examples: 
# @match_algorithm_name(nthread, iterations = nrow(external_data), 
# internal_data, external_data, internal_data$formatted_name, external_data$formatted_name)
#
####

match_algorithm_name <- function(nthread, iterations, internal_data, external_data, in_name_vector, ex_name_vector){
  print("Jaro-Winkler string-matching starting...")

  # set total number of cores for job and register them to the system
  clusters <- makeCluster(nthread)
  registerDoSNOW(clusters)
  
  # create progress bar for match algorithm
  pb <- progress_bar$new(
    format = " String-matching progress [:bar] :percent eta: :eta elapsed_full :elapsedfull",
    total = iterations,    # 100 
    width = 100
  )
  
  # generate progress indicator
  ## token reported in progress bar
  progress_indicator <- rep(1, 100)  
  
  # allowing progress bar to be used in for-each function options
  progress <- function(n){
    pb$tick(tokens = list(prog_count = progress_indicator[n]))
  } 
  
  # listing options for progress
  opts <- list(progress = progress)
  
  # iterate through names using cores set above
  output_name_match <- foreach(name = ex_name_vector[1:iterations], .packages = c('data.table', 'stringdist', 'fastmatch'), .combine = 'c', .options.snow = opts) %dopar% {
    # if there is an exact match present, match it then move to the next name
    quick_def_match <- as.vector(fmatch(name, in_name_vector))
    
    # case when quick match returns a positive definite match
    if(!(is.na(quick_def_match))){
      match <- paste0(name, " --> ", quick_def_match[1], " --> ", "0")
    }
    
    # case when there is not a definite match, resort to Jaro-Winkler and take best approximate match
    else{
      # subset to only names that have the same beginning as the current name being matched
      closest_match <- as.vector(in_name_vector[grep(pattern = substr(name, 1, 4), in_name_vector, fixed = TRUE, useBytes = TRUE)])
      
      # generate a Jaro-Winkler score for all of the possible matches (based on closes match)
      jw_match <- stringdist(name, closest_match, method = 'jw', p = .1, nthread = getOption("sd_num_thread"), useBytes = TRUE)
      
      # case when there are approximate matches
      if(length(jw_match) > 0){
        # take the lowest Jaro-Winkler score index
        min_index_jw <- as.numeric(which.min(jw_match))
        
        # re-match it back to the original name index (in OG data set, not closest_match subset)
        internal_index <- fmatch(closest_match[min_index_jw], in_name_vector)
        
        match <- paste0(name, " --> ", internal_index, " --> ", jw_match[min_index_jw])
      }

      # case when there are NO approximate matches
      else{
        match <- paste0(name, " --> ", NA, " --> ", NA)
      }
    }
  }
  
  # free any zombie processes
  on.exit(stopCluster(clusters))
  
  # format the matched data
  output_name_match <- format_name_match_output(output_name_match, internal_data, external_data)
  
  print("Jaro-Winkler string-matching complete!")
  
  return(output_name_match)
}


####
# Function Name: format_name_match_output()
#
# Purpose: format Jaro-Winkler name matching algorithm output
#
# Parameters: 
# @output        (type: R object)  - match_algorithm_name results
# @internal_data (type: R object)  - internal data set
# @external_data (type: R object)  - external data set
#
# Return: formatted table with linked definite and approximate name matches
#
# Examples: 
# @format_name_match_output(output_name_match, internal_data, external_data)
#
####

format_name_match_output <- function(output, internal_data, external_data){
  # convert vectorized output from html algorithm to data.table format
  output <- as.data.table(output)
  
  # set the column name to ex_firm_name
  colnames(output) <- "ex_firm_name"
  
  # split each vector element into the domain and index
  output <- as.data.table(str_split_fixed(output$ex_firm_name, " --> ", 3))
  
  # rename columns to now include index
  colnames(output) <- c("ex_firm_name", "internal_index_name_match", "jw_match")
  
  # convert the index into a numeric type
  output <- output[, internal_index_name_match := as.numeric(internal_index_name_match)]
  
  # set exact matches to TRUE and no matches to false
  output <- output[, is_match := ifelse(is.na(jw_match), FALSE, TRUE)]
  
  # any match is a definite string match in this case (1:1)
  output <- output[, string_def_match := ifelse(jw_match == 0, TRUE, FALSE)]
  
  # re-assign index count (same as initial but will be utilized for merge, 
  ## more general than specific non-transferable codes)
  output$external_index <- seq.int(nrow(output))
  
  return(output)
}

####
# Function Name: match_algorithm_html_title()
#
# Purpose: match data sets based on web-scrapping HTML algorithm that retrieves HTML-title
#
# Parameters: 
# @nthread             (type: numeric)  - set from getOption("sd_num_thread"), will be max(cores_available)-1
# @internal_iterations (type: numeric)  - number of iterations internally to match to (full dataset pass in nrow(internal_data))
# @external_iterations (type: numeric)  - number of iterations externally to match up (full dataset pass in nrow(external_data))
# @internal_data       (type: R object) - internal data to match to the external data
# @external_data       (type: R object) - external data to match to the internal data
# @in_domain_vector    (type: R object) - vector of internal domain names (NEED TO BE 'http://www.random_website.com' format!)
# @ex_domain_vector    (type: R object) - vector of external domain_names (NEED TO BE 'http://www.random_website.com' format!)
#
# Return: merged data set between external and internal html titles
#
# Examples: 
# @match_algorithm_html_title(nthread, internal_iterations = nrow(internal_data), 
# external_iterations = nrow(external_data), internal_data, external_data, internal_data$in_domain, external_data$ex_domain)
#
####

match_algorithm_html_title <- function(nthread, internal_iterations, external_iterations, internal_data, external_data, in_domain_vector, ex_domain_vector){
  print("HTML title matching starting...")
  
  # extract html-title from each webpage in
  internal_title_check <- html_title_algorithm(nthread, length(in_domain_vector[1:internal_iterations]), FALSE, internal_data, external_data, in_domain_vector)
  external_title_check <- html_title_algorithm(nthread, length(ex_domain_vector[1:external_iterations]), TRUE, internal_data, external_data, ex_domain_vector)
  
  # isolate two identifying columns
  internal_title_check <- internal_title_check[, c("internal_index", "html_title")]
  external_title_check <- external_title_check[, c("external_index", "html_title")]
  
  # merge data by html_title output
  merged_title_checks <- merge(internal_title_check, external_title_check, by = "html_title", all.y = TRUE)
  
  # remove any NA's in the data
  merged_title_checks <- merged_title_checks[!(is.na(internal_index))]
  
  # remove any duplicated rows
  merged_title_checks <- merged_title_checks[!(duplicated(merged_title_checks$external_index)), ]
  
  # set any successful match to a definite match
  merged_title_checks <- merged_title_checks[, html_title_def_match := TRUE]
  
  print("HTML title matching complete!")
  
  return(merged_title_checks)
}


####
# Function Name: html_title_check()
#
# Purpose: extract the html-title from a webpage
#
# Parameters: 
# @domain (type: string) - pass in the domain name in the format "http://www.random_website.com"
#
# Return: html title for a given domain name
#
# Examples:
# @html_title_check("http://www.microsoft.com")
#
####

html_title_check <- function(domain){
  try(
    # feed domain into html algorithm
    domain %>%
      # instate a timer between requests (as to not overload web requests)
      GET(., timeout(10)) %>%
      as.character() %>%
      read_html() %>%
      # extract only the title from the HTML page & retrieve as text
      html_nodes('title') %>%
      html_text()
  )
}

####
# Function Name: html_title_algorithm()
#
# Purpose: retrieve and consolidate html_title algorithm results
#
# Parameters: 
# @nthread       (type: numeric)   - set from getOption("sd_num_thread"), will be max(cores_available)-1
# @iterations    (type: numeric)   - number of iterations to match (will be passed in from calling function)
# @is_external   (type: logical)   - is this the external or internal data? 
# @internal_data (type: R object)  - internal data to match to the external data
# @external_data (type: R object)  - external data to match to the internal data
# @domain_vector (type: R object)  - vector of domain names (NEED TO BE 'http://www.random_website.com' format!)
#
# Return: table of html-title matches
#
# Examples: 
# @html_title_algorithm(nthread, iterations = nrow(external_data), is_external = TRUE, internal_data, external_data, external_data$ex_domain)
#
####

html_title_algorithm <- function(nthread, iterations, is_external, internal_data, external_data, domain_vector){
  # set total number of cores for job and register them
  clusters <- makeCluster(nthread)
  registerDoSNOW(clusters)
  
  # create progress bar for the match (external data)
  if(is_external == TRUE){
    pb <- progress_bar$new(
      format = " External HTML-title extraction progress [:bar] :percent eta: :eta elapsed_full :elapsedfull",
      total = iterations,    # 100 
      width = 100
    ) 
  }
  
  # create progress bar for the match (internal data)
  else{
    pb <- progress_bar$new(
      format = " Internal HTML-title extraction progress [:bar] :percent eta: :eta elapsed_full :elapsedfull",
      total = iterations,    # 100 
      width = 100
    )
  }
  
  # generate progress indicator
  ## token reported in progress bar
  progress_indicator <- rep(1, 100)  
  
  # allowing progress bar to be used in for-each function options
  progress <- function(n){
    pb$tick(tokens = list(prog_count = progress_indicator[n]))
  } 
  
  # listing options for progress
  opts <- list(progress = progress)
  
  # iterate through domains using cores set above
  output_html_title <- foreach(domain = domain_vector[1:iterations], .packages = c('data.table', 'stringdist', 'httr', 'dplyr', 'rvest'), .combine = 'c', .export = "html_title_check", .options.snow = opts) %dopar% {
    # sleep for 5 seconds to not overload internet requests (varies by browser)
    Sys.sleep(5)
    
    # extract title from both domains
    html_title <- as.character(html_title_check(domain))
    
    # ignore errors from false HTMLs
    tryCatch(
      # set FALSE or TRUE based on equal titles
      match <- paste0(domain, " --> ", html_title),
      error = function(e){NA}
    )
  }
  
  # free any zombie processes
  on.exit(stopCluster(clusters))
  
  # format data
  output_html_title <- format_html_title(output_html_title, is_external, internal_data, external_data)
  
  return(output_html_title)
}

####
# Function Name: format_html_title()
#
# Purpose: format html-title matching algorithm output
#
# Parameters: 
# @output        (type: R object) - html_check_algorithm results
# @is_external   (type: logical)  - is this the external or internal data? 
# @internal_data (type: R object) - internal data set
# @external_data (type: R object) - external data set
#
# Return: formatted table with linked html-titles
#
# Examples: 
# @format_html_title(output_html_title_match, is_external = TRUE, internal_data, external_data)
#
####

format_html_title <- function(output, is_external, internal_data, external_data){
  # convert vecotrized output to a data.table
  output <- as.data.table(output)
  
  # case when the data passed in is external
  if(is_external == TRUE){
    # rename column name to 'ex_domain'
    colnames(output) <- "ex_domain"
    
    # split each vector element into the domain and title
    output <- as.data.table(str_split_fixed(output$ex_domain, " --> ", 2))
    
    # rename columns to now include title
    colnames(output) <- c("ex_domain", "html_title")
    
    # remove any duplicates domains
    output <- output[!duplicated(output$ex_domain), ]
    
    # remove any false/non-unique results
    output <- output[!(ex_domain == "" | ex_domain == " " | is.na(ex_domain))]
    
    # remove any entries with error messages and any title < 5 characters
    output <- output[!(html_title %like% "Error in curl" | html_title %like% "Error : 'NA'" | html_title %like% "Error " | html_title %like% "40" | nchar(html_title) < 5 | html_title %like% "Not Found" | html_title %like% "No Such Website" | html_title %like% "DNS resolution error" | html_title %like% "Cloudflare")]
    
    # merge data back with external data
    output <- merge(output, external_data, by = "ex_domain")
  }
  
  # case when the data passed in is internal
  else{
    # rename column name to 'in_domain'
    colnames(output) <- "in_domain"
    
    # split each vector element into the domain and title
    output <- as.data.table(str_split_fixed(output$in_domain, " --> ", 2))
    
    # rename columns to now include title
    colnames(output) <- c("in_domain", "html_title")
    
    # remove any duplicates domains
    output <- output[!duplicated(output$in_domain), ]
    
    # remove any false/non-unique results
    output <- output[!(in_domain == "" | in_domain == " " | is.na(in_domain))]
    
    # remove any entries with error messages and any title < 5 characters
    output <- output[!(html_title %like% "Error in curl" | html_title %like% "Error : 'NA'" | html_title %like% "Error " | html_title %like% "40" | nchar(html_title) < 5 | html_title %like% "Not Found" | html_title %like% "No Such Website" | html_title %like% "DNS resolution error" | html_title %like% "Cloudflare")]
    
    # merge data back with external data
    output <- merge(output, internal_data, by = "in_domain")
  }
  return(output)
}


####
# Function Name: algorithm_selection()
#
# Purpose: menu of options for the three matching algorithms, can choose 1, 2, or all 3 algorithms
#
# Parameters: 
# @nthread             (type: numeric)    - set from getOption("sd_num_thread"), will be max(cores_available)-1
# @iterations          (type: numeric)    - number of iterations internally to match to (full dataset pass in nrow(internal_data))
# @internal_data       (type: R object)   - internal data to match to the external data
# @external_data       (type: R object)   - external data to match to the internal data
# @domain_name_match   (type: logical)    - toggle to TRUE for domain name matching
# @name_match          (type: logical)    - toggle to TRUE for Jaro-Winkler name matching
# @html_title_match    (type: logical)    - toggle to TRUE for html-title matching
# @in_domain_vector    (type: R object)   - vector of internal domain names (NEED TO BE 'http://www.random_website.com' format!)
# @ex_domain_vector    (type: R object)   - vector of external domain_names (NEED TO BE 'http://www.random_website.com' format!)
# @in_name_vector      (type: R object)   - vector of internal names
# @ex_name_vector      (type: R object)   - vector of external names
# @internal_iterations (type: numeric)    - number of iterations internally to match to (full dataset pass in nrow(internal_data))
# @external_iterations (type: numeric)    - number of iterations externally to match up (full dataset pass in nrow(external_data))
#
# Return: matched table according to algorithm configuration
#
# Examples:
# @algorithm_selection(nthread, iterations = nrow(external_data), internal_data, external_data,
# domain_name_match = TRUE, in_domain_vector = internal_data$in_domain, ex_domain_vector = external_data$ex_domain)
#
# @algorithm_selection(nthread, iterations = nrow(external_data), internal_data, external_data,
# name_match = TRUE, in_name_vector = internal_data$formatted_name, ex_name_vector = external_data$formatted_name)
#
# @algorithm_selection(nthread, iterations = nrow(external_data), internal_data, external_data,
# html_title_match = TRUE, in_domain_vector = internal_data$in_domain, ex_domain_vector = external_data$ex_domain)
#
####

algorithm_selection <- function(nthread, iterations, internal_data, external_data, 
                                domain_name_match = FALSE, name_match = FALSE, html_title_match = FALSE, 
                                in_domain_vector = NULL, ex_domain_vector = NULL, 
                                in_name_vector = NULL, ex_name_vector = NULL,
                                internal_iterations = nrow(internal_data), external_iterations = nrow(external_data))
{
  # case for domain name matching
  if(domain_name_match == TRUE & name_match == FALSE & html_title_match == FALSE){
    print("Algorithm(s) chosen -> domain_name_match")
    
    # domain name match algorithm call
    domain_name_table <- match_algorithm_domain_name(nthread, iterations, internal_data, external_data, in_domain_vector, ex_domain_vector)
    
    # subset to only unique columns
    domain_name_table <- domain_name_table[, c("external_index", "internal_index_domain_name", "is_match", "html_def_match")]
    
    # merge external data with domain name table
    domain_name_ex_merge <- merge(external_data, domain_name_table, by = "external_index")
    
    # set the final internal index based on the matched index
    domain_name_ex_merge <- domain_name_ex_merge[, internal_index := internal_index_domain_name]
    
    # merge the internal data with the externally matched table
    domain_name_all_merge <- merge(internal_data, domain_name_ex_merge, by = "internal_index")
    
    # declare final table
    final_merge <- domain_name_all_merge
  }
  
  # case for name matching
  else if(domain_name_match == FALSE & name_match == TRUE & html_title_match == FALSE){
    print("Algorithm(s) chosen -> name_match")
    
    # Jaro-Winkler name match algorithm call
    name_match_table <- match_algorithm_name(nthread, iterations, internal_data, external_data, in_name_vector, ex_name_vector)
    
    # subset to only unique columns
    name_match_table <- name_match_table[, c("external_index", "internal_index_name_match", "jw_match", "is_match", "string_def_match")]
    
    # merge external data with matched name table
    name_match_ex_merge <- merge(external_data, name_match_table, by = "external_index")
    
    # set the final internal index based on the matched index
    name_match_ex_merge <- name_match_ex_merge[, internal_index := internal_index_name_match]
    
    # merge the internal data with the externally matched table
    name_match_all_merge <- merge(internal_data, name_match_ex_merge, by = "internal_index")
    
    # declare final table
    final_merge <- name_match_all_merge
  }
  
  # case for html-title matching
  else if(domain_name_match == FALSE & name_match == FALSE & html_title_match == TRUE){
    print("Algorithm(s) chosen -> html_title_match")
    
    # html-title matching algorithm call
    html_title_table <- match_algorithm_html_title(nthread, internal_iterations, external_iterations, internal_data, external_data, in_domain_vector, ex_domain_vector)
    
    # subset to only unique columns
    html_title_table <- html_title_table[, c("external_index", "internal_index", "html_title", "html_title_def_match")]
    
    # merge external data with matched html-title table
    html_title_ex_merge <- merge(external_data, html_title_table, by = "external_index")
    
    # merge the internal data with the externally matched table
    html_title_all_merge <- merge(internal_data, html_title_ex_merge, by = "internal_index")
    
    # declare final table
    final_merge <- html_title_all_merge
  }
  
  # case for domain_name_match and name_match
  else if(domain_name_match == TRUE & name_match == TRUE & html_title_match == FALSE){
    print("Algorithm(s) chosen -> domain_name_match & name_match")
    
    # domain name matching algorithm call
    domain_name_table <- match_algorithm_domain_name(nthread, iterations, internal_data, external_data, in_domain_vector, ex_domain_vector)
    
    # Jaro-Winkler name match algorithm call
    name_match_table <- match_algorithm_name(nthread, iterations, internal_data, external_data, in_name_vector, ex_name_vector)
    
    # merge domain name and Jaro-Winkler tables together
    domain_and_name_merge <- merge(domain_name_table, name_match_table, by = "external_index")
    
    # set match if result returns for match
    domain_and_name_merge[, is_match := ifelse(is_match.x == TRUE | is_match.y == TRUE, TRUE, FALSE)]
    
    # set definitive matches if either definite match returns TRUE
    domain_and_name_merge[, def_match := ifelse(html_def_match == TRUE | string_def_match == TRUE, TRUE, FALSE)]
    
    # set base internal index
    domain_and_name_merge[, internal_index := internal_index_name_match]
    
    # prioritize domain internal index (more accurate) 
    domain_and_name_merge[, internal_index := ifelse(!is.na(internal_index_domain_name) & (internal_index_domain_name != internal_index), internal_index_domain_name, internal_index)]
    
    # remove is_match columns
    domain_and_name_merge <- domain_and_name_merge[, !c("is_match.x", "is_match.y")]
    
    # set Jaro-Winkler to 0 for any definite matches
    domain_and_name_merge <- domain_and_name_merge[, jw_match := ifelse(def_match == TRUE, 0, jw_match)]
    
    # subset to unique columns
    domain_and_name_merge <- domain_and_name_merge[, c("external_index", "internal_index", "internal_index_domain_name", "internal_index_name_match", "jw_match", "is_match", "html_def_match", "string_def_match", "def_match")]
    
    # merge external data with matched table
    domain_and_name_ex_merge <- merge(external_data, domain_and_name_merge, by = "external_index")
    
    # merge internal data with matched table
    domain_and_name_all_merge <- merge(internal_data, domain_and_name_ex_merge, by = "internal_index")
    
    # declare final table
    final_merge <- domain_and_name_all_merge
  }
  
  # case for domain_name_match, name_match, & html_title_match (FROM GIVEN MATCHES, NOT SEPERATE)
  else if(domain_name_match == TRUE & name_match == TRUE & html_title_match == TRUE){
    print("Algorithm(s) chosen -> domain_name_match, name_match, & html_title_match")
    
    # domain name matching algorithm call
    domain_name_table <- match_algorithm_domain_name(nthread, iterations, internal_data, external_data, in_domain_vector, ex_domain_vector)
    
    # Jaro-Winkler name match algorithm call
    name_match_table <- match_algorithm_name(nthread, iterations, internal_data, external_data, in_name_vector, ex_name_vector)
    
    # merge domain name and Jaro-Winkler tables together
    domain_and_name_merge <- merge(domain_name_table, name_match_table, by = "external_index")
    
    # set match if result returns for match
    domain_and_name_merge[, is_match := ifelse(is_match.x == TRUE | is_match.y == TRUE, TRUE, FALSE)]
    
    # set definitive matches if either definite match returns TRUE
    domain_and_name_merge[, def_match := ifelse(html_def_match == TRUE | string_def_match == TRUE, TRUE, FALSE)]
    
    # set base internal index
    domain_and_name_merge[, internal_index := internal_index_name_match]
    
    # prioritize domain internal index (more accurate) 
    domain_and_name_merge[, internal_index := ifelse(!is.na(internal_index_domain_name) & (internal_index_domain_name != internal_index), internal_index_domain_name, internal_index)]
    
    # remove is_match columns
    domain_and_name_merge <- domain_and_name_merge[, !c("is_match.x", "is_match.y")]
    
    # set Jaro-Winkler to 0 for any definite matches
    domain_and_name_merge <- domain_and_name_merge[, jw_match := ifelse(def_match == TRUE, 0, jw_match)]
    
    # subset to unique columns
    domain_and_name_merge <- domain_and_name_merge[, c("external_index", "internal_index", "internal_index_domain_name", "internal_index_name_match", "jw_match", "is_match", "html_def_match", "string_def_match", "def_match")]
    
    # merge external data with matched table
    domain_and_name_ex_merge <- merge(external_data, domain_and_name_merge, by = "external_index")
    
    # merge internal data with matched table
    domain_and_name_all_merge <- merge(internal_data, domain_and_name_ex_merge, by = "internal_index")
    
    # html-title matching algorithm call
    html_title_table <- match_algorithm_html_title(nthread, iterations, internal_data, external_data, in_domain_vector, ex_domain_vector)
    
    # remove internal index (already exists)
    html_title_table <- html_title_table[, !("internal_index")]
    
    # merge domain name, Jaro-Winkler name, and html-title tables together
    all_merge <- merge(domain_and_name_all_merge, html_title_table, by = "external_index", all.x = TRUE)
    
    # set NAs to FALSE
    all_merge[, html_title_def_match := ifelse(is.na(html_title), FALSE, html_title_def_match)]
    
    # set definite matches based on html-title matches
    all_merge[, def_match := ifelse(html_title_def_match == TRUE, TRUE, def_match)]
    
    # declare final table
    final_merge <- all_merge
    
  }
  
  # case when no valid merge configuration is selected
  else{
    print("No valid merge combination chosen!")
  }
  
  return(final_merge)
}

####
# Function Name: match_thresholds()
#
# Purpose: create a thresholds column in final match table to break up approximate matches into confidence groups (100%, 75%, 50%, 25%)
#
# Parameters: 
# @final_merge          (type: R object) - final merged data table
# @has_city             (type: logical)  - toggle to TRUE if city data is present in both datasets (columns should be 'ex_city' and 'in_city')
# @has_industry         (type: logical)  - toggle to TRUE if industry data is present in both datasets (columns should be 'ex_industry' and 'in_industry')
# @full_partial_match   (type: numeric)  - Jaro-Winkler threshold for full to partial matches
# @partial_little_match (type: numeric)  - Jaro-Winkler threshold for partial to little matches
#
# Return: table with confidence thresholds
#
# Examples:
# @match_thresholds(final_merge, has_city = TRUE, has_industry = FALSE, full_partial_match = .12, partial_little_match = .18)
#
####

match_thresholds <- function(final_merge, has_city = FALSE, has_industry = FALSE, full_partial_match = .12, partial_little_match = .18){
  # case when only city data is present
  if(has_city == TRUE & has_industry == FALSE){
    
    # set definite match
    final_merge[, def_match := ifelse(jw_match < fill_partial_match & (ex_city == in_city), TRUE, def_match)]
    
    # create thresholds based on JW and city identifier
    final_merge[, threshold_confidence := ifelse(def_match == TRUE, "100%", 
                                                    ifelse((jw_match > full_partial_match & (ex_city == in_city)) | jw_match <= full_partial_match, "75%", 
                                                           ifelse(jw_match > full_partial_match & jw_match <= partial_little_match & !(ex_city == in_city), "50%",
                                                                  ifelse(jw_match > partial_little_match, "25-0%", "NA"))))]
  }
  
  # case when only industry data is present
  else if(has_city == FALSE & has_industry == TRUE){
    
    # set definite match
    final_merge[, def_match := ifelse(jw_match < fill_partial_match & (ex_industry == in_industry), TRUE, def_match)]
    
    # create thresholds based on JW and industry identifier
    final_merge[, threshold_confidence := ifelse(def_match == TRUE, "100%", 
                                                 ifelse((jw_match > full_partial_match & (ex_industry == in_industry)) | jw_match <= full_partial_match, "75%", 
                                                        ifelse(jw_match > full_partial_match & jw_match <= partial_little_match & !(ex_industry == in_industry), "50%",
                                                               ifelse(jw_match > partial_little_match, "25-0%", "NA"))))]
  }
  
  # case when city and industry data is present
  else if(has_city == TRUE & has_industry == TRUE){
    
    # set definite match
    final_merge[, def_match := ifelse(jw_match < fill_partial_match & (ex_city == in_city | ex_industry == in_industry), TRUE, def_match)]
    
    # create thresholds based on JW, city, and industry identifier
    final_merge[, threshold_confidence := ifelse(def_match == TRUE, "100%", 
                                                 ifelse((jw_match > full_partial_match & (ex_city == in_city | ex_industry == in_industry)) | jw_match <= full_partial_match, "75%", 
                                                        ifelse(jw_match > full_partial_match & jw_match <= partial_little_match & !(ex_city == in_city & ex_industry == in_industry), "50%",
                                                               ifelse(jw_match > partial_little_match, "25-0%", "NA"))))]
  }
  
  # case when no city or industry data is available
  else{
    # create thresholds based only on JW
    final_merge[, threshold_confidence := ifelse(def_match == TRUE, "100%", 
                                                 ifelse((jw_match > full_partial_match) | jw_match <= full_partial_match, "75%", 
                                                        ifelse(jw_match > full_partial_match & jw_match <= partial_little_match, "50%",
                                                               ifelse(jw_match > partial_little_match, "25-0%", "NA"))))]
  }
  
  return(final_merge)
}

####
# Function Name: results_output()
#
# Purpose: create custom graphs based on final merge output
#
# Parameters: 
# @final_table                   (type: R object) - final data table
# @plot_match_rate_general       (type: R object) - plot of definite vs. approximate matches percentages
# @plot_threshold_rate_general   (type: R object) - plot of threshold breakdown
# @plot_match_rate_decile        (type: R object) - plot of match rates by decile
# @rank_column                   (type: R object) - column to organize deciles by (employment count, index number, etc...)
# @table_match_rates             (type: R object) - general data table with summary statistics
#
# Return: plot or table with end results
#
# Examples:
# @results_output(final_table, plot_match_rate_decile = TRUE, rank_column = final_table$external_index)
#
####

results_output <- function(final_table, plot_match_rate_general = FALSE, plot_threshold_rate_general = FALSE, plot_match_rate_decile = FALSE, rank_column = NULL, table_match_rates = FALSE){
  # case for general match rate plot
  if(plot_match_rate_general == TRUE){
    # table definite matches
    general_match <- as.data.table(table(test$def_match))
    
    # create ggplot
    final_plot <- ggplot(general_match, aes(x = V1, y = N/sum(general_match$N)*100)) +
      geom_bar(stat = "identity", fill = "steelblue") +
      theme_minimal() +
      scale_y_continuous(breaks = c(0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100)) + 
      xlab("Definite Match")+
      ylab("Percentage of Definite Matches") + 
      geom_text(aes(label = scales::percent(N/sum(general_match$N), accuracy = 2)), position = position_stack(.5)) +
      ggtitle("Approximate vs. Definite Matches Percentages", ) +
      theme(plot.title = element_text(hjust = 0.5))
  }
  
  # case for threshold match rate plot
  else if(plot_threshold_rate_general == TRUE){
    # table threshold breakdown
    general_match <- as.data.table(table(final_table$threshold_confidence))
    
    # create ggplot
    final_plot <- ggplot(general_match, aes(x = V1, y = N/sum(general_match$N)*100)) +
      geom_bar(stat = "identity", fill = "steelblue") +
      theme_minimal() +
      scale_y_continuous(breaks = c(0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100)) + 
      coord_flip()+
      xlab("Thresholds (100% being a definite match)")+
      ylab("Percentage of Data in Threshold") + 
      geom_text(aes(label = scales::percent(N/sum(general_match$N), accuracy = 2)), position = position_stack(.5)) +
      ggtitle("Threshold Confidence Percentage", ) +
      theme(plot.title = element_text(hjust = 0.5))
  }
  
  # case for decile match rate plot
  else if(plot_match_rate_decile == TRUE){
    # rank by column passed in
    final_table <- final_table[order(rank_column, decreasing = TRUE)]
    
    # seperate data into 10 equal deciles
    final_table <- mutate(final_table, decile_rank = ntile(rank_column,10))
    
    # table how many rows per decile
    decile_count <- as.data.table(table(final_table$decile_rank))
    table_decile <- as.data.table(table(final_table$def_match, by = final_table$decile_rank))
    
    # merge data together
    setnames(x = decile_count, old = "V1", "by")
    merged <- merge(table_decile, decile_count, by = "by")
    merged[, percent := (N.x/N.y)]
    merged <- merged[, by := as.numeric(by)]
    merged <- merged[, V2 := as.character(V2)]
    
    # reorder
    merged <- merged[order(merged$by)]
    merged[, V2 := ifelse(V2 == "TRUE", "Definite Match", "Approximate Match")]
    
    # create ggplot
    final_plot <- ggplot(merged, aes(x = by, y = percent*100)) + 
      geom_bar(stat = "identity", fill = "steelblue") +
      theme_minimal() + 
      scale_x_continuous(breaks = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)) + 
      scale_y_continuous(breaks = c(0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100)) + 
      coord_flip()+
      xlab("Deciles (10 being largest)")+
      ylab("Percentage") + 
      geom_text(aes(label = scales::percent(percent, accuracy = 2)), position = position_stack(.5)) +
      ggtitle("Match Rates for Merge Table", ) +
      theme(plot.title = element_text(hjust = 0.5)) +
      facet_grid(~V2) 
  }
  
  # case for general match rate table
  else if(table_match_rates == TRUE){
    # break up data into sections
    top_half <- final_table[1:(nrow(final_table)/2)]
    botton_half <- final_table[(nrow(final_table)/2):nrow(final_table)]
    
    # create data frame with summary stats
    final_plot <- data.frame(
      result_name = c("TOTAL MATCH RATE", "DEFINITE MATCH RATE", "APPROXIMATE MATCH RATE", "TOP HALF DEFINITE MATCHES", "BOTTOM HALF DEFINITE MATCHES"),
      results = c(100*round(nrow(final_table[is_match == TRUE])/nrow(final_table), 3), 100*round(nrow(final_table[def_match == TRUE])/nrow(final_table), 3), 100*round(nrow(final_table[def_match == FALSE])/nrow(final_table), 3), 100*round(nrow(top_half[def_match == TRUE])/nrow(top_half), 3),  100*round(nrow(botton_half[def_match == TRUE])/nrow(botton_half), 5))
    )
  }
  
  # case when no option is selected
  else{
    print("No option selected.")
  }
  
  return(final_plot)
}

# Section 3: Load Data to R
##################################################

# internal data
raw_internal_data <- fread("")
internal_data      <- as.data.table(copy(raw_internal_data))

# external data 
raw_external_data  <- fread("")
external_data      <- as.data.table(copy(raw_external_data))

# Section 4: Format Data for Algorithm
##################################################

# internal data indexing for merging
internal_data$internal_index <- as.numeric(seq.int(nrow(internal_data)))

# external data indexing for merging
external_data$external_index <- as.numeric(seq.int(nrow(external_data)))

# DATA CLEANING NOTES: 
 # if domain, city, or industry columns exist in datasets
  # 1) rename internal domain column to 'in_domain'
  # 2) rename external domain column to 'ex_domain'
  # 3) rename internal city column to 'in_city'
  # 4) rename external city column to 'ex_city'
  # 5) rename internal industry column to 'in_industry'
  # 6) rename external industry column to 'ex_industry'
 # domain name should be 'http://www.random_website.com" format
  # 1) use/modify clean_domain_name_data() function
 # names should be cleaned and properly formatted before running algorithm
 ## using the clean_name_data() function

# setnames(internal_data, "", "")
# setnames(external_data, "", "")

# clean_domain_name_data()

# clean_name_data()

## Section 5: Run Algorithm
##################################################

# call algorithm selection function here with desired configuration here
# algorithm_selection()

## Section 6: Analyze/Modify Algorithm Results
##################################################

# add confidence threshold for approximate matches using match_thresholds function
# match_thresholds()

## Section 7: Generate Results
##################################################

# curate results via the results_output function
# results_output()


# Section 8: Save Results
##################################################

# save results via the save_object function
# save_object()