# Merge Datasets

<h2>Packages</h2>

In [5]:
library(tidyverse)

<h2>Functions</h2>

In [6]:
# Converts all factors to character class
unfactorize <- function(df){
  for(i in which(sapply(df, class) == "factor")) df[[i]] = as.character(df[[i]])
  return(df)
}

In [7]:
# Converts character to factor class
factorize <- function(df){
  for(i in which(sapply(df, class) == "character")) df[[i]] = as.factor(df[[i]])
  return(df)
}

<h2>Data</h2>

In [8]:
# Load data related to authors and documents (metadata)
dfMeta <- unfactorize(read.csv("20240405_PhD_NaildohSubset.csv"))
#glimpse(dfMeta)

In [9]:
# Column names
names(dfMeta)

In [11]:
# Load data related to authors and documents (metadata)
dfNarrative <- unfactorize(read.csv("20240405_PhD_SentimentLetter.csv"))[-1]
# head(dfNarrative)

In [12]:
# Strip .txt from docids in DfNarrative
dfNarrative$docid <- gsub(".txt","", dfNarrative$docid)

In [13]:
# Are the values identical
identical(dfMeta$docid,dfNarrative$docid)

In [14]:
#Merge datasets
df <- right_join(dfMeta, dfNarrative, by = 'docid')
#glimpse(df)
# df[1, c("docid", "text")]

In [15]:
# Convert variables to factor
vars  <- c("docauthorid", "docauthorname", "docid", "authorgender", "nationalOrigin", "authorLocation")
df[vars]  <- factorize(df[vars])

In [16]:
# Check classes
# lapply(df, class)
# summary(df)
glimpse(df) 

Rows: 492
Columns: 24
$ docauthorid      [3m[90m<fct>[39m[23m per0001043, per0001043, per0001043, per0001043, per00…
$ docauthorname    [3m[90m<fct>[39m[23m "Segale, Sister Blandina, 1850-1941", "Segale, Sister…
$ docid            [3m[90m<fct>[39m[23m S1019-D002, S1019-D004, S1019-D005, S1019-D006, S1019…
$ docyear          [3m[90m<int>[39m[23m 1872, 1872, 1872, 1872, 1873, 1873, 1873, 1874, 1874,…
$ docmonth         [3m[90m<int>[39m[23m 11, 12, 12, 12, 3, 7, 9, 6, 11, 6, 9, 12, 1, 3, 3, 6,…
$ authorgender     [3m[90m<fct>[39m[23m F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,…
$ agewriting       [3m[90m<int>[39m[23m 22, 22, 22, 22, 23, 23, 23, 24, 24, 26, 26, 26, 27, 2…
$ agedeath         [3m[90m<int>[39m[23m 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 9…
$ relMin           [3m[90m<lgl>[39m[23m TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,…
$ nationalOrigin   [3m[90m<fct>[39m[23m Italian, Italian, Italian, Italian, Italian

In [17]:
write.csv(df, "20240405_PhD_Data4TopicModel-Letter.csv")