# Merge Datasets

<h2>Packages</h2>

In [2]:
library(tidyverse)

<h2>Functions</h2>

In [3]:
# Converts all factors to character class
unfactorize <- function(df){
  for(i in which(sapply(df, class) == "factor")) df[[i]] = as.character(df[[i]])
  return(df)
}

In [4]:
# Converts character to factor class
factorize <- function(df){
  for(i in which(sapply(df, class) == "character")) df[[i]] = as.factor(df[[i]])
  return(df)
}

<h2>Data</h2>

In [5]:
# Load data related to authors and documents (metadata)
dfMeta <- unfactorize(read.csv("20240405_PhD_NaildohSubset.csv"))
#glimpse(dfMeta)

In [6]:
# Column names
names(dfMeta)

In [7]:
# Load data related to authors and documents (metadata)
dfNarrative <- unfactorize(read.csv("20240405_PhD_SentimentLetters-Chunks.csv"))[-1]
#head(dfNarrative)
nrow(dfNarrative)

In [8]:
# Are the values identical
identical(unique(dfMeta$docid),unique(dfNarrative$docid))

In [9]:
#Merge datasets
df <- right_join(dfMeta, dfNarrative, by = 'docid')
#glimpse(df)

In [10]:
# Convert variables to factor
vars  <- c("docauthorid", "docauthorname", "docid", "authorgender", "nationalOrigin", "authorLocation")
df[vars]  <- factorize(df[vars])

In [11]:
# Check classes
# lapply(df, class)
# summary(df)
glimpse(df)

Rows: 2,270
Columns: 27
$ docauthorid      [3m[90m<fct>[39m[23m per0001043, per0001043, per0001043, per0001043, per00…
$ docauthorname    [3m[90m<fct>[39m[23m "Segale, Sister Blandina, 1850-1941", "Segale, Sister…
$ docid            [3m[90m<fct>[39m[23m S1019-D002, S1019-D002, S1019-D002, S1019-D002, S1019…
$ docyear          [3m[90m<int>[39m[23m 1872, 1872, 1872, 1872, 1872, 1872, 1872, 1872, 1872,…
$ docmonth         [3m[90m<int>[39m[23m 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12, 1…
$ authorgender     [3m[90m<fct>[39m[23m F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,…
$ agewriting       [3m[90m<int>[39m[23m 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 2…
$ agedeath         [3m[90m<int>[39m[23m 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 9…
$ relMin           [3m[90m<lgl>[39m[23m TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,…
$ nationalOrigin   [3m[90m<fct>[39m[23m Italian, Italian, Italian, Italian, Itali

In [12]:
write.csv(df, "20240405_PhD_Data4TopicModel-LetterChunk.csv")