# Merge Datasets

<h2>Packages</h2>

In [2]:
library(tidyverse)

<h2>Functions</h2>

In [3]:
# Converts all factors to character class
unfactorize <- function(df){
  for(i in which(sapply(df, class) == "factor")) df[[i]] = as.character(df[[i]])
  return(df)
}

In [4]:
# Converts character to factor class
factorize <- function(df){
  for(i in which(sapply(df, class) == "character")) df[[i]] = as.factor(df[[i]])
  return(df)
}

<h2>Data</h2>

In [5]:
# Load data related to authors and documents (metadata)
dfMeta <- unfactorize(read.csv("20240606_PhD_Letters.csv"))
# Order rows by value in docid
dfMeta <- dfMeta[order(dfMeta$docid, decreasing = FALSE), ]
glimpse(dfMeta)

Rows: 676
Columns: 18
$ docid          [3m[90m<chr>[39m[23m "20910", "21062", "21324", "21334", "21354", "21470", "…
$ docyear        [3m[90m<int>[39m[23m 1891, 1871, 1892, 1891, 1890, 1800, 1839, 1838, 1844, 1…
$ docmonth       [3m[90m<int>[39m[23m 7, 11, 5, 10, 2, 3, 1, 9, 12, 4, 10, 1, 5, 6, 10, 10, 1…
$ authorName     [3m[90m<chr>[39m[23m "Isabella Weir Moore", "E. Rothwell", "Isabella Weir Mo…
$ docauthorid    [3m[90m<chr>[39m[23m "IED0107", "IED0179", "IED0107", "IED0621", "IED0958", …
$ authorLocation [3m[90m<chr>[39m[23m "USA", "Canada", "USA", "USA", "USA", "USA", "USA", "US…
$ authorGender   [3m[90m<chr>[39m[23m "F", "F", "F", "F", "M", "F", "F", "F", "F", "F", "F", …
$ nationalOrigin [3m[90m<chr>[39m[23m "Irish", "Irish", "Irish", "Irish", "Irish", "Irish", "…
$ irish          [3m[90m<lgl>[39m[23m TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, T…
$ otherUK        [3m[90m<lgl>[39m[23m FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FAL

In [6]:
# Column names
#names(dfMeta)

In [9]:
# Load data related to authors and documents (metadata)
dfNarrative <- unfactorize(read.csv("20240607_PhD_SentimentLetters-Chunks.csv"))[-1]
# Strip .txt from docids in DfNarrative
dfNarrative$docid <- gsub(".txt","", dfNarrative$docid)
glimpse(dfNarrative)

Rows: 2,392
Columns: 12
$ text             [3m[90m<chr>[39m[23m "July 18 1891 Dear Sister I have waited until I could…
$ docid            [3m[90m<chr>[39m[23m "20910", "21062", "21062", "21324", "21334", "21334",…
$ sequence         [3m[90m<int>[39m[23m 1, 1, 2, 1, 1, 2, 1, 2, 1, 2, 3, 1, 2, 3, 4, 5, 6, 7,…
$ totalTokens      [3m[90m<int>[39m[23m 182, 269, 122, 210, 287, 220, 290, 225, 274, 279, 129…
$ uniqueTokens     [3m[90m<int>[39m[23m 119, 163, 97, 139, 174, 140, 155, 145, 166, 163, 87, …
$ lexicalDiversity [3m[90m<dbl>[39m[23m 0.6538462, 0.6059480, 0.7950820, 0.6619048, 0.6062718…
$ scoreNeg         [3m[90m<dbl>[39m[23m 0.000000000, 0.041000000, 0.040750000, 0.046000000, 0…
$ scoreNeu         [3m[90m<dbl>[39m[23m 0.8550000, 0.7486667, 0.6670000, 0.7990000, 0.8192222…
$ scorePos         [3m[90m<dbl>[39m[23m 0.14500000, 0.21050000, 0.29225000, 0.15500000, 0.135…
$ scoreCom         [3m[90m<dbl>[39m[23m 0.51510000, 0.27973333, 0.08157500, 0.942

In [10]:
# Are the values identical
identical(unique(dfMeta$docid),unique(dfNarrative$docid))

In [11]:
#Merge datasets
df <- right_join(dfMeta, dfNarrative, by = 'docid')
#glimpse(df)

In [12]:
# Convert variables to factor
vars  <- c("docauthorid", "authorName", "docid", "authorGender", "nationalOrigin", "authorLocation")
df[vars]  <- factorize(df[vars])

In [13]:
# Check classes
# lapply(df, class)
# summary(df)
glimpse(df)

Rows: 2,392
Columns: 29
$ docid            [3m[90m<fct>[39m[23m 20910, 21062, 21062, 21324, 21334, 21334, 21354, 2135…
$ docyear          [3m[90m<int>[39m[23m 1891, 1871, 1871, 1892, 1891, 1891, 1890, 1890, 1800,…
$ docmonth         [3m[90m<int>[39m[23m 7, 11, 11, 5, 10, 10, 2, 2, 3, 3, 3, 1, 1, 1, 1, 1, 1…
$ authorName       [3m[90m<fct>[39m[23m Isabella Weir Moore, E. Rothwell, E. Rothwell, Isabel…
$ docauthorid      [3m[90m<fct>[39m[23m IED0107, IED0179, IED0179, IED0107, IED0621, IED0621,…
$ authorLocation   [3m[90m<fct>[39m[23m USA, Canada, Canada, USA, USA, USA, USA, USA, USA, US…
$ authorGender     [3m[90m<fct>[39m[23m F, F, F, F, F, F, M, M, F, F, F, F, F, F, F, F, F, F,…
$ nationalOrigin   [3m[90m<fct>[39m[23m Irish, Irish, Irish, Irish, Irish, Irish, Irish, Iris…
$ irish            [3m[90m<lgl>[39m[23m TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,…
$ otherUK          [3m[90m<lgl>[39m[23m FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,

In [14]:
write.csv(df, "20240608_PhD_Data4TopicModel-LetterChunk.csv")