# Merge Datasets

<h2>Packages</h2>

In [2]:
library(tidyverse)

<h2>Functions</h2>

In [3]:
# Converts all factors to character class
unfactorize <- function(df){
  for(i in which(sapply(df, class) == "factor")) df[[i]] = as.character(df[[i]])
  return(df)
}

In [4]:
# Converts character to factor class
factorize <- function(df){
  for(i in which(sapply(df, class) == "character")) df[[i]] = as.factor(df[[i]])
  return(df)
}

<h2>Data</h2>

In [33]:
# Load data related to authors and documents (metadata)
dfMeta <- unfactorize(read.csv("20240606_PhD_Letters.csv"))
# Order rows by value in docid
dfMeta <- dfMeta[order(dfMeta$docid, decreasing = FALSE), ]
glimpse(dfMeta)

Rows: 676
Columns: 18
$ docid          [3m[90m<chr>[39m[23m "20910", "21062", "21324", "21334", "21354", "21470", "…
$ docyear        [3m[90m<int>[39m[23m 1891, 1871, 1892, 1891, 1890, 1800, 1839, 1838, 1844, 1…
$ docmonth       [3m[90m<int>[39m[23m 7, 11, 5, 10, 2, 3, 1, 9, 12, 4, 10, 1, 5, 6, 10, 10, 1…
$ authorName     [3m[90m<chr>[39m[23m "Isabella Weir Moore", "E. Rothwell", "Isabella Weir Mo…
$ docauthorid    [3m[90m<chr>[39m[23m "IED0107", "IED0179", "IED0107", "IED0621", "IED0958", …
$ authorLocation [3m[90m<chr>[39m[23m "USA", "Canada", "USA", "USA", "USA", "USA", "USA", "US…
$ authorGender   [3m[90m<chr>[39m[23m "F", "F", "F", "F", "M", "F", "F", "F", "F", "F", "F", …
$ nationalOrigin [3m[90m<chr>[39m[23m "Irish", "Irish", "Irish", "Irish", "Irish", "Irish", "…
$ irish          [3m[90m<lgl>[39m[23m TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, T…
$ otherUK        [3m[90m<lgl>[39m[23m FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FAL

In [34]:
# Column names
#names(dfMeta)

In [35]:
# Load data related to authors and documents (metadata)
dfNarrative <- unfactorize(read.csv("20240607_PhD_SentimentLetters.csv"))[-1]
# Strip .txt from docids in DfNarrative
dfNarrative$docid <- gsub(".txt","", dfNarrative$docid)
glimpse(dfNarrative)

Rows: 676
Columns: 9
$ text             [3m[90m<chr>[39m[23m "July 18 1891 Dear Sister I have waited until I could…
$ docid            [3m[90m<chr>[39m[23m "20910", "21062", "21324", "21334", "21354", "21470",…
$ totalTokens      [3m[90m<int>[39m[23m 182, 303, 210, 403, 414, 489, 1353, 994, 899, 265, 12…
$ uniqueTokens     [3m[90m<int>[39m[23m 119, 184, 139, 222, 210, 250, 567, 466, 361, 151, 507…
$ lexicalDiversity [3m[90m<dbl>[39m[23m 0.6538462, 0.6072607, 0.6619048, 0.5508685, 0.5072464…
$ scoreNeg         [3m[90m<dbl>[39m[23m 0.00000000, 0.03512500, 0.04600000, 0.04511111, 0.028…
$ scoreNeu         [3m[90m<dbl>[39m[23m 0.8550000, 0.7101250, 0.7990000, 0.8120000, 0.8460000…
$ scorePos         [3m[90m<dbl>[39m[23m 0.14500000, 0.25475000, 0.15500000, 0.14288889, 0.125…
$ scoreCom         [3m[90m<dbl>[39m[23m 0.51510000, 0.25835000, 0.94230000, 0.14520000, 0.887…


In [36]:
# Are the values identical
identical(dfMeta$docid,dfNarrative$docid)

In [37]:
#Merge datasets
df <- right_join(dfMeta, dfNarrative, by = 'docid')
#glimpse(df)
# df[1, c("docid", "text")]

In [38]:
# Convert variables to factor
vars  <- c("docauthorid", "authorName", "docid", "authorGender", "nationalOrigin", "authorLocation")
df[vars]  <- factorize(df[vars])

In [39]:
# Check classes
# lapply(df, class)
# summary(df)
glimpse(df) 

Rows: 676
Columns: 26
$ docid            [3m[90m<fct>[39m[23m 20910, 21062, 21324, 21334, 21354, 21470, 21549, 2156…
$ docyear          [3m[90m<int>[39m[23m 1891, 1871, 1892, 1891, 1890, 1800, 1839, 1838, 1844,…
$ docmonth         [3m[90m<int>[39m[23m 7, 11, 5, 10, 2, 3, 1, 9, 12, 4, 10, 1, 5, 6, 10, 10,…
$ authorName       [3m[90m<fct>[39m[23m Isabella Weir Moore, E. Rothwell, Isabella Weir Moore…
$ docauthorid      [3m[90m<fct>[39m[23m IED0107, IED0179, IED0107, IED0621, IED0958, IED0099,…
$ authorLocation   [3m[90m<fct>[39m[23m USA, Canada, USA, USA, USA, USA, USA, USA, USA, USA, …
$ authorGender     [3m[90m<fct>[39m[23m F, F, F, F, M, F, F, F, F, F, F, F, F, M, M, F, F, F,…
$ nationalOrigin   [3m[90m<fct>[39m[23m Irish, Irish, Irish, Irish, Irish, Irish, Irish, Iris…
$ irish            [3m[90m<lgl>[39m[23m TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,…
$ otherUK          [3m[90m<lgl>[39m[23m FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, F

In [40]:
write.csv(df, "20240608_PhD_Data4TopicModel-Letter.csv")