# Merge Datasets

<h2>Packages</h2>

In [4]:
library(tidyverse)

<h2>Functions</h2>

In [5]:
# Converts all factors to character class
unfactorize <- function(df){
  for(i in which(sapply(df, class) == "factor")) df[[i]] = as.character(df[[i]])
  return(df)
}

In [6]:
# Converts character to factor class
factorize <- function(df){
  for(i in which(sapply(df, class) == "character")) df[[i]] = as.factor(df[[i]])
  return(df)
}

<h2>Data</h2>

In [11]:
# Load data related to authors and documents (metadata)
dfMeta <- unfactorize(read.csv("20240628_PhD_Diaries.csv"))
# Order rows by value in docid
dfMeta <- dfMeta[order(dfMeta$docid, decreasing = FALSE), ]
glimpse(dfMeta)

“incomplete final line found by readTableHeader on '20240628_PhD_Diaries.csv'”


Rows: 4
Columns: 18
$ docid          [3m[90m<chr>[39m[23m "D0002", "D0003", "D0007", "D0009"
$ docyear        [3m[90m<int>[39m[23m 1883, 1858, 1865, 1871
$ docmonth       [3m[90m<lgl>[39m[23m NA, NA, NA, NA
$ authorName     [3m[90m<chr>[39m[23m "Anne F. Richards", "Henry H. Adams", "John Hart", "Edi…
$ docauthorid    [3m[90m<chr>[39m[23m "D0002", "D0003", "D0007", "D0009"
$ authorLocation [3m[90m<chr>[39m[23m "Australia", "Australia", "Australia", "Australia"
$ authorGender   [3m[90m<chr>[39m[23m "F", "M", "M", "F"
$ nationalOrigin [3m[90m<chr>[39m[23m "English", NA, "English", "English"
$ irish          [3m[90m<lgl>[39m[23m FALSE, NA, FALSE, FALSE
$ otherUK        [3m[90m<lgl>[39m[23m TRUE, NA, TRUE, TRUE
$ relMin         [3m[90m<lgl>[39m[23m FALSE, FALSE, FALSE, FALSE
$ catholic       [3m[90m<lgl>[39m[23m FALSE, FALSE, FALSE, FALSE
$ otherChristian [3m[90m<lgl>[39m[23m TRUE, TRUE, TRUE, TRUE
$ U              [3m[90m<lgl>[39m[23m

In [12]:
# Column names
#names(dfMeta)

In [14]:
# Load data related to authors and documents (metadata)
dfNarrative <- unfactorize(read.csv("20240628_PhD_SentimentDiaries.csv"))[-1]
# Strip .txt from docids in DfNarrative
dfNarrative$docid <- gsub(".txt","", dfNarrative$docid)
glimpse(dfNarrative)

Rows: 4
Columns: 9
$ text             [3m[90m<chr>[39m[23m "Charra Wednesday Nov 4th 1883 A nice cool day Mrs Ro…
$ docid            [3m[90m<chr>[39m[23m "D0002", "D0003", "D0007", "D0009"
$ totalTokens      [3m[90m<int>[39m[23m 81551, 19635, 54919, 28146
$ uniqueTokens     [3m[90m<int>[39m[23m 6885, 1939, 6332, 5644
$ lexicalDiversity [3m[90m<dbl>[39m[23m 0.08442570, 0.09875223, 0.11529707, 0.20052583
$ scoreNeg         [3m[90m<dbl>[39m[23m 0.05703855, 0.02814008, 0.05047711, 0.03900746
$ scoreNeu         [3m[90m<dbl>[39m[23m 0.8936699, 0.9166144, 0.8967719, 0.9102920
$ scorePos         [3m[90m<dbl>[39m[23m 0.04931566, 0.05386269, 0.05274381, 0.05068750
$ scoreCom         [3m[90m<dbl>[39m[23m -0.17875711, 0.07031789, -0.01703646, 0.02562369


In [15]:
# Are the values identical
identical(dfMeta$docid,dfNarrative$docid)

In [16]:
#Merge datasets
df <- right_join(dfMeta, dfNarrative, by = 'docid')
#glimpse(df)
# df[1, c("docid", "text")]

In [17]:
# Convert variables to factor
vars  <- c("docauthorid", "authorName", "docid", "authorGender", "nationalOrigin", "authorLocation")
df[vars]  <- factorize(df[vars])

In [18]:
# Check classes
# lapply(df, class)
# summary(df)
glimpse(df) 

Rows: 4
Columns: 26
$ docid            [3m[90m<fct>[39m[23m D0002, D0003, D0007, D0009
$ docyear          [3m[90m<int>[39m[23m 1883, 1858, 1865, 1871
$ docmonth         [3m[90m<lgl>[39m[23m NA, NA, NA, NA
$ authorName       [3m[90m<fct>[39m[23m Anne F. Richards, Henry H. Adams, John Hart, Edith C.…
$ docauthorid      [3m[90m<fct>[39m[23m D0002, D0003, D0007, D0009
$ authorLocation   [3m[90m<fct>[39m[23m Australia, Australia, Australia, Australia
$ authorGender     [3m[90m<fct>[39m[23m F, M, M, F
$ nationalOrigin   [3m[90m<fct>[39m[23m English, NA, English, English
$ irish            [3m[90m<lgl>[39m[23m FALSE, NA, FALSE, FALSE
$ otherUK          [3m[90m<lgl>[39m[23m TRUE, NA, TRUE, TRUE
$ relMin           [3m[90m<lgl>[39m[23m FALSE, FALSE, FALSE, FALSE
$ catholic         [3m[90m<lgl>[39m[23m FALSE, FALSE, FALSE, FALSE
$ otherChristian   [3m[90m<lgl>[39m[23m TRUE, TRUE, TRUE, TRUE
$ U                [3m[90m<lgl>[39m[23m FALSE, FALS

In [19]:
write.csv(df, "20240628_PhD_Data4TopicModel-Diary.csv")