# Merge Datasets

<h2>Packages</h2>

In [3]:
library(tidyverse)

<h2>Functions</h2>

In [4]:
# Converts all factors to character class
unfactorize <- function(df){
  for(i in which(sapply(df, class) == "factor")) df[[i]] = as.character(df[[i]])
  return(df)
}

In [5]:
# Converts character to factor class
factorize <- function(df){
  for(i in which(sapply(df, class) == "character")) df[[i]] = as.factor(df[[i]])
  return(df)
}

<h2>Data</h2>

In [6]:
# Load data related to authors and documents (metadata)
dfMeta <- unfactorize(read.csv("20240628_PhD_Diaries.csv"))
# Order rows by value in docid
dfMeta <- dfMeta[order(dfMeta$docid, decreasing = FALSE), ]
glimpse(dfMeta)

“incomplete final line found by readTableHeader on '20240628_PhD_Diaries.csv'”


Rows: 4
Columns: 18
$ docid          [3m[90m<chr>[39m[23m "D0002", "D0003", "D0007", "D0009"
$ docyear        [3m[90m<int>[39m[23m 1883, 1858, 1865, 1871
$ docmonth       [3m[90m<lgl>[39m[23m NA, NA, NA, NA
$ authorName     [3m[90m<chr>[39m[23m "Anne F. Richards", "Henry H. Adams", "John Hart", "Edi…
$ docauthorid    [3m[90m<chr>[39m[23m "D0002", "D0003", "D0007", "D0009"
$ authorLocation [3m[90m<chr>[39m[23m "Australia", "Australia", "Australia", "Australia"
$ authorGender   [3m[90m<chr>[39m[23m "F", "M", "M", "F"
$ nationalOrigin [3m[90m<chr>[39m[23m "English", NA, "English", "English"
$ irish          [3m[90m<lgl>[39m[23m FALSE, NA, FALSE, FALSE
$ otherUK        [3m[90m<lgl>[39m[23m TRUE, NA, TRUE, TRUE
$ relMin         [3m[90m<lgl>[39m[23m FALSE, FALSE, FALSE, FALSE
$ catholic       [3m[90m<lgl>[39m[23m FALSE, FALSE, FALSE, FALSE
$ otherChristian [3m[90m<lgl>[39m[23m TRUE, TRUE, TRUE, TRUE
$ U              [3m[90m<lgl>[39m[23m

In [6]:
# Column names
#names(dfMeta)

In [7]:
# Load data related to authors and documents (metadata)
dfNarrative <- unfactorize(read.csv("20240701_PhD_SentimentDiary-Chunks.csv"))[-1]
# Strip .txt from docids in DfNarrative
dfNarrative$docid <- gsub(".txt","", dfNarrative$docid)
glimpse(dfNarrative)

Rows: 1,023
Columns: 12
$ text             [3m[90m<chr>[39m[23m "Charra Wednesday Nov 4th 1883 A nice cool day Mrs Ro…
$ docid            [3m[90m<chr>[39m[23m "D0002", "D0002", "D0002", "D0002", "D0002", "D0002",…
$ sequence         [3m[90m<int>[39m[23m 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16…
$ totalTokens      [3m[90m<int>[39m[23m 277, 281, 276, 270, 282, 279, 280, 286, 282, 269, 269…
$ uniqueTokens     [3m[90m<int>[39m[23m 171, 170, 170, 163, 147, 164, 169, 181, 169, 161, 165…
$ lexicalDiversity [3m[90m<dbl>[39m[23m 0.6173285, 0.6049822, 0.6159420, 0.6037037, 0.5212766…
$ scoreNeg         [3m[90m<dbl>[39m[23m 0.070000000, 0.044333333, 0.074750000, 0.045600000, 0…
$ scoreNeu         [3m[90m<dbl>[39m[23m 0.8910000, 0.9076667, 0.8845000, 0.9256000, 0.8922857…
$ scorePos         [3m[90m<dbl>[39m[23m 0.039000000, 0.048000000, 0.041000000, 0.028600000, 0…
$ scoreCom         [3m[90m<dbl>[39m[23m -0.37976667, -0.05883333, -0.13772500, -0

In [8]:
# Are the values identical
identical(unique(dfMeta$docid),unique(dfNarrative$docid))

In [9]:
#Merge datasets
df <- right_join(dfMeta, dfNarrative, by = 'docid')
#glimpse(df)

In [10]:
# Convert variables to factor
vars  <- c("docauthorid", "authorName", "docid", "authorGender", "nationalOrigin", "authorLocation")
df[vars]  <- factorize(df[vars])

In [11]:
# Check classes
# lapply(df, class)
# summary(df)
glimpse(df)

Rows: 1,023
Columns: 29
$ docid            [3m[90m<fct>[39m[23m D0002, D0002, D0002, D0002, D0002, D0002, D0002, D000…
$ docyear          [3m[90m<int>[39m[23m 1883, 1883, 1883, 1883, 1883, 1883, 1883, 1883, 1883,…
$ docmonth         [3m[90m<lgl>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ authorName       [3m[90m<fct>[39m[23m Anne F. Richards, Anne F. Richards, Anne F. Richards,…
$ docauthorid      [3m[90m<fct>[39m[23m D0002, D0002, D0002, D0002, D0002, D0002, D0002, D000…
$ authorLocation   [3m[90m<fct>[39m[23m Australia, Australia, Australia, Australia, Australia…
$ authorGender     [3m[90m<fct>[39m[23m F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,…
$ nationalOrigin   [3m[90m<fct>[39m[23m English, English, English, English, English, English,…
$ irish            [3m[90m<lgl>[39m[23m FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALS…
$ otherUK          [3m[90m<lgl>[39m[23m TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,

In [12]:
write.csv(df, "20240701_PhD_Data4TopicModel-DiaryChunk.csv")