# Merge Datasets

In [4]:
# Resources
library(tidyverse)

In [2]:
# Converts all factors to character class
unfactorize <- function(df){
  for(i in which(sapply(df, class) == "factor")) df[[i]] = as.character(df[[i]])
  return(df)
}
# Code from user "By0" at https://stackoverflow.com/questions/2851015/convert-data-frame-columns-from-factors-to-characters (line 14)

In [3]:
# Converts character to factor class
factorize <- function(df){
  for(i in which(sapply(df, class) == "character")) df[[i]] = as.factor(df[[i]])
  return(df)
}

In [5]:
# Load data related to authors and documents (metadata)
metaData <- unfactorize(read.csv("20201219_AM_Meta2Merge.csv"))
glimpse(metaData)

Rows: 915
Columns: 48
$ X                       [3m[38;5;246m<int>[39m[23m 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14…
$ docsequence             [3m[38;5;246m<int>[39m[23m 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 3…
$ docid                   [3m[38;5;246m<chr>[39m[23m "S10003-D023", "S10003-D024", "S10003-D025", …
$ docyear                 [3m[38;5;246m<int>[39m[23m 1836, 1836, 1837, 1837, 1838, 1838, 1838, 183…
$ doctype                 [3m[38;5;246m<chr>[39m[23m "Letter", "Letter", "Letter", "Letter", "Lett…
$ allsubject              [3m[38;5;246m<chr>[39m[23m "Childbirth; Church attendance; Cities; Farms…
$ broadsubj               [3m[38;5;246m<chr>[39m[23m "Health; Religion; Communities; Relationships…
$ personalevent           [3m[38;5;246m<chr>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, "Physical ill…
$ wwritten                [3m[38;5;246m<chr>[39m[23m "Baltimore, MD; Maryland; United States; Mid-…
$ docauthorid             [3m[38;5;246m<

In [6]:
# Load data related to authors and documents (metadata)
latentData <- unfactorize(read.csv("20210101_AM_Sentiment.csv"))
glimpse(latentData)

Rows: 37,608
Columns: 9
$ X            [3m[38;5;246m<int>[39m[23m 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16…
$ Sentence     [3m[38;5;246m<chr>[39m[23m " baltimore 20 september 1836 dear heinrich: friday even…
$ fileid       [3m[38;5;246m<chr>[39m[23m "S10003-D023.txt", "S10003-D023.txt", "S10003-D023.txt",…
$ Sequence     [3m[38;5;246m<int>[39m[23m 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1…
$ Sentiment    [3m[38;5;246m<dbl>[39m[23m 0.7263, 0.0000, 0.8777, 0.2263, 0.0000, 0.0000, 0.5267, …
$ Sentences    [3m[38;5;246m<int>[39m[23m 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, …
$ Position     [3m[38;5;246m<dbl>[39m[23m 0.01851852, 0.03703704, 0.05555556, 0.07407407, 0.092592…
$ Last         [3m[38;5;246m<chr>[39m[23m "False", "False", "False", "False", "False", "False", "F…
$ SentimentLTR [3m[38;5;246m<dbl>[39m[23m 0.1710519, 0.1710519, 0.1710519, 0.1710519, 0.1710519, 0…


In [7]:
# Drop index variables
metaData  <- metaData[-1]
names(metaData)

In [8]:
# Drop index variable
latentData  <- latentData[-1]
names(latentData)

In [9]:
# Drop .txt from filename (latentData) so that it will match docid (metaData)
latentData$fileid  <- str_remove_all(latentData$fileid, ".txt")
latentData$fileid[0:10]

In [10]:
names(latentData)[names(latentData) == "fileid"] <- "docid"
names(latentData)

In [11]:
# Check to make sure the datasets contain the same docids

# Put unique values in vectors and sort
docidMeta  <- sort(unique(metaData$docid))
latentMeta  <- sort(unique(latentData$docid))

# Are the values identical
identical(docidMeta,latentMeta)

In [12]:
#Merge datasets
df <- right_join(metaData, latentData, by = 'docid')
glimpse(df)

Rows: 37,608
Columns: 54
$ docsequence             [3m[38;5;246m<int>[39m[23m 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 2…
$ docid                   [3m[38;5;246m<chr>[39m[23m "S10003-D023", "S10003-D023", "S10003-D023", …
$ docyear                 [3m[38;5;246m<int>[39m[23m 1836, 1836, 1836, 1836, 1836, 1836, 1836, 183…
$ doctype                 [3m[38;5;246m<chr>[39m[23m "Letter", "Letter", "Letter", "Letter", "Lett…
$ allsubject              [3m[38;5;246m<chr>[39m[23m "Childbirth; Church attendance; Cities; Farms…
$ broadsubj               [3m[38;5;246m<chr>[39m[23m "Health; Religion; Communities; Relationships…
$ personalevent           [3m[38;5;246m<chr>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ wwritten                [3m[38;5;246m<chr>[39m[23m "Baltimore, MD; Maryland; United States; Mid-…
$ docauthorid             [3m[38;5;246m<chr>[39m[23m "per0022938", "per0022938", "per0022938", "pe…
$ docauthorname           [3m[38;5;24

In [13]:
# Apply function to turn character class variables to factor class.
 df  <- factorize(df)
summary(df)

  docsequence            docid          docyear       doctype     
 Min.   :  2.00   S8552-D008:  693   Min.   :1804   Letter:37608  
 1st Qu.: 16.00   S9912-D003:  450   1st Qu.:1850                 
 Median : 36.00   S9912-D004:  443   Median :1863                 
 Mean   : 54.66   S8552-D007:  385   Mean   :1863                 
 3rd Qu.: 83.00   S9912-D002:  357   3rd Qu.:1880                 
 Max.   :239.00   S1019-D013:  275   Max.   :1913                 
                  (Other)   :35005   NA's   :194                  
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                

In [14]:
head(df)

docsequence,docid,docyear,doctype,allsubject,broadsubj,personalevent,wwritten,docauthorid,docauthorname,⋯,Social,Government,Other,Sentence,Sequence,Sentiment,Sentences,Position,Last,SentimentLTR
23,S10003-D023,1836,Letter,"Childbirth; Church attendance; Cities; Farms; Homesickness; Motion sickness; Neighbors; Ocean voyages; Steamboats; Travelers; Wagon travel; Health; Religion; Communities; Relationships; Transportation; Entertainment and recreation; Westphalia, MO; Missouri; United States; West North Central States; Midwest States; Mississippi Basin States; North America",Health; Religion; Communities; Relationships; Transportation; Entertainment and recreation,,"Baltimore, MD; Maryland; United States; Mid-Atlantic States; Northeast States; East Coast States; North America",per0022938,"Bruns, Jette, 1813-1899",⋯,True,False,True,"baltimore 20 september 1836 dear heinrich: friday evening, 16 september, anchor was dropped and we had safely arrived in the harbor!",1,0.7263,54,0.01851852,False,0.1710519
23,S10003-D023,1836,Letter,"Childbirth; Church attendance; Cities; Farms; Homesickness; Motion sickness; Neighbors; Ocean voyages; Steamboats; Travelers; Wagon travel; Health; Religion; Communities; Relationships; Transportation; Entertainment and recreation; Westphalia, MO; Missouri; United States; West North Central States; Midwest States; Mississippi Basin States; North America",Health; Religion; Communities; Relationships; Transportation; Entertainment and recreation,,"Baltimore, MD; Maryland; United States; Mid-Atlantic States; Northeast States; East Coast States; North America",per0022938,"Bruns, Jette, 1813-1899",⋯,True,False,True,it was a long and arduous voyage!,2,0.0,54,0.03703704,False,0.1710519
23,S10003-D023,1836,Letter,"Childbirth; Church attendance; Cities; Farms; Homesickness; Motion sickness; Neighbors; Ocean voyages; Steamboats; Travelers; Wagon travel; Health; Religion; Communities; Relationships; Transportation; Entertainment and recreation; Westphalia, MO; Missouri; United States; West North Central States; Midwest States; Mississippi Basin States; North America",Health; Religion; Communities; Relationships; Transportation; Entertainment and recreation,,"Baltimore, MD; Maryland; United States; Mid-Atlantic States; Northeast States; East Coast States; North America",per0022938,"Bruns, Jette, 1813-1899",⋯,True,False,True,however we were and are all well and thank the father in heaven that he has protected us so far!,3,0.8777,54,0.05555556,False,0.1710519
23,S10003-D023,1836,Letter,"Childbirth; Church attendance; Cities; Farms; Homesickness; Motion sickness; Neighbors; Ocean voyages; Steamboats; Travelers; Wagon travel; Health; Religion; Communities; Relationships; Transportation; Entertainment and recreation; Westphalia, MO; Missouri; United States; West North Central States; Midwest States; Mississippi Basin States; North America",Health; Religion; Communities; Relationships; Transportation; Entertainment and recreation,,"Baltimore, MD; Maryland; United States; Mid-Atlantic States; Northeast States; East Coast States; North America",per0022938,"Bruns, Jette, 1813-1899",⋯,True,False,True,"i had written down the events of the voyage for you, but to my great annoyance i am now missing the whole notebook; perhaps i will find it later.",4,0.2263,54,0.07407407,False,0.1710519
23,S10003-D023,1836,Letter,"Childbirth; Church attendance; Cities; Farms; Homesickness; Motion sickness; Neighbors; Ocean voyages; Steamboats; Travelers; Wagon travel; Health; Religion; Communities; Relationships; Transportation; Entertainment and recreation; Westphalia, MO; Missouri; United States; West North Central States; Midwest States; Mississippi Basin States; North America",Health; Religion; Communities; Relationships; Transportation; Entertainment and recreation,,"Baltimore, MD; Maryland; United States; Mid-Atlantic States; Northeast States; East Coast States; North America",per0022938,"Bruns, Jette, 1813-1899",⋯,True,False,True,"as you know, we set sail on 12 july.",5,0.0,54,0.09259259,False,0.1710519
23,S10003-D023,1836,Letter,"Childbirth; Church attendance; Cities; Farms; Homesickness; Motion sickness; Neighbors; Ocean voyages; Steamboats; Travelers; Wagon travel; Health; Religion; Communities; Relationships; Transportation; Entertainment and recreation; Westphalia, MO; Missouri; United States; West North Central States; Midwest States; Mississippi Basin States; North America",Health; Religion; Communities; Relationships; Transportation; Entertainment and recreation,,"Baltimore, MD; Maryland; United States; Mid-Atlantic States; Northeast States; East Coast States; North America",per0022938,"Bruns, Jette, 1813-1899",⋯,True,False,True,"by the afternoon, when the pilot left us, one after the other began to hold his head overboard; bernhard, mrs schwarze, the children, and others.",6,0.0,54,0.11111111,False,0.1710519


In [15]:
write.csv(df, "20210101_AM_Data4TopicModel.csv")