# Merge Datasets

In [1]:
# Resources
library(tidyverse)

── Attaching packages ─────────────────────────────────────── tidyverse 1.2.1 ──
✔ ggplot2 3.1.0     ✔ purrr   0.2.5
✔ tibble  2.0.1     ✔ dplyr   0.7.6
✔ tidyr   0.8.1     ✔ stringr 1.3.1
✔ readr   1.1.1     ✔ forcats 0.3.0
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()


In [2]:
# Converts all factors to character class
unfactorize <- function(df){
  for(i in which(sapply(df, class) == "factor")) df[[i]] = as.character(df[[i]])
  return(df)
}
# Code from user "By0" at https://stackoverflow.com/questions/2851015/convert-data-frame-columns-from-factors-to-characters (line 14)

In [3]:
# Converts character to factor class
factorize <- function(df){
  for(i in which(sapply(df, class) == "character")) df[[i]] = as.factor(df[[i]])
  return(df)
}

In [8]:
# Load data related to authors and documents (metadata)
metaData <- unfactorize(read.csv("20201125_AM_Meta2Merge.csv"))
glimpse(metaData)

Observations: 925
Variables: 33
$ X.1                     <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14…
$ X                       <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14…
$ docsequence             <int> 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 3…
$ docid                   <chr> "S10003-D023", "S10003-D024", "S10003-D025", …
$ docyear                 <int> 1836, 1836, 1837, 1837, 1838, 1838, 1838, 183…
$ doctype                 <chr> "Letter", "Letter", "Letter", "Letter", "Lett…
$ allsubject              <chr> "Childbirth; Church attendance; Cities; Farms…
$ broadsubj               <chr> "Health; Religion; Communities; Relationships…
$ personalevent           <chr> NA, NA, NA, NA, NA, NA, NA, NA, "Physical ill…
$ wwritten                <chr> "Baltimore, MD; Maryland; United States; Mid-…
$ docauthorid             <chr> "per0022938", "per0022938", "per0022938", "pe…
$ docauthorname           <chr> "Bruns, Jette, 1813-1899", "Bruns, Jette, 181…
$ language          

In [9]:
# Load data related to authors and documents (metadata)
latentData <- unfactorize(read.csv("20201124_AM_Sentiment.csv"))
glimpse(latentData)

Observations: 38,074
Variables: 9
$ X            <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16…
$ Sentence     <chr> " baltimore 20 september 1836 dear heinrich: friday even…
$ fileid       <chr> "S10003-D023.txt", "S10003-D023.txt", "S10003-D023.txt",…
$ Sequence     <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1…
$ Sentiment    <dbl> 0.7263, 0.0000, 0.8777, 0.2263, 0.0000, 0.0000, 0.5267, …
$ Position     <dbl> 0.01851852, 0.03703704, 0.05555556, 0.07407407, 0.092592…
$ Sentences    <int> 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, …
$ Last         <chr> "False", "False", "False", "False", "False", "False", "F…
$ SentimentLTR <dbl> 0.1710519, 0.1710519, 0.1710519, 0.1710519, 0.1710519, 0…


In [10]:
# Drop index variables
metaData  <- metaData[-c(1,2)]
names(metaData)

In [11]:
# Drop index variable
latentData  <- latentData[-(1)]
names(latentData)

In [12]:
# Drop .txt from filename (latentData) so that it will match docid (metaData)
latentData$fileid  <- str_remove_all(latentData$fileid, ".txt")
latentData$fileid[0:10]

In [13]:
names(latentData)[names(latentData) == "fileid"] <- "docid"
names(latentData)

In [14]:
# Check to make sure the datasets contain the same docids

# Put unique values in vectors and sort
docidMeta  <- sort(unique(metaData$docid))
latentMeta  <- sort(unique(latentData$docid))

# Are the values identical
identical(docidMeta,latentMeta)

In [15]:
#Merge datasets
df <- right_join(metaData, latentData, by = 'docid')
glimpse(df)

Observations: 38,074
Variables: 38
$ docsequence             <int> 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 2…
$ docid                   <chr> "S10003-D023", "S10003-D023", "S10003-D023", …
$ docyear                 <int> 1836, 1836, 1836, 1836, 1836, 1836, 1836, 183…
$ doctype                 <chr> "Letter", "Letter", "Letter", "Letter", "Lett…
$ allsubject              <chr> "Childbirth; Church attendance; Cities; Farms…
$ broadsubj               <chr> "Health; Religion; Communities; Relationships…
$ personalevent           <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ wwritten                <chr> "Baltimore, MD; Maryland; United States; Mid-…
$ docauthorid             <chr> "per0022938", "per0022938", "per0022938", "pe…
$ docauthorname           <chr> "Bruns, Jette, 1813-1899", "Bruns, Jette, 181…
$ language                <chr> "English; German", "English; German", "Englis…
$ editor                  <chr> "Schroeder, Adoplh E., tr.; Geisberg, Carla S…
$ briefname      

In [16]:
# Apply function to turn character class variables to factor class.
 df  <- factorize(df)
summary(df)

  docsequence            docid          docyear       doctype     
 Min.   :  2.00   S8552-D008:  693   Min.   :1804   Letter:38074  
 1st Qu.: 15.00   S9912-D003:  450   1st Qu.:1850                 
 Median : 36.00   S9912-D004:  443   Median :1862                 
 Mean   : 54.17   S8552-D007:  385   Mean   :1863                 
 3rd Qu.: 82.00   S9912-D002:  357   3rd Qu.:1880                 
 Max.   :239.00   S1019-D013:  275   Max.   :1913                 
                  (Other)   :35471   NA's   :194                  
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                

In [17]:
head(df)

docsequence,docid,docyear,doctype,allsubject,broadsubj,personalevent,wwritten,docauthorid,docauthorname,⋯,culturalheritage,religion,northamericanoccupation,Sentence,Sequence,Sentiment,Position,Sentences,Last,SentimentLTR
23,S10003-D023,1836,Letter,"Childbirth; Church attendance; Cities; Farms; Homesickness; Motion sickness; Neighbors; Ocean voyages; Steamboats; Travelers; Wagon travel; Health; Religion; Communities; Relationships; Transportation; Entertainment and recreation; Westphalia, MO; Missouri; United States; West North Central States; Midwest States; Mississippi Basin States; North America",Health; Religion; Communities; Relationships; Transportation; Entertainment and recreation,,"Baltimore, MD; Maryland; United States; Mid-Atlantic States; Northeast States; East Coast States; North America",per0022938,"Bruns, Jette, 1813-1899",⋯,German; European,Catholic; Christian,Homemaker; Physician,"baltimore 20 september 1836 dear heinrich: friday evening, 16 september, anchor was dropped and we had safely arrived in the harbor!",1,0.7263,0.01851852,54,False,0.1710519
23,S10003-D023,1836,Letter,"Childbirth; Church attendance; Cities; Farms; Homesickness; Motion sickness; Neighbors; Ocean voyages; Steamboats; Travelers; Wagon travel; Health; Religion; Communities; Relationships; Transportation; Entertainment and recreation; Westphalia, MO; Missouri; United States; West North Central States; Midwest States; Mississippi Basin States; North America",Health; Religion; Communities; Relationships; Transportation; Entertainment and recreation,,"Baltimore, MD; Maryland; United States; Mid-Atlantic States; Northeast States; East Coast States; North America",per0022938,"Bruns, Jette, 1813-1899",⋯,German; European,Catholic; Christian,Homemaker; Physician,it was a long and arduous voyage!,2,0.0,0.03703704,54,False,0.1710519
23,S10003-D023,1836,Letter,"Childbirth; Church attendance; Cities; Farms; Homesickness; Motion sickness; Neighbors; Ocean voyages; Steamboats; Travelers; Wagon travel; Health; Religion; Communities; Relationships; Transportation; Entertainment and recreation; Westphalia, MO; Missouri; United States; West North Central States; Midwest States; Mississippi Basin States; North America",Health; Religion; Communities; Relationships; Transportation; Entertainment and recreation,,"Baltimore, MD; Maryland; United States; Mid-Atlantic States; Northeast States; East Coast States; North America",per0022938,"Bruns, Jette, 1813-1899",⋯,German; European,Catholic; Christian,Homemaker; Physician,however we were and are all well and thank the father in heaven that he has protected us so far!,3,0.8777,0.05555556,54,False,0.1710519
23,S10003-D023,1836,Letter,"Childbirth; Church attendance; Cities; Farms; Homesickness; Motion sickness; Neighbors; Ocean voyages; Steamboats; Travelers; Wagon travel; Health; Religion; Communities; Relationships; Transportation; Entertainment and recreation; Westphalia, MO; Missouri; United States; West North Central States; Midwest States; Mississippi Basin States; North America",Health; Religion; Communities; Relationships; Transportation; Entertainment and recreation,,"Baltimore, MD; Maryland; United States; Mid-Atlantic States; Northeast States; East Coast States; North America",per0022938,"Bruns, Jette, 1813-1899",⋯,German; European,Catholic; Christian,Homemaker; Physician,"i had written down the events of the voyage for you, but to my great annoyance i am now missing the whole notebook; perhaps i will find it later.",4,0.2263,0.07407407,54,False,0.1710519
23,S10003-D023,1836,Letter,"Childbirth; Church attendance; Cities; Farms; Homesickness; Motion sickness; Neighbors; Ocean voyages; Steamboats; Travelers; Wagon travel; Health; Religion; Communities; Relationships; Transportation; Entertainment and recreation; Westphalia, MO; Missouri; United States; West North Central States; Midwest States; Mississippi Basin States; North America",Health; Religion; Communities; Relationships; Transportation; Entertainment and recreation,,"Baltimore, MD; Maryland; United States; Mid-Atlantic States; Northeast States; East Coast States; North America",per0022938,"Bruns, Jette, 1813-1899",⋯,German; European,Catholic; Christian,Homemaker; Physician,"as you know, we set sail on 12 july.",5,0.0,0.09259259,54,False,0.1710519
23,S10003-D023,1836,Letter,"Childbirth; Church attendance; Cities; Farms; Homesickness; Motion sickness; Neighbors; Ocean voyages; Steamboats; Travelers; Wagon travel; Health; Religion; Communities; Relationships; Transportation; Entertainment and recreation; Westphalia, MO; Missouri; United States; West North Central States; Midwest States; Mississippi Basin States; North America",Health; Religion; Communities; Relationships; Transportation; Entertainment and recreation,,"Baltimore, MD; Maryland; United States; Mid-Atlantic States; Northeast States; East Coast States; North America",per0022938,"Bruns, Jette, 1813-1899",⋯,German; European,Catholic; Christian,Homemaker; Physician,"by the afternoon, when the pilot left us, one after the other began to hold his head overboard; bernhard, mrs schwarze, the children, and others.",6,0.0,0.11111111,54,False,0.1710519


In [18]:
write.csv(df, "20201127_AM_Data4TopicModel.csv")