# Merge Datasets

In [2]:
# Resources
library(tidyverse)

In [3]:
# Converts all factors to character class
unfactorize <- function(df){
  for(i in which(sapply(df, class) == "factor")) df[[i]] = as.character(df[[i]])
  return(df)
}
# Code from user "By0" at https://stackoverflow.com/questions/2851015/convert-data-frame-columns-from-factors-to-characters (line 14)

In [4]:
# Converts character to factor class
factorize <- function(df){
  for(i in which(sapply(df, class) == "character")) df[[i]] = as.factor(df[[i]])
  return(df)
}

In [5]:
# Load data related to authors and documents (metadata)
metaData <- unfactorize(read.csv("20210130_AM_Data-Letter.csv"))
glimpse(metaData)

Rows: 915
Columns: 57
$ X                       [3m[38;5;246m<int>[39m[23m 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14…
$ docsequence             [3m[38;5;246m<int>[39m[23m 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 3…
$ docid                   [3m[38;5;246m<chr>[39m[23m "S10003-D023", "S10003-D024", "S10003-D025", …
$ docyear                 [3m[38;5;246m<int>[39m[23m 1836, 1836, 1837, 1837, 1838, 1838, 1838, 183…
$ doctype                 [3m[38;5;246m<chr>[39m[23m "Letter", "Letter", "Letter", "Letter", "Lett…
$ allsubject              [3m[38;5;246m<chr>[39m[23m "Childbirth; Church attendance; Cities; Farms…
$ broadsubj               [3m[38;5;246m<chr>[39m[23m "Health; Religion; Communities; Relationships…
$ personalevent           [3m[38;5;246m<chr>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, "Physical ill…
$ wwritten                [3m[38;5;246m<chr>[39m[23m "Baltimore, MD; Maryland; United States; Mid-…
$ docauthorid             [3m[38;5;246m<

In [6]:
# Drop index variables
metaData  <- metaData[-1]
names(metaData)

In [7]:
# Load data related to authors and documents (metadata)
sentiment <- unfactorize(read.csv("20210119_AM_Data4TopicModel.csv"))
glimpse(sentiment)

Rows: 37,608
Columns: 62
$ X                       [3m[38;5;246m<int>[39m[23m 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14…
$ docsequence             [3m[38;5;246m<int>[39m[23m 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 2…
$ docid                   [3m[38;5;246m<chr>[39m[23m "S10003-D023", "S10003-D023", "S10003-D023", …
$ docyear                 [3m[38;5;246m<int>[39m[23m 1836, 1836, 1836, 1836, 1836, 1836, 1836, 183…
$ doctype                 [3m[38;5;246m<chr>[39m[23m "Letter", "Letter", "Letter", "Letter", "Lett…
$ allsubject              [3m[38;5;246m<chr>[39m[23m "Childbirth; Church attendance; Cities; Farms…
$ broadsubj               [3m[38;5;246m<chr>[39m[23m "Health; Religion; Communities; Relationships…
$ personalevent           [3m[38;5;246m<chr>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ wwritten                [3m[38;5;246m<chr>[39m[23m "Baltimore, MD; Maryland; United States; Mid-…
$ docauthorid             [3m[38;5;24

In [8]:
# Drop index variable
sentiment  <- sentiment[-1]
names(sentiment)

In [9]:
letterSentiments  <- sentiment %>%
group_by(docid) %>%
select(docid, SentimentLTR)%>%
unique()

In [10]:
nrow(letterSentiments)

In [11]:
# Check to make sure the datasets contain the same docids and that they are in the same order
identical(metaData$docid,letterSentiments$docid)

In [12]:
sum(is.na(letterSentiments$docid))

In [13]:
#Merge datasets
dfNew <- bind_cols(metaData, letterSentiments)
glimpse(dfNew)

New names:
* docid -> docid...2
* docid -> docid...57



Rows: 915
Columns: 58
$ docsequence             [3m[38;5;246m<int>[39m[23m 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 3…
$ docid...2               [3m[38;5;246m<chr>[39m[23m "S10003-D023", "S10003-D024", "S10003-D025", …
$ docyear                 [3m[38;5;246m<int>[39m[23m 1836, 1836, 1837, 1837, 1838, 1838, 1838, 183…
$ doctype                 [3m[38;5;246m<chr>[39m[23m "Letter", "Letter", "Letter", "Letter", "Lett…
$ allsubject              [3m[38;5;246m<chr>[39m[23m "Childbirth; Church attendance; Cities; Farms…
$ broadsubj               [3m[38;5;246m<chr>[39m[23m "Health; Religion; Communities; Relationships…
$ personalevent           [3m[38;5;246m<chr>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, "Physical ill…
$ wwritten                [3m[38;5;246m<chr>[39m[23m "Baltimore, MD; Maryland; United States; Mid-…
$ docauthorid             [3m[38;5;246m<chr>[39m[23m "per0022938", "per0022938", "per0022938", "pe…
$ docauthorname           [3m[38;5;246m<

In [14]:
# Test to make sure all the docids line up
test  <- dfNew$docid...2  == dfNew$docid...57
table(test)

test
TRUE 
 915 

In [15]:
#Drop redundant docid column (I checked to make sure the values match)
dfNew  <- dfNew[-57]
names(dfNew)

In [16]:
names(dfNew)[names(dfNew)=="docid...2"] <- "docid"
names(dfNew)

In [17]:
# Apply function to turn character class variables to factor class.
 df  <- factorize(df)
summary(dfNew)

  docsequence        docid              docyear       doctype         
 Min.   :  2.00   Length:915         Min.   :1804   Length:915        
 1st Qu.: 25.00   Class :character   1st Qu.:1856   Class :character  
 Median : 53.00   Mode  :character   Median :1863   Mode  :character  
 Mean   : 72.19                      Mean   :1865                     
 3rd Qu.:110.00                      3rd Qu.:1880                     
 Max.   :239.00                      Max.   :1913                     
                                     NA's   :8                        
  allsubject         broadsubj         personalevent        wwritten        
 Length:915         Length:915         Length:915         Length:915        
 Class :character   Class :character   Class :character   Class :character  
 Mode  :character   Mode  :character   Mode  :character   Mode  :character  
                                                                            
                                               

In [18]:
head(dfNew)

docsequence,docid,docyear,doctype,allsubject,broadsubj,personalevent,wwritten,docauthorid,docauthorname,⋯,docday,docMonth,docDay,docdate,docDate,letterOrphan,letterLast,topicNumber,pyLDAvisTopic,SentimentLTR
23,S10003-D023,1836,Letter,"Childbirth; Church attendance; Cities; Farms; Homesickness; Motion sickness; Neighbors; Ocean voyages; Steamboats; Travelers; Wagon travel; Health; Religion; Communities; Relationships; Transportation; Entertainment and recreation; Westphalia, MO; Missouri; United States; West North Central States; Midwest States; Mississippi Basin States; North America",Health; Religion; Communities; Relationships; Transportation; Entertainment and recreation,,"Baltimore, MD; Maryland; United States; Mid-Atlantic States; Northeast States; East Coast States; North America",per0022938,"Bruns, Jette, 1813-1899",⋯,20.0,9,20,1836-09-20,1836-09-20,False,False,0,1,0.17105185
24,S10003-D024,1836,Letter,Church services; Farming; Letters and mail; Log cabins; Neighbors; Weather; Religion; Life Styles; Politics; Communities; Relationships; Environment,Religion; Life Styles; Politics; Communities; Relationships; Environment,,"Westphalia, MO; Missouri; United States; West North Central States; Midwest States; Mississippi Basin States; North America",per0022938,"Bruns, Jette, 1813-1899",⋯,14.0,11,14,1836-11-14,1836-11-14,False,False,17,18,0.23648803
25,S10003-D025,1837,Letter,Correspondence; Crops; English language; Living arrangements; Sons; Intellectual life; Agriculture; Domestic life; Relationships,Intellectual life; Agriculture; Domestic life; Relationships,,"Westphalia, MO; Missouri; United States; West North Central States; Midwest States; Mississippi Basin States; North America",per0022938,"Bruns, Jette, 1813-1899",⋯,,8,1,1837-08-01,1837-08-01,False,False,17,18,0.11368736
26,S10003-D026,1837,Letter,American Indians; Construction; Fevers; Floods; Horseback riding; Houses; Neighbors; Pianos; Ethnic groups; Economics; Health; Environment; Entertainment and recreation; Domestic life; Relationships; Intellectual life,Ethnic groups; Economics; Health; Environment; Entertainment and recreation; Domestic life; Relationships; Intellectual life,,"Westphalia, MO; Missouri; United States; West North Central States; Midwest States; Mississippi Basin States; North America",per0022938,"Bruns, Jette, 1813-1899",⋯,7.0,9,7,1837-09-07,1837-09-07,False,False,17,18,0.04470548
27,S10003-D027,1838,Letter,Brothers; Deaths; Farms; Fires; Letters and mail; Neighbors; Physical illnesses; Relationships; Health; Communities; Environment; Politics,Relationships; Health; Communities; Environment; Politics,,"Westphalia, MO; Missouri; United States; West North Central States; Midwest States; Mississippi Basin States; North America",per0022938,"Bruns, Jette, 1813-1899",⋯,1.0,3,1,1838-03-01,1838-03-01,False,False,17,18,0.14126174
28,S10003-D028,1838,Letter,Aunts; Family arguments; Fevers; Parenting; Sisters; Uncles; Relationships; Health,Relationships; Health,,"Westphalia, MO; Missouri; United States; West North Central States; Midwest States; Mississippi Basin States; North America",per0022938,"Bruns, Jette, 1813-1899",⋯,23.0,9,23,1838-09-23,1838-09-23,False,False,17,18,0.06461556


In [19]:
write.csv(dfNew, "20210130_AM_Data-Letter.csv")