# Merge Datasets

In [2]:
# Resources
library(tidyverse)

In [3]:
# Converts all factors to character class
unfactorize <- function(df){
  for(i in which(sapply(df, class) == "factor")) df[[i]] = as.character(df[[i]])
  return(df)
}
# Code from user "By0" at https://stackoverflow.com/questions/2851015/convert-data-frame-columns-from-factors-to-characters (line 14)

In [4]:
# Converts character to factor class
factorize <- function(df){
  for(i in which(sapply(df, class) == "character")) df[[i]] = as.factor(df[[i]])
  return(df)
}

In [15]:
# Load data related to authors and documents (metadata)
metaData <- unfactorize(read.csv("20210118_AM_Meta2Merge.csv"))
glimpse(metaData)

Rows: 915
Columns: 55
$ X                       [3m[38;5;246m<int>[39m[23m 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14…
$ docsequence             [3m[38;5;246m<int>[39m[23m 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 3…
$ docid                   [3m[38;5;246m<chr>[39m[23m "S10003-D023", "S10003-D024", "S10003-D025", …
$ docyear                 [3m[38;5;246m<int>[39m[23m 1836, 1836, 1837, 1837, 1838, 1838, 1838, 183…
$ doctype                 [3m[38;5;246m<chr>[39m[23m "Letter", "Letter", "Letter", "Letter", "Lett…
$ allsubject              [3m[38;5;246m<chr>[39m[23m "Childbirth; Church attendance; Cities; Farms…
$ broadsubj               [3m[38;5;246m<chr>[39m[23m "Health; Religion; Communities; Relationships…
$ personalevent           [3m[38;5;246m<chr>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, "Physical ill…
$ wwritten                [3m[38;5;246m<chr>[39m[23m "Baltimore, MD; Maryland; United States; Mid-…
$ docauthorid             [3m[38;5;246m<

In [16]:
# Drop index variables
metaData  <- metaData[-1]
names(metaData)

In [17]:
# Load data related to authors and documents (metadata)
latentData <- unfactorize(read.csv("20210129_AM_Topics-Letter.csv"))
glimpse(latentData)

Rows: 915
Columns: 3
$ X           [3m[38;5;246m<int>[39m[23m 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,…
$ docid       [3m[38;5;246m<chr>[39m[23m "S10003-D023", "S10003-D024", "S10003-D025", "S10003-D026…
$ topicNumber [3m[38;5;246m<int>[39m[23m 0, 17, 17, 17, 17, 17, 17, 17, 17, 17, 18, 17, 17, 18, 17…


In [18]:
# Drop index variable
latentData  <- latentData[-1]
names(latentData)

In [19]:
summary(latentData)

    docid            topicNumber   
 Length:915         Min.   : 0.00  
 Class :character   1st Qu.: 5.00  
 Mode  :character   Median :11.00  
                    Mean   :10.18  
                    3rd Qu.:15.00  
                    Max.   :20.00  

In [20]:
latentData['pyLDAvisTopic']  <- latentData['topicNumber'] + 1
summary(latentData)

    docid            topicNumber    pyLDAvisTopic  
 Length:915         Min.   : 0.00   Min.   : 1.00  
 Class :character   1st Qu.: 5.00   1st Qu.: 6.00  
 Mode  :character   Median :11.00   Median :12.00  
                    Mean   :10.18   Mean   :11.18  
                    3rd Qu.:15.00   3rd Qu.:16.00  
                    Max.   :20.00   Max.   :21.00  

In [21]:
# Sort by docid
metaData  <- arrange(metaData, docid)

In [22]:
# Sort by docid
latentData  <- arrange(latentData, docid)

In [23]:
# Check to make sure the datasets contain the same docids and that they are in the same order
identical(metaData$docid,latentData$docid)

In [24]:
#Merge datasets
df <- bind_cols(metaData, latentData)
glimpse(df)

New names:
* docid -> docid...2
* docid -> docid...55



Rows: 915
Columns: 57
$ docsequence             [3m[38;5;246m<int>[39m[23m 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 3…
$ docid...2               [3m[38;5;246m<chr>[39m[23m "S10003-D023", "S10003-D024", "S10003-D025", …
$ docyear                 [3m[38;5;246m<int>[39m[23m 1836, 1836, 1837, 1837, 1838, 1838, 1838, 183…
$ doctype                 [3m[38;5;246m<chr>[39m[23m "Letter", "Letter", "Letter", "Letter", "Lett…
$ allsubject              [3m[38;5;246m<chr>[39m[23m "Childbirth; Church attendance; Cities; Farms…
$ broadsubj               [3m[38;5;246m<chr>[39m[23m "Health; Religion; Communities; Relationships…
$ personalevent           [3m[38;5;246m<chr>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, "Physical ill…
$ wwritten                [3m[38;5;246m<chr>[39m[23m "Baltimore, MD; Maryland; United States; Mid-…
$ docauthorid             [3m[38;5;246m<chr>[39m[23m "per0022938", "per0022938", "per0022938", "pe…
$ docauthorname           [3m[38;5;246m<

In [25]:
#Drop redundant docid column (I checked to make sure the values match)
df  <- df[-55]
names(df)

In [28]:
# Apply function to turn character class variables to factor class.
 df  <- factorize(df)
summary(df)

  docsequence             docid        docyear       doctype   
 Min.   :  2.00   S10003-D023:  1   Min.   :1804   Letter:915  
 1st Qu.: 25.00   S10003-D024:  1   1st Qu.:1856               
 Median : 53.00   S10003-D025:  1   Median :1863               
 Mean   : 72.19   S10003-D026:  1   Mean   :1865               
 3rd Qu.:110.00   S10003-D027:  1   3rd Qu.:1880               
 Max.   :239.00   S10003-D028:  1   Max.   :1913               
                  (Other)    :909   NA's   :8                  
                                                                          allsubject 
 Business; Sons; Economics; Relationships                                      :  4  
 Merchants; Economics                                                          :  4  
 Children; Physical illnesses; Relationships; Health; Physical illness of child:  3  
 Children; Relationships                                                       :  3  
 Correspondence; Intellectual life                        

In [29]:
# Correct docid column name
names(df)[names(df)=="docid...2"] <- "docid"
names(df)

In [30]:
head(df)

docsequence,docid,docyear,doctype,allsubject,broadsubj,personalevent,wwritten,docauthorid,docauthorname,⋯,docmonth,docday,docMonth,docDay,docdate,docDate,letterOrphan,letterLast,topicNumber,pyLDAvisTopic
23,S10003-D023,1836,Letter,"Childbirth; Church attendance; Cities; Farms; Homesickness; Motion sickness; Neighbors; Ocean voyages; Steamboats; Travelers; Wagon travel; Health; Religion; Communities; Relationships; Transportation; Entertainment and recreation; Westphalia, MO; Missouri; United States; West North Central States; Midwest States; Mississippi Basin States; North America",Health; Religion; Communities; Relationships; Transportation; Entertainment and recreation,,"Baltimore, MD; Maryland; United States; Mid-Atlantic States; Northeast States; East Coast States; North America",per0022938,"Bruns, Jette, 1813-1899",⋯,9,20.0,9,20,1836-09-20,1836-09-20,False,False,0,1
24,S10003-D024,1836,Letter,Church services; Farming; Letters and mail; Log cabins; Neighbors; Weather; Religion; Life Styles; Politics; Communities; Relationships; Environment,Religion; Life Styles; Politics; Communities; Relationships; Environment,,"Westphalia, MO; Missouri; United States; West North Central States; Midwest States; Mississippi Basin States; North America",per0022938,"Bruns, Jette, 1813-1899",⋯,11,14.0,11,14,1836-11-14,1836-11-14,False,False,17,18
25,S10003-D025,1837,Letter,Correspondence; Crops; English language; Living arrangements; Sons; Intellectual life; Agriculture; Domestic life; Relationships,Intellectual life; Agriculture; Domestic life; Relationships,,"Westphalia, MO; Missouri; United States; West North Central States; Midwest States; Mississippi Basin States; North America",per0022938,"Bruns, Jette, 1813-1899",⋯,8,,8,1,1837-08-01,1837-08-01,False,False,17,18
26,S10003-D026,1837,Letter,American Indians; Construction; Fevers; Floods; Horseback riding; Houses; Neighbors; Pianos; Ethnic groups; Economics; Health; Environment; Entertainment and recreation; Domestic life; Relationships; Intellectual life,Ethnic groups; Economics; Health; Environment; Entertainment and recreation; Domestic life; Relationships; Intellectual life,,"Westphalia, MO; Missouri; United States; West North Central States; Midwest States; Mississippi Basin States; North America",per0022938,"Bruns, Jette, 1813-1899",⋯,9,7.0,9,7,1837-09-07,1837-09-07,False,False,17,18
27,S10003-D027,1838,Letter,Brothers; Deaths; Farms; Fires; Letters and mail; Neighbors; Physical illnesses; Relationships; Health; Communities; Environment; Politics,Relationships; Health; Communities; Environment; Politics,,"Westphalia, MO; Missouri; United States; West North Central States; Midwest States; Mississippi Basin States; North America",per0022938,"Bruns, Jette, 1813-1899",⋯,3,1.0,3,1,1838-03-01,1838-03-01,False,False,17,18
28,S10003-D028,1838,Letter,Aunts; Family arguments; Fevers; Parenting; Sisters; Uncles; Relationships; Health,Relationships; Health,,"Westphalia, MO; Missouri; United States; West North Central States; Midwest States; Mississippi Basin States; North America",per0022938,"Bruns, Jette, 1813-1899",⋯,9,23.0,9,23,1838-09-23,1838-09-23,False,False,17,18


In [31]:
write.csv(df, "20210130_AM_Data-Letter.csv")