In [1]:
#Import Library
library(tidyverse)

── Attaching packages ─────────────────────────────────────── tidyverse 1.2.1 ──
✔ ggplot2 3.1.0     ✔ purrr   0.2.5
✔ tibble  2.0.1     ✔ dplyr   0.7.6
✔ tidyr   0.8.1     ✔ stringr 1.3.1
✔ readr   1.1.1     ✔ forcats 0.3.0
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()


In [2]:
# Converts all factors to character class
unfactorize <- function(df){
  for(i in which(sapply(df, class) == "factor")) df[[i]] = as.character(df[[i]])
  return(df)
}

In [3]:
#Load and unfactor data
letters <- unfactorize(read.csv("20201005_AM_FinalMeta.csv"))
#Show data
glimpse(letters)

Observations: 1,222
Variables: 51
$ X                           <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13…
$ docsequence                 <int> 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 3…
$ docid                       <chr> "S10003-D023", "S10003-D024", "S10003-D02…
$ docyear                     <int> 1836, 1836, 1837, 1837, 1838, 1838, 1838,…
$ doctype                     <chr> "Letter", "Letter", "Letter", "Letter", "…
$ allsubject                  <chr> "Childbirth; Church attendance; Cities; F…
$ broadsubj                   <chr> "Health; Religion; Communities; Relations…
$ personalevent               <chr> NA, NA, NA, NA, NA, NA, NA, NA, "Physical…
$ wwritten                    <chr> "Baltimore, MD; Maryland; United States; …
$ docauthorid                 <chr> "per0022938", "per0022938", "per0022938",…
$ docauthorname.x             <chr> "Bruns, Jette, 1813-1899", "Bruns, Jette,…
$ docauthorname.y             <chr> "Bruns, Jette, 1813-1899", "Bruns, Jette,…
$ language        

In [4]:
# Drop index variable
letters <- select(letters,-c(1))
names(letters)

In [5]:
# Cound the number of cases with S9908 in docid
nrow(subset(letters, grepl('S9908',docid)))

Remove these cases. They were identified during sentiment analysis to be much larger files than the other letter files. Upon closer inspection, they appear to be journal entries, or perhaps letters of a prisoner, which is a special case. 

In [6]:
letters <- letters[- grep("S9908", letters$docid),]
nrow(letters)

## Check variables with both x and y versions.

In [7]:
# Are there differences for docauthorname
letters[which(letters$docauthorname.x != letters$docauthorname.y), c('docauthorname.x', 'docauthorname.y')]

Unnamed: 0,docauthorname.x,docauthorname.y
216,"Anonymous Russian Jewish Mother, fl. 1910","Anonymous Russian Jewish Mother, 1910"
220,"Anonymous Russian Jewish Male, V. A., fl. 1911","Anonymous Russian Jewish Male, V. A."
947,"Pietersen, Jacobus, fl. 1876","Pietersen, Jacobus, 1876"
981,"Pietersen, Jacobus, fl. 1876","Pietersen, Jacobus, 1876"


In [8]:
# Keeping docauthorname.x because more nuanced.
# Drop docauthorname.y.
letters <- select(letters,-c(docauthorname.y))

# Change variable name
names(letters)[names(letters)=="docauthorname.x"] <- "docauthorname"

# Check df
names(letters)
nrow(letters)

In [9]:
# Are there differences for briefname.x
letters[which(letters$briefname.x != letters$briefname.y), c('briefname.x', 'briefname.y')]

Unnamed: 0,briefname.x,briefname.y
216,Anonymous Russian Jewish Mother,"Anonymous Russian Jewish Mother, 1910"
947,Jacobus Pietersen,"Jacobus Pietersen, 1876"
981,Jacobus Pietersen,"Jacobus Pietersen, 1876"


In [10]:
sample(letters$briefname.x, 10)

In [11]:
# Keeping briefname.x because the other values for that variable do not include date.
letters <- select(letters,-c(briefname.y))

# Change variable name
names(letters)[names(letters)=="briefname.x"] <- "briefname"

# Check df
names(letters)
nrow(letters)

In [12]:
# Check to race category counts
summary(as.factor(letters$authrace.x))
summary(as.factor(letters$authrace.y))

In [13]:
# Confirm that the Asian letters have one author
summary(as.factor(letters$docauthorname[letters$authrace.x=="Asian"]))

Race data is not very informative because there is only one Black authored letter and all the Asian authored letters are by the same person.  

In [14]:
# Drop both versions of the race variable.
letters <- select(letters,-c(authrace.x, authrace.y))

# Check df
names(letters)
nrow(letters)

In [15]:
# Are there differences for nationality
sum(is.na(letters$nationality.x))
sum(is.na(letters$nationality.y))

In [16]:
# Omit y version of the nationality variable because more NAs
letters <- select(letters,-c(nationality.y))

# Change variable name
names(letters)[names(letters)=="nationality.x"] <- "nationality"

# Check df
names(letters)
nrow(letters)

In [17]:
# Making sure the record for Hedy Lamar has been corrected or filtered.
nrow(letters$briefname[which(letters$docauthorid == "per0021755")])

NULL

In [18]:
# Omit x and y versions of the religion variable because a composite one is already in df.
letters <- select(letters,-c(religion.x, religion.y))

# Check df
names(letters)
nrow(letters)

In [19]:
# Are there differences for birthyear
letters[which(letters$birthyear.x != letters$birthyear.y), c('birthyear.x', 'birthyear.y')]

birthyear.x,birthyear.y


In [20]:
# Omit y version of birthyear
letters <- select(letters,-c(birthyear.y))

# Change variable name
names(letters)[names(letters)=="birthyear.x"] <- "birthyear"

# Check df
names(letters)
nrow(letters)

In [21]:
# Omit y version of birthyear
letters <- select(letters,-c(deathyear.y))

# Change variable name
names(letters)[names(letters)=="deathyear.x"] <- "deathyear"

# Check df
names(letters)
nrow(letters)

In [22]:
# Are there diferences for birthplace?
sum(is.na(letters$birthplace.x))
sum(is.na(letters$birthplace.y))

In [23]:
# Omit y version of birthplace
letters <- select(letters,-c(birthplace.y))

# Change variable name
names(letters)[names(letters)=="birthplace.x"] <- "birthplace"

# Check df
names(letters)
nrow(letters)

In [24]:
# Are there diferences for deathplace?
sum(is.na(letters$deathplace.x))
sum(is.na(letters$deathplace.y))

In [25]:
# Omit y version of deathplace
letters <- select(letters,-c(deathplace.y))

# Change variable name
names(letters)[names(letters)=="deathplace.x"] <- "deathplace"

# Check df
names(letters)
nrow(letters)

In [26]:
# Drop x and y versions of authorgender variable 
letters <- select(letters,-c(authorgender.x, authorgender.y))

# Check df
names(letters)
nrow(letters)

In [27]:
sum(is.na(letters$native_occupation.x))
sum(is.na(letters$native_occupation.y))

In [28]:
# Omit y version of native_occupation
letters <- select(letters,-c(native_occupation.y))

# Change variable name
names(letters)[names(letters)=="native_occupation.x"] <- "native_occupation"

# Check df
names(letters)
nrow(letters)

In [29]:
# Drop x and y versions of north_american_occupation because a composite variables already exists.
letters <- select(letters,-c(north_american_occupation.x, north_american_occupation.y))

# Check df
names(letters)
nrow(letters)

In [30]:
# Are there differences for year_immigration
letters[which(letters$year_immigration.x != letters$year_immigration.y), c('year_immigration.x', 'year_immigration.y')]

year_immigration.x,year_immigration.y


In [31]:
# Omit y version of year_immigration
letters <- select(letters,-c(year_immigration.y))

# Change variable name
names(letters)[names(letters)=="year_immigration.x"] <- "year_immigration"

# Check df
names(letters)
nrow(letters)

In [32]:
# Are there differences for year_immigration
letters[which(letters$cultural_heritage.x != letters$cultural_heritage), c('cultural_heritage.x', 'cultural_heritage')]

cultural_heritage.x,cultural_heritage


In [33]:
# Drop x and y versions of cultural_heritage because a composite variables already exists.
letters <- select(letters,-c(cultural_heritage.x, cultural_heritage.y))

# Check df
names(letters)
nrow(letters)

In [34]:
# Are there differences for year_immigration
letters[which(letters$stayed_north_america.x != letters$stayed_north_america.y), c('stayed_north_america.x', 'stayed_north_america.y')]

Unnamed: 0,stayed_north_america.x,stayed_north_america.y
1,Stayed,No
2,Stayed,No
3,Stayed,No
4,Stayed,No
5,Stayed,No
6,Stayed,No
7,Stayed,No
8,Stayed,No
9,Stayed,No
10,Stayed,No


In [35]:
# Drop y version of stayed_north_america because of errors (too many nos).
letters <- select(letters,-c(stayed_north_america.y))

# Change variable name
names(letters)[names(letters)=="stayed_north_america.x"] <- "stayed_north_america"

# Check df
names(letters)
nrow(letters)

In [36]:
# Converts character to factor class
factorize <- function(df){
  for(i in which(sapply(df, class) == "character")) df[[i]] = as.factor(df[[i]])
  return(df)
}

In [37]:
# Turn characters into factors and get summary
letters  <- factorize(letters)
summary(letters)

  docsequence             docid         docyear       doctype    
 Min.   :  2.00   S10003-D023:   1   Min.   :1804   Letter:1212  
 1st Qu.: 25.00   S10003-D024:   1   1st Qu.:1858                
 Median : 55.00   S10003-D025:   1   Median :1869                
 Mean   : 72.78   S10003-D026:   1   Mean   :1882                
 3rd Qu.:110.25   S10003-D027:   1   3rd Qu.:1909                
 Max.   :239.00   S10003-D028:   1   Max.   :1979                
                  (Other)    :1206   NA's   :20                  
                                                         allsubject  
 Correspondence; Intellectual life                            :  10  
 Internment camps; Politics                                   :   8  
 Winter; Environment                                          :   6  
 Correspondence; Internment camps; Intellectual life; Politics:   5  
 Photographs; Intellectual life                               :   5  
 Business; Sons; Economics; Relationships           

In [38]:
# Remove underscores from variable names
names(letters)<-gsub("\\_","",names(letters))
names(letters)

In [39]:
glimpse(letters)

Observations: 1,212
Variables: 30
$ docsequence             <int> 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 3…
$ docid                   <fct> S10003-D023, S10003-D024, S10003-D025, S10003…
$ docyear                 <int> 1836, 1836, 1837, 1837, 1838, 1838, 1838, 183…
$ doctype                 <fct> Letter, Letter, Letter, Letter, Letter, Lette…
$ allsubject              <fct> "Childbirth; Church attendance; Cities; Farms…
$ broadsubj               <fct> Health; Religion; Communities; Relationships;…
$ personalevent           <fct> NA, NA, NA, NA, NA, NA, NA, NA, Physical illn…
$ wwritten                <fct> "Baltimore, MD; Maryland; United States; Mid-…
$ docauthorid             <fct> per0022938, per0022938, per0022938, per002293…
$ docauthorname           <fct> "Bruns, Jette, 1813-1899", "Bruns, Jette, 181…
$ language                <fct> English; German, English; German, English; Ge…
$ editor                  <fct> "Schroeder, Adoplh E., tr.; Geisberg, Carla S…
$ briefname       

In [40]:
write.csv(letters, "20201119_AM_Meta2Merge.csv")