In [229]:
#Import Library
library(tidyverse)

In [230]:
# Converts all factors to character class
unfactorize <- function(df){
  for(i in which(sapply(df, class) == "factor")) df[[i]] = as.character(df[[i]])
  return(df)
}
# Code from user "By0" at https://stackoverflow.com/questions/2851015/convert-data-frame-columns-from-factors-to-characters (line 14)

In [231]:
# Get and view last meta dataset
letters  <- unfactorize(read.csv("20201125_AM_Meta2Merge.csv"))
glimpse(letters)

Observations: 925
Variables: 33
$ X.1                     <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14…
$ X                       <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14…
$ docsequence             <int> 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 3…
$ docid                   <chr> "S10003-D023", "S10003-D024", "S10003-D025", …
$ docyear                 <int> 1836, 1836, 1837, 1837, 1838, 1838, 1838, 183…
$ doctype                 <chr> "Letter", "Letter", "Letter", "Letter", "Lett…
$ allsubject              <chr> "Childbirth; Church attendance; Cities; Farms…
$ broadsubj               <chr> "Health; Religion; Communities; Relationships…
$ personalevent           <chr> NA, NA, NA, NA, NA, NA, NA, NA, "Physical ill…
$ wwritten                <chr> "Baltimore, MD; Maryland; United States; Mid-…
$ docauthorid             <chr> "per0022938", "per0022938", "per0022938", "pe…
$ docauthorname           <chr> "Bruns, Jette, 1813-1899", "Bruns, Jette, 181…
$ language          

In [232]:
unique(letters[c("culturalheritage", "religion")])

Unnamed: 0,culturalheritage,religion
1,German; European,Catholic; Christian
137,Italian; European,Catholic; Christian
193,Jewish; Russian; European,Jewish
194,Jewish,Jewish
198,Russian; European,Jewish
200,Jewish; Spanish; European,Jewish
205,Jewish; Polish; European,Jewish
214,Jewish; Lithuanian; European,Jewish
222,Polish; European,Jewish
225,Spanish; European,Jewish


In [233]:
# Create a new variable called "culture" containing the values from culturalheritage
letters['religionNew']  <- letters['religion']

In [234]:
# Recode Religions in
letters$religionNew[grep("Episcopalian", letters$religionNew)]  <- "Protestant"
letters$religionNew[grep("Anglican", letters$religionNew)]  <- "Protestant"
letters$religionNew[grep("Methodist", letters$religionNew)]  <- "Protestant"
letters$religionNew[grep("Lutheran", letters$religionNew)]  <- "Protestant"
letters$religionNew[grep("Presbyterian", letters$religionNew)]  <- "Protestant"
letters$religionNew[grep("Dutch Reformed", letters$religionNew)]  <- "Protestant"
letters$religionNew[grep("Protestant", letters$religionNew)]  <- "Protestant"
letters$religionNew[grep("Mormon", letters$religionNew)]  <- "Mormon"
letters$religionNew[grep("Catholic", letters$religionNew)]  <- "Catholic"
unique(letters[c("culturalheritage", "religionNew")])

Unnamed: 0,culturalheritage,religionNew
1,German; European,Catholic
137,Italian; European,Catholic
193,Jewish; Russian; European,Jewish
194,Jewish,Jewish
198,Russian; European,Jewish
200,Jewish; Spanish; European,Jewish
205,Jewish; Polish; European,Jewish
214,Jewish; Lithuanian; European,Jewish
222,Polish; European,Jewish
225,Spanish; European,Jewish


In [235]:
# Create a new variable called "culture" containing the values from culturalheritage
letters['nationalOrigin']  <- letters['culturalheritage']

In [236]:
# Strip Continents
letters['nationalOrigin'] <- str_remove(letters$nationalOrigin, "; European")
letters['nationalOrigin'] <- str_remove(letters$nationalOrigin, "; African")
unique(letters[c("nationalOrigin", "religionNew")])

Unnamed: 0,nationalOrigin,religionNew
1,German,Catholic
137,Italian,Catholic
193,Jewish; Russian,Jewish
194,Jewish,Jewish
198,Russian,Jewish
200,Jewish; Spanish,Jewish
205,Jewish; Polish,Jewish
214,Jewish; Lithuanian,Jewish
222,Polish,Jewish
225,Spanish,Jewish


In [237]:
# Everywhere that Jewish appears in culture, it also appears in religion. Strip from culture.
letters['nationalOrigin'] <- str_remove(letters$nationalOrigin, "Jewish; ")
letters['nationalOrigin'] <- str_replace(letters$nationalOrigin, "Jewish", replacement = NA_character_)
unique(letters[c("nationalOrigin", "religionNew")])

Unnamed: 0,nationalOrigin,religionNew
1,German,Catholic
137,Italian,Catholic
193,Russian,Jewish
194,,Jewish
200,Spanish,Jewish
205,Polish,Jewish
214,Lithuanian,Jewish
226,Italian,
229,English,Protestant
330,Czech,Catholic


In [238]:
letters['culture'] <- letters['nationalOrigin']
table(letters$culture)


           Czech   Dushane; Xhosa            Dutch          English 
               6                3               35              367 
        European          Finnish French; Prussian           German 
               1               12                7              145 
           Irish  Irish; Scottish          Italian       Lithuanian 
              26               17               59                1 
       Norwegian           Polish         Prussian          Russian 
             105                3               34               16 
        Scottish          Spanish 
              72                2 

In [239]:
letters$culture <- paste(letters$culture, letters$religionNew)
table(letters$culture)


            Czech Catholic          Dushane; Xhosa NA 
                         6                          3 
                  Dutch NA           Dutch Protestant 
                        34                          1 
         English Christian             English Mormon 
                       231                          1 
                English NA         English Protestant 
                        30                        105 
               European NA         Finnish Protestant 
                         1                         12 
       French; Prussian NA            German Catholic 
                         7                        136 
                 German NA          German Protestant 
                         1                          8 
            Irish Catholic                   Irish NA 
                         1                         25 
Irish; Scottish Protestant           Italian Catholic 
                        17                         56 
         

In [240]:
letters$culture[letters$culture == "NA NA"]  <- "Unknown"
letters$culture <- str_remove(letters$culture, " NA")
letters$culture <- str_remove(letters$culture, "NA ")
letters$culture <- str_replace(letters$culture, "; ", "-")
table(letters$culture)               


           Czech Catholic             Dushane-Xhosa                     Dutch 
                        6                         3                        34 
         Dutch Protestant                   English         English Christian 
                        1                        30                       231 
           English Mormon        English Protestant                  European 
                        1                       105                         1 
       Finnish Protestant           French-Prussian                    German 
                       12                         7                         1 
          German Catholic         German Protestant                     Irish 
                      136                         8                        25 
           Irish Catholic Irish-Scottish Protestant                   Italian 
                        1                        17                         3 
         Italian Catholic                    Jewish

In [241]:
# What is the culture of the authors
letters %>% 
distinct(docauthorid, .keep_all = T) %>% 
count(culture, sort = TRUE) 

culture,n
Norwegian,61
Scottish,27
Dutch,24
English,24
Russian Jewish,16
Jewish,13
English Christian,9
Norwegian Christian,6
Norwegian Protestant,6
Irish,5


In [242]:
# What are the national origins of the authors
letters %>% 
distinct(docauthorid, .keep_all = T) %>% 
count(nationalOrigin, sort = TRUE) 

nationalOrigin,n
Norwegian,73
English,36
Scottish,30
Dutch,25
Russian,16
,14
Irish,6
German,3
Polish,3
Czech,2


In [243]:
# What are the religions of the authors
letters %>% 
distinct(docauthorid, .keep_all = T) %>% 
count(religionNew, sort = TRUE) 

religionNew,n
,148
Jewish,34
Christian,16
Protestant,16
Catholic,5
Mormon,1


In [244]:
# Converts character to factor class
factorize <- function(df){
  for(i in which(sapply(df, class) == "character")) df[[i]] = as.factor(df[[i]])
  return(df)
}

In [245]:
# Refactorize the data
lettersFinal  <-  factorize(letters)
glimpse(lettersFinal)

Observations: 925
Variables: 36
$ X.1                     <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14…
$ X                       <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14…
$ docsequence             <int> 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 3…
$ docid                   <fct> S10003-D023, S10003-D024, S10003-D025, S10003…
$ docyear                 <int> 1836, 1836, 1837, 1837, 1838, 1838, 1838, 183…
$ doctype                 <fct> Letter, Letter, Letter, Letter, Letter, Lette…
$ allsubject              <fct> "Childbirth; Church attendance; Cities; Farms…
$ broadsubj               <fct> Health; Religion; Communities; Relationships;…
$ personalevent           <fct> NA, NA, NA, NA, NA, NA, NA, NA, Physical illn…
$ wwritten                <fct> "Baltimore, MD; Maryland; United States; Mid-…
$ docauthorid             <fct> per0022938, per0022938, per0022938, per002293…
$ docauthorname           <fct> "Bruns, Jette, 1813-1899", "Bruns, Jette, 181…
$ language          

In [246]:
# What is the religion of the authors
sort(summary(lettersFinal$religionNew), decreasing = TRUE)

lettersFinal %>% 
distinct(docauthorid, .keep_all = T) %>% 
count(religionNew, sort = TRUE) 

religionNew,n
,148
Jewish,34
Christian,16
Protestant,16
Catholic,5
Mormon,1


In [247]:
# What is the nationality of the authors
sort(summary(lettersFinal$nationalOrigin), decreasing = TRUE)

lettersFinal %>% 
distinct(docauthorid, .keep_all = T) %>% 
count(nationalOrigin, sort = TRUE) 


nationalOrigin,n
Norwegian,73
English,36
Scottish,30
Dutch,25
Russian,16
,14
Irish,6
German,3
Polish,3
Czech,2


In [248]:
# What cultural groups are represented?
sort(summary(lettersFinal$culture), decreasing = TRUE)

lettersFinal %>% 
distinct(docauthorid, .keep_all = T) %>% 
count(culture, sort = TRUE) 

culture,n
Norwegian,61
Scottish,27
Dutch,24
English,24
Russian Jewish,16
Jewish,13
English Christian,9
Norwegian Christian,6
Norwegian Protestant,6
Irish,5


In [221]:
sort(summary(lettersFinal$nativeoccupation), decreasing = TRUE)
sort(summary(lettersFinal$northamericanoccupation), decreasing = TRUE)

In [252]:
table(lettersFinal$yearimmigration)
table(lettersFinal$docyear)


1801 1804 1819 1821 1824 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 
   1    2    8    1    1    3    2    1    7    7  109   10   12    7  136    1 
1841 1844 1847 1848 1849 1851 1853 1856 1857 1862 1865 1870 1882 1890 1892 1895 
   3    1   32    4    6    8   34  216    1   13    3    2   12    1    1    3 
1905 1906 1908 1909 1910 1912 1913 1931 1937 
   1    2    4    1    1    1    5    1    2 


1804 1805 1812 1817 1822 1823 1824 1825 1827 1828 1830 1831 1832 1833 1834 1835 
   1    1    1    9    3   17    1    1    3    8    7   11    3   19   17    3 
1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 
   5   10    7    5    3    5    4    2    4    2    3    4    4   11   13    3 
1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 
   5   15   13    6   12   36   40   47   35   28   33   47   50   12   11    6 
1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 
   7    8   11   11   10    4    4    2    8    5   11   11   13   20   16   18 
1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1900 
   8    9   12    8   15    9    5    9    3    6    7    6    5    2    4    3 
1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 
   1    2    1    1    3   13    5    6    7    5    8    2    7 

In [256]:
nrow(subset(lettersFinal, yearimmigration < 1914 | is.na(yearimmigration)))
lettersFinal <- subset(lettersFinal, yearimmigration < 1914 | is.na(yearimmigration))
summary(lettersFinal)

      X.1               X           docsequence             docid    
 Min.   :   1.0   Min.   :   1.0   Min.   :  2.00   S10003-D023:  1  
 1st Qu.: 472.2   1st Qu.: 472.2   1st Qu.: 24.00   S10003-D024:  1  
 Median : 716.5   Median : 716.5   Median : 53.00   S10003-D025:  1  
 Mean   : 659.3   Mean   : 660.4   Mean   : 71.76   S10003-D026:  1  
 3rd Qu.: 965.5   3rd Qu.: 965.5   3rd Qu.:110.00   S10003-D027:  1  
 Max.   :1212.0   Max.   :1222.0   Max.   :239.00   S10003-D028:  1  
                                                    (Other)    :916  
    docyear       doctype   
 Min.   :1804   Letter:922  
 1st Qu.:1856               
 Median :1862               
 Mean   :1865               
 3rd Qu.:1880               
 Max.   :1913               
 NA's   :8                  
                                                                          allsubject 
 Business; Sons; Economics; Relationships                                      :  4  
 Merchants; Economics               

In [258]:
originalIDs  <- letters$docid
finalIDs <- lettersFinal$docid
removedIDs <- originalIDs[!(originalIDs %in% finalIDs)]
length(removedIDs)
removedIDs

In [259]:
# .txt to all IDs
fileids <- paste(removedIDs, ".txt", sep="")
fileids

In [260]:
# Confirm working directory
getwd()

In [261]:
# Set working directory
setwd("letters")

In [262]:
# What are the files in that directory?
list.files()

In [263]:
# How many files are there?
length(list.files())

In [264]:
# Delete the fileids
sapply(fileids, unlink)

In [265]:
# Check it worked correctly by getting count.
length(list.files())

In [257]:
# Write a new .csv
write.csv(lettersFinal, "20201130_AM_Meta2Merge.csv")