# NAILDOH Nationality

## Resources

This is the second notebook used in the series to prepare and analyze the NAILDOH dataset.

In [35]:
# Options
options(digits = 1)

In [36]:
# Libraries
library(tidyverse) # for data manipulation

In [37]:
# Functions
factorize <- function(df){ # Create a function
  for(i in which(sapply(df, class) == "character")) # that looks for variables with the character class 
      df[[i]] = as.factor(df[[i]]) # and converts them to factor (i.e., categorical) class
  return(df)
}

unfactorize <- function(df){ # Create a function
  for(i in which(sapply(df, class) == "factor")) # that looks for variables with the character class 
      df[[i]] = as.character(df[[i]]) # and converts them to factor (i.e., categorical) class
  return(df)
}

In [39]:
# Data
letters <- factorize(read.csv("20240314a_PhD_NaildohSubset.csv")) # Put csv into a dataframe called docData

# Obserbations and Variables
glimpse(letters)

Rows: 986
Columns: 72
$ docsequence               [3m[90m<int>[39m[23m 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,…
$ docid                     [3m[90m<fct>[39m[23m S1019-D002, S1019-D004, S1019-D005, S1019-D0…
$ sourceid                  [3m[90m<fct>[39m[23m S1019, S1019, S1019, S1019, S1019, S1019, S1…
$ docauthorid               [3m[90m<fct>[39m[23m per0001043, per0001043, per0001043, per00010…
$ doctitle                  [3m[90m<fct>[39m[23m "Letter from Sister Blandina Segale to Siste…
$ docyear                   [3m[90m<int>[39m[23m 1872, 1872, 1872, 1872, 1873, 1873, 1873, 18…
$ docmonth                  [3m[90m<int>[39m[23m 11, 12, 12, 12, 3, 7, 9, 6, 11, 6, 9, 12, 1,…
$ docday                    [3m[90m<int>[39m[23m 30, 6, 10, 21, 1, NA, NA, 30, 14, NA, NA, 16…
$ docpage                   [3m[90m<fct>[39m[23m "3-10", "13-22", "22-29", "29-37", "37-44", …
$ doctype                   [3m[90m<fct>[39m[23m Letter, Letter, Letter, Letter, Le

# National Origin & Language

In [40]:
# See some example values in the cultural_heritage variable

unique(letters$cultural_heritage)[0:5]

In [41]:
# Create a new variable called "nationalOrigin" 
# Populate it with  the values from cultural_heritage
letters$nationalOrigin <- as.character(letters$cultural_heritage)

# Strip continents
letters['nationalOrigin'] <- str_remove(letters$nationalOrigin, "; European")
letters['nationalOrigin'] <- str_remove(letters$nationalOrigin, "; African")
letters['nationalOrigin'] <- str_remove(letters$nationalOrigin, "; Asian")

# Strip cultural reference (because this info appears in religion)
letters['nationalOrigin'] <- str_remove(letters$nationalOrigin, "Jewish; ")

# Replace blank cells with NA
letters$nationalOrigin[letters$nationalOrigin == ""]  <- NA

# Return to factor variable
letters$nationalOrigin  <- as.factor(letters$nationalOrigin)
summary(letters$nationalOrigin)

In [42]:
# Create a new variable called "britishEmpire_EU" and fill it with FALSE
letters$britishEmpire_EU <- FALSE

# Change NAs for nationalOrigin to NA for britishEmpire_EU
letters$britishEmpire_EU[is.na(letters$nationalOrigin)] <- NA

# Change "European" to NA
letters$britishEmpire_EU[letters$nationalOrigin=="European"] <- NA

# Now enter "TRUE" wherever the nationalOrigin is 
# part of the British Empire.

letters$britishEmpire_EU[letters$nationalOrigin %in% c("English", 
                                                       "Irish; Scottish", 
                                                       "Scottish", 
                                                       "Welsh", 
                                                       "Irish")]  <- TRUE

# See the data
summary(letters$britishEmpire_EU)

# Get the percentage
306/1032

   Mode   FALSE    TRUE    NA's 
logical     302     675       9 

In [43]:
# What are the sources of letters written by people 
# who originated from outside the British Empire
letters %>%
filter(britishEmpire_EU == FALSE | is.na(britishEmpire_EU)) %>%
select(sourcetitle) %>%
unique()

Unnamed: 0_level_0,sourcetitle
Unnamed: 0_level_1,<fct>
1,At the End of the Santa Fe Trail
56,"A Bintel Brief, vol. 1: Sixty Years of Letters from the Lower East Side to the Jewish Daily Forward"
117,"Papers of Diego Delfino, 1912-1929"
120,"Life Anew for Czech Immigrants: The Letters of Marie and Vavrin Stritecky, 1913-1934"
126,From East Prussia to the Golden Gate
133,The Uncorrupted Heart: Journals and Letters of Frederick Julius Gustorf 1800-1845
141,"Life and Letters of Joseph Riipa, 1868-1896"
153,Write Back Soon: Letters From Immigrants in America
212,"Advice to Emigrants, Who Intend to Settle in the United States of America, 2nd Edition, Greatly Enlarged and Improved"
213,Counsel for Emigrants


In [44]:
#Create variable for translated and fill with NAs
letters$translated <- NA

# Wherever writer originated from within British Empire, enter "False" for translation
letters$translated[letters$britishEmpire_EU == TRUE] <- FALSE

# Now go source by source by source and recode as necessary

# Start with sources that were not translated
letters$translated[letters$britishEmpire_EU == FALSE & 
                   letters$sourcetitle=="At the End of the Santa Fe Trail"] <- FALSE
letters$translated[letters$britishEmpire_EU == FALSE & 
                   letters$sourcetitle=="Advice to Emigrants, Who Intend to Settle in the United States of America, 2nd Edition, Greatly Enlarged and Improved"] <- FALSE
letters$translated[letters$britishEmpire_EU == FALSE & 
                   letters$sourcetitle=="Counsel for Emigrants"] <- FALSE
letters$translated[letters$britishEmpire_EU == FALSE & 
                   letters$sourcetitle=="Hints on Emigration to Upper Canada; Especially Addressed to the Middle and Lower Classes in Great Britain and Ireland"] <- FALSE

# Now sources that were translated
letters$translated[letters$britishEmpire_EU == FALSE & 
                   letters$sourcetitle=="A Bintel Brief, vol. 1: Sixty Years of Letters from the Lower East Side to the Jewish Daily Forward"] <- TRUE
letters$translated[letters$britishEmpire_EU == FALSE & 
                   letters$sourcetitle=="Papers of Diego Delfino, 1912-1929"] <- TRUE
letters$translated[letters$britishEmpire_EU == FALSE & 
                   letters$sourcetitle=="Life Anew for Czech Immigrants: The Letters of Marie and Vavrin Stritecky, 1913-1934"] <- TRUE
letters$translated[letters$britishEmpire_EU == FALSE & 
                   letters$sourcetitle=="From East Prussia to the Golden Gate"] <- TRUE
letters$translated[letters$britishEmpire_EU == FALSE & 
                   letters$sourcetitle=="The Uncorrupted Heart: Journals and Letters of Frederick Julius Gustorf 1800-1845"] <- TRUE
letters$translated[letters$britishEmpire_EU == FALSE & 
                   letters$sourcetitle=="Life and Letters of Joseph Riipa, 1868-1896"] <- TRUE
letters$translated[letters$britishEmpire_EU == FALSE & 
                   letters$sourcetitle=="Write Back Soon: Letters From Immigrants in America"] <- TRUE
# https://archives.calvin.edu/?p=collections/findingaid&id=79&q=&rootcontentid=13858
letters$translated[letters$britishEmpire_EU == FALSE & 
                   letters$sourcetitle=="Their Own Saga: Letters From the Norwegian Global Migration"] <- TRUE

#This one requires closer examination
letters %>%
filter(sourcetitle == "America's Immigrants: Adventures in Eyewitness History") %>%
select(nationalOrigin, britishEmpire_EU, translated, docauthorname, docid)

nationalOrigin,britishEmpire_EU,translated,docauthorname,docid
<fct>,<lgl>,<lgl>,<fct>,<fct>
English,True,False,"Thorpe, John, fl. 1828",S9974-D008
English,True,False,"Downe, John, fl. 1830",S9974-D010
Norwegian,False,,"Anonymous Norwegian Immigrant, fl. 1845",S9974-D017
Norwegian,False,,"Endresen, Guri, ?-1881",S9974-D027
Chinese,False,,"Ling, Kwang Chang, fl. 1878",S9974-D029
Russian,False,,"Anonymous Russian Jewish Mother, fl. 1906",S9974-D038
Jewish,False,,"Kislikoff, Raphael, fl. 1906",S9974-D039
Jewish,False,,"Anonymous Jewish Male Seeking Help, fl. 1906",S9974-D040
Russian,False,,"Mednikoff, Youah, fl. 1906",S9974-D041
Polish,False,,"Rembié'nska, Alesksandra, fl. 1911",S9974-D042


In [45]:
# These are all taken from other collections that have been translated. 
letters$translated[letters$docid=="S9974-D017"] <- TRUE
letters$translated[letters$docid=="S9974-D027"] <- TRUE
letters$translated[letters$docid=="S9974-D038"] <- TRUE
letters$translated[letters$docid=="S9974-D039"] <- TRUE
letters$translated[letters$docid=="S9974-D040"] <- TRUE
letters$translated[letters$docid=="S9974-D041"] <- TRUE
letters$translated[letters$docid=="S9974-D042"] <- TRUE

# This was a letter to the editor of an English periodical
letters$translated[letters$docid=="S9974-D029"] <- FALSE
letters$publicLetter[letters$docid=="S9974-D029"] <- NA
letters$publicLetter[letters$docid=="S9974-D029"] <- TRUE

# This author is from either Great Britain or Ireland 
# and hence his letter is not translated.
letters$translated[letters$docid=="S9974-D008"]  <- FALSE
letters$translated[letters$docid=="S9974-D010"]  <- FALSE

# Now check the recoding
letters %>%
filter(sourcetitle == "America's Immigrants: Adventures in Eyewitness History") %>%
select(nationalOrigin, britishEmpire_EU, translated, docauthorname, docid)

nationalOrigin,britishEmpire_EU,translated,docauthorname,docid
<fct>,<lgl>,<lgl>,<fct>,<fct>
English,True,False,"Thorpe, John, fl. 1828",S9974-D008
English,True,False,"Downe, John, fl. 1830",S9974-D010
Norwegian,False,True,"Anonymous Norwegian Immigrant, fl. 1845",S9974-D017
Norwegian,False,True,"Endresen, Guri, ?-1881",S9974-D027
Chinese,False,False,"Ling, Kwang Chang, fl. 1878",S9974-D029
Russian,False,True,"Anonymous Russian Jewish Mother, fl. 1906",S9974-D038
Jewish,False,True,"Kislikoff, Raphael, fl. 1906",S9974-D039
Jewish,False,True,"Anonymous Jewish Male Seeking Help, fl. 1906",S9974-D040
Russian,False,True,"Mednikoff, Youah, fl. 1906",S9974-D041
Polish,False,True,"Rembié'nska, Alesksandra, fl. 1911",S9974-D042


In [46]:
vars <- c("nationalOrigin", "britishEmpire_EU", "translated")
summary(letters[vars])

   nationalOrigin britishEmpire_EU translated     
 English  :366    Mode :logical    Mode :logical  
 Welsh    :188    FALSE:302        FALSE:731      
 Norwegian: 87    TRUE :675        TRUE :246      
 Scottish : 69    NA's :9          NA's :9        
 Italian  : 58                                    
 (Other)  :212                                    
 NA's     :  6                                    

In [47]:
# What are the NAs?
letters %>%
filter(is.na(translated) | is.na(britishEmpire_EU) | is.na(translated)) %>%
select(nationalOrigin, britishEmpire_EU, docauthorname, sourcetitle, docid)

nationalOrigin,britishEmpire_EU,docauthorname,sourcetitle,docid
<fct>,<lgl>,<fct>,<fct>,<fct>
,,"Anonymous Jewish Male, V., fl. 1906","A Bintel Brief, vol. 1: Sixty Years of Letters from the Lower East Side to the Jewish Daily Forward",S11067-D016
,,"Steffens, Nicholas M., fl. 1891",Write Back Soon: Letters From Immigrants in America,S9831-D081
,,"Jennings, Joseph, fl. 1931","Advice to Emigrants, Who Intend to Settle in the United States of America, 2nd Edition, Greatly Enlarged and Improved",S9845-D004
,,"Buchanan, J. C., fl. 1833",Counsel for Emigrants,S9865-D009
,,"Buchanan, Alexander Carlisle, 1786-1840",Counsel for Emigrants,S9865-D010
European,,"Anonymous Government Agent in Upper Canada, fl. 1833",Counsel for Emigrants,S9865-D020
European,,"Aylmer, Matthew, Lord, 1775-1850",Counsel for Emigrants,S9865-D072
European,,"Anonymous Government Agent in Upper Canada, fl. 1833",Counsel for Emigrants,S9865-D073
,,"Prongley, Esau, fl. 1830",Hints on Emigration to Upper Canada; Especially Addressed to the Middle and Lower Classes in Great Britain and Ireland,S9873-D019


In [48]:
# Recoding the NAs for translated variable
letters$translated[letters$docid=="S11067-D016"]  <- TRUE #Yiddish publication
letters$translated[letters$docid=="S9831-D081"]  <- TRUE #Probably in Dutch
letters$translated[letters$docid=="S9845-D004"]  <- FALSE #From an English oriented book
letters$translated[letters$docid=="S9865-D009"]  <- FALSE #Irish born British agent
letters$translated[letters$docid=="S9865-D010"]  <- FALSE #Irish born British agent
letters$translated[letters$docid=="S9865-D020"]  <- FALSE #British official
letters$translated[letters$docid=="S9865-D072"]  <- FALSE #British official
letters$translated[letters$docid=="S9865-D073"]  <- FALSE #British official
letters$translated[letters$docid=="S9873-D019"]  <- FALSE #From an English oriented book

letters$nationalOrigin[letters$docid=="S9865-D072"]  <- "English" # http://www.biographi.ca/en/bio/whitworth_aylmer_matthew_7E.html
letters$britishEmpire_EU[letters$docid=="S9865-D072"]  <- TRUE 

letters$nationalOrigin[letters$docid=="S9865-D009"]  <- "Irish; Scottish" # http://www.ccchs.ca/Newsletters/CCCHS%20Newsletter--Winter-Spring%202019%20.pdf
letters$britishEmpire_EU[letters$docid=="S9865-D009"]  <- TRUE 

letters$nationalOrigin[letters$docid=="S9865-D010"]  <- "Irish; Scottish" # http://biographi.ca/en/bio/buchanan_alexander_carlisle_1786_1840_7E.html
letters$britishEmpire_EU[letters$docid=="S9865-D010"]  <- TRUE 

letters$nationalOrigin[letters$docid=="S9873-D019"]  <- "English" # https://www.findagrave.com/memorial/133281403/esau-prangley
letters$britishEmpire_EU[letters$docid=="S9873-D019"]  <- TRUE

letters$nationalOrigin[letters$docid=="S9831-D081"]  <- "German" # https://en.wikisource.org/wiki/Author:Nicholas_Martin_Steffens
letters$britishEmpire_EU[letters$docid=="S9831-D081"]  <- FALSE

The national origins of the individuals below cannot be resolved so will remain as missing, to be predicted using multiple imputation at a later stage.

In [49]:
# What are the NAs?
letters %>%
filter(is.na(translated) | is.na(britishEmpire_EU) | is.na(translated)) %>%
select(nationalOrigin, britishEmpire_EU, docauthorname, sourcetitle, docid)

nationalOrigin,britishEmpire_EU,docauthorname,sourcetitle,docid
<fct>,<lgl>,<fct>,<fct>,<fct>
,,"Anonymous Jewish Male, V., fl. 1906","A Bintel Brief, vol. 1: Sixty Years of Letters from the Lower East Side to the Jewish Daily Forward",S11067-D016
,,"Jennings, Joseph, fl. 1931","Advice to Emigrants, Who Intend to Settle in the United States of America, 2nd Edition, Greatly Enlarged and Improved",S9845-D004
European,,"Anonymous Government Agent in Upper Canada, fl. 1833",Counsel for Emigrants,S9865-D020
European,,"Anonymous Government Agent in Upper Canada, fl. 1833",Counsel for Emigrants,S9865-D073


Now there is the issue of the Welsh letters, most of which have been translated into English, according to the preface to the source. The notes for the book indicate which letters were originally in English (n = 27). The texts were accidentally deleted from the dataset. To correct this and include only the ones originally in English, the following steps were taken:

<ol>
    <li>Make all values for this source TRUE for translated</li>
    <li>Match items marked "Original in English" in the Notes to items in the CSV</li>
<li>Find those items in the book (https://books.google.ca/books?id=ECxwYKeFWkcC&printsec=frontcover&vq=%22original+in+English%22)</li>
<li>Create new txt file with just the text of this letter</li>
    <li>Give it the same name as indicated in the CSV.</li>
    <li>Put docids for letters originally in English into a vector</li>
    <li>Everywhere a value in that vector appears in docid, code FALSE for translated.</li>
</ol>

In [50]:
# The "bulk" of the letters contained in this book are translated from Welsh into English
# But it is not specified which are translated.
# Also, this source is missing from the dataset
# Make all values for this source TRUE for translated

letters$translated[letters$sourcetitle =="The Welsh in America: Letters from the Immigrants"] <- TRUE

In [51]:
letters  %>% 
filter(sourcetitle == "The Welsh in America: Letters from the Immigrants" & 
       grepl("Pugh", docauthorname)) %>% 
select(docid, docauthorname, docyear, docmonth, docday)  %>% 
arrange(docyear) 

docid,docauthorname,docyear,docmonth,docday
<fct>,<fct>,<int>,<int>,<int>
S316-D061,"Pugh, Margaret",1845,3,2


The date is incorrect on this one so I've recoded. Also, this is an excerpt, so it must be noted that many of the letters in this collection are likely so as well. 
https://www.britishnewspaperarchive.co.uk/viewer/bl/0002971/18470108/021/0003

In [52]:
# Correct the year, month and day
letters$docyear[letters$docid == "S316-D061"]  <- 1846
letters$docmonth[letters$docid == "S316-D061"]  <- 11
letters$docday[letters$docid == "S316-D061"]  <- 15

In [53]:
letters  %>% 
filter(sourcetitle == "The Welsh in America: Letters from the Immigrants" & 
       grepl("Jane", docauthorname)) %>% 
select(docid, docauthorname, docyear, docmonth, docday)  %>% 
arrange(docyear) 

docid,docauthorname,docyear,docmonth,docday
<fct>,<fct>,<dbl>,<dbl>,<dbl>
S316-D205,"Anonymous Welsh Immigrant, Jane, fl. 1862",1862,9,16


The date for S316-D204 is the date the letter was published not written. No date is indicated in the published version, reference only being made to a letter "lately received." I will leave the date but making note of this. https://www.britishnewspaperarchive.co.uk/viewer/bl/0000915/18570321/006/0003

In [54]:
letters  %>% 
filter(docid == "S316-D203") %>% 
#filter(docyear == "1880" & docmonth == "3")  %>% 
select(docid, docauthorname, docyear, docmonth, docday) 

docid,docauthorname,docyear,docmonth,docday
<fct>,<fct>,<dbl>,<dbl>,<dbl>
S316-D203,"Grant, David, fl. 1856",1856,8,31


In [55]:
# Put docids for letters originally in English into a vector
vals  <- c("S316-D004", "S316-D031", "S316-D037", "S316-D040", "S316-D049", "S316-D050", "S316-D052", 
           "S316-D053", "S316-D054", "S316-D055", "S316-D058", "S316-D061", "S316-D064",
          "S316-D071","S316-D072", "S316-D079", "S316-D132", "S316-D137","S316-D163",
          "S316-D180", "S316-D182", "S316-D189", "S316-D193", "S316-D202", "S316-D204", 
           "S316-D205")

In [56]:
# Everywhere a value in that vector appears in docid, code FALSE for translated.
letters$translated[letters$docid %in% vals]  <- FALSE

In [57]:
# See the counts for cultural variables
letters %>%
filter(translated==FALSE) %>%
select(nationalOrigin) %>%
summary()

  nationalOrigin
 English :368   
 Scottish: 69   
 Italian : 55   
 Irish   : 35   
 Welsh   : 26   
 (Other) : 22   
 NA's    :  1   

In [58]:
# See the counts for cultural variables
letters %>%
filter(translated==FALSE) %>%
select(nationalOrigin) %>%
table()

.
         Chinese            Czech   Dushane; Xhosa            Dutch 
               1                0                0                0 
         English         European          Finnish           French 
             368                2                0                0 
French; Prussian           German            Irish  Irish; Scottish 
               0                0               35               19 
         Italian           Jewish       Lithuanian        Norwegian 
              55                0                0                0 
          Polish          Russian  Russian; Polish         Scottish 
               0                0                0               69 
         Spanish            Welsh 
               0               26 

In [59]:
# update dataframe

letters <- letters %>%
filter(translated==FALSE)

# How many observations now
nrow(letters)

In [60]:
letters  %>% 
filter(is.na(nationalOrigin)) %>% 
select(docauthorname, authorgender, north_american_occupation, religion, docid, publicLetter, sourcetitle, translated)

docauthorname,authorgender,north_american_occupation,religion,docid,publicLetter,sourcetitle,translated
<fct>,<fct>,<fct>,<fct>,<fct>,<lgl>,<fct>,<lgl>
"Jennings, Joseph, fl. 1931",M,,Not indicated,S9845-D004,,"Advice to Emigrants, Who Intend to Settle in the United States of America, 2nd Edition, Greatly Enlarged and Improved",False


In [61]:
letters  %>% 
filter(britishEmpire_EU==FALSE) %>% 
select(docauthorname, authorgender, north_american_occupation, religion, publicLetter, sourcetitle, translated) %>% 
unique()

Unnamed: 0_level_0,docauthorname,authorgender,north_american_occupation,religion,publicLetter,sourcetitle,translated
Unnamed: 0_level_1,<fct>,<fct>,<fct>,<fct>,<lgl>,<fct>,<lgl>
1,"Segale, Sister Blandina, 1850-1941",F,Nun; Social worker; Teacher,Catholic; Christian,,At the End of the Santa Fe Trail,False
56,"Ling, Kwang Chang, fl. 1878",M,,Not indicated,True,America's Immigrants: Adventures in Eyewitness History,False


In [62]:
write.csv(letters, 
          "20240314b_PhD_NaildohSubset.csv", 
          row.names=FALSE)

In [63]:
glimpse(letters)

Rows: 576
Columns: 76
$ docsequence               [3m[90m<int>[39m[23m 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,…
$ docid                     [3m[90m<fct>[39m[23m S1019-D002, S1019-D004, S1019-D005, S1019-D0…
$ sourceid                  [3m[90m<fct>[39m[23m S1019, S1019, S1019, S1019, S1019, S1019, S1…
$ docauthorid               [3m[90m<fct>[39m[23m per0001043, per0001043, per0001043, per00010…
$ doctitle                  [3m[90m<fct>[39m[23m "Letter from Sister Blandina Segale to Siste…
$ docyear                   [3m[90m<dbl>[39m[23m 1872, 1872, 1872, 1872, 1873, 1873, 1873, 18…
$ docmonth                  [3m[90m<dbl>[39m[23m 11, 12, 12, 12, 3, 7, 9, 6, 11, 6, 9, 12, 1,…
$ docday                    [3m[90m<dbl>[39m[23m 30, 6, 10, 21, 1, NA, NA, 30, 14, NA, NA, 16…
$ docpage                   [3m[90m<fct>[39m[23m "3-10", "13-22", "22-29", "29-37", "37-44", …
$ doctype                   [3m[90m<fct>[39m[23m Letter, Letter, Letter, Letter, Le

In [64]:
length(unique(letters$docauthorid))