## Prep Dataset

In [3]:
# Install packages, load libraries.
library(tidyverse)
library(stringr)

── Attaching packages ─────────────────────────────────────── tidyverse 1.2.1 ──
✔ ggplot2 3.1.0     ✔ purrr   0.2.5
✔ tibble  2.0.1     ✔ dplyr   0.7.6
✔ tidyr   0.8.1     ✔ stringr 1.3.1
✔ readr   1.1.1     ✔ forcats 0.3.0
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()


Tidyverse is used in this notebook mostly for data manipulation and visualization. Relevant documentation is at https://dplyr.tidyverse.org and https://ggplot2.tidyverse.org.

Stringr used to recode values for some variables (e.g., North American Occupation).

In [19]:
# Converts all factors to character class
unfactorize <- function(df){
  for(i in which(sapply(df, class) == "factor")) df[[i]] = as.character(df[[i]])
  return(df)
}
# Code from user "By0" at https://stackoverflow.com/questions/2851015/convert-data-frame-columns-from-factors-to-characters (line 14)

In [20]:
# Import, unfactorize and view data.
authorData <- unfactorize(read.csv("IMLD_AUTHORS_QA completed.csv", na.strings=c("", "Not indicated")))
docData <- unfactorize(read.csv("IMLD_DOCS_QA completed.csv", na.strings=c("", "Not indicated")))
glimpse(authorData)
glimpse(docData)

Observations: 2,162
Variables: 30
$ sourceids                 <chr> NA, "S10000; S9527", "S10001", "S10002", "S…
$ numdocs                   <int> 502, 23, 19, 23, 172, 2, 1, 3, 1, 1, 24, 1,…
$ docauthorid               <chr> "per0002637", "per0021589", "per0022935", "…
$ docauthorname             <chr> "Editor", "Pilibosian, Khachadoor, 1904-198…
$ alternatenames            <chr> NA, NA, NA, NA, "Giesberg, Henriette Ann El…
$ briefname                 <chr> "Editor", "Khachadoor Pilibosian", "Evelio …
$ authrace                  <chr> "Not applicable", "White", "Black", "White"…
$ nationality               <chr> "Not applicable", NA, "United States", "Uni…
$ religion                  <chr> "Not applicable", "Catholic; Christian", "C…
$ birthyear                 <int> NA, 1904, 1919, 1882, 1813, 1774, 1794, 181…
$ birthmonth                <int> NA, NA, NA, 4, NA, 8, NA, 9, 5, 4, NA, NA, …
$ birthday                  <int> NA, NA, NA, 16, NA, 11, NA, 17, 3, 23, NA, …
$ deathyear       

In [21]:
#Merge datasets
df <- right_join(authorData, docData, by = 'docauthorid')
glimpse(df)

Observations: 8,749
Variables: 100
$ sourceids                   <chr> NA, "S10000; S9527", "S10000; S9527", "S1…
$ numdocs                     <int> 502, 23, 23, 23, 23, 23, 23, 23, 23, 23, …
$ docauthorid                 <chr> "per0002637", "per0021589", "per0021589",…
$ docauthorname.x             <chr> "Editor", "Pilibosian, Khachadoor, 1904-1…
$ alternatenames              <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ briefname.x                 <chr> "Editor", "Khachadoor Pilibosian", "Khach…
$ authrace.x                  <chr> "Not applicable", "White", "White", "Whit…
$ nationality.x               <chr> "Not applicable", NA, NA, NA, NA, NA, NA,…
$ religion.x                  <chr> "Not applicable", "Catholic; Christian", …
$ birthyear.x                 <int> NA, 1904, 1904, 1904, 1904, 1904, 1904, 1…
$ birthmonth.x                <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ birthday.x                  <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ deathyear.x    

In [22]:
# Converts character to factor class
factorize <- function(df){
  for(i in which(sapply(df, class) == "character")) df[[i]] = as.factor(df[[i]])
  return(df)
}

In [23]:
df <- factorize(df)
summary(df)

               sourceids       numdocs           docauthorid  
 S6210              : 232   Min.   :  1.00   per0002637: 460  
 S11073             : 189   1st Qu.: 10.00   per0038009: 190  
 S10003             : 180   Median : 28.00   per0022938: 172  
 S9140; S9435       : 171   Mean   : 69.89   per0018089: 171  
 S2344; S4963; S9924: 150   3rd Qu.: 64.00   per0004772: 150  
 (Other)            :7174   Max.   :502.00   per0007475: 149  
 NA's               : 653   NA's   :193      (Other)   :7457  
                               docauthorname.x
 Editor                                : 460  
 Harris, Sarah Stretch, 1818-1897      : 190  
 Bruns, Jette, 1813-1899               : 172  
 Berenson, Bernard, 1865-1959          : 171  
 Moodie, Susannah Strickland, 1803-1885: 150  
 (Other)                               :7413  
 NA's                                  : 193  
                                                       alternatenames
 Giesberg, Henriette Ann Elisabeth; Bruns, Mrs. Jo

## Select Columns

In [24]:
df  <- select(df,
       docsequence, 
       docid, 
       docyear, 
       doctype, 
       allsubject, 
       broadsubj, 
       personalevent,
       wwritten,
       docauthorid,
       docauthorname.x,
       docauthorname.y,
       language, 
       editor, 
       briefname.x,
       briefname.y, 
       authrace.x,
       authrace.y,
       nationality.x,
       nationality.y,
       religion.x,
       religion.y,
       birthyear.x,
       birthyear.y,
       deathyear.x,
       deathyear.y,
       birthplace.x,
       birthplace.y,
       deathplace.x,
       deathplace.y,
       ageatdeath, 
       authorgender.x,
       authorgender.y,
       native_occupation.x,
       native_occupation.y,
       north_american_occupation.x,
       north_american_occupation.y,
       year_immigration.x,
       year_immigration.y,
       cultural_heritage.x,
       cultural_heritage.y,
       stayed_north_america.x,
       stayed_north_america.y,
       author_generation.x, 
       author_generation.y, 
       agewriting,
       marriagestatus, 
       maternalstatus)
summary(df)

  docsequence             docid         docyear                 doctype    
 Min.   :  0.00   S10000-D001:   1   Min.   :1784   Chapter         :3367  
 1st Qu.:  4.00   S10000-D002:   1   1st Qu.:1880   Letter          :1866  
 Median : 14.00   S10000-D003:   1   Median :1925   Oral history    :1028  
 Mean   : 28.87   S10000-D004:   1   Mean   :1922   Diary           : 750  
 3rd Qu.: 35.00   S10000-D005:   1   3rd Qu.:1961   Emigration guide: 705  
 Max.   :241.00   S10000-D006:   1   Max.   :2004   Editorial       : 499  
                  (Other)    :8743   NA's   :168    (Other)         : 534  
                                                                allsubject  
 Correspondence; Intellectual life                                   :  13  
 Internment camps; Politics                                          :   9  
 Angel Island Immigration Center, San Francisco, CA; Meals; Health   :   7  
 Immigrants and emigrants; Irish people; Domestic life; Ethnic groups:   7  
 Immigr

In [25]:
# Create a new column for resolving author_generation discrepancy
df$author_generation <- df$author_generation.x
summary(df$author_generation)
summary(df$author_generation.x)
summary(df$author_generation.y)

## Compare Subsets

In [26]:
#Subset data to isolate letters written by immigrants in North America
nrow(filter(df, 
       doctype == "Letter" & #Only letters
            grepl("English", language) & #Originally in English or translated into English
            author_generation.x == "First" & #Only 1st generation migrants
            grepl("North America", wwritten))) #Writing only from North America

#Subset data to isolate letters written by immigrants in North America
nrow(filter(df, 
       doctype == "Letter" & #Only letters
            grepl("English", language) & #Originally in English or translated into English
            author_generation.y == "First" & #Only 1st generation migrants
            grepl("North America", wwritten))) #Writing only from North America

In [27]:
# What's the difference?
1272-1225

## Diagnostics

In [28]:
# Went back to check on 19 errors between the two datasets. 
# Ids selected from "Author Generation" output in CrosscheckData notebook.
authorData[authorData$docauthorid == "per0022893",]
unique(docData[docData$docauthorid == "per0022893", "doctype"])

authorData[authorData$docauthorid == "per0003974",]
unique(docData[docData$docauthorid == "per0003974", "doctype"])

Unnamed: 0,sourceids,numdocs,docauthorid,docauthorname,alternatenames,briefname,authrace,nationality,religion,birthyear,⋯,authorgender,native_occupation,north_american_occupation,organization_affiliations,year_immigration,cultural_heritage,point_of_entry,point_of_emigration,stayed_north_america,author_generation
1989,S9955,18,per0022893,"Clayton, William, 1814-1879",,William Clayton,White,United States,Mormon; Christian,1814,⋯,M,,,,,English; European,,,Stayed,First


Unnamed: 0,sourceids,numdocs,docauthorid,docauthorname,alternatenames,briefname,authrace,nationality,religion,birthyear,⋯,authorgender,native_occupation,north_american_occupation,organization_affiliations,year_immigration,cultural_heritage,point_of_entry,point_of_emigration,stayed_north_america,author_generation
1999,S9974,1,per0003974,"Carnegie, Andrew, 1835-1919",,Andrew Carnegie,White,United States,,1835,⋯,M,,,,,Scottish; European,,,Stayed,First


These were not letters, so 19 discrepancies caught by arsenal do not account for the difference found here. 

According to right_join documentation, for IDs in y (docData) but not in x (authorData), an NA is returned. I went back to the CrosscheckData notebook to see how many IDs meeting the subset criteria were in docData but not authorData. There were exactly 47. Thus, it seems that the discrepancy is due to IDs that appeared in docData but not authorData. 

In [29]:
# Create vector of subset IDs in Y but not X.
missingIds  <- read.table('missingIds.txt')

In [30]:
missingIds  <- missingIds$x

In [31]:
# Confirm that these IDs are indeed in docData (y) but not in authorData (x)
summary(subset(docData, grepl(paste(missingIds,collapse="|"), docauthorid), select = c("docauthorid", "author_generation", "doctype")))
nrow(subset(authorData, grepl(paste(missingIds,collapse="|"), docauthorid), select = c("docauthorid", "author_generation")))

 docauthorid        author_generation    doctype         
 Length:48          Length:48          Length:48         
 Class :character   Class :character   Class :character  
 Mode  :character   Mode  :character   Mode  :character  

In [32]:
#There is one too many. Is this because a letter was written from outside North America?
subset(docData, grepl(paste(missingIds,collapse="|"), docauthorid), select = "wwritten")

Unnamed: 0,wwritten
3382,"Ebensburg, PA; Pennsylvania; United States; Mid-Atlantic States; Northeast States; East Coast States; North America"
3383,"Albany, NY; New York; United States; Mid-Atlantic States; Northeast States; East Coast States; North America"
3386,United States; North America
3388,"Racine, WI; Wisconsin; United States; East North Central States; Midwest States; Great Lakes States; North America"
3391,"Cincinnati, OH; Ohio; United States; East North Central States; Midwest States; Mississippi Basin States; North America"
3393,North America
3396,United States; North America
3398,"Granville, OH; Ohio; United States; East North Central States; Midwest States; Mississippi Basin States; North America"
3400,"New York, NY; New York; United States; Mid-Atlantic States; Northeast States; East Coast States; North America"
3402,Indiana; United States; East North Central States; Midwest States; Mississippi Basin States; North America


Yes, one of these letters was written outside the United States. So we now have 47 letters associated with IDs in y but not x. Now, how do these appear after the merge (i.e., as NAs)? 

In [33]:
# This code does not work 
#df[df$docauthorid[missingIds],c("docauthorid", "author_generation.x", "author_generation.y")]

#But this code does seeem to indicate that the 47 letters were coded as NA for author_generation.x
subset(df, grepl(paste(missingIds,collapse="|"), docauthorid), select = c("docauthorid", "author_generation.x", "author_generation.y", "doctype"))
nrow(subset(df, grepl(paste(missingIds,collapse="|"), docauthorid), select = c("docauthorid", "author_generation.x", "author_generation.y", "doctype")))

Unnamed: 0,docauthorid,author_generation.x,author_generation.y,doctype
3382,per0031173,,First,Letter
3383,per0031175,,First,Letter
3386,per0041383,,First,Letter
3388,per0031180,,First,Letter
3391,per0004486,,First,Letter
3393,per0041388,,First,Letter
3396,per0041394,,First,Letter
3398,per0031224,,First,Letter
3400,per0031227,,First,Letter
3402,per0031228,,First,Letter


In [34]:
filter(df, grepl(paste(missingIds,collapse="|"), docauthorid)) %>% select(ends_with("y"))

docauthorname.y,briefname.y,authrace.y,nationality.y,religion.y,birthyear.y,deathyear.y,birthplace.y,deathplace.y,authorgender.y,native_occupation.y,north_american_occupation.y,year_immigration.y,cultural_heritage.y,stayed_north_america.y,author_generation.y
"Roberts, George, fl. 1850",George Roberts,White,American,,,,,,M,,Clergy,1795.0,Welsh; European,No,First
"Harry, David Shone, fl. 1817",David Shone Harry,White,,,,,,,M,,,,Welsh; European,No,First
"Jones, Edward, fl. 1837",Edward Jones,White,,,,,Wales; United Kingdom; Western Europe; Europe,,M,,,1837.0,Welsh; European,No,First
"Cheshire, John, fl. 1847",John Cheshire,White,American,,,,Wales; United Kingdom; Western Europe; Europe,,M,,,1846.0,Welsh; European,No,First
"Roberts, Samuel, fl. 1856-1870",Samuel Roberts,White,Welsh; European,Congregationalist; Christian,,,Wales; United Kingdom; Western Europe; Europe,,M,Clergy; Farmer,,1857.0,Welsh; European,No,First
"Jenkins, William, fl. 1862",William Jenkins,White,,,,,Wales; United Kingdom; Western Europe; Europe,,M,,,,Welsh; European,No,First
"Morddal, fl. 1866",Morddal,White,,,,,Wales; United Kingdom; Western Europe; Europe,,M,,,,Welsh; European,No,First
"Owen, John, fl. 1867",John Owen,White,,,,,,,M,,,1867.0,Welsh; European,No,First
"Lloyd, John, fl. 1868",John Lloyd,White,American,,,,Wales; United Kingdom; Western Europe; Europe,,M,,,1868.0,Welsh; European,No,First
"James, Hiram",Hiram James,White,,,,,,,M,,,,Welsh; European,No,First


These cases have missing data in authorData (x) but not entirely in docData (y).

## Recode the 48 cases above using y data

In [35]:
rows = which(grepl(paste(missingIds,collapse="|"), df$docauthorid))
length(rows)
print(rows)

 [1] 3382 3383 3386 3388 3391 3393 3396 3398 3400 3402 3405 3406 3407 3410 3411
[16] 3414 3416 3418 3422 3440 3441 3442 3448 3451 3454 3456 3459 3460 3464 3469
[31] 3477 3482 3496 3503 3507 3513 3514 3517 3518 3520 3521 3537 3547 3574 3577
[46] 3578 3579 3580


In [36]:
df$author_generation[rows] <- df$author_generation.y[rows]
df$author_generation[rows]

In [37]:
subset(df, grepl(paste(missingIds,collapse="|"), docauthorid), select = c("docauthorid", "author_generation.x", "author_generation.y", "author_generation", "doctype"))

Unnamed: 0,docauthorid,author_generation.x,author_generation.y,author_generation,doctype
3382,per0031173,,First,First,Letter
3383,per0031175,,First,First,Letter
3386,per0041383,,First,First,Letter
3388,per0031180,,First,First,Letter
3391,per0004486,,First,First,Letter
3393,per0041388,,First,First,Letter
3396,per0041394,,First,First,Letter
3398,per0031224,,First,First,Letter
3400,per0031227,,First,First,Letter
3402,per0031228,,First,First,Letter


In [38]:
summary(df$author_generation)
summary(df$author_generation.x)
summary(df$author_generation.y)

In [39]:
#Subset data to isolate letters written by immigrants in North America
nrow(filter(df, 
       doctype == "Letter" & #Only letters
            grepl("English", language) & #Originally in English or translated into English
            author_generation == "First" & #Only 1st generation migrants
            grepl("North America", wwritten))) #Writing only from North America

nrow(filter(df, 
       doctype == "Letter" & #Only letters
            grepl("English", language) & #Originally in English or translated into English
            author_generation.x == "First" & #Only 1st generation migrants
            grepl("North America", wwritten))) #Writing only from North America

#Subset data to isolate letters written by immigrants in North America
nrow(filter(df, 
       doctype == "Letter" & #Only letters
            grepl("English", language) & #Originally in English or translated into English
            author_generation.y == "First" & #Only 1st generation migrants
            grepl("North America", wwritten))) #Writing only from North America

To Do: Do the above for all key author level variables (i.e., gender, religion, cultural heritage, occupation, marriage status, parental status, stayed in North America). 

In [40]:
# Compare gender
summary(df$authorgender.x)
summary(df$authorgender.y)

In [41]:
# Create new variable and populate with authorData (x)
df$authorgender <- df$authorgender.x
summary(df$authorgender)

In [42]:
# Check to make sure the rows with missing IDs are all NAs
subset(df, grepl(paste(missingIds,collapse="|"), docauthorid), select = c("docauthorid", "authorgender", "authorgender.x", "authorgender.y", "doctype"))

Unnamed: 0,docauthorid,authorgender,authorgender.x,authorgender.y,doctype
3382,per0031173,,,M,Letter
3383,per0031175,,,M,Letter
3386,per0041383,,,M,Letter
3388,per0031180,,,M,Letter
3391,per0004486,,,M,Letter
3393,per0041388,,,M,Letter
3396,per0041394,,,M,Letter
3398,per0031224,,,M,Letter
3400,per0031227,,,M,Letter
3402,per0031228,,,M,Letter


In [43]:
# Replace NA in missing data rows with value for docData (y)
df$authorgender[rows] <- df$authorgender.y[rows]
summary(df$authorgender)

In [44]:
summary(subset(df, is.na(authorgender), select = c("docauthorid", "authorgender", "authorgender.x", "authorgender.y", "doctype")))

     docauthorid  authorgender authorgender.x authorgender.y
 per0000339:  4   F   :  0     F   :  0       F: 15         
 per0004491:  3   M   :  0     M   :  0       M:130         
 per0004529:  3   NA's:145     NA's:145                     
 per0004552:  3                                             
 per0031346:  3                                             
 per0031824:  3                                             
 (Other)   :126                                             
             doctype   
 Letter          :143  
 Emigration guide:  2  
 Cartoon         :  0  
 Chapter         :  0  
 Diary           :  0  
 Editorial       :  0  
 (Other)         :  0  

This shows that there are still 145 NAs for authorgender variable. These may be cases that will not be included in the subset.

In [45]:
# Check number of cases for which gender is NOT NA and the other conditions are met ...
# ...to see if they match the case counts from the docauthorid / briefname review.
nrow(filter(df, !is.na(authorgender) &
       doctype == "Letter" & #Only letters
            grepl("English", language) & #Originally in English or translated into English
            author_generation == "First" & #Only 1st generation migrants
            grepl("North America", wwritten))) #Writing only from North America

nrow(filter(df, !is.na(authorgender.x) &
       doctype == "Letter" & #Only letters
            grepl("English", language) & #Originally in English or translated into English
            author_generation == "First" & #Only 1st generation migrants
            grepl("North America", wwritten))) #Writing only from North America

nrow(filter(df, !is.na(authorgender.y) &
       doctype == "Letter" & #Only letters
            grepl("English", language) & #Originally in English or translated into English
            author_generation == "First" & #Only 1st generation migrants
            grepl("North America", wwritten))) #Writing only from North America

They do match. What this shows is that after replacing missing authorData values for gender with docData values, there are no NA cases that meet the conditions for migrant generation, language, format and place written. All appears to be good with this variable. Moving on.

In [46]:
# Compare cultural heritage
#Subset data to isolate letters written by immigrants in North America
nrow(filter(df, is.na(cultural_heritage.x) &
       doctype == "Letter" & #Only letters
            grepl("English", language) & #Originally in English or translated into English
            author_generation == "First" & #Only 1st generation migrants
            grepl("North America", wwritten))) #Writing only from North America

nrow(filter(df, is.na(cultural_heritage.y) &
       doctype == "Letter" & #Only letters
            grepl("English", language) & #Originally in English or translated into English
            author_generation == "First" & #Only 1st generation migrants
            grepl("North America", wwritten))) #Writing only from North America

In [47]:
# Create new variable and populate with authorData (x)
df$cultural_heritage <- df$cultural_heritage.x

# For IDs in docData (y) but not authorData (x), replace x value with y value.
df$cultural_heritage[rows] <- df$cultural_heritage.y[rows]

# Check amount of cultural heritage missing data in the eventual subset.

nrow(filter(df, is.na(cultural_heritage) &
       doctype == "Letter" & #Only letters
            grepl("English", language) & #Originally in English or translated into English
            author_generation == "First" & #Only 1st generation migrants
            grepl("North America", wwritten))) #Writing only from North America

nrow(filter(df, is.na(cultural_heritage.x) &
       doctype == "Letter" & #Only letters
            grepl("English", language) & #Originally in English or translated into English
            author_generation == "First" & #Only 1st generation migrants
            grepl("North America", wwritten))) #Writing only from North America

nrow(filter(df, is.na(cultural_heritage.y) &
       doctype == "Letter" & #Only letters
            grepl("English", language) & #Originally in English or translated into English
            author_generation == "First" & #Only 1st generation migrants
            grepl("North America", wwritten))) #Writing only from North America

In [48]:
glimpse(filter(df, is.na(cultural_heritage) &
       doctype == "Letter" & #Only letters
            grepl("English", language) & #Originally in English or translated into English
            author_generation == "First" & #Only 1st generation migrants
            grepl("North America", wwritten))) #Writing only from North America

Observations: 1
Variables: 50
$ docsequence                 <int> 19
$ docid                       <fct> S9873-D019
$ docyear                     <int> 1830
$ doctype                     <fct> Letter
$ allsubject                  <fct> Crops; Farms; Immigration and emigration;…
$ broadsubj                   <fct> Agriculture; Communities; Domestic life; …
$ personalevent               <fct> NA
$ wwritten                    <fct> Canada; North America
$ docauthorid                 <fct> per0038979
$ docauthorname.x             <fct> "Prongley, Esau, fl. 1830"
$ docauthorname.y             <fct> "Prongley, Esau, fl. 1830"
$ language                    <fct> English
$ editor                      <fct> NA
$ briefname.x                 <fct> Esau Prongley
$ briefname.y                 <fct> Esau Prongley
$ authrace.x                  <fct> White
$ authrace.y                  <fct> White
$ nationality.x               <fct> NA
$ nationality.y               <fct> NA
$ religion.x               

47 out of 48 cases of missing data for cultural heritage are resolved. Unable to confirm cultural heritage of remaining NA by looking at original text or searching online. Will leave as-is. This case will be omitted from some statistical procedures.

In [49]:
# Compare religion
#Subset data to isolate letters written by immigrants in North America
nrow(filter(df, is.na(religion.x) &
       doctype == "Letter" & #Only letters
            grepl("English", language) & #Originally in English or translated into English
            author_generation == "First" & #Only 1st generation migrants
            grepl("North America", wwritten))) #Writing only from North America

nrow(filter(df, is.na(religion.y) &
       doctype == "Letter" & #Only letters
            grepl("English", language) & #Originally in English or translated into English
            author_generation == "First" & #Only 1st generation migrants
            grepl("North America", wwritten))) #Writing only from North America

In [50]:
# Create new variable and populate with authorData (x)
df$religion <- as.character(df$religion.x)
df$religion.y <- as.character(df$religion.y)

In [51]:
data.frame(df$religion[rows], df$religion.y[rows])

df.religion.rows.,df.religion.y.rows.
,
,
,
,
,Congregationalist; Christian
,
,
,
,
,


In [52]:
# For IDs in docData (y) but not authorData (x), replace x value with y value.
df$religion[rows] <- df$religion.y[rows]
data.frame(df$religion[rows], df$religion.y[rows])

df.religion.rows.,df.religion.y.rows.
,
,
,
,
Congregationalist; Christian,Congregationalist; Christian
,
,
,
,
,


In [53]:
df$religion  <- factor(df$religion)
glimpse(df$religion)

 Factor w/ 57 levels "Albanian Orthodox; Christian",..: 43 12 12 12 12 12 12 12 12 12 ...


In [54]:
# Check amount of cultural heritage missing data in the eventual subset.

nrow(filter(df, is.na(religion) &
       doctype == "Letter" & #Only letters
            grepl("English", language) & #Originally in English or translated into English
            author_generation == "First" & #Only 1st generation migrants
            grepl("North America", wwritten))) #Writing only from North America

nrow(filter(df, is.na(religion.x) &
       doctype == "Letter" & #Only letters
            grepl("English", language) & #Originally in English or translated into English
            author_generation == "First" & #Only 1st generation migrants
            grepl("North America", wwritten))) #Writing only from North America

nrow(filter(df, is.na(religion.y) &
       doctype == "Letter" & #Only letters
            grepl("English", language) & #Originally in English or translated into English
            author_generation == "First" & #Only 1st generation migrants
            grepl("North America", wwritten))) #Writing only from North America

In [55]:
# Get the docids for subset using religion.

docids.1 <- filter(df, is.na(religion) &
       doctype == "Letter" & #Only letters
            grepl("English", language) & #Originally in English or translated into English
            author_generation == "First" & #Only 1st generation migrants
            grepl("North America", wwritten)) %>%
select(docid)
docids.1  <- docids.1$docid
length(docids.1)

In [56]:
# Get the docids for subset using religion.x

docids.2 <- filter(df, is.na(religion.x) &
       doctype == "Letter" & #Only letters
            grepl("English", language) & #Originally in English or translated into English
            author_generation == "First" & #Only 1st generation migrants
            grepl("North America", wwritten)) %>%
select(docid)
docids.2  <- docids.2$docid
length(docids.2)

In [57]:
# Get IDs  in the larger set but not in the smaller set.
diffsReligion <- setdiff(docids.2, docids.1)
diffsReligion

In [58]:
filter(df, is.na(religion.x) &
            grepl(paste(diffsReligion,collapse="|"), docid) &
            doctype == "Letter" & #Only letters
            grepl("English", language) & #Originally in English or translated into English
            author_generation == "First" & #Only 1st generation migrants
            grepl("North America", wwritten)) %>% #Writing only from North America
summary()


  docsequence          docid      docyear                 doctype 
 Min.   : 12.0   S316-D012:1   Min.   :1821   Letter          :7  
 1st Qu.: 51.5   S316-D031:1   1st Qu.:1849   Cartoon         :0  
 Median : 90.0   S316-D072:1   Median :1854   Chapter         :0  
 Mean   :114.7   S316-D090:1   Mean   :1852   Diary           :0  
 3rd Qu.:198.5   S316-D198:1   3rd Qu.:1858   Editorial       :0  
 Max.   :201.0   S316-D199:1   Max.   :1873   Emigration guide:0  
                 (Other)  :1                  (Other)         :0  
                                                                                                                                                                                                                                                                                                                                                                                allsubject
 Canals; Churches; Economic conditions; Towns; Transportation; Religion; Economics; C

These documents are missing from the dataset. Ok to exclude them. To do: occupation, marriage status, parental status, stayed in North America.

In [59]:
letters  <- filter(df, 
            !grepl("S316", docid) & #Not the missing source
            doctype == "Letter" & #Only letters
            grepl("English", language) & #Originally in English or translated into English
            author_generation == "First" & #Only 1st generation migrants
            grepl("North America", wwritten)) %>% #Writing only from North America
    select(-c(author_generation.x,author_generation.y))
glimpse(letters)

Observations: 1,222
Variables: 49
$ docsequence                 <int> 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 3…
$ docid                       <fct> S10003-D023, S10003-D024, S10003-D025, S1…
$ docyear                     <int> 1836, 1836, 1837, 1837, 1838, 1838, 1838,…
$ doctype                     <fct> Letter, Letter, Letter, Letter, Letter, L…
$ allsubject                  <fct> "Childbirth; Church attendance; Cities; F…
$ broadsubj                   <fct> Health; Religion; Communities; Relationsh…
$ personalevent               <fct> NA, NA, NA, NA, NA, NA, NA, NA, Physical …
$ wwritten                    <fct> "Baltimore, MD; Maryland; United States; …
$ docauthorid                 <fct> per0022938, per0022938, per0022938, per00…
$ docauthorname.x             <fct> "Bruns, Jette, 1813-1899", "Bruns, Jette,…
$ docauthorname.y             <fct> "Bruns, Jette, 1813-1899", "Bruns, Jette,…
$ language                    <fct> English; German, English; German, English…
$ editor          

## Adjustments to the Subset

Recode values per CrosscheckData findings and decisions (search for term "Decision" in that script)

<b>Nationality</b>: The only correction needed is a chapter and therefore not required.

<b>North American Occupation</b>: Omit "'s wife" from the values.

In [61]:
# To what value does this apply
unique(letters$north_american_occupation.x[which(grepl("wife", letters$north_american_occupation.x))])

In [62]:
letters$north_american_occupation  <-  str_remove_all(letters$north_american_occupation.x, "[:space:]wife")

In [63]:
letters$north_american_occupation  <-  str_remove_all(letters$north_american_occupation, "\'s")

In [64]:
letters$north_american_occupation  <-  str_remove_all(letters$north_american_occupation, "[:space:]personnel")

In [65]:
unique(letters$north_american_occupation.x)
unique(letters$north_american_occupation)

In [67]:
# Convert strings back to factor
letters <- factorize(letters)
glimpse(letters)

Observations: 1,222
Variables: 50
$ docsequence                 <int> 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 3…
$ docid                       <fct> S10003-D023, S10003-D024, S10003-D025, S1…
$ docyear                     <int> 1836, 1836, 1837, 1837, 1838, 1838, 1838,…
$ doctype                     <fct> Letter, Letter, Letter, Letter, Letter, L…
$ allsubject                  <fct> "Childbirth; Church attendance; Cities; F…
$ broadsubj                   <fct> Health; Religion; Communities; Relationsh…
$ personalevent               <fct> NA, NA, NA, NA, NA, NA, NA, NA, Physical …
$ wwritten                    <fct> "Baltimore, MD; Maryland; United States; …
$ docauthorid                 <fct> per0022938, per0022938, per0022938, per00…
$ docauthorname.x             <fct> "Bruns, Jette, 1813-1899", "Bruns, Jette,…
$ docauthorname.y             <fct> "Bruns, Jette, 1813-1899", "Bruns, Jette,…
$ language                    <fct> English; German, English; German, English…
$ editor          

In [68]:
summary(letters)

  docsequence             docid         docyear       doctype    
 Min.   :  2.00   S10003-D023:   1   Min.   :1804   Letter:1222  
 1st Qu.: 25.00   S10003-D024:   1   1st Qu.:1858                
 Median : 54.00   S10003-D025:   1   Median :1868                
 Mean   : 72.47   S10003-D026:   1   Mean   :1881                
 3rd Qu.:110.00   S10003-D027:   1   3rd Qu.:1909                
 Max.   :239.00   S10003-D028:   1   Max.   :1979                
                  (Other)    :1216   NA's   :21                  
                                                         allsubject  
 Correspondence; Intellectual life                            :  10  
 Internment camps; Politics                                   :   8  
 Winter; Environment                                          :   6  
 Correspondence; Internment camps; Intellectual life; Politics:   5  
 Photographs; Intellectual life                               :   5  
 Business; Sons; Economics; Relationships           

In [69]:
write.csv(letters, "20201005_AM_FinalMeta.csv")

## Adjustments to the Subset

In [5]:
letters  <- read.csv("20201005_AM_FinalMeta.csv")
glimpse(letters)

Observations: 1,222
Variables: 51
$ X                           <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13…
$ docsequence                 <int> 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 3…
$ docid                       <fct> S10003-D023, S10003-D024, S10003-D025, S1…
$ docyear                     <int> 1836, 1836, 1837, 1837, 1838, 1838, 1838,…
$ doctype                     <fct> Letter, Letter, Letter, Letter, Letter, L…
$ allsubject                  <fct> "Childbirth; Church attendance; Cities; F…
$ broadsubj                   <fct> Health; Religion; Communities; Relationsh…
$ personalevent               <fct> NA, NA, NA, NA, NA, NA, NA, NA, Physical …
$ wwritten                    <fct> "Baltimore, MD; Maryland; United States; …
$ docauthorid                 <fct> per0022938, per0022938, per0022938, per00…
$ docauthorname.x             <fct> "Bruns, Jette, 1813-1899", "Bruns, Jette,…
$ docauthorname.y             <fct> "Bruns, Jette, 1813-1899", "Bruns, Jette,…
$ language        

In [6]:
letters[letters$docid == "S1019-D002", ]

Unnamed: 0,X,docsequence,docid,docyear,doctype,allsubject,broadsubj,personalevent,wwritten,docauthorid,⋯,stayed_north_america.x,stayed_north_america.y,agewriting,marriagestatus,maternalstatus,author_generation,authorgender,cultural_heritage,religion,north_american_occupation
137,137,2,S1019-D002,1872,Letter,Clergy; Missionaries; Students; Travel; Religion; Education; Entertainment and recreation,Religion; Education; Entertainment and recreation,,Ohio; United States; East North Central States; Midwest States; Mississippi Basin States; North America,per0001043,⋯,Stayed,Yes,22,Single,Childless,First,F,Italian; European,Catholic; Christian,Nun; Social worker; Teacher


In [7]:
letters[letters$docauthorid == "per0001043", ]

Unnamed: 0,X,docsequence,docid,docyear,doctype,allsubject,broadsubj,personalevent,wwritten,docauthorid,⋯,stayed_north_america.x,stayed_north_america.y,agewriting,marriagestatus,maternalstatus,author_generation,authorgender,cultural_heritage,religion,north_american_occupation
137,137,2,S1019-D002,1872,Letter,Clergy; Missionaries; Students; Travel; Religion; Education; Entertainment and recreation,Religion; Education; Entertainment and recreation,,Ohio; United States; East North Central States; Midwest States; Mississippi Basin States; North America,per0001043,⋯,Stayed,Yes,22,Single,Childless,First,F,Italian; European,Catholic; Christian,Nun; Social worker; Teacher
138,138,4,S1019-D004,1872,Letter,Missionaries; Nuns; Railroad trips; Religious festivals; Travel incidents; Travelers; Religion; Entertainment and recreation,Religion; Entertainment and recreation,,"Kansas City, MO; Missouri; United States; West North Central States; Midwest States; Mississippi Basin States; North America",per0001043,⋯,Stayed,Yes,22,Single,Childless,First,F,Italian; European,Catholic; Christian,Nun; Social worker; Teacher
139,139,5,S1019-D005,1872,Letter,Houses; Missionaries; Schools; Stagecoach travel; Travelers; Domestic life; Religion; Education; Transportation; Entertainment and recreation,Domestic life; Religion; Education; Transportation; Entertainment and recreation,,"Trinidad, CO; Colorado; United States; Southwestern States; Western States; Rocky Mountain States; North America",per0001043,⋯,Stayed,Yes,22,Single,Childless,First,F,Italian; European,Catholic; Christian,Nun; Social worker; Teacher
140,140,6,S1019-D006,1872,Letter,Churches; Clubs; Discipline; Educational philosophies; Missionaries; Schools; Students; Religion; Entertainment and recreation; Education,Religion; Entertainment and recreation; Education,,"Trinidad, CO; Colorado; United States; Southwestern States; Western States; Rocky Mountain States; North America",per0001043,⋯,Stayed,Yes,22,Single,Childless,First,F,Italian; European,Catholic; Christian,Nun; Social worker; Teacher
141,141,7,S1019-D007,1873,Letter,Accidents; Missionaries; Murder; Religious beliefs; Students; Travelers; Transportation; Religion; Law; Education; Entertainment and recreation,Transportation; Religion; Law; Education; Entertainment and recreation,,"Trinidad, CO; Colorado; United States; Southwestern States; Western States; Rocky Mountain States; North America",per0001043,⋯,Stayed,Yes,23,Single,Childless,First,F,Italian; European,Catholic; Christian,Nun; Social worker; Teacher
142,142,8,S1019-D008,1873,Letter,Indian raids; Landowners; Missionaries; Murder; Walking; Military; Economics; Religion; Law; Entertainment and recreation,Military; Economics; Religion; Law; Entertainment and recreation,,"Trinidad, CO; Colorado; United States; Southwestern States; Western States; Rocky Mountain States; North America",per0001043,⋯,Stayed,Yes,23,Single,Childless,First,F,Italian; European,Catholic; Christian,Nun; Social worker; Teacher
143,143,9,S1019-D009,1873,Letter,Animal cruelty; Burial services; Church services; Public schools; Renovations; Town life; Law; Religion; Education; Economics; Life Styles,Law; Religion; Education; Economics; Life Styles,,"Trinidad, CO; Colorado; United States; Southwestern States; Western States; Rocky Mountain States; North America",per0001043,⋯,Stayed,Yes,23,Single,Childless,First,F,Italian; European,Catholic; Christian,Nun; Social worker; Teacher
144,144,10,S1019-D010,1874,Letter,Coal mines; Crime; Economics; Law,Economics; Law,,"Trinidad, CO; Colorado; United States; Southwestern States; Western States; Rocky Mountain States; North America",per0001043,⋯,Stayed,Yes,24,Single,Childless,First,F,Italian; European,Catholic; Christian,Nun; Social worker; Teacher
145,145,11,S1019-D011,1874,Letter,Legal system; Law,Law,,"Trinidad, CO; Colorado; United States; Southwestern States; Western States; Rocky Mountain States; North America",per0001043,⋯,Stayed,Yes,24,Single,Childless,First,F,Italian; European,Catholic; Christian,Nun; Social worker; Teacher
146,146,12,S1019-D012,1876,Letter,Construction occupations; Invalids; Public schools; Renovations; Economics; Health; Education,Economics; Health; Education,,"Trinidad, CO; Colorado; United States; Southwestern States; Western States; Rocky Mountain States; North America",per0001043,⋯,Stayed,Yes,26,Single,Childless,First,F,Italian; European,Catholic; Christian,Nun; Social worker; Teacher


This record was throwing errors in getDataNAIL because it had been overwritten by another file. This has been corrected.