In [1]:
#Import Library
library(tidyverse)

── Attaching packages ─────────────────────────────────────── tidyverse 1.2.1 ──
✔ ggplot2 3.1.0     ✔ purrr   0.2.5
✔ tibble  2.0.1     ✔ dplyr   0.7.6
✔ tidyr   0.8.1     ✔ stringr 1.3.1
✔ readr   1.1.1     ✔ forcats 0.3.0
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()


In [2]:
# Converts all factors to character class
unfactorize <- function(df){
  for(i in which(sapply(df, class) == "factor")) df[[i]] = as.character(df[[i]])
  return(df)
}
# Code from user "By0" at https://stackoverflow.com/questions/2851015/convert-data-frame-columns-from-factors-to-characters (line 14)

In [3]:
# Get and view last meta dataset
letters  <- unfactorize(read.csv("20201118_AM_Meta2Merge.csv"))
glimpse(letters)

Observations: 1,212
Variables: 32
$ X                       <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14…
$ docsequence             <int> 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 3…
$ docid                   <chr> "S10003-D023", "S10003-D024", "S10003-D025", …
$ docyear                 <int> 1836, 1836, 1837, 1837, 1838, 1838, 1838, 183…
$ doctype                 <chr> "Letter", "Letter", "Letter", "Letter", "Lett…
$ allsubject              <chr> "Childbirth; Church attendance; Cities; Farms…
$ broadsubj               <chr> "Health; Religion; Communities; Relationships…
$ personalevent           <chr> NA, NA, NA, NA, NA, NA, NA, NA, "Physical ill…
$ wwritten                <chr> "Baltimore, MD; Maryland; United States; Mid-…
$ docauthorid             <chr> "per0022938", "per0022938", "per0022938", "pe…
$ docauthorname           <chr> "Bruns, Jette, 1813-1899", "Bruns, Jette, 181…
$ language                <chr> "English; German", "English; German", "Englis…
$ editor          

In [5]:
# How many letters were written before 1914
nrow(subset(letters, docyear < 1914))

# How many doc years are unknown?
sum(is.na(letters$docyear))

In [5]:
# Make subset of letters writte before 1914 or for whom docyear is NA.
lettersNew  <- subset(letters, docyear < 1914 | is.na(letters$docyear))
nrow(lettersNew)

In [6]:
# Who are the NAs and when did they live (contained in docauthorid)
table(lettersNew$docauthorname[is.na(lettersNew$docyear)])


             Beltman, H., fl. 1878        Birkbeck, Morris, 1764-1825 
                                 1                                  1 
Bishop, Robert Hamilton, 1777-1855                     Brinks, Roelof 
                                 1                                  1 
        De Jong, William, fl. 1888          Delfino, Diego, 1874-1926 
                                 2                                  2 
            Dunnink, Mr., fl. 1850                      Johnson, Nils 
                                 1                                  1 
                    Karsten, J. H.                        Koopman, H. 
                                 1                                  1 
                      Kroes, Henry            Lankester, P., fl. 1849 
                                 1                                  2 
                Nywening, U. V. L.                  Posthumus, Wietze 
                                 1                                  1 
     

<b>Decision</b>: Drop Diego and authors without any year associated. Note: <i>Fl.</i> means flourished and is understood to mark a person's active years. People whose active year is 1850 or earlier will be included. Dates afterward excluded. Get those docauthorids.

In [7]:
# Put names in list
names = c("Beltman, H., fl. 1878",
          "De Jong, William, fl. 1888", 
          "Karsten, J. H.", 
          "Kroes, Henry", 
          "Nywening, U. V. L.",
          "Sneller, Matje",
          "Veldhuis, Zwiertje",
          "Brinks, Roelof",
          "Delfino, Diego, 1874-1926", 
          "Johnson, Nils",
          "Koopman, H.",
          "Posthumus, Wietze",
          "Te Selle, Harm, fl. 1865")

In [8]:
# Do some of these authors have letters with docyear provided?
lettersNew[lettersNew$docauthorname %in% names, c("docauthorname", "docid", "docyear")]

Unnamed: 0,docauthorname,docid,docyear
413,"Delfino, Diego, 1874-1926",S12296-D002,1912.0
418,"Delfino, Diego, 1874-1926",S12296-D008,
419,"Delfino, Diego, 1874-1926",S12296-D009,
933,"Sneller, Matje",S9831-D008,
935,"Brinks, Roelof",S9831-D010,
936,"Veldhuis, Zwiertje",S9831-D014,
946,"Kroes, Henry",S9831-D026,
950,"Nywening, U. V. L.",S9831-D036,
951,"De Jong, William, fl. 1888",S9831-D037,
952,"Posthumus, Wietze",S9831-D038,


Include the people who whose other letters predate 1914.

In [9]:
# Revising the list above.
names = c("Beltman, H., fl. 1878",
          "De Jong, William, fl. 1888", 
          "Karsten, J. H.", 
          "Kroes, Henry", 
          "Nywening, U. V. L.",
          "Sneller, Matje",
          "Veldhuis, Zwiertje",
          "Brinks, Roelof",
          "Johnson, Nils",
          "Koopman, H.",
          "Posthumus, Wietze")

# Remove cases with the docauthors above from the dataset.
lettersFinal  <- subset(lettersNew, !(docauthorname %in% names))

# Who are the docyear-NAs that were saved?
table(lettersFinal$docauthorname[is.na(lettersFinal$docyear)])


       Birkbeck, Morris, 1764-1825 Bishop, Robert Hamilton, 1777-1855 
                                 1                                  1 
         Delfino, Diego, 1874-1926             Dunnink, Mr., fl. 1850 
                                 2                                  1 
           Lankester, P., fl. 1849           Te Selle, Harm, fl. 1865 
                                 2                                  1 

In [40]:
glimpse(lettersFinal)

Observations: 925
Variables: 32
$ X                       <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14…
$ docsequence             <int> 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 3…
$ docid                   <chr> "S10003-D023", "S10003-D024", "S10003-D025", …
$ docyear                 <int> 1836, 1836, 1837, 1837, 1838, 1838, 1838, 183…
$ doctype                 <chr> "Letter", "Letter", "Letter", "Letter", "Lett…
$ allsubject              <chr> "Childbirth; Church attendance; Cities; Farms…
$ broadsubj               <chr> "Health; Religion; Communities; Relationships…
$ personalevent           <chr> NA, NA, NA, NA, NA, NA, NA, NA, "Physical ill…
$ wwritten                <chr> "Baltimore, MD; Maryland; United States; Mid-…
$ docauthorid             <chr> "per0022938", "per0022938", "per0022938", "pe…
$ docauthorname           <chr> "Bruns, Jette, 1813-1899", "Bruns, Jette, 181…
$ language                <chr> "English; German", "English; German", "Englis…
$ editor            

In [26]:
# How many IDs were removed from the dataset and what are they
originalIDs  <- letters$docid
finalIDs <- lettersFinal$docid
removedIDs <- originalIDs[!(originalIDs %in% finalIDs)]
length(removedIDs)
removedIDs

This adds up nicely. Now I need to remove these files from the folder.

In [27]:
# .txt to all IDs
fileids <- paste(removedIDs, ".txt", sep="")
fileids

In [29]:
# Confirm working directory
getwd()

In [31]:
# Set working directory
setwd("letters")

In [36]:
# What are the files in that directory?
list.files()

In [37]:
# How many files are there?
length(list.files())

In [38]:
# Delete the fileids
sapply(fileids, unlink)

In [39]:
# Check it worked correctly by getting count.
length(list.files())

In [42]:
# Converts character to factor class
factorize <- function(df){
  for(i in which(sapply(df, class) == "character")) df[[i]] = as.factor(df[[i]])
  return(df)
}

In [43]:
# Refactorize the data
lettersFinal  <-  factorize(lettersFinal)
glimpse(lettersFinal)

Observations: 925
Variables: 32
$ X                       <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14…
$ docsequence             <int> 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 3…
$ docid                   <fct> S10003-D023, S10003-D024, S10003-D025, S10003…
$ docyear                 <int> 1836, 1836, 1837, 1837, 1838, 1838, 1838, 183…
$ doctype                 <fct> Letter, Letter, Letter, Letter, Letter, Lette…
$ allsubject              <fct> "Childbirth; Church attendance; Cities; Farms…
$ broadsubj               <fct> Health; Religion; Communities; Relationships;…
$ personalevent           <fct> NA, NA, NA, NA, NA, NA, NA, NA, Physical illn…
$ wwritten                <fct> "Baltimore, MD; Maryland; United States; Mid-…
$ docauthorid             <fct> per0022938, per0022938, per0022938, per002293…
$ docauthorname           <fct> "Bruns, Jette, 1813-1899", "Bruns, Jette, 181…
$ language                <fct> English; German, English; German, English; Ge…
$ editor            

In [45]:
# Write a new .csv
write.csv(lettersFinal, "../20201125_AM_Meta2Merge.csv")