Nick Clifford

# Webscraper for Valley of the Shadow

In [1]:
import re
import os 
import requests 
import pandas as pd
from bs4 import BeautifulSoup
from tqdm.notebook import trange, tqdm

# Document IDs

Each of the pages that host the various documents are listed on a `.../papers/` directory of the Valley of the Shadow website. In order to crawl the website, I must first gather the complete list of document IDs.

In [2]:
url = "https://valley.lib.virginia.edu/papers/"
html = requests.get(url)
soup = BeautifulSoup(html.text, 'html')
# document ids that correspond to hyperlink names
doc_ids = soup.get_text().split('\n')[1:-5]

print("%d total document IDs:\n\n[ %s, %s, ... %s ]\n" %(len(doc_ids), doc_ids[0], doc_ids[1], doc_ids[-1]))

2746 total document IDs:

[ A0001.html, A0002.html, ... FN090664.html ]



### Indexing the keys

- A#### = Augusta County letters 
- F#### = Franklin County letters
- B#### = Freedmen's Bureau records
- @D#### = Diary entries
- @N#### = Newspaper Editorial
- Br@@@, Em@@ = Other? 

I refrain from pulling data off sources that are not letters from Augusta or Franklin County. Here I update my list of feasible IDs

In [3]:
letter_bool = ~pd.Series(doc_ids).str.startswith(('B','Em','AD','AN','FD','FN'))
letter_ids = list(pd.Series(doc_ids)[letter_bool])

print("%d total letter IDs:\n\n[ %s, %s, ... %s ]\n" %(len(letter_ids), letter_ids[0], letter_ids[1], letter_ids[-1]))

1867 total letter IDs:

[ A0001.html, A0002.html, ... F8582.html ]



# Scraping Letters 

Within each document, I scrape the document text in each `<p>` tags, as well as the author/date/county which are located in `<h4>` tags. The result is a combined metadata header and document text string

In [4]:
def remove_html_tags(text):
    """Remove html tags from a string"""
    
    # Match all tags, but replace <br/> tag with newline char \n
    clean = re.sub('<(?!br\/).*?>', '', text)
    # Clean out extra whitespace within paragraphs
    clean = re.sub('\s{2,}', ' ', clean)
    return re.sub('<br\/>', '\n', clean)

In [5]:
def clean_date(d_list):
    """Given a list of strings containing a month, numerical day & year
    returns a string in order of Month 00 0000"""
    
    # list of possible month strings
    month_list = ['january', 'february', 'march', 'april', 'may', 'june',
 'july', 'august', 'september', 'october', 'november', 'december']
    new_d_list = []
    for i in d_list:
        # remove non word/digit characters
        i = re.sub("\W", "", i) 
        if i in month_list:
            # if str is a month, do nothing
            new_d_list.append(i)
        else:
            # remove non digit characters
            i = re.sub("\D", "", i) 
            new_d_list.append(i)
            
    d_str = ' '.join(new_d_list).strip()
    # handle datetime conversion for incorrect date
    if d_str == 'april 31 1868': 
        d_str = 'april 30 1868'
        
    return d_str

In [6]:
def get_doc(url):
    """Extract the document author(s)/date/text from webpage"""
    
    html = requests.get(url)
    soup = BeautifulSoup(html.text, 'html')
    
    # Extract metadata
    title = re.sub('\s{2,}', ' ', soup.h4.get_text()).strip()
    county, other = title.split(': ')[0], title.split(': ')[1]
    if ' to ' in other:
        authors = other.split(' to ')[0]
        date = ' '.join(other.split(' to ')[1].split(', ')[-2:]).lower()
    elif 'Letter from ' in other:   
        authors = other.split(', ')[0].lstrip('Letter from ') 
        date = ' '.join(other.split(', ')[-2:]).lower()
        # If the doc is a Will
#    elif 'Will of ' in other:   
#        authors = other.split(', ')[0].strip('Will of ') 
#        date = ' '.join(other.split(', ')[1:])
    # Remove chars from unclear dates
    date = clean_date(date.split())
    
    # Extract text:
    # remove first 2 tags, which are summary sections of the webpage
    text_html = soup.find_all(class_='p10')[2:] 
    text = []
    # clean out html and fix whitespace in each paragraph tag
    for para_html in text_html: 
        para = remove_html_tags(str(para_html))
        text.append(para)
    # further remove extra whitespace from html code and place newlines between text of diff paragraph tags
    text = '\n\n'.join(text)
    text = re.sub('(?<=\n)[^\S\r\n]', '', text)
    # clean out html encoded symbols (&)
    text = re.sub('&amp;', '&', text)
    
    # Combine metadata with text
    full_doc = '\n'.join([authors, county, str(date), '\n***START OF TEXT***\n[page 1]\n', text])
    return full_doc

### Example

In [7]:
print(get_doc("https://valley.lib.virginia.edu/papers/A0010.html"))

Eliza H. Stanton
Augusta County
january 1858

***START OF TEXT***
[page 1]



Staunton Va

Good morning Miss Mag

I have got some news for you this morning I have not heard from you some time, therefore I cannot tell what are your plans for the future, When you rote me last, you wish me to try to get you a place here, I will not say that I have got you a place But will Say come and try; you have a chance on condishions; that is But I will not wright to you the condistions; But will plainly tell you what they are when you come Mag if you want a home why you had better come soon as you get this dont tell any one but come right away when you come to the hospittle dont call for any one but me for i want to see you firt on perticular bisness

yours

E Stanton
Va

Dont you delay or you may be to late



## Download Data

In [8]:
def download_valleydocs(ids, original=True, modern=False):
    """Scrape, clean, and download documents to .txt files"""
    
    valley_url = 'https://valley.lib.virginia.edu/' # Valley of the Shadow url
    output = '/Users/nickclifford/Documents/UVA/Spring 2020/DS 5001 Exploratory Text Analysis/final/data/' # output file destination
   
    if original:
        print("Bad Document IDs (orig):")
        # Downloads the original version
        for doc in tqdm(ids, desc='Original IDs', unit='file'):
            file = open(output + 'orig/' + doc.replace('.html', '.txt'), 'w')
            try:
                file.write(get_doc(valley_url + 'papers/'  + doc))
                file.close()
            # Error when Valley of the Shadow page is empty, not a letter, or doesnt state county of origin
            except (AttributeError, IndexError, UnboundLocalError) as e :
                file.close()
                os.remove(file.name)
                print(doc)

    # Downloads the modern day spelling if specified
    if modern:
        print("\nBad Document IDs (mod):")
        doc='0'
        for doc in tqdm(ids, desc='Modern IDs', unit='file'):
            file = open(output + 'mod/' + doc.replace('.html', '.txt'), 'w')
            try:
                file.write(get_doc(valley_url + 'mod/'  + doc.strip('.html')))
                file.close()
            # Error when Valley of the Shadow page is empty, not a letter, or doesnt state county of origin
            except (AttributeError, IndexError, UnboundLocalError) as e: 
                file.close()
                os.remove(file.name)
                print(doc)
            

In [9]:
download_valleydocs(letter_ids, modern=True)

Bad Document IDs (orig):


HBox(children=(FloatProgress(value=0.0, description='Original IDs', max=1867.0, style=ProgressStyle(descriptio…

A0322.html
A2000.html
A3061.html
F0060.html
F0063.html
F3505.html
F3507.html
F3509.html
F3510.html
F3511.html
F3516.html
F3518.html
F3520.html
F3521.html
F3524.html
F3525.html
F3526.html
F3527.html
F3529.html
F3531.html
F3533.html
F3534.html
F3535.html
F3536.html
F3539.html
F3540.html
F6070.html


Bad Document IDs (mod):


HBox(children=(FloatProgress(value=0.0, description='Modern IDs', max=1867.0, style=ProgressStyle(description_…

A3061.html
F0060.html
F0063.html
F3505.html
F3507.html
F3509.html
F3510.html
F3511.html
F3516.html
F3518.html
F3520.html
F3521.html
F3524.html
F3525.html
F3526.html
F3527.html
F3529.html
F3531.html
F3533.html
F3534.html
F3535.html
F3536.html
F3539.html
F3540.html
F6070.html



# Reference

- Valley of the Shadow: Two Communities in the American Civil War, University of Virginia Library (https://valley.lib.virginia.edu/).