This is a scraper built to run on an EC2 instance to assist in TESU's open source materials accessibility initiative. It is the second of 3 notebooks used to scrape, check, create static sites, and update. As the name indicates, this notebook contains the checking and updating functionality:

In [None]:
# Import neccessary packages. If any errors occur because of missing libraries, be sure to install them on 
# your EC2 instance.

import pandas as pd
import numpy as np
import boto
import boto3
import re
import os
import requests
from xml.etree.cElementTree import XML
import zipfile
from uuid import uuid4 as uuid
from time import sleep
import html

In [None]:
data= pd.read_csv('run1.csv') # read in dataframe created in 01_scraper

### Check 1: Does what we have in s3 == what we have in our .csv summary file?
We retrieve a list of files from our s3 buckets html_content and pdfs

In [None]:
from boto3.session import Session

ACCESS_KEY='your_access_key'
SECRET_KEY='your_secret_key'

bucket_name = 'your_s3_bucket_name' # replace with your bucket name

session = Session(aws_access_key_id=ACCESS_KEY,
                  aws_secret_access_key=SECRET_KEY)
s3 = session.resource('s3')
your_bucket = s3.Bucket(bucket_name)

html_list = []
for s3_file in your_bucket.objects.all():
    if str(s3_file.key)[0:13] == "html_content/" and len(str(s3_file.key)) > 5:
        html_list.append(str(s3_file.key))
    else:
        continue
        
pdf_list = []
for s3_file in your_bucket.objects.all():
    if str(s3_file.key)[0:5] == "pdfs/" and len(str(s3_file.key)) > 5:
        pdf_list.append(str(s3_file.key))
    else:
        continue

In [None]:
html_list = [x[13:] for x in html_list]
pdf_list = [x[5:] for x in pdf_list]
scraped_list = html_list + pdf_list
scraped_list = sorted(scraped_list)

In [None]:
#all files in ec2 csv
no_youtube = data[data['comment'] != 'youtube video']

In [None]:
#are they equal lengths?
len(all_files) == len(scraped_list)

### Check 2: So we know if they weren't captured by our scraper, they are not included on our CSV. Now to check each file for missing links
Find all the documents where there are unequal link counts. This kind of error can happen if the page loads too slowly, or if the page blocks automated requests. We're ultimately going to find any discrepancies and see what can be solved by trying to request them again, and write work-arounds if we can.

In [None]:
# we already loaded in what was scraped, stored as variable 'data'. We want to filter out anything that's a youtube video.
data2 = data[data['comment'] != "youtube video"]
data2['link_name'] = data2.apply(lambda x: x['idx'][0:8], axis =1) 

In [None]:
#now to see if there's any 
file_list = os.listdir('../ec2docs')
all_links = []
unequal_records = [] #declares all_records as a list

for file_count, file in enumerate(sorted(file_list)): #enumerate returns your list with each item numbered. 
                                              #so we can declare 2 variables to iterate over here, file_count and file 
    #open doc, from folder 'docs'
    pathway = '../ec2docs/'+file
    document = zipfile.ZipFile(pathway)
    xml_content = document.read('word/document.xml')
    document.close()
    xml_str = str(xml_content)
    
    #create linklist for doc
    link_list = re.findall('>http.*?\<',xml_str) #it returns text starting with '>http', ending with '<', inclusive.
            
    link_list = [x[1:-1] for x in link_list] #shaves off first and last character of each item in the list. (it's a '<')
    #replace &amp; with &, and other html entities.
    link_list = [html.unescape(x) for x in link_list]
    
    if file_count == 0:
        all_links = link_list
    else:
        all_links = all_links + link_list
    for link_count, link in enumerate(link_list):
        idx = "link_%03d" % (file_count+1) #this just creates the string. ie: "doc_001"
    if len(link_list) != len(data[data['idx'].str.contains(idx)]):
        unequal_records.append(idx)

In [None]:
#see any unqual records?
unequal_records

In [None]:
#compare to just get links where there are mismatches, this can also be used for updating.
all_links = list(data2['idx'])
link_list = [x[0:12] for x in scraped_list]
links_that_did_not_pull = sorted(list(set(all_links)-set(link_list)))
to_scrape = data2[data2['idx'].isin(links_that_did_not_pull)]

In [None]:
# now to scrape just the links where we have a discrepancy:
# this code also prints what the error is. In our case, it was mostly connection errors.
from boto3.session import Session
from requests.exceptions import ConnectionError

ACCESS_KEY='your_access_key'
SECRET_KEY='your_secret_key'

bucket_name = 's3_bucket_name' # replace with your bucket name

session = Session(aws_access_key_id=ACCESS_KEY,
                  aws_secret_access_key=SECRET_KEY)
s3 = session.resource('s3')
your_bucket = s3.Bucket(bucket name)

name_of_run = "try2"
all_records = [] #declares all_records as a list
# this will contain columns 'idx', 'docname', 'url' 'comment'

tries = 3 # number of times retry a connection error

for x in to_scrape.values.tolist():
    idx = x[1]
    link = x[3]
    file = x[2]
    comment = x[4]
    
    try:
        r = requests.get(link, headers={"User-agent": str(uuid())})
        content = r.text

        if "youtube" in link:
            comment = "youtube video"
        elif ".pdf" in link:
            content = r.content
            key = "pdfs/" + idx + ".pdf"
            s3.Bucket(bucket_name).put_object(Key=key, Body=content)
            comment = "pdf file"
        else:
            key = "html_content/" + idx + ".html"
            s3.Bucket(bucket_name).put_object(Key=key, Body=content)
    except ConnectionError as e1:
        print(idx, e1)
        print('retrying', idx)
        for t in range(tries):
            sleep(10)
            try:
                r = requests.get(link, headers={"User-agent": str(uuid())})
                content = r.text

                if "youtube" in link:
                    comment = "youtube video"
                elif ".pdf" in link:
                    content = r.content
                    key = "pdfs/" + idx + ".pdf"
                    s3.Bucket(bucket_name).put_object(Key=key, Body=content)
                    comment = "pdf file"
                else:
                    key = "html_content/" + idx + ".html"
                    s3.Bucket(bucket_name).put_object(Key=key, Body=content)
            except ConnectionError:
                comment = "connection error"
            except Exception:
                comment = "scraping error"
            else:
                break
    except Exception as e2:
        comment = "scraping error"
        print(idx, e2)
    finally:
        #add to df 
        #appends a dictionary with keys "idx", "docname" and "url" to the list all_records.
        all_records.append({'idx': idx, 'docname': file, 'url':link, 'comment':comment})
        sleep(4)
#make pandas df and store in runs folder
df_long2 = pd.DataFrame(all_records, columns=['idx', 'docname', 'url', 'comment'])
csv_name = name_of_run+'.csv'
df_long2[['idx', 'docname','url','comment']].to_csv(csv_name)