This is a scraper built to run on an EC2 instance to assist in TESU's open source materials accessibility initiative. It is the first of 3 notebooks used to scrape, check, create static sites, and update.
As the name indicates, this notebook contains the scraping functionality:
1. Retrieve .docx files stored in an S3 bucket
2. Extract XML from those .doc files
3. Extract all URLs contained in the document XML files
    - Assigns a number to each file, and each link in a file.
    - i.e.: link_001_005 is the fifth link in the first document
4. Using requests library, request each of those URLs and store html.
5. Stores pdfs in our S3 bucket folder "pdfs"
6. Stores html content in our S3 bucket "html_content"
5. Creates a Data Frame with the following columns:
    - idx: (i.e.: "link_001_005")
    - docname: Name of the document
    - url: the url
    - comment: contains "youtube video" if youtube video, "pdf" if pdf.

In [None]:
# Import neccessary packages. If any errors occur because of missing libraries, be sure to install them on 
# your EC2 instance.
import os
import requests
from xml.etree.cElementTree import XML
import zipfile
import re
import pandas as pd

#to store items on an AWS instance
import boto
import botocore
import boto3
from boto.s3.connection import S3Connection
from uuid import uuid4 as uuid
from time import sleep
import html
import random

In [None]:
### Create list of files to download to EC2 Instance
from boto3.session import Session

ACCESS_KEY='your_access_key'
SECRET_KEY='your_secret_key'

session = Session(aws_access_key_id=ACCESS_KEY,
                  aws_secret_access_key=SECRET_KEY)
s3 = session.resource('s3')
your_bucket = s3.Bucket('name_of_s3_bucket')

file_list = []
for s3_file in your_bucket.objects.all():
    #if in docs folder, and a file name greater than 5 characters, add to list.
    if str(s3_file.key)[0:4] == "docs" and len(str(s3_file.key)) > 5: 
        file_list.append(str(s3_file.key))
    else:
        continue

In [None]:
#Save our s3 word docs to the EC2 instance

bucket_name = 'name_of_s3_bucket' # replace with your bucket name
session = Session(aws_access_key_id=ACCESS_KEY,
                  aws_secret_access_key=SECRET_KEY)
s3 = session.resource('s3')

#for loop to loop through all files in the file_list and downlaod them to the EC2 instance.
for word_doc in file_list:
#word_doc is the name of our file, as a string.
    KEY = word_doc
    ec2_file_name = "../ec2docs/"+word_doc[5:]
    
    try:
        s3.Bucket(bucket_name).download_file(KEY, ec2_file_name)
    except botocore.exceptions.ClientError as e:
        if e.response['Error']['Code'] == "404":
            continue
            #print("The object does not exist.")
        else:
            raise

In [None]:
#scrape!
from boto3.session import Session

bucket_name = 'name_of_s3_bucket' # replace with your bucket name

session = Session(aws_access_key_id=ACCESS_KEY,
                  aws_secret_access_key=SECRET_KEY)
s3 = session.resource('s3')
your_bucket = s3.Bucket(bucket_name)

name_of_run = "run2" #### this will name the dataframe you create, can be adjusted accordingly
file_list = os.listdir('/home/ec2-user/ec2docs')
all_records = [] #declares all_records as a list
# this will contain columns 'index', 'docname', 'url' 'comment'

for file_count, file in enumerate(sorted(file_list)): #enumerate returns your list with each item numbered. 
                                              #so we can declare 2 variables to iterate over here, file_count and file 
                                              #sorted puts the file_list in alphanumeric order
    #open doc, from folder 'docs', extract XML coding
    pathway = '/home/ec2-user/ec2docs/'+file
    document = zipfile.ZipFile(pathway)
    xml_content = document.read('word/document.xml')
    document.close()
    xml_str = str(xml_content)
    
    #create linklist for doc, by going through the XML and finding the links
    link_list = re.findall('>http.*?\<',xml_str) #it returns text starting with '>http', ending with '<', inclusive.
            
    link_list = [x[1:-1] for x in link_list] #shaves off the last character of each item in the list. (it's a '<')
    #replace &amp; with &, and other html entities.
    link_list = [html.unescape(x) for x in link_list]
    
    for link_count, link in enumerate(link_list):
        #request web page content
        idx = "link_%03d_%03d" % ((file_count+1), (link_count+1)) #this just creates the string. ie: "doc_001_001"
        docname = file
        url = link
        comment = ""
        
        try:
            r = requests.get(link, headers={"User-agent": str(uuid())})
            content = r.text
            
            if "youtube" in link:
                comment = "youtube video"
            elif ".pdf" in link:
                content = r.content
                key = "pdfs/" + index + ".pdf"
                s3.Bucket(bucket_name).put_object(Key=key, Body=content, ContentType='application/pdf', ContentDisposition='inline')
                comment = "pdf file"
            else:
                key = "html_content/" + index + ".html"
                s3.Bucket(bucket_name).put_object(Key=key, Body=content, ContentType='text/html')
            
        except:
            comment = "scraping error"
            
        #add to df 
        #appends a dictionary with keys "idx", "docname" and "url" to the list all_records.
        all_records.append({'idx': index, 'docname': file, 'url':link, 'comment':comment})
    sleep(random.randrange(3, 9))
#make pandas df and store in runs folder
df_long = pd.DataFrame(all_records, columns=['idx', 'docname', 'url', 'comment'])
csv_name = name_of_run+'.csv'
df_long[['idx', 'docname','url','comment']].to_csv(csv_name)