In [1]:
import requests
from bs4 import BeautifulSoup
from bs4 import SoupStrainer
import re
import time
import pickle
from collections import defaultdict
import smtplib
import pandas as pd
from copy import deepcopy
from urllib.parse import urlparse
from pymongo import MongoClient
import pymongo
import os

In [4]:
utility_password = os.environ['email_bot_password']

In [41]:
password = os.environ['password']
libera_url = "54.164.158.211"

### E-mail notice functions

In [3]:
def requests_error_notice(error_code,current_url):
    """Sends an e-mail notification if requests fails to connect to a url"""
    smtpObj = smtplib.SMTP('smtp.gmail.com', 587)
    smtpObj.ehlo()
    smtpObj.starttls()
    smtpObj.login('pfblack.utility@gmail.com',  utility_password)
    smtpObj.sendmail('pfblack.utility@gmail.com', 'paul.laifu.black@gmail.com',
                     'Subject: Get request failed\nRequests failed to scrape the following url\n \
                     url: %s \n \
                     error code: %i (if 0 requests failed to connect)' % (current_url, error_code))
    smtpObj.quit()

In [4]:
def duplicate_notice(db_type, db, col, url, duplicate_count):
    """Sends an e-mail notification if a url has been found in the database multiple times"""
    smtpObj = smtplib.SMTP('smtp.gmail.com', 587)
    smtpObj.ehlo()
    smtpObj.starttls()
    smtpObj.login('pfblack.utility@gmail.com', utility_password)
    smtpObj.sendmail('pfblack.utility@gmail.com', 'paul.laifu.black@gmail.com',
                     'Subject: Duplicate Found\nA duplicate entry has been found on your %s database.\n \
                     Database: %s \n \
                     Collection: %s \n \
                     Url: %s \n \
                     Count: %i' % (db_type, db, col, url, duplicate_count))
    smtpObj.quit()

In [5]:
def termination_notice(location):
    """Sends an e-mail notification if function/process terminates/concludes
    
    Note: Requires that no unhandled errors were thrown and that the function concluded normally
    """
    smtpObj = smtplib.SMTP('smtp.gmail.com', 587)
    smtpObj.ehlo()
    smtpObj.starttls()
    smtpObj.login('pfblack.utility@gmail.com', utility_password)
    smtpObj.sendmail('pfblack.utility@gmail.com', 'paul.laifu.black@gmail.com',
                     'Subject: Progam Stopped Running\nYour WebCrawler has terminated on %s!' % location)
    smtpObj.quit()

In [6]:
# Global Variables
client = pymongo.MongoClient("mongodb://paul:" + password + "@" + libera_url + "/libera_db")

db = client.libera_db

col = db.scraped_blogs

only_body_tag = SoupStrainer("body")

http_pattern = re.compile('https://www.|https://|http://www.|http://|www.')

location = 'local computer'

In [7]:
df = pickle.load(open("mongodb_data.p",'rb'))

In [7]:
def run_crawler(max_iterations, location):
    for _ in range(max_iterations):
        current_url = get_seed()
        source_code = make_request(current_url)
        if source_code:
            soup = BeautifulSoup(source_code.text, "lxml", parse_only=only_body_tag)
            get_links(soup, current_url)
        else:
            pass
    termination_notice(location)

In [8]:
def get_seed():
    """Grabs an open seed from the libera_db collection: scraped blogs
    and then sets the value of the open_seed field to False.
    
    $natural, 1 sets the order of scan to create order
    """
    current_url = col.find({'open_seed':True}).hint([('$natural', 1)]).limit(3)[2]['url']
    col.update_one({'url':current_url},{'$set':{'open_seed':False}})
    return current_url

In [9]:
# Used to pull up soup's plain text from seed AND to iterate through link_list
def make_request(current_url):
    """Takes a url and returns the BeautifulSoup object of the webpage"""
    try:
        source_code = requests.get(current_url)
        time.sleep(1)
        # If requests connects, but does not succeed in getting source_code, return None, e-mail error_notice, edit data entry
        if source_code.status_code != 200:
            error_code = source_code.status_code
            col.update_one({'url':current_url},{'$set':{'failed_scrape':True, 'error_code':source_code.status_code}})
            requests_error_notice(error_code, current_url)
            source_code = None
    # if request fails to connect, return None, e-mail error_notice, edit database entry
    except:
        error_code = 0
        requests_error_notice(error_code, current_url)
        col.update_one({'url':current_url},{'$set':{'failed_scrape':True, 'error_code':error_code}})
        source_code = None
    return source_code
    

Check link before expanding to save time

In [10]:
# Be sure to define current url
def get_links(soup, current_url):
    """Grabs all links from current_url's soup.

    Will pass on invalid links, links that match the avoid pattern, 
    and links that have already been grabbed.

    New Links will have open_seed: True meaning they can be used as a seed for a later step
    This value will be set to False when this page is used as a seed
    """
    # avoid pictures, files, and facebook pages,etc.
    avoid_pattern = re.compile('zillow\.com|coursera\.org|yelp\.com|flickr|tumblr|comments$|amazon.com|plus.google.com|linkedin.com|youtube.com|jobs.acm.org|vimeo.com|http://awards.acm.org|instagram.com|twitter.com|respond$|comment$|.pdf$|.png$|.jpg$|.jpeg$|.gif$|.xlsx$|wikipedia.org|facebook|\?share=|\?ref=footer_website|\?ref=footer_blog')
    for a_tag in soup.find_all('a'):
        href =a_tag.get('href')
        # Make sure that link is valid and not in avoid patterns, otherwise pass
        if href != None and http_pattern.match(href) and not avoid_pattern.search(href):
            # Check to see if the link has already been grabbed.
            if previously_grabbed(href):
                # if it has been previously grabbed AND it's an external link inc inbound_link_count and urls_pointed_here
                if external_link(href, current_url):
                    col.update_one({'url':href}, {'$inc': {'inbound_link_count':1}, '$push': {'urls_pointed_here':current_url}})
                pass
            # If not previously grabbed, expand link and check again
            else:
                source_code = make_request(href)
                # Try to connect via requests lib and if failed pass
                if source_code:
                    # failsafe for situations where url has been shortened, expands url
                    try:
                        url = source_code.url
                    except AttributeError:
                        requests_error_notice(0, href)
                        continue
                    # Query SQL or MongoDB to see if link has already been grabbed
                    if previously_grabbed(url):
                        # If the link points to a new web page increase that url's inbound_link_count by one
                        if external_link(url, current_url):
                            col.update_one({'url':url}, {'$inc': {'inbound_link_count':1}, '$push': {'urls_pointed_here':current_url}})
                        pass
                    # Only add a url to open_seed list if it has not been previously grabbed and it is valid
                    # If it is an external link, put inbound_link_count at 1    
                    else:
                        try:
                            blog_text = scrape_url(source_code)
                            update_blog_entry(url, blog_text, current_url)
                        except AttributeError:
                            requests_error_notice(0, href)
                            pass
                else:
                    pass
        else:
            pass

In [11]:
# Helper function for get_links
def external_link(url, current_url):
    """Checks to see if a link is internal or external.
    This is for initializing a mock page-rank system:
        Mock page-rank system will tally up the number
        of incomming links from external web pages
        across scraped pages."""
    parsed_url = urlparse(url)
    parsed_current_url = urlparse(current_url)
    # if both the scheme and the netloc match between sites it is an internal link, return false
    if parsed_url.scheme == parsed_current_url.scheme and parsed_url.netloc == parsed_current_url.netloc:
        return False
    else:
        return True

In [12]:
# Helper function for get_links
def previously_grabbed(url):
    """Check to see if url is already in DataBase
    If it has been previously scraped it will return True and webcrawler will act accordingly
    If it exists more than once, something has gone wrong and an e-mail will be sent out to notify users
    If it has not yet been scrapped it returns false
    """
    if col.find({'url':url}).count() == 1:
        return True
    
    # if the url exists multiple times e-mail out notification, something went wrong!
    elif col.find({'url':url}).count() > 1:
        duplicate_count = col.find({'url':url}).count()
        duplicate_notice(db_type, db, col, url, duplicate_count)
        return True
    
    else:
        return False

In [13]:
def scrape_url(source_code):
    """Takes a live url and it's BeautifulSoup object and pulls all the text from the body
    Does not pull from inputs, scripts, and noscripts (to avoid JavaScript)
    Returns the text as blog_text
    """
    soup = BeautifulSoup(source_code.text, "lxml", parse_only=only_body_tag)
    blog_text = []
    # still grabbing some javascript
    for child in soup.body.children:
        if child.name != None and child.name != 'script' and child.name != 'input' and child.name != 'noscript' and child.name != 'style' and child.name!= 'option':
            blog_text.append(child.get_text(' ', strip=True))
    space = ' '
    blog_text = space.join(blog_text)
    blog_text.encode('utf-8')
    
    return blog_text

In [14]:
# Needs to be Reworked! But for now we're turning it off to accept all pages
# Runs inside of update_blog_entry
def quality_check(url, blog_text):
    """Takes a url the scraped_text and passes it through a quality check.
    
    For this project that quality check is a Naive Bayes Classifier trained on categorized
    blog posts from initial scraping.
    
    If the blog_text meets relevancy criteria, return True, else return False
    """
    # Insert Naive Bayes classification here, if pass return True if fail return False
    """Example psuedo-code:
    category = nb_classifier.predict(blog_text)
    if category == 1:
        return True
    else:
        return False"""
    return True

In [15]:
def update_blog_entry(url, blog_text, current_url):
    """If blog_text passes the quality check, update it's entry in mongo_db
    Otherwise set open_seed to False and drop blog_text information
    
    For relevant web pages:
    
    hand_reviewed:
    --- True if this entry has been hand reviewed
    --- False if this entry has not been hand reviewed
    
    quality:
    --- True if this web page passed quality check
    --- False if this web page failed quality check
    """
    if quality_check(url, blog_text):
        if external_link(url, current_url):
            col.insert_one(
                {'url': url,
                 'text': blog_text,
                 'hand_reviewed': False,
                 'quality': True,
                 'open_seed':True,
                 'inbound_link_count':1,
                 'urls_pointed_here':[current_url]
                }
            )
        else:
            col.insert_one(
                {'url': url,
                 'text': blog_text,
                 'hand_reviewed': False,
                 'quality': True,
                 'open_seed':True,
                 'inbound_link_count':0
                }
            )
    else:
        col.insert_one(
            {'url':url,
            'hand_reviewed':False,
            'quality': False
            }
        )

In [117]:
def run_crawler(max_iterations, location):
    for _ in range(max_iterations):
        current_url = get_seed()
        source_code = make_request(current_url)
        if source_code:
            soup = BeautifulSoup(source_code.text, "lxml", parse_only=only_body_tag)
            get_links(soup, current_url)
        else:
            pass
    termination_notice(location)

In [None]:
max_iterations = 225
run_crawler(max_iterations, 'aws')