# DS320 Final Project
## The Blaze Data Collection Notebook
### Noah B Johnson

## Import Modules

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import urllib3
import random
import time
import json
from tinydb import TinyDB, Query
import feedparser

## Configure Environment

In [72]:
# base url for scraping
baseUrl = "https://www.theblaze.com"
# url list name
urlListName = "urls.txt"
# scraping result json file name
scrapedDataName = "scraped.json"
# output json file name
outputFileName = "out.json"
# setup json file for tinydb
db = TinyDB('db.json')

## Build List of Articles to Scrape

In [3]:
def getArticles(response):
    # Gets all the links to articles in the html response
    # Returns a set
    soup = BeautifulSoup(response.text, 'html.parser')
    scrapeUrls = set()
    for a in soup.find_all("a"):
        try:
            if a["href"].find("/news/") > -1:
                if a["href"].find(baseUrl) > -1:
                    scrapeUrls.add(a["href"])
                else:
                    scrapeUrls.add(baseUrl + a["href"])
        except KeyError:
            pass
    return scrapeUrls

In [30]:
def buildArticleList(daysBack):
    # Build a set of links for articles on theblaze.com
    # daysBack = How many days of data to collect
    
    # Get the articles on the front page
    startPage = requests.get("https://www.theblaze.com/")
    articleList = getArticles(startPage)
    
    # The number of archive pages (days) scraped
    pages = 0
    
    # Set the iterative variables for the url
    day = 16
    month = 11
    year = 2018
    
    # Go through the archive pages and add article links to the set
    while pages < daysBack:
        # Create archive url based on date iterators
        pageLink = "https://www.theblaze.com/news/{}/{}/{}".format(year,month,day)
        # Get the page html
        page = requests.get(pageLink)
        # Get the article links from the response html
        articleList = articleList.union(getArticles(page))
        
        # Take the date down by one
        day = day - 1
        if day == 0:
            month = month - 1
            day = 31
        if month == 0:
            year = year - 1
            month = 12
            
        # Update the pages scraped value
        pages +=1
            
        
    return articleList

In [5]:
# Run the link generation function

# Load from file after first run!!
# urlList = buildArticleList(1100)

In [6]:
# Write the urls to a file

# Load from file after first run!!
# article_urls = open(urlListName,'w')
# for line in mylist:
#     article_urls.write(line)
#     article_urls.write("\n")

In [7]:
# Read the urls from the file

# Open the File
urls = open("urls.txt",'r').readlines()

# remove newlines from the urls
for i in range(len(urls)):
    urls[i] = urls[i].replace("\n","")

## Scrape List of Articles

In [8]:
def parseArticle(response):
    # Scrapes an article html response for title, author, body, and time
    # Try/Catch block returns none if not all the fields are found
    try:
        soup = BeautifulSoup(response.text, 'html.parser')
        time = soup.find('time')["datetime"]
        author = soup.find('span',{'class':'author-name'}).text
        title = soup.find("h1", {"class": "page-title"}).text
        body = ''
        for p in soup.find('div', {'class': 'entry-content article-styles'}).find_all(['p','h3','ul']):
            if p.text.find("H/T: ") == -1:
                body = body + p.text  + '\n'
    except:
        return None
    return {
        'url': response.url,
        'time': time,
        'author': author,
        'title': title,
        'body': body,
        'site': 'theblaze'
    }

In [10]:
def scrapeDB(lines):
    # Takes list of urls and scrapes them to a tinydb
    
    # Count variable for printing progress
    count = 0
    
    # query object for checking urls
    q = Query()
    
    # Scrape!
    for line in lines:
        # Sleep to prevent https issue or getting ip blocked for ddos
        time.sleep(.1)
        # print progress every 50 articles
        if count % 50 == 0:
            print(str(count / len(lines) * 100) + "%")
#         print("db length: " + str(len(db)))
        # get the page
        if not db.contains(q.url == line):
            try:
                response = requests.get(line)
                # scrape the data from the page and add it to the df
                db.insert(parseArticle(response))
            except ValueError:
#                 print("failure on " + line)
                pass
        # add one to the count
        count += 1

In [74]:
# Sample 4.5k urls from the 14505 in the list
# scrapeDB(random.sample(urls, 4500))