In [2]:
#! pip install awesome-slugify

### HTML SCRAPING ###

In [19]:
import bs4 # beautifulsoup - for text munging
import requests # to download data
from slugify import slugify # used to slugify names - for saving files with unique names
import os # for file paths below

# Remember to install awesome-slugify, NOT slugify

In [20]:
sources = ['https://www.nytimes.com/'] # data source

Getting links from a page to scrape further

In [21]:
# this function takes a url - splits it up to get the domain
# save content of url to html variable
# uses the bs4 function and html5lib parser (alternative is hmtl.parser)
# lxml is also possible but needs separate installation
# finds all the links using the html a identifier
# if one of the links has the same domain (e.g. an ad won't) then save as sub-url
# and download the content of that sub-url
# these are then all saved
# this is just one layer, will need to write additional functions or recursive function to drill down more

def crawl(url):
    domain = url.split("//www.")[-1].split("/")[0]
    html = requests.get(url).content
    soup = bs4.BeautifulSoup(html, "html5lib")
    links = set(soup.findAll('a', href=True))
    for link in links:
        sub_url = link['href']
        page_name = link.string
        if domain in sub_url:
            try:
                page = requests.get(sub_url).content
                filename = slugify(page_name).lower() + '.html'
                save_dir = '../data/html/' # Create directory first
                filepath = os.path.join(save_dir, filename)
                with open(filepath, 'wb') as f:
                    f.write(page)
            except:
                pass

** Single threaded **

In [22]:
# for url in sources:
#     crawl(url)

** Multi threaded **

In [23]:
from multiprocessing.dummy import Pool

def multi_proc_crawl(url_list, processes=2):
    pool = Pool(processes)
    pool.map(crawl, url_list)
    pool.close()
    pool.join()

In [24]:
url_list = sources
multi_proc_crawl(url_list, processes=4)

<hr>

### HTML PROCESSING ###

In [25]:
import bs4

Convert all the text within the TAGS listed below

In [26]:
TAGS = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7', 'p', 'li']

def html2text(path):
    with open(path, 'r') as f:
        html = f.read()
        soup = bs4.BeautifulSoup(html, 'html5lib')
        for tag in soup.find_all(TAGS):
            yield tag.get_text() # This is a generator

Get a list of all files in a folder that ends with .html
<br/> 
**re** is a library for regular expressions

In [27]:
import re

save_dir = '../data/html/'
files_list = os.listdir(save_dir)
files_list = [f for f in files_list if re.search('.html', f)]
files_list[:10]

['terms-of-sale.html',
 'classifieds.html',
 'listen-to-the-daily.html',
 'unbuttoned-fashions-gossip-addiction.html',
 'n-y-region.html',
 'subscribe.html',
 'how-facebooks-ad-system-works.html',
 'slinky-dresses-meet-slouchy-trousers.html',
 'on-the-market-homes-for-sale-in-new-york-and-connecticut.html',
 'video-opinion.html']

In [28]:
for f in files_list[:3]:
    path = os.path.join(save_dir, f)
    for paragraph in html2text(path): # Get all the paragraphs using the get_text function within
        print(paragraph + '\n')

Help

Home Page

Today's Paper

Video

Most Popular






Help


Terms of Sale

Last Updated on February 8, 2017

This page provides the terms of sale and purchase for The New York Times digital products, including digital subscriptions and one-time purchases, offered by The New York Times newspaper and NYTimes.com ("NYTimes" or "we" or "us").

Other useful links:

Home Delivery Web SiteTerms of ServicePrivacy Policy

By using NYTimes.com and NYTimes products, you are agreeing to our Terms of Service.

1. GENERAL TERMS OF SALE     1.1. Definitions     1.2. About Digital Products     1.3. Processing and Payment     1.4. Pricing     1.5. Billing          1.5.1. Billing of Digital Subscriptions          1.5.2. Billing of One-Time Purchases          1.5.3. Currency     1.6. Unpaid Charges     1.7. Promotions     1.8. Software Products     1.9. Third Parties

___________

2. CANCELLATION AND REFUND POLICY

     2.1. Cancellation and Refunds of Digital Subscriptions     2.2. Cancellation Dur


                                        Jobs
                                    


                                        Magazine
                                    


                                        N.Y.C. Events Guide
                                    


                                        Real Estate
                                    


                                        T Magazine
                                    


                                        Travel
                                    


                                        Weddings & Celebrations
                                    

Listings & More


                                        Reader Center
                                    


                                        Classifieds
                                    


                                        Tools & Services
                                    


                                        N.Y.C. Events Guide
 