In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container{width:100 !important;}</style"))

# Blog Scrapping Notebook

In [2]:
pip install textstat

Note: you may need to restart the kernel to use updated packages.


In [3]:
import requests
from bs4 import BeautifulSoup
from dateutil import parser
from textstat.textstat import textstat

In [4]:
headers = {
    'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.106 Safari/537.36',
    'referrer': "https://google.com"
}


# Scraping one article

In [5]:
url = 'https://blog.frame.io/2018/10/01/womans-experience-cutting-blockbusterrs/'
r = requests.get(url, headers=headers)


In [6]:
r.status_code


200

In [7]:
html = r.text.strip()
print(html)

<!DOCTYPE html>

<script>
 var fioPageType = ''; 
var fioPageSlug = ''; 
fioPageType = 'article';
 fioPageSlug = 'womans-experience-cutting-blockbusterrs';
 </script>
<html lang="en-US">
	<head>
		<meta charset="UTF-8">
		<meta name="viewport" content="width=device-width, initial-scale=1">

		<link rel="profile" href="http://gmpg.org/xfn/11">

		

		<title>On Gender and Genre: A Woman’s Experience Cutting Action Blockbusters</title>

<!-- This site is optimized with the Yoast SEO plugin v12.8.1 - https://yoast.com/wordpress/plugins/seo/ -->
<meta name="description" content="“Mile 22” Editor Melissa Lawson Cheung shares her experience cutting action blockbusters, and what she brings to the cutting room as a woman."/>
<meta name="robots" content="max-snippet:-1, max-image-preview:large, max-video-preview:-1"/>
<link rel="canonical" href="https://blog.frame.io/2018/10/01/womans-experience-cutting-blockbusterrs/" />
<meta property="og:locale" content="en_US" />
<meta property="

In [8]:
pip install lxml

Note: you may need to restart the kernel to use updated packages.


In [9]:
soup = BeautifulSoup(html, 'lxml')
print(type(soup))

<class 'bs4.BeautifulSoup'>


In [10]:
soup = BeautifulSoup(html, 'lxml')
print(type(soup))

<class 'bs4.BeautifulSoup'>


In [11]:

# Header Content
header = soup.find(class_='entry-header')
title_html = header.find(class_='post-meta-title')
print(title_html)

<h1 class="post-meta-title">
								On Gender and Genre: A Woman’s Experience Cutting Action Blockbusters							</h1>


In [12]:
title_html.contents


['\r\n\t\t\t\t\t\t\t\tOn Gender and Genre: A Woman’s Experience Cutting Action Blockbusters\t\t\t\t\t\t\t']

In [13]:
title_str = title_html.contents[0].strip()
print(title_str)

On Gender and Genre: A Woman’s Experience Cutting Action Blockbusters


In [16]:

author_html = header.find(class_='author-name')
author_str = author_html.find('a').contents[0].strip()
print(author_str) 

Lisa McNamara


# Modularize code 

In [17]:
def parse_page(url):
    r = requests.get(url, headers=headers)
    html = r.text.strip()
    soup = BeautifulSoup(html, 'lxml')
    
    # Header Content
    header = soup.find(class_='entry-header')
    read_time = extract_read_time(header)
    title = extract_title(header)

    author = extract_author(header)
    categories = extract_categories(header)

    date = extract_date(header)
    dt = parser.parse(date)
    month = dt.strftime("%B")
    weekday = dt.strftime("%A")
    
    # Body Content
    content = soup.find(class_='entry-content')
    word_count = len(content.text.split())
    reading_level = textstat.flesch_kincaid_grade(content.text)

    links = content.find_all("a")
    link_count = len(links)

    images = content.find_all("img")
    image_count = len(images)
    
    page_data = {
        'reading_time' : read_time,
        'title': title,
        'date': date,
        'month': month,
        'weekday': weekday,
        'author': author,
        'categories': categories,
        'word_count': word_count,
        'reading_level': reading_level,
        'link_count': link_count,
        'image_count': image_count
    }
    
    return page_data
    
def extract_read_time(header):
    html_str = header.find(class_='read-time')
    time_str = html_str.contents[0].strip().lower().split()[0]
    time_int = int(time_str)
    return time_int

def extract_title(header):
    html_str = header.find(class_='post-meta-title')
    title_str = html_str.contents[0].strip()
    return title_str

def extract_date(header):
    html_str = header.find(class_='single-post-date')
    date_str = html_str.contents[0].strip()
    return date_str

def extract_author(header):
    html_str = header.find(class_='author-name')
    author_str = html_str.find('a').contents[0].strip()
    return author_str

def extract_categories(header):
    html_str = header.find(class_='single-post-cat')
    categories = html_str.findAll('a')
    cat_names = []
    for cat_link in categories:
        cat_name = cat_link.contents[0].strip().lower()
        cat_names.append(cat_name)
    return cat_names

In [19]:

url = 'https://blog.frame.io/2018/10/01/womans-experience-cutting-blockbusterrs/'
wmn_exp = parse_page(url)

In [24]:
print(wmn_exp)

{'reading_time': 13, 'title': 'On Gender and Genre: A Woman’s Experience Cutting Action Blockbusters', 'date': 'October 1, 2018', 'month': 'October', 'weekday': 'Monday', 'author': 'Lisa McNamara', 'categories': ['behind the scenes', 'interviews'], 'word_count': 2639, 'reading_level': 17.1, 'link_count': 6, 'image_count': 15}


# Scraping one category

In [26]:
articles_store = []

In [28]:
def parse_category(url):
    r = requests.get(url, headers=headers)
    html = r.text.strip()
    soup = BeautifulSoup(html, 'lxml')
    
    article_cards = soup.findAll(class_='post-content')

    for article in article_cards:
        title = article.find(class_='post-meta-title')
        link = title.contents[0]['href']
        print('Parsing URL:', link)
        page = parse_page(link)
        articles_store.append(page)
        
    next_link = find_next_link(soup)
    
    if next_link is not None:
        print('Next page:', next_link)
        parse_category(next_link)
        
    return None

def find_next_link(soup_item):
    bottom_nav = soup_item.find(class_='navigation')
    
    if bottom_nav == None:
        return None
    
    links = bottom_nav.findAll('a')
    next_page = links[-1]

    if next_page.contents[0] == 'Next':
        next_link = next_page['href']
        return next_link
    
    return None

In [29]:
bts = 'https://blog.frame.io/category/behind-the-scenes/'
parse_category(bts)

Parsing URL: https://blog.frame.io/2020/02/10/oscars-2020-workflows/
Parsing URL: https://blog.frame.io/2020/02/06/jinmo-yang-road-to-oscars/
Parsing URL: https://blog.frame.io/2020/01/21/studio-workflow-on-indie-budget/
Parsing URL: https://blog.frame.io/2020/01/13/parasite-design-and-vfx/
Parsing URL: https://blog.frame.io/2019/12/23/john-lewis-christmas-ad/
Parsing URL: https://blog.frame.io/2019/12/16/block-and-tackle-mif-film/
Parsing URL: https://blog.frame.io/2019/11/21/exclusive-interview-joker/
Parsing URL: https://blog.frame.io/2019/11/11/bt-espn-sneakercenter/
Parsing URL: https://blog.frame.io/2019/10/28/ryan-connolly-there-comes-a-knocking/
Parsing URL: https://blog.frame.io/2019/10/23/loom-made-in-frame/
Parsing URL: https://blog.frame.io/2019/09/16/marvelous-mrs-maisel-editors/
Parsing URL: https://blog.frame.io/2019/08/05/made-in-frame-pixcom/
Next page: https://blog.frame.io/category/behind-the-scenes/page/2/
Parsing URL: https://blog.frame.io/2019/07/08/homecoming-edi

In [32]:
print(len(articles_store))
print(articles_store[0])


47
{'reading_time': 35, 'title': 'Workflow Breakdown of Every 2020 Oscars Best Picture and Editing Nominee', 'date': 'February 10, 2020', 'month': 'February', 'weekday': 'Monday', 'author': 'Alexander Huls', 'categories': ['behind the scenes', 'workflow process'], 'word_count': 6771, 'reading_level': 13.4, 'link_count': 53, 'image_count': 37}


# Scraping all categories

In [33]:
articles_store=[]

In [34]:

categories = ['post-production', 'color-correction', 'business', 'workflow', 'behind-the-scenes', 'production', 'announcement']

In [35]:
for category in categories:
    url = 'https://blog.frame.io/category/' + category + '/'
    print('Parsing category', category)
    parse_category(url)

Parsing category post-production
Parsing URL: https://blog.frame.io/2020/02/17/vfx-workflow-best-practices/
Parsing URL: https://blog.frame.io/2020/02/06/jinmo-yang-road-to-oscars/
Parsing URL: https://blog.frame.io/2020/01/27/post-production-supervisor/
Parsing URL: https://blog.frame.io/2019/12/02/5-most-underrated-resolve-tools/
Parsing URL: https://blog.frame.io/2019/11/21/exclusive-interview-joker/
Parsing URL: https://blog.frame.io/2019/11/18/diagnosing-premiere-pro-problems/
Parsing URL: https://blog.frame.io/2019/11/11/bt-espn-sneakercenter/
Parsing URL: https://blog.frame.io/2019/11/07/premiere-frameio-remote-proxies/
Parsing URL: https://blog.frame.io/2019/11/04/premiere-resolve-roundtrip/
Parsing URL: https://blog.frame.io/2019/10/31/mixed-rates-resolve-part-5/
Parsing URL: https://blog.frame.io/2019/10/24/mixed-frame-rates-part-4/
Parsing URL: https://blog.frame.io/2019/10/21/emulating-film-look/
Next page: https://blog.frame.io/category/post-production/page/2/
Parsing URL:

Parsing URL: https://blog.frame.io/2016/08/16/last-best-picture-without-visual-effects/
Parsing URL: https://blog.frame.io/2016/08/05/12-steps-to-make-after-effects-faster/
Parsing URL: https://blog.frame.io/2016/04/11/frameio-for-premiere/
Parsing category color-correction
Parsing URL: https://blog.frame.io/2020/02/17/vfx-workflow-best-practices/
Parsing category business
Parsing URL: https://blog.frame.io/2020/01/27/post-production-supervisor/
Parsing URL: https://blog.frame.io/2019/09/03/boost-your-productivity-and-creativity/
Parsing URL: https://blog.frame.io/2019/08/19/avid-in-hollywood/
Parsing URL: https://blog.frame.io/2019/07/01/staff-vs-freelance-editing/
Parsing URL: https://blog.frame.io/2019/03/25/women-in-post-production/
Parsing URL: https://blog.frame.io/2019/01/22/the-most-direct-path-to-lead-editor-may-be-a-detour/
Parsing URL: https://blog.frame.io/2019/01/14/hustle-and-workflow/
Parsing URL: https://blog.frame.io/2019/01/07/5-reasons-to-move-to-la-ny/
Parsing URL: 

Parsing URL: https://blog.frame.io/2019/10/23/loom-made-in-frame/
Parsing URL: https://blog.frame.io/2019/09/16/marvelous-mrs-maisel-editors/
Parsing URL: https://blog.frame.io/2019/08/05/made-in-frame-pixcom/
Next page: https://blog.frame.io/category/behind-the-scenes/page/2/
Parsing URL: https://blog.frame.io/2019/07/08/homecoming-editors/
Parsing URL: https://blog.frame.io/2019/06/28/made-in-frame-gopro-million-dollar-challenge/
Parsing URL: https://blog.frame.io/2019/06/10/frameio-vice/
Parsing URL: https://blog.frame.io/2019/06/03/how-buzzfeed-harnessed-frameio-api/
Parsing URL: https://blog.frame.io/2019/05/13/avengers-endgame-workflow/
Parsing URL: https://blog.frame.io/2019/04/29/michel-aller-shazam-editor/
Parsing URL: https://blog.frame.io/2019/03/28/captain-marvel-editor-debbie-berman/
Parsing URL: https://blog.frame.io/2019/03/18/nicholas-monsour-us/
Parsing URL: https://blog.frame.io/2019/02/18/tom-cross-editing-first-man/
Parsing URL: https://blog.frame.io/2019/02/14/edit

In [37]:
len(articles_store)

290

In [39]:
articles_store[0]

{'reading_time': 23,
 'title': 'The Essential Guide to Building a Successful VFX Workflow',
 'date': 'February 17, 2020',
 'month': 'February',
 'weekday': 'Monday',
 'author': 'Dan Swierenga',
 'categories': ['vfx and mograph', 'workflow'],
 'word_count': 4612,
 'reading_level': 10.5,
 'link_count': 14,
 'image_count': 3}

In [43]:
import json

with open('my_project_env/data/articles.json', 'w') as f:
    json.dump(articles_store, f)