In [118]:
import requests
import pandas as pd
import numpy as np
import json
from bs4 import BeautifulSoup
import config
import logging
import importlib
import time
import random
from selenium import webdriver
from selenium.webdriver.common.by import By 
from selenium.webdriver.support.ui import WebDriverWait 
from selenium.webdriver.support import expected_conditions as EC 
from selenium.common.exceptions import TimeoutException
import validators
import pickle
import time
import random
from calendar import monthrange
import re

In [2]:
# Set logging parameters
logging.basicConfig(level=logging.INFO, filename='ft_scraping.log', filemode='w+', \
                    format='%(asctime)-15s:  %(levelname)s - %(message)s')

## Helper function to configure and format a json dict for the FT API

In [7]:
def format_query_dict(date_str):
    time_suffix = 'T00:00:00Z'
    qry_fmt_str = 'World AND lastPublishDateTime:>{}'
    full_date = date_str + time_suffix
    full_qry_str = qry_fmt_str.format(full_date)
    return  {
        "queryString": full_qry_str,
        "resultContext": {
            "aspects": ["title", "lifecycle", "location", "summary", "editorial"]
        },
        "queryContext":{
            "curations":["ARTICLES"]
        }
    }

## Code to pull out FT dicts containing article URLs and other meta-data

In [8]:
def get_ft_url_dicts(data):
    ft_url_dicts = []
    for doc in data:
        lastPubDateTime = doc['lifecycle']['lastPublishDateTime']
        lastPubDate = lastPubDateTime[0:lastPubDateTime.find('T')]
        
        url = doc['location']['uri']
        summary = ''
        if 'excerpt' in (doc['summary'].keys()):
            summary = doc['summary']['excerpt']
        
        dict_entry = dict(date=lastPubDate,
                          summary=summary,
                         url=url)
        ft_url_dicts.append(dict_entry)
    
    return ft_url_dicts

## Main driver code to get FT articles per month

In [17]:


# set up a sequence of sleep times
sequence = [x/10 for x in range(8, 14)]


ft_archive_dict_list = []

post_header = {"X-Api-Key": config.ft_api_key, "Content-Type": "application/json"}
ft_post_url = 'http://api.ft.com/content/search/v1?'


query_dict = {"queryString": "World AND lastPublishDateTime:<2019-09-03T00:00:00Z",
              "resultContext": {
                  "aspects": ["title", "lifecycle", "location", "summary", "editorial"]
              },
              "queryContext":{
                  "curations":["ARTICLES"]
              }
             }

try:
    logging.info(f"Querying for: {query_dict['queryString']}")
    response = requests.post(ft_post_url, json=query_dict, headers=post_header)

    # If the response was successful, no Exception will be raised
    response.raise_for_status()
except requests.exceptions.HTTPError as http_err:
    logging.exception(f'HTTP error occurred: {http_err}')  
except Exception as err:
    logging.exception(f'Other error occurred: {err}')  
else:
    # get json dict
    json_dict = response.json()
    f =  open('ft_json_dict.json', 'w')
    f.write(json.dumps(json_dict))
    f.close()

if 'results' in (json_dict.keys()):
    if 'results' in (json_dict['results'][0].keys()):    
        # pull out url dictionary
        data = response.json()['results'][0]['results']
        ft_archive_dict_list = get_ft_url_dicts(data)



In [20]:
for i in ft_archive_dict_list:
    print(i['date'])

2019-09-02
2019-09-02
2019-09-02
2019-09-02
2019-09-02
2019-09-02
2019-09-02
2019-09-02
2019-09-02
2019-09-02
2019-09-02
2019-09-02
2019-09-02
2019-09-02
2019-09-02
2019-09-02
2019-09-02
2019-09-02
2019-09-02
2019-09-02
2019-09-02
2019-09-02
2019-09-02
2019-09-02
2019-09-02
2019-09-02
2019-09-02
2019-09-02
2019-09-02
2019-09-02
2019-09-02
2019-09-02
2019-09-02
2019-09-02
2019-09-02
2019-09-02
2019-09-02
2019-09-02
2019-09-02
2019-09-02
2019-09-02
2019-09-02
2019-09-02
2019-09-02
2019-09-02
2019-09-02
2019-09-02
2019-09-02
2019-09-02
2019-09-02
2019-09-02
2019-09-02
2019-09-02
2019-09-02
2019-09-02
2019-09-02
2019-09-02
2019-09-02
2019-09-02
2019-09-02
2019-09-02
2019-09-02
2019-09-02
2019-09-02
2019-09-02
2019-09-02
2019-09-02
2019-09-02
2019-09-02
2019-09-02
2019-09-02
2019-09-02
2019-09-02
2019-09-02
2019-09-02
2019-09-02
2019-09-02
2019-09-02
2019-09-02
2019-09-01
2019-09-01
2019-09-01
2019-09-01
2019-09-01
2019-09-01
2019-09-01
2019-09-01
2019-09-01
2019-09-01
2019-09-01
2019-09-01

In [11]:
def ft_get_month_archive(month, year):
    
    # set up a sequence of sleep times
    sequence = [x/10 for x in range(8, 14)]

    # workaround: only go to Oct 11
    start = 0
    end = 0
    if month == 10:
        start = 1
        end = 11
    else:
        month_days = monthrange(int(year), int(month))
        start = month_days[0]
        end = month_days[1]
        
    ft_archive_dict_list = []
    
    post_header = {"X-Api-Key": config.ft_api_key, "Content-Type": "application/json"}
    ft_post_url = 'http://api.ft.com/content/search/v1?'
    
    for day in range(start, end):
        day_str = ''
        
        if day < 10:
            day_str = '0' + str(day)
        else:
            day_str = str(day)

        if int(month) < 10:
            month_str = '0' + month    
        
        query_dict = format_query_dict(f'{year}-{month_str}-{day_str}')
        
        try:
            logging.info(f"Querying for: {query_dict['queryString']}")
            response = requests.post(ft_post_url, json=query_dict, headers=post_header)
        
            # If the response was successful, no Exception will be raised
            response.raise_for_status()
        except requests.exceptions.HTTPError as http_err:
            logging.exception(f'HTTP error occurred: {http_err}')  
            continue
        except Exception as err:
            logging.exception(f'Other error occurred: {err}')  
            continue
        else:
            # get json dict
            json_dict = response.json()
            f =  open('ft_json_dict.json', 'w')
            f.write(json.dumps(json_dict))
            f.close()
            
            if 'results' in (json_dict.keys()):
                if 'results' in (json_dict['results'][0].keys()):    
                    # pull out url dictionary
                    data = response.json()['results'][0]['results']
                    ft_archive_dict_list.extend(get_ft_url_dicts(data))
            
            # sleep a random period before invoking API again
        
    return ft_archive_dict_list

## Manually invoke the code to fetch a single month's set of URLs

In [12]:
ft_archive_dict_list = ft_get_month_archive('9', '2019')

In [13]:
len(ft_archive_dict_list)

2400

## Let's save this URL list for safety

In [46]:
with open('ft_sep_archive_urls.pkl', 'wb') as f:
    pickle.dump(ft_archive_dict_list, f)

In [63]:
with open('ft_sep_archive_urls.pkl', 'rb') as f:
    ft_archive_dict_list = pickle.load(f)

In [65]:
ft_archive_dict_list[0]

{'date': '2019-10-12',
 'summary': 'Nigel Dodds, deputy leader of the Democratic Unionist Party, has voiced doubts about the idea of a “double customs” plan...',
 'url': 'https://www.ft.com/content/6cb5ea32-ed1a-11e9-ad1e-4367d8281195'}

## SELENIUM!

In [None]:
## 
## Set of methods for reference
##
# These return the first instance
# driver.find_element_by_id()
# driver.find_element_by_name()
# driver.find_element_by_xpath()
# driver.find_element_by_link_text()
# driver.find_element_by_partial_link_text()
# driver.find_element_by_tag_name()
# driver.find_element_by_class_name()
# driver.find_element_by_css_selector()
# # These return a list of all instances
# driver.find_elements_by_name()
# driver.find_elements_by_xpath()
# driver.find_elements_by_link_text()
# driver.find_elements_by_partial_link_text()
# driver.find_elements_by_tag_name()
# driver.find_elements_by_class_name()
# driver.find_elements_by_css_selector()

In [22]:
def ft_selenium_login(driver, url):
    SUBMIT_BUTTON = '//*[@id="login-form"]/div[3]/button'
    
    driver.get(url)
    username = driver.find_element_by_id("enter-email")

    next_button   = driver.find_element_by_id("enter-email-next")

    username.send_keys(config.FT_USERNAME)

    next_button.click()
    
    password = driver.find_element_by_id('enter-password')
    password.send_keys(config.FT_PASS)
    
    pwd_submit = driver.find_element_by_xpath(SUBMIT_BUTTON)
    pwd_submit.click()


## Manually do the FT login via Selenium in order to control and supervise.

In [23]:
FT_LOGIN_URL = 'https://accounts.ft.com/login'

# Get Driver
driver = webdriver.Chrome('/Users/markbrennan/Desktop/chromedriver')

# Login first (will need manual supervision):
ft_selenium_login(driver, FT_LOGIN_URL)

In [21]:
'class="o-teaser__heading"'

'class="o-teaser__heading"'

In [24]:
driver.get('https://www.ft.com/world?page=101')

In [63]:
refs = driver.find_elements_by_class_name('o-teaser__heading')

In [64]:
t = refs[0]

In [None]:
#site-content > div > div.stream__right-hand > div > div.recommended-stories--stream.o-teaser-collection.js-track-scroll-event > div > div:nth-child(9) > div > div.o-teaser__content > div.o-teaser__heading > a

In [65]:
t. find_element_by_css_selector('a').get_attribute('href')

'https://www.ft.com/content/4a9d4d4c-cd8e-11e9-b018-ca4456540ea6'

In [None]:
<a href="/content/4a4505f8-9808-11e9-8cfb-30c211dcd229" data-trackable="heading-link" class="js-teaser-heading-link">A year in pictures: how LGBT+ rights have changed</a>

In [None]:
<div class="o-teaser__heading"><a href="/content/ee1db498-ce47-11e9-99a4-b5ded7a7fe3f" data-trackable="heading-link" class="js-teaser-heading-link">Polish opposition picks candidate for prime minister</a></div>


In [None]:
//*[@id="stream"]/div[1]/ul/li[1]/div[2]/div/div/div[1]/div[2]/a

In [None]:
//*[@id="stream"]/div[1]/ul/li[2]/div[2]/div/div/div[1]/div[2]/a

In [54]:
elems = driver.find_elements_by_xpath('//*[@id="stream"]/div[1]/ul/li[2]/div[2]/div/div/div[1]/div[2]/a')

In [56]:
u = elems.pop()

In [57]:
u.

'Polish opposition picks candidate for prime minister'

In [None]:
ref.find_

In [28]:
xpath = '//*[@id="stream"]/div[1]/ul/li[2]/div[2]/div/div/div[1]/div[2]/a'

In [29]:
driver.find_elements_by_xpath(xpath)

[<selenium.webdriver.remote.webelement.WebElement (session="2c7ca6974a06e55a6346fba4308f6fdc", element="c7300799-5eed-478c-a631-901fa3670028")>]

In [30]:
urls = driver.find_elements_by_xpath(xpath)

In [32]:
urls.pop()

<selenium.webdriver.remote.webelement.WebElement (session="2c7ca6974a06e55a6346fba4308f6fdc", element="c7300799-5eed-478c-a631-901fa3670028")>

## New code to scrape World news URLs from the website

In [71]:
with open('ft_url_list.txt', 'a') as ft_url_file:
    sleep_sequence = [x/10 for x in range(6, 19)]    
    ft_url_list = []
    ft_base_url = 'https://www.ft.com/world?page={}'
    for page_num in range(2, 250):  
        url = ft_base_url.format(page_num)
        logging.info(f'Scraping url: {url}')
        try:
            driver.get(url)
            article_refs = driver.find_elements_by_class_name('o-teaser__heading')
            for ref in article_refs:
                article_url = ref.find_element_by_css_selector('a').get_attribute('href')

                ft_url_list.append(article_url)
                ft_url_file.write(article_url)
                ft_url_file.write('\n')
        except Exception as err:
            logging.exception(f'Other error occurred: {err}')
            continue
            
        time.sleep(random.choice(sleep_sequence))
    

In [72]:
len(ft_url_list)

7079

In [73]:
ad = 'adclick.g.doubleclick.net'
ig = 'ig.ft.com'
alphaville = 'ftalphaville.ft.com'
cash_trails = 'cash-trails'
podcast = 'podcast'
video = 'video'
property_listing = 'propertylistings.ft.com'


In [81]:
bad_urls = [ad,
ig,
alphaville,
cash_trails,
podcast,
video,
property_listing]

In [82]:
bad_urls

['adclick.g.doubleclick.net',
 'ig.ft.com',
 'ftalphaville.ft.com',
 'cash-trails',
 'podcast',
 'video',
 'propertylistings.ft.com']

In [74]:
t = 'https://adclick.g.doubleclick.net/pcs/click?xai=AKAOjsuAfMp-SoE9pKoWKNGlnRmC4LrY-EBrWNBbwTXBVivxaHr2HoNZKbVZfbHKlNHS_PtCs8mqnHGcX_Q30aDBjbOg32la4eWpzJLTl     yGNILQt0xwiBF_FjTNB4RotFAOlco9eorpLtlAxfHd8UYGVf5Jj9kYllfSLYdSoVOvbPo9DXoPhrLyATvkK24D4uN-fO6rrEM5hJa6AbAvhjst-wMK9SCddx1XW_6waccLJZZ1AjQU-hcLbEQ&sig=Cg0     ArKJSzPTYVqj9RFMjEAE&urlfix=1&adurl=https://www.ft.com/content/82fdd5fa-be73-11e9-9381-78bab8a70848'

In [75]:
t

'https://adclick.g.doubleclick.net/pcs/click?xai=AKAOjsuAfMp-SoE9pKoWKNGlnRmC4LrY-EBrWNBbwTXBVivxaHr2HoNZKbVZfbHKlNHS_PtCs8mqnHGcX_Q30aDBjbOg32la4eWpzJLTl     yGNILQt0xwiBF_FjTNB4RotFAOlco9eorpLtlAxfHd8UYGVf5Jj9kYllfSLYdSoVOvbPo9DXoPhrLyATvkK24D4uN-fO6rrEM5hJa6AbAvhjst-wMK9SCddx1XW_6waccLJZZ1AjQU-hcLbEQ&sig=Cg0     ArKJSzPTYVqj9RFMjEAE&urlfix=1&adurl=https://www.ft.com/content/82fdd5fa-be73-11e9-9381-78bab8a70848'

In [76]:
t.find(ad)

8

In [77]:
t = 'http://ftalphaville.ft.com/2019/10/01/1569910148000/Further-reading'

In [78]:
t.find(alphaville)

7

In [79]:
t = 'https://ig.ft.com/trump-china-tariffs'

In [80]:
t.find(ig)

8

In [88]:
new_ft_url_list = []
for url in ft_url_list:
    good = True
    for bad in bad_urls:
        if url.find(bad) > -1:
            good = False
            break
    if good:
        new_ft_url_list.append(url)


In [89]:
len(new_ft_url_list)

6146

In [92]:
with open('ft_good_url_list.pkl', 'wb') as f:
    pickle.dump(new_ft_url_list, f)

In [93]:
test_url = new_ft_url_list[0]

In [94]:
test_url

'https://www.ft.com/content/59e751c0-e8f6-11e9-a240-3b065ef5fc55'

In [95]:
new_ft_url_list[1]

'https://www.ft.com/content/d0fe931e-d946-11e9-8f9b-77216ebe1f17'

In [None]:
<span class="article-classifier__gap">Iran: hardliners’ anti-graft drive masks wider goals</span>

In [96]:
driver.get('https://www.ft.com/content/d0fe931e-d946-11e9-8f9b-77216ebe1f17')

In [97]:
article = driver.find_elements_by_class_name('article__content')[0].text

In [117]:
article

"Najmeh Bozorgmehr in Tehran 21 HOURS AGO\nPrint this page\n26\nIt wasn’t his “luxury” house that made Iranians sit up and notice. It wasn’t even the top of the range Mercedes-Benz, described by one prosecutor as payment for Ammar Salehi’s alleged role in facilitating a $26m bank fraud. What really surprised people, inured to corruption at the highest levels of power in Iran, was that Mr Salehi had been arrested at all.\nMany assumed the 39-year-old son of a former army commander was beyond the reach of the state’s anti-corruption investigators. But it seems that is no longer the case. Mr Salehi is among hundreds of people with links to senior figures who are facing charges in an anti-corruption drive targeting the country’s political and military elite. With the US tightening sanctions, some in the regime believe the crackdown is essential to the Islamic republic’s survival.\nCharged with “collaboration in disrupting the economic system”, Mr Salehi is accused of misusing his links to 

In [102]:
driver.find_elements_by_class_name('article-classifier__gap')[0].text

'Iran: hardliners’ anti-graft drive masks wider goals'

In [112]:
driver .get('https://www.ft.com/content/59e751c0-e8f6-11e9-a240-3b065ef5fc55')

In [104]:
a2 = driver.find_elements_by_class_name('article__content')[0].text

In [106]:
driver.find_elements_by_class_name('article-classifier__gap')[0].text

'Argentina’s economic woes spell doom for Macri’s election prospects'

In [None]:
<time class="article-info__timestamp o-date" data-o-component="o-date" datetime="2019-07-02T19:32:36Z" data-o-date-js="" title="July 2 2019 3:32 pm" aria-label="July 2 2019">July 2 2019</time>

In [109]:
driver.get('https://www.ft.com/content/fdfbf65c-9370-11e9-b7ea-60e35ef678d2')

In [110]:
driver.find_elements_by_class_name('article-info__timestamp o-date')

[]

In [None]:
datetime

In [113]:
driver.find_element_by_css_selector('time').get_attribute('datetime')

'2019-10-13T09:01:16Z'

In [114]:
d = '2019-10-13T09:01:16Z'

In [115]:
d.find('T')

10

In [116]:
d[0:10]

'2019-10-13'

## New code to scrape FT pages from URL list

In [119]:
test = 'Najmeh Bozorgmehr in Tehran 21 HOURS AGO\nPrint this page\n26\nIt wasn’t his “luxury” house that made Iranians sit'

In [140]:
pat2 = r'^.*HOURS AGO\\nPrint this page'

In [160]:
pat = r'\w+.*\\nPrint this page'

In [149]:
p = re.compile(pat)

In [150]:
p.sub('', test)

'Najmeh Bozorgmehr in Tehran 21 HOURS AGO\nPrint this page\n26\nIt wasn’t his “luxury” house that made Iranians sit'

In [157]:
re.search('\w+.*\\nPrint this page', test)

<re.Match object; span=(0, 56), match='Najmeh Bozorgmehr in Tehran 21 HOURS AGO\nPrint t>

In [162]:
m = re.match(pat, test)

In [172]:
re.sub('\w+.*\\nPrint this page', '', test)

'\n26\nIt wasn’t his “luxury” house that made Iranians sit'

In [173]:
pat = '\w+.*\\nPrint this page'

In [174]:
p = re.compile(pat)

In [175]:
p.sub('', test)

'\n26\nIt wasn’t his “luxury” house that made Iranians sit'

In [169]:
re.sub(pat, '', test)

'Najmeh Bozorgmehr in Tehran 21 HOURS AGO\nPrint this page\n26\nIt wasn’t his “luxury” house that made Iranians sit'

In [182]:
t = "21\nArgentina\u2019s president Mauricio Macri sounded almost apologetic as     he addressed a crowd in the heart of the country\u2019s Malbec wine-growing region, to make his case for another term.\n\u201cWe all know that recent tim    es have been difficult, especially the last year and a half\u2009.\u2009.\u2009.\u2009but I want to tell you that I have listened to you and I have unders    tood, I have taken note and I have comprehended.\u201d\n\u201cNow something different is coming,\u201d he added, promising a changed approach.\nBut everyt    hing suggests that the \u201csomething different\u201d in store for Argentina is Mr Macri\u2019s main opponent; Alberto Fern\u00e1ndez, a leftwing Peronis    t running on a ticket with ex-president Cristina Fern\u00e1ndez de Kirchner. He is the out-and-out favourite to win the election on October 27.\nA second     consecutive year of recession, a sharp devaluation of the peso, a record-breaking $57bn IMF bailout, rising poverty and worsening unemployment would be di    re for any candidate seeking re-election. But for a scion of one of the country\u2019s wealthiest families, they are especially toxic. Hence Mr Macri\u201    9s contrition.\nThe middle-class crowd of a few thousand gathered in a square in the city of Mendoza cheered dutifully and waved banners with the campaign     slogan \u201cYes we can\u201d as the president worked his way through a stump speech that lasted barely 20 minutes, his voice failing at times. His wife     Juliana Awada, clad in designer black, laid a comforting hand on his shoulder, exuding the effortless millionaire elegance that won her a Vogue \u201cbest     dressed\u201d acclamation. A drone hovered overhead gathering images for use on social media. The warm-up act came courtesy of the son of a soy baron.\nR    ecommended\nWeekend long reads\nPoverty, priests and politics: why Peronism"

SyntaxError: (unicode error) 'unicodeescape' codec can't decode bytes in position 1175-1179: truncated \uXXXX escape (<ipython-input-182-c5d46e2bbb3f>, line 1)

In [None]:
t

In [142]:
p2 = re.compile(pat2)

In [143]:
t2 = p2.sub('', test)

In [144]:
t2

'Najmeh Bozorgmehr in Tehran 21 HOURS AGO\nPrint this page\n26\nIt wasn’t his “luxury” house that made Iranians sit'

In [139]:
test.strip()

'Najmeh Bozorgmehr in Tehran 21 HOURS AGO\nPrint this page\n26\nIt wasn’t his “luxury” house that made Iranians sit'

In [176]:
def ft_scrape_pages(url_list, driver):
    sleep_sequence = [x/10 for x in range(6, 19)]

    section = 'World'

    ft_data_file = 'ft_data.json'
    
    pat = '\w+.*\\nPrint this page'
    p = re.compile(pat)

    with open(ft_data_file, 'a') as ft_data:
        for url in url_list:
            logging.info(f'Getting page to scrape: {url}')

            if validators.url(url) != True: 
                logging.exception(f'URL is invalid!  URL: {url}')
                continue

            try:
                # Use Selenium to scrape
                driver.get(url)
                
                # get article
                article = p.sub('', driver.find_elements_by_class_name('article__content')[0].text)
                
                # get date:
                full_dt = driver.find_element_by_css_selector('time').get_attribute('datetime')
                # remove timestamp portion following 'T': "2019-10-13T09:01:16Z"
                dt = full_dt[0:full_dt.find('T')]
                
                # get headline
                headline = driver.find_elements_by_class_name('article-classifier__gap')[0].text
                
                text_entry = dict(paper='FT',
                                  date=dt,
                                  section=section,
                                  url=url,
                                  headline=headline,
                                  text=article.strip())

                logging.info(f"date: {dt} | section: {section} | url: {url} | headline: {headline} | text: {article[0:20]}")

                ft_data.write(json.dumps(text_entry))
                ft_data.write('\n')

                # now sleep a random period of time
                time.sleep(random.choice(sleep_sequence))

            except Exception as err:
                logging.exception(f'Other error occurred: {err}')
                continue


In [177]:
len(new_ft_url_list)

6146

In [178]:
test_url_list = new_ft_url_list[0:10]

In [183]:
ft_scrape_pages(test_url_list, driver)

In [184]:
ft_url_list = new_ft_url_list[11:2239]

In [185]:
ft_scrape_pages(ft_url_list, driver)

## Main FT Article Scraping Driver Function

In [51]:
def ft_scrape(month_list, driver): 
    sleep_sequence = [x/10 for x in range(6, 19)]

    year = '2019'
    section = 'World'

    ft_archive_file = 'ft_url_archive.json'
    ft_data_file = 'ft_data.json'

    with open(ft_archive_file, 'a') as ft_archive, open(ft_data_file, 'a') as ft_data:
        for month in month_list:
            logging.info(f'Calling API for month: {month}')

            # we already have October's url archive, so simply stub it to use
            # it first for testing
            ft_archive_dict_list = T_ft_get_month_archive(month, year)

            ft_archive.write(json.dumps(ft_archive_dict_list))
            ft_archive.write('\n')

            logging.info(f'Fetched {len(ft_archive_dict_list)} article URL entries')
            logging.info('Fetching each article...')


            for item in ft_archive_dict_list:
                url = item['url']
                logging.info(f'Getting page to scrape: {url}')

                if validators.url(url) != True: 
                    logging.exception(f'URL is invalid!  URL: {url}')
                    continue

                try:
                    # Use Selenium to scrape
                    driver.get(url)
                    article = driver.find_elements_by_class_name('article__content')[0].text
                    dt = item['date']
                    headline = item['summary']
                    text_entry = dict(paper='FT',
                                      date=dt,
                                      section=section,
                                      url=url,
                                      headline=headline,
                                      text=article.strip())

                    logging.info(f"date: {dt} | section: {section} | url: {url} | headline: {headline} | text: {article[0:20]}")

                    ft_data.write(json.dumps(text_entry))
                    ft_data.write('\n')

                    # now sleep a random period of time
                    time.sleep(random.choice(sleep_sequence))

                except Exception as err:
                    logging.exception(f'Other error occurred: {err}')
                    continue


In [52]:
ft_scrape(['9'], driver)

KeyboardInterrupt: 

In [55]:
driver.quit()