In [None]:
import datetime
import os
import re
import time

import numpy as np
import pandas as pd
import urllib.request

from bs4 import BeautifulSoup

In [245]:
class BingHeadlineGetter:
    
    # delay in seconds for each http request sent
    LOWER_REQUEST_DELAY = 0.5
    UPPER_REQUEST_DELAY = 1.5
    
    # bing constant for the infinite scroller
    SCROLL_SIZE = 10
    
    SCROLL_INDEX_REGEX = re.compile(r'.*first=(\d+).*')
    
    # have the replaces so that we can get it all on one line
    SEARCH_URL = '''https://www.bing.com/news/infinitescrollajax?qs=n&form=NWRFSH&sp=-1&
        ghc=1&pq={term:s}&sc=8-6&sk=&cvid=FE95A940E6DD4103AD0FC2A09728BD5B&
        InfiniteScroll=1&q={term:s}&first=11&IG=B131BA2E33924919A80CAF9FBCB2EFB4&IID=NEWS.302&SFX=1&PCW=794'''\
        .replace('\n', '').replace('\t', '').replace(' ', '').format
  
    @classmethod
    def _get_delay(cls):
        return np.random.uniform(low=cls.LOWER_REQUEST_DELAY, high=cls.UPPER_REQUEST_DELAY)
    
    @classmethod
    def get_url(cls, search_term):
        return cls.SEARCH_URL(term=search_term.replace(' ', '+'))
    
    @staticmethod
    def _get_headlines(soup):
        headlines = list()
        divs = soup.findAll('a')
        
        for div in divs:
            if div.has_attr('class') and div['class'] == ['title']:
                headlines.append(div.text)
                
        return headlines
    
    @classmethod
    def get_headlines(cls, search_term, num_pages):
        assert num_pages > 0
        
        url = cls.get_url(search_term)
        scroll_index = int(re.match(cls.SCROLL_INDEX_REGEX, url).group(1))
        
        headlines = list()
        
        for _ in range(num_pages):
            with urllib.request.urlopen(url) as response:
                html = response.read()
                
            soup = BeautifulSoup(html, 'lxml')
            headlines.extend(cls._get_headlines(soup))
            
            url = url.replace('first={:d}'.format(scroll_index), 'first={:d}'.format(scroll_index + cls.SCROLL_SIZE))
            scroll_index += 1
        
            # let's try not to get noticed...
            time.sleep(cls._get_delay())
            
        return headlines

In [266]:
search_term = 'brexit'
num_pages = 200

headlines = BingHeadlineGetter.get_headlines(search_term, num_pages)

In [264]:
def store_headlines(headlines, filename):
    directory = 'C:/Users/daonw/Documents/SentimentAnalysisData'
    series = pd.Series(headlines)
    filename = '{:s}-{:%Y-%m-%d}.pkl'.format(filename, datetime.datetime.now())
    
    location = os.path.join(directory, filename)
    series.to_pickle(location)

In [271]:
store_headlines(headlines, 'brexit_headlines')

In [270]:
headlines

 "Anti-Brexit campaigner seeks to 'end the chaos'",
 'Yellowhammer: the Brexit bird with a story to tell about the EU',
 'EU Brexit negotiator reveals ‘divorce deal’ with Britain will go through in the next few weeks',
 "Former British foreign secretary Boris Johnson condemned for Brexit 'suicide vest' comment",
 "Brexit deal 'realistic within 6-8 weeks'",
 'Brexit is a disaster and I would love to reverse it, says actress Celia Imrie',
 'Brexit will strike a discordant note for Britain’s musical relationship with Europe',
 'Tories ban anti-Brexit group Best For Britain from their party conference',
 '2m Brits need to renew their passports immediately or risk being barred from Europe if we fail to get a Brexit d…',
 'JPMorgan and Citi say just hundreds of jobs will leave London due to Brexit — not thousands',
 'May must walk tall in the mountains and sell her Brexit plan to EU in Salzburg showdown',
 'Brexit rebels in UK PM May’s party discuss ousting her',
 'No-deal Brexit could see U