In [31]:
import os
import re
import time
import json
import requests

from bs4 import BeautifulSoup

In [16]:
news_fdir = '../data/raw/news/bbc/'
url_fpath = news_fdir + 'urls.txt'

agent = {"User-Agent":'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 \
(KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'}

sitemap_url = 'https://www.bbc.com/sitemaps/https-index-com-archive.xml'

In [18]:
def find_urls(page_url):
    page = requests.get(page_url, headers=agent)
    elements = BeautifulSoup(page.content, features='xml').find_all('loc')
    urls = [element.text for element in elements]
    return urls

In [29]:
def get_content(url):
    page = requests.get(url, headers=agent)
    soup = BeautifulSoup(page.content, 'html.parser')

    date = soup.find('time', {'data-testid': 'timestamp'}).text
    title = soup.find('h1', id='main-heading').text
    text = ' '.join([element.text for element in soup.find_all('div', \
        {'data-component': 'text-block'})])
    text = re.sub(r'[\s]+', ' ', text)

    content_dict = {'date': date,
                    'title': title,
                    'text': text}
    return content_dict

In [20]:
def scrape_sitemap(sitemap_url, re_scrape=False):
    url_fpath = news_fdir + 'urls.txt'
    if os.path.exists(url_fpath) and not re_scrape:
        print('List of URLs already exists. Loading the URL list...')
        
        with open(url_fpath, 'r') as f:
            url_list = f.read().split('\n')
        print('Loading URL list: Done\n')
    
    else:
        print('Scraping sitemap starts from the scratch.')
        page_list = find_urls(sitemap_url)

        url_list = []
        for idx, page in enumerate(page_list):
            print('Scraping %d-th page from the root sitemap, out of total %d pages...' % \
                (idx+1, len(page_list)), end='\r')
            
            url_list += [url for url in find_urls(page) \
                if url.startswith('https://www.bbc.com/news/')]
        print('\nScraping sitemap: Done\n')

        print('Saving URLs list at: %s' % url_fpath)
        os.makedirs(news_fdir, exist_ok=True)
        with open(url_fpath, 'w') as f:
            f.write('\n'.join(url_list))
        print('Saving URLs list: Done\n')

    return url_list

In [25]:
def scrape_urls(url_list):
    print('Scarping total %d URLs starts.' % len(url_list))
    file_list = os.listdir(news_fdir)
    if len(file_list) <= 1:
        last_idx = -1
        print('Scraping URLs starts from the scratch.\n')
    else:
        last_idx = int(sorted(file_list, reverse=True)[1][:8])
        print('Scraping URLs continues from %d\n' % last_idx)

    for idx, url in enumerate(url_list):
        if idx <= last_idx:
            continue
        
        print('Scraping contents from %d-th URL out of %d URLs...' % \
            (idx+1, len(url_list)), end='\r')
        try:
            content = get_content(url)
            
            fname = news_fdir + '%.8d.json' % idx
            with open(fname, 'w') as f:
                json.dump(content, f, indent=4)
        
        except Exception:
            continue
        
        time.sleep(0.1)
    print('\nScraping contents: Done\n')

    return

In [22]:
url_list = scrape_sitemap(sitemap_url, re_scrape=False)

List of URLs already exists. Loading the URL list...
Loading URL list: Done



In [34]:
scrape_urls(url_list)

Scarping total 1340806 URLs starts.
Scraping URLs continues from 20

Scraping contents from 42-th URL out of 1340806 URLs...

KeyboardInterrupt: 

In [32]:
get_content(url)

{'date': '6 May 2010',
 'title': "Durham cricket club passes to India media moguls' hands",
 'text': 'On the pitch Durham are going about a spirited defence of their status as county champions. At the club\'s Riverside ground Liam Plunkett is running in with some hostility to his bowling. Ian Blackwell is bringing new meaning to the word "yeomanlike" with his batting and bowling, and in spite of the north-eastern chill crowds are being counted in their hundreds. But behind the playing scene there are big financial changes going on at the game\'s newest first-class county. Quietly and without publicity the club has passed into foreign hands. Durham is now more than 90% owned by the Indian media figure Gautam Radia and his brother Hiren. Their stake cost them just short of £2.5m. According to Durham\'s chief executive David Harker the two are associates of the club\'s chairman Clive Leach - a former media man himself - and apparently little more than sleeping investors. "They tend not to

In [4]:
def find_urls(page_url):
    page = requests.get(page_url, headers=agent)
    tags = BeautifulSoup(page.content, features='xml').find_all('loc')
    urls = [tag.text for tag in tags]
    return urls

In [6]:
page_list = find_urls(sitemap_url)

In [11]:
page = page_list[50]

In [12]:
[url for url in find_urls(page) if url.startswith('https://www.bbc.com/news/')]

['https://www.bbc.com/news/uk-politics-eu-referendum-36017170',
 'https://www.bbc.com/news/world-europe-36086855',
 'https://www.bbc.com/news/world-asia-35994375',
 'https://www.bbc.com/news/technology-36159146',
 'https://www.bbc.com/news/business-36024413',
 'https://www.bbc.com/news/uk-england-essex-36161503',
 'https://www.bbc.com/news/uk-northern-ireland-35969816',
 'https://www.bbc.com/news/uk-wales-36016333',
 'https://www.bbc.com/news/world-europe-36112451',
 'https://www.bbc.com/news/world-asia-36181638',
 'https://www.bbc.com/news/world-europe-36122928',
 'https://www.bbc.com/news/blogs-trending-36045747',
 'https://www.bbc.com/news/world-europe-36128489',
 'https://www.bbc.com/news/world-asia-36127024',
 'https://www.bbc.com/news/world-asia-36176772',
 'https://www.bbc.com/news/world-asia-36253681',
 'https://www.bbc.com/news/world-africa-36255476',
 'https://www.bbc.com/news/magazine-36249697',
 'https://www.bbc.com/news/world-middle-east-36231986',
 'https://www.bbc.com/ne

In [None]:
find_urls(post_urls[0])[0].startswith('https://www.bbc.com/news/')

In [None]:
post_url = 'https://www.bbc.com/sitemaps/https-sitemap-com-archive-97.xml'

In [None]:
list(filter(lambda x: x.startswith('https://www.bbc.com/news/'), find_urls(post_url)))

In [None]:
urls = []
for idx, post_url in enumerate(post_urls):
    print('Scraping urls from %d-th page out of %d pages...' % (idx+1, len(post_urls)), end='\r')
    urls += list(filter(lambda x: x.startswith('https://www.bbc.com/news/'), find_urls(post_url)))
print('\nScraping urls: Done')

In [None]:
url = 'https://www.bbc.com/news/10096909'

In [None]:
url_fpath = '../data/news/mb/urls.txt'

In [None]:
with open(url_fpath, 'w') as f:
    f.write('\n'.join(urls))
print('Saving urls at: %s' % url_fpath)

In [None]:
with open(url_fpath, 'r') as f:
    urls = f.read().split('\n')

In [None]:
def get_content(url):
    page = requests.get(url, headers=agent)
    soup = BeautifulSoup(page.content, 'html.parser')
    content_dict = {'date': soup.find('time', {'data-testid': 'timesp'}).text,
                    'title': soup.find('h1', id='main-heading').text,
#                     'author': soup.find('em').text,
                    'text': soup.find('section', class_='article-content').text}
    return content_dict

In [None]:
page = requests.get(url, headers=agent)
soup = BeautifulSoup(page.content, 'html.parser')
content_dict = {'date': soup.find('time').text,
                'title': soup.find('h1', id='main-heading').text,
#                     'author': soup.find('em').text,
                'text': soup.find('section', class_='article-content').text}

In [None]:
' '.join([element.text for element in soup.find_all('div', {'data-component': 'text-block'})])

In [None]:
for idx, url in enumerate(urls):
    print('Scraping contents from %d-th url out of %d urls...' % (idx+1, len(urls)), end='\r')
    try:
        content = get_content(url)
        fname = '../data/news/mb/%.8d.json' % idx
        with open(fname, 'w') as f:
            json.dump(content, f, indent=4)
    except:
        continue
    time.sleep(0.1)
print('\nScraping contents: Done')

In [None]:
last_idx = int(sorted(os.listdir('../data/news/mb/'), reverse=True)[1][:8])

In [None]:
last_idx in globals()

In [None]:
for idx, url in enumerate(urls):
    if idx <= last_idx:
        continue
    
    print('Scraping contents from %d-th url out of %d urls...' % (idx+1, len(urls)), end='\r')
    try:
        content = get_content(url)
        fname = '../data/news/mb/%.8d.json' % idx
        with open(fname, 'w') as f:
            json.dump(content, f, indent=4)
    except:
        continue
    
    time.sleep(0.1)
print('\nScraping contents: Done')

In [None]:
last_idx + 1

In [None]:
print('List of URLs already exists. Scraping continues from %d-th page.' % (last_idx + 1))

In [None]:
#!/usr/bin/env python
# coding: utf-8

import os
import time
import json
import requests
import pandas as pd
from bs4 import BeautifulSoup

def find_urls(page_url):
    page = requests.get(page_url, headers=agent)
    tags = BeautifulSoup(page.content, features='xml').find_all('loc')
    urls = [tag.text for tag in tags]
    return urls

def get_content(url):
    page = requests.get(url, headers=agent)
    soup = BeautifulSoup(page.content, 'html.parser')
    content_dict = {'date': soup.find('p', class_='published').text,
                    'title': soup.find('h2', class_='title').text,
#                     'author': soup.find('em').text,
                    'text': soup.find('section', class_='article-content').text}
    return content_dict

url_fpath = '../data/news/mb/urls.txt'
news_fdir = '../data/news/mb/'

agent = {"User-Agent":'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'}
sitemap_url = 'https://mb.com.ph/wp-sitemap.xml'

post_urls = [url for url in find_urls(sitemap_url) if 'posts-post' in url]

# Scraping urls from the sitemap
if os.path.exists(url_fpath):
    with open(url_fpath, 'r') as f:
        urls = f.read().split('\n')
    last_idx = int(sorted(os.listdir('../data/news/mb/'), reverse=True)[1][:8])
    print('List of URLs already exists. Scraping continues from %d-th page.' % (last_idx + 1))
else:
    urls = []
    for idx, post_url in enumerate(post_urls):
        print('Scraping urls from %d-th page out of %d pages...' % (idx+1, len(post_urls)), end='\r')
        urls += find_urls(post_url)
    print('\nScraping %d urls: Done' % len(urls))

    with open(url_fpath, 'w') as f:
        f.write('\n'.join(urls))
    print('Saving urls at: %s' % url_fpath)

    last_idx = -1

# Scraping contents from the urls
for idx, url in enumerate(urls):
    if idx <= last_idx:
        continue
    
    print('Scraping contents from %d-th url out of %d urls...' % (idx+1, len(urls)), end='\r')
    try:
        content = get_content(url)
        fname = '../data/news/mb/%.8d.json' % idx
        with open(fname, 'w') as f:
            json.dump(content, f, indent=4)
    except:
        continue
    
    time.sleep(0.1)
print('\nScraping contents: Done')
