### Imports

In [1]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import time
import os
from google.cloud import bigquery
from google.oauth2 import service_account

### Some functions

In [2]:
def get_html_from_page(url):
    '''
    Returns a BeutifulSoup object given an URL in string format.
    
    :param url: string
    
    :return: BeautifulSoup object
    '''
    response = urlopen(url)
    html = response.read()

    soup = BeautifulSoup(html, 'html.parser')
    return soup

In [3]:
def find_trash_list(raw_urls):
    '''
    Filter `raw_urls` and returns a list of URL in string format that will be descarted.
    Only pages with `/news/` will pass.
    
    :param raw_urls: list
    
    :return: list
    '''
    trash_list = []

    for item in raw_urls:
        if item.find('/news/') == -1:
            trash_list.append(item)

    return trash_list

In [4]:
def remove_items(raw_urls, trash_list):
    '''
    Removes from list `raw_urls` the elements of list `trash_list`.
    
    :param raw_urls: list
    :param trash_list: list
    
    :return: list
    '''
    urls = raw_urls.copy()

    for trash in trash_list:
        urls.remove(trash)

    return urls

### Crawling the main page to take the URL of all articles

In [5]:
site_url = 'https://www.bbc.com'

# Getting the HTML from main page
soup = get_html_from_page(site_url)

raw_urls = []

# Fetch the tag where the links are
bs_news = soup.findAll('h3', {'class': 'media__title'})

for item in bs_news:
    if item.a.get('href')[0] == '/':
        raw_urls.append(site_url + item.a.get('href'))
    else:
        raw_urls.append(item.a.get('href'))

In [6]:
# Filtering only URL that contain `/news/`
trash_list = find_trash_list(raw_urls)
urls = remove_items(raw_urls, trash_list)

### Crawling all pages selected to fetch:
- Headlines
- Authors
- Publication dates and times
- Article texts
- URL after the redirect

In [7]:
redirected_urls = []
headlines = []
authors = []
publications_date_time = []
article_texts = []

for i, url in enumerate(urls):
    
    # Getting the HTML from each page
    soup_page = get_html_from_page(url)
    time.sleep(1)

###############################################################################
    # URLs redirected
    redirected = soup_page.find('head').find('meta', {'property': 'og:url'}).get('content')
    redirected_urls.append(redirected)


###############################################################################
     # Headlines
    title = soup_page.find('article').h1.get_text()
    headlines.append(title)


###############################################################################
    # Authors
    authors_names = []
    final_name = 'Author not identified'

    try:
        by_line_block = soup_page.find('article').find('div', {'data-component': 'byline-block'})
        tag_div_author = by_line_block.find('div', {'class': 'ssrcss-68pt20-Text-TextContributorName e8mq1e96'})
        complete_by_line = tag_div_author.get_text().split('By ')[1].split('&')
        
        # Sometimes there are more than one author
        for n, author in enumerate(complete_by_line):
            final_name = author.split(' and ')[0].split(' in ')[0].strip().split('BBC')[0]
            authors_names.append(final_name)
    
    
    # Some pages have no author on the front-end (pages with videos or photos only)
    except:
        # print(f'Redirected url without author\n{redirected}', end='\n\n') # Only developer check
        authors_names.append(final_name)
    
    authors.append(authors_names)


###############################################################################
    # Publication dates and times
    try:
        str_format = soup_page.findAll('time')[0].get('datetime')
        
        # Transforming the format
        for char in ['.', '+']:
            if str_format.find(char) != -1:
                dt_format = datetime.strptime(str_format.split(char)[0], '%Y-%m-%dT%H:%M:%S')
        publications_date_time.append(dt_format)

    except:
        print('Error in datetime')


###############################################################################
    # Article texts
    try:
        paragraph = []
        text_block = soup_page.findAll('div', {'data-component': 'text-block'})

        # For pages with only videos, I get the video descriptions
        if text_block == []:

            tag_article = soup_page.find('article').find('div', {'data-testid': 'reveal-text-wrapper'})
            paragraphs_list = tag_article.findAll('p')

            video_description = []

            # The video descriptions are separated in more than one tag `p`
            for p in paragraphs_list:
                
                video_description.append(p.get_text().strip())
            
            # Merging all the texts in each tag and separating them with `\n`
            article_texts.append('\n'.join(video_description))
        
        # For pages with text
        else:

            # The article text are separated in more than one tag `div`
            for block in text_block:
                paragraph.append(block.get_text().strip())
                
                try:
                    # Some article texts have subheadings in the middle and without them the text could be meaningless
                    if block.next_sibling['data-component'] == 'subheadline-block':
                        paragraph.append(block.next_sibling.get_text())
                except:
                    pass
            
            # Merging all the texts in each tag and separating them with `\n`
            article_texts.append('\n'.join(paragraph))
    except:
        print('Error in text')

### Checking how many items each list has

In [8]:
print(f'Urls ------------------------> {len(urls)}')
print(f'Authors ---------------------> {len(authors)}')
print(f'Headlines -------------------> {len(headlines)}')
print(f'Articles texts --------------> {len(article_texts)}')
print(f'Redirected urls -------------> {len(redirected_urls)}')
print(f'Publications date and time --> {len(publications_date_time)}')

Urls ------------------------> 36
Authors ---------------------> 36
Headlines -------------------> 36
Articles texts --------------> 36
Redirected urls -------------> 36
Publications date and time --> 36


### Preparing output

In [9]:
data_dict = {
    'Headline': headlines,
    'Authors': authors,
    'Publication_datetime': publications_date_time,
    'Article': article_texts,
    "URL": urls,
    'Redirected_URL': redirected_urls
}

data = pd.DataFrame(data_dict)

for col in data.columns:
    if col != 'Publication_datetime':
        data[col] = data[col].astype(str)

data.head()

Unnamed: 0,Headline,Authors,Publication_datetime,Article,URL,Redirected_URL
0,"Ukraine names unarmed, smoking soldier shot by...",['Author not identified'],2023-03-07 12:31:01,Ukraine has vowed to find the Russian soldiers...,https://www.bbc.com/news/world-europe-64872623,https://www.bbc.com/news/world-europe-64872623
1,'Granby Girl' identified as Patricia Ann Tucke...,['Author not identified'],2023-03-07 12:58:50,A woman found shot to death in the state of Ma...,https://www.bbc.com/news/world-us-canada-64874502,https://www.bbc.com/news/world-us-canada-64874502
2,Kidnapped Americans were in Mexico for tummy tuck,['Author not identified'],2023-03-07 14:25:19,Four Americans kidnapped by heavily armed men ...,https://www.bbc.com/news/world-us-canada-64875131,https://www.bbc.com/news/world-us-canada-64875131
3,Junk fees: US names and shames airlines over f...,['Chloe Kim'],2023-03-06 20:18:57,US airlines are being called out for their fam...,https://www.bbc.com/news/world-us-canada-64870070,https://www.bbc.com/news/world-us-canada-64870070
4,Japan forced to destroy flagship H3 rocket in ...,['Joel Guinto'],2023-03-07 09:25:23,Japan was forced to blow up its new rocket dur...,https://www.bbc.com/news/world-asia-64871603,https://www.bbc.com/news/world-asia-64871603


### Upload file to BigQuery

In [10]:
project_id = 'vini-project-379618'              
dataset_id = 'coding_test_lima_consulting'
table_id = 'bbc_news'

destination_table = f'{dataset_id}.{table_id}'

In [11]:
table_schema = [
        {'name': 'Headline', 'type': 'STRING'},
        {'name': 'Authors', 'type': 'STRING'},
        {'name': 'Publication_datetime', 'type': 'DATETIME'},
        {'name': 'Article', 'type': 'STRING'},
        {'name': 'URL', 'type': 'STRING'},
        {'name': 'Redirected_URL', 'type': 'STRING'}
    ]

In [13]:
key_path = '/content/GBQ.json'

scopes = [
    'https://www.googleapis.com/auth/bigquery',
    'https://www.googleapis.com/auth/cloud-platform'
]

credentials = service_account.Credentials.from_service_account_file(
    key_path,
    scopes=scopes
)

In [14]:
data.to_gbq(
    credentials=credentials,
    destination_table=destination_table,
    if_exists='replace',
    project_id=project_id,
    table_schema=table_schema
)

100%|██████████| 1/1 [00:00<00:00, 3858.61it/s]
