In [None]:

transformed_articles = news_etl.transform_data(articles, search_query)

In [None]:
news_etl.load_data_to_bigquery(transformed_articles, "soccer_player", "news")

In [11]:
from newsapi import NewsApiClient
from google.cloud import bigquery
import os

# read API key
with open("newsapi_key.txt", "r") as file: 
  api_key = file.read().strip()
file.close()

# establish a newsapi client with an api key
newsapi = NewsApiClient(api_key=api_key)
search_query = "L. Messi"
articles = newsapi.get_everything(q=search_query)

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = 'cloud_service_account.json'
client = bigquery.Client()

In [12]:

# Step 3: Extract data from the News API
def extract_data(query, language="en", page_size=3, max_pages=2):
    all_articles = []
    page = 1
    while True:
        articles = newsapi.get_everything(q=query, language=language, page_size=page_size, page=page)
        if not articles['articles'] or page >= max_pages:
            break
        all_articles.extend(articles['articles'])
        page += 1
    return all_articles

articles = extract_data(search_query)

In [14]:
import news_etl
search_query = "Cristiano Ronaldo"
articles = news_etl.extract_data(search_query)

NewsAPIException: {'status': 'error', 'code': 'rateLimited', 'message': 'You have made too many requests recently. Developer accounts are limited to 100 requests over a 24 hour period (50 requests available every 12 hours). Please upgrade to a paid plan if you need more requests.'}

In [30]:
import pandas as pd
print(articles)

[{'source': {'id': None, 'name': 'The Guardian'}, 'author': 'Rob Smyth', 'title': 'Manchester City v Bayern Munich: Champions League quarter-final, first leg – live', 'description': '<ul><li>Champions League news from the 8pm BST game</li><li>Get in touch! Send Rob an email with your thoughts</li></ul>Hello and welcome to live, minute-by-minute coverage of Manchester City v Bayern Munich in the Champions League quarter-final. This is the …', 'url': 'https://www.theguardian.com/football/live/2023/apr/11/manchester-city-bayern-munich-champions-league-quarter-final-first-leg-live', 'urlToImage': 'https://i.guim.co.uk/img/media/161b15d61056a8d53203f34803bf2c6ec0fa3045/0_93_5501_3302/master/5501.jpg?width=1200&height=630&quality=85&auto=format&fit=crop&overlay-align=bottom%2Cleft&overlay-width=100p&overlay-base64=L2ltZy9zdGF0aWMvb3ZlcmxheXMvdGctbGl2ZS5wbmc&enable=upscale&s=527fe1441f42f9871701a6eab14afcbc', 'publishedAt': '2023-04-11T17:36:45Z', 'content': 'Team news: Bernardo starts, Cance

In [22]:

# Step 4: Transform the data into a suitable format
def transform_data(articles):
    transformed_articles = []
    for article in articles:
        transformed_articles.append(
            {
                "source_id": article["source"]["id"],
                "source_name": article["source"]["name"],
                "author": article["author"],
                "title": article["title"],
                "description": article["description"],
                "url": article["url"],
                "url_to_image": article["urlToImage"],
                "published_at": article["publishedAt"],
                "content": article["content"],
            }
        )
    return transformed_articles

transformed_articles = transform_data(articles)

In [23]:
def load_data_to_bigquery(transformed_articles, dataset_id, table_id):

    # Load the data into the table
    table_ref = client.dataset(dataset_id).table(table_id)
    table = client.get_table(table_ref)

    errors = client.insert_rows_json(table, transformed_articles)
    if errors:
        raise RuntimeError("Failed to load data to BigQuery: {}".format(errors))

load_data_to_bigquery(transformed_articles, "soccer_player", "news")

In [None]:
""" from google.api_core.exceptions import NotFound
# Step 5: Load the transformed data into the BigQuery table
def create_table_if_not_exists(dataset_id, table_id, schema):
    dataset_ref = client.dataset(dataset_id)
    table_ref = dataset_ref.table(table_id)
    try:
        client.get_table(table_ref)
    except NotFound:
        table = bigquery.Table(table_ref, schema=schema)
        client.create_table(table) """