# Webscraping more headlines for my database

In [1]:
import requests
from bs4 import BeautifulSoup
import json

## Getting the headlines using Beautifulsoup

## Getting some negative news:

###  Webscrapping: BBC news 


In [2]:
# URL of the BBC News homepage
url = "https://www.bbc.com/news"

# Send a GET request to the URL
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Parse the HTML content of the page
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find all <h2> tags
    h2_tags = soup.find_all('h2')
    
    # Extract and print the titles from the <h2> tags
    for tag in h2_tags:
        print(tag.get_text(strip=True))
else:
    print(f"Failed to retrieve the page. Status code: {response.status_code}")

Netanyahu denounces bid to arrest him over Gaza war
Iran declares five days of mourning for president
Decades after training, 90-year-old finally goes to space
ChatGPT to lose voice over Johansson similarity
Assange wins right to challenge US extradition
Iran declares five days of mourning for president
What next for Iran after President Raisi's death?
Drama at Trump trial as judge reprimands witness and clears court
Ship that hit Baltimore bridge on the move again
Decades after training, 90-year-old finally goes to space
Electric pulses may ease paralysis after broken neck
Scottie Scheffler court date pushed back after arrest
'Stop threatening us', Taiwan's new president tells China
Three Americans detained in alleged DR Congo coup attempt
How should countries deal with falling birth rates?
Iranian president killed
Ebrahim Raisi: What we know about deadly Iran helicopter crash
How Iranians reacted to president's helicopter crash
Who is in charge of Iran?
President Ebrahim Raisi's mixe

## Returning the result as Json

GPT prompt: 

"now, edit the json file to transform it in a dictionary like this one: 

{
    "label": 1,
    "text": "Simone Biles wins Core Hydration Classic on road to Paris 2024 Olympics"
    },


leave the label empty, but add to the "text"  for  each headline in the json file you gave me. I will fill the "label" myself."

In [19]:
def get_bbc_news_titles(url):
    # Send a GET request to the URL
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content of the page
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find all <h2> tags
        h2_tags = soup.find_all('h2')
        
        # Extract titles from the <h2> tags and transform them into the desired dictionary format
        titles = [{"label": 0, "text": tag.get_text(strip=True)} for tag in h2_tags]
        
        # Return the result as a list of dictionaries
        return titles
    else:
        # If the request failed, return an error message as a dictionary
        return {"error": f"Failed to retrieve the page. Status code: {response.status_code}"}

# URL of the BBC News homepage
url = "https://www.bbc.com/news"

# Get the titles as a list of dictionaries
titles = get_bbc_news_titles(url)

# Check if the result is an error message
if "error" in titles:
    print(titles["error"])
else:
    # Save the result to a JSON file
    output_file = "bbc_news_titles.json"
    with open(output_file, "w") as f:
        json.dump(titles, f, indent=4)
    
    print(f"Titles saved successfully to {output_file}")


Titles saved successfully to bbc_news_titles.json


In [32]:
# Open the JSON file and load the data
with open("bbc_news_titles.json", "r") as file:
    data = json.load(file)

# Check the number of objects
num_objects_bbc = len(data)

print("Number of objects in the BBC JSON file:", num_objects_bbc)

# Open the JSON file and load the data
with open("bbc_news_titles.json", "r") as file:
    data = json.load(file)

# Count the occurrences of labels
label_counts = {"0": 0, "1": 0}
for item in data:
    if item["label"] == 0:
        label_counts["0"] += 1
    elif item["label"] == 1:
        label_counts["1"] += 1

# Print the label counts
print("Number of labels with value 0:", label_counts["0"])
print("Number of labels with value 1:", label_counts["1"])


Number of objects in the BBC JSON file: 67
Number of labels with value 0: 67
Number of labels with value 1: 0


### Webscrapping: The Guardian 

In [21]:
def get_guardian_aria_labels(url):
    # List of unwanted "aria-label" values
    unwanted_labels = [
        "Toggle main menu",
        "Toggle News",
        "Toggle Opinion",
        "Toggle Sport",
        "Toggle Culture",
        "Toggle Lifestyle"
    ]
    
    # Send a GET request to the URL
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content of the page
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find all elements with "aria-label" attribute
        elements_with_aria_label = soup.find_all(attrs={"aria-label": True})
        
        # Extract the content of "aria-label" attribute from each element
        aria_labels = [
            {"label": 0, "text": element["aria-label"]}
            for element in elements_with_aria_label
            if element["aria-label"] not in unwanted_labels
        ]
        
        # Return the result as a JSON object
        return json.dumps(aria_labels, indent=4)
    else:
        # If the request failed, return an error message as a JSON object
        return json.dumps({"error": f"Failed to retrieve the page. Status code: {response.status_code}"}, indent=4)

# URL of the Guardian homepage
url = "https://www.theguardian.com/international"

# Get the "aria-label" content as a JSON object
aria_labels_json = get_guardian_aria_labels(url)

# Save the result to a JSON file
output_file = "the_guardian.json"
with open(output_file, "w") as f:
    f.write(aria_labels_json)

# Print a message indicating that the data has been saved
print(f"Aria-labels saved successfully to {output_file}")


Aria-labels saved successfully to the_guardian.json


In [33]:
# Open the JSON file and load the data
with open("the_guardian.json", "r") as file:
    data = json.load(file)

# Check the number of objects
num_objects_guardian = len(data)

print("Number of objects in the GUARDIAN JSON file:", num_objects_guardian)

# Open the JSON file and load the data
with open("the_guardian.json", "r") as file:
    data = json.load(file)

# Count the occurrences of labels
label_counts = {"0": 0, "1": 0}
for item in data:
    if item["label"] == 0:
        label_counts["0"] += 1
    elif item["label"] == 1:
        label_counts["1"] += 1

# Print the label counts
print("Number of labels with value 0:", label_counts["0"])
print("Number of labels with value 1:", label_counts["1"])


Number of objects in the GUARDIAN JSON file: 114
Number of labels with value 0: 114
Number of labels with value 1: 0


## Getting some negative news:

###  Webscrapping: Good News Agency

In [23]:

def scrape_good_news(url):
    # Send a GET request to the URL
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content of the page
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find all <strong> tags
        strong_tags = soup.find_all('strong')
        
        # Extract text from the <strong> tags and format into dictionaries
        data = [{"label": 1, "text": tag.get_text(strip=True)} for tag in strong_tags]
        
        # Return the result as a JSON object
        return json.dumps(data, indent=4)
    else:
        # If the request failed, return an error message as a JSON object
        return json.dumps({"error": f"Failed to retrieve the page. Status code: {response.status_code}"}, indent=4)

# URL of the webpage
url = "http://www.goodnewsagency.org/m/issue.php?number=324&lang=en"

# Get the content as a JSON object
content_json = scrape_good_news(url)

# Save the result to a JSON file
output_file = "good_news.json"
with open(output_file, "w") as f:
    f.write(content_json)

# Print a message indicating that the data has been saved
print(f"Content saved successfully to {output_file}")


Content saved successfully to good_news.json


In [34]:
import json

# Open the JSON file and load the data
with open("good_news.json", "r") as file:
    data = json.load(file)

# Check the number of objects
num_objects_good_news = len(data)

print("Number of objects in the GOOD NEWS JSON file:", num_objects_good_news)

# Open the JSON file and load the data
with open("good_news.json", "r") as file:
    data = json.load(file)

# Count the occurrences of labels
label_counts = {"0": 0, "1": 0}
for item in data:
    if item["label"] == 0:
        label_counts["0"] += 1
    elif item["label"] == 1:
        label_counts["1"] += 1

# Print the label counts
print("Number of labels with value 0:", label_counts["0"])
print("Number of labels with value 1:", label_counts["1"])


Number of objects in the GOOD NEWS JSON file: 111
Number of labels with value 0: 0
Number of labels with value 1: 111


In [9]:
def extract_titles(url):
    # Send a GET request to the URL
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content of the page
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find all <a> tags with the specified attributes
        a_tags = soup.find_all('a', attrs={'rel': 'bookmark'})
        
        # Extract titles from the title attribute of each <a> tag
        titles = [{"label": 1, "text":tag.get('title', '')} for tag in a_tags]
        
        # Return the result as a JSON object
        return json.dumps(titles, indent=4)
    else:
        # If the request failed, return an error message as a JSON object
        return json.dumps({"error": f"Failed to retrieve the page. Status code: {response.status_code}"}, indent=4)

# URL of the webpage
url = "https://www.goodnewsnetwork.org/category/news/world/"

# Get the titles as a JSON object
titles_json = extract_titles(url)

# Save the titles to a JSON file
with open("good_news_network_titles.json", "w") as file:
    file.write(titles_json)

# Print a success message
print("Titles extracted and saved to 'good_news_network_titles.json'.")


Titles extracted and saved to 'good_news_network_titles.json'.


In [35]:
# Open the JSON file and load the data
with open("good_news_network_titles.json", "r") as file:
    data = json.load(file)

# Check the number of objects
num_objects_goodnews_network = len(data)

print("Number of objects in the good_news_network_titles.json file:", num_objects_goodnews_network)

# Open the JSON file and load the data
with open("good_news_network_titles.json", "r") as file:
    data = json.load(file)

# Count the occurrences of labels
label_counts = {"0": 0, "1": 0}
for item in data:
    if item["label"] == 0:
        label_counts["0"] += 1
    elif item["label"] == 1:
        label_counts["1"] += 1

# Print the label counts
print("Number of labels with value 0:", label_counts["0"])
print("Number of labels with value 1:", label_counts["1"])


Number of objects in the good_news_network_titles.json file: 34
Number of labels with value 0: 0
Number of labels with value 1: 34


In [11]:
def extract_titles(url):
    # Send a GET request to the URL
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content of the page
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find all <a> tags with the specified attributes
        a_tags = soup.find_all('a', attrs={'rel': 'bookmark'})
        
        # Extract titles from the title attribute of each <a> tag
        titles = [{"label": 1, "text":tag.get('title', '')} for tag in a_tags]
        
        # Return the result as a JSON object
        return json.dumps(titles, indent=4)
    else:
        # If the request failed, return an error message as a JSON object
        return json.dumps({"error": f"Failed to retrieve the page. Status code: {response.status_code}"}, indent=4)

# URL of the webpage
url = "https://www.goodnewsnetwork.org/category/news/inspiring/"

# Get the titles as a JSON object
titles_json = extract_titles(url)

# Save the titles to a JSON file
with open("good_news_inspiring.json", "w") as file:
    file.write(titles_json)

# Print a success message
print("Titles extracted and saved to 'good_news_inspiring.json'.")


Titles extracted and saved to 'good_news_inspiring.json'.


In [36]:
# Open the JSON file and load the data
with open("good_news_inspiring.json", "r") as file:
    data = json.load(file)

# Check the number of objects
num_objects_goodnews_inspiring = len(data)

print("Number of objects in the good_news_inspiring.json file:", num_objects_goodnews_inspiring)

# Open the JSON file and load the data
with open("good_news_inspiring.json", "r") as file:
    data = json.load(file)

# Count the occurrences of labels
label_counts = {"0": 0, "1": 0}
for item in data:
    if item["label"] == 0:
        label_counts["0"] += 1
    elif item["label"] == 1:
        label_counts["1"] += 1

    
# Print the label counts
print("Number of labels with value 0:", label_counts["0"])
print("Number of labels with value 1:", label_counts["1"])

Number of objects in the good_news_inspiring.json file: 35
Number of labels with value 0: 0
Number of labels with value 1: 35


In [37]:
total_positive = num_objects_goodnews_network + num_objects_good_news + 13 + num_objects_goodnews_inspiring
print("total positive news: ", total_positive)

total_negative = num_objects_bbc + num_objects_guardian + 6
print("total negative news: ", total_negative)

total positive news:  193
total negative news:  187


In [43]:
# Open the JSON file and load the data
with open("dataset.json", "r") as file:
    data = json.load(file)

# # Check the number of objects
# num_objects_goodnews_network = len(data)

# print("Number of objects in the good_news_network_titles.json file:", num_objects_goodnews_network)

# Open the JSON file and load the data
with open("dataset.json", "r") as file:
    data = json.load(file)

# Count the occurrences of labels
label_counts = {"0": 0, "1": 0}
for item in data:
    if item["label"] == 0:
        label_counts["0"] += 1
    elif item["label"] == 1:
        label_counts["1"] += 1

# Print the label counts
print("Number of labels with value 0:", label_counts["0"])
print("Number of labels with value 1:", label_counts["1"])


Number of labels with value 0: 120
Number of labels with value 1: 120
