In [23]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import nltk
from openai import OpenAI
import os

nltk.download('stopwords')
nltk.download('wordnet')

OpenAI.api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI()

URL_BBC = "https://www.bbc.com/news"
URL_CNN = "https://www.cnn.com/world"

news_data = []

def scrape_bbc():
    response = requests.get(URL_BBC)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, "html.parser")
        articles = soup.find_all("h2", class_="sc-87075214-3 eywmDE")
        
        for article in articles[:1]:
            title = article.text.strip() if article.text else "No title"
            parent = article.find_parent("a")
            link = "https://www.bbc.com" + parent.get("href") if parent and parent.get("href") else "No link available"
            content = scrape_content(link)
            news_data.append({"source": "BBC", "title": title, "link": link, "content": content})

def scrape_cnn():
    response = requests.get(URL_CNN)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, "html.parser")
        articles = soup.find_all("a", class_="container__link")
        
        for article in articles[:1]:
            title_tag = article.find("span", class_="container__headline-text")
            title = title_tag.text.strip() if title_tag and title_tag.text.strip() else "No title"
            link = "https://www.cnn.com" + article.get("href") if article.get("href") else "No link available"
            content = scrape_content(link)
            news_data.append({"source": "CNN", "title": title, "link": link, "content": content})

def scrape_content(link):
    if link == "No link available":
        return "No content available"
    try:
        news_response = requests.get(link)
        if news_response.status_code == 200:
            news_soup = BeautifulSoup(news_response.text, "html.parser")
            paragraphs = news_soup.find_all("p")
            content = " ".join([p.text.strip() for p in paragraphs])
            return content.replace("\n", " ").strip()
    except Exception as e:
        print(f"⚠️ Error extracting content from {link}: {e}")
    return "Error extracting content"

def classify_news(title, content, link):
    prompt = f"""
    Analyze the following news story and determine whether it is real or fake based on its content, style, structure, the credibility of the source, and the validity of the link.

    Title: {title}
    Link: {link}
    Content: {content}

    Consider the following when making your decision:
    - If the news comes from a reputable source like BBC or CNN, it is more likely to be "real."
    - If the content contains exaggerated or sensational language, seems misleading, or lacks credible details, it is likely "fake."
    - If the link is from a questionable or unknown source, or if the link doesn't lead to a valid or functioning webpage, it raises suspicion that the news is "fake."
    - Pay attention to whether the link corresponds to a known media outlet or points to a potentially unreliable or non-existent source.
    - If there are references to fact-checked claims, studies, or experts, the news is more likely "real."
    - If the link is broken or returns a 404 error, this strongly indicates the news is "fake."

    Respond only with "real" or "fake. You don't have to explain your answer."
    """
    try:
        response = client.chat.completions.create(
            model="gpt-4o-search-preview",
            web_search_options={},
            messages=[
                {"role": "system", "content": "You're a news sorter. Answer only with 'real' or 'fake'. You don't have to explain your answer."},
                {"role": "user", "content": prompt}
            ]
        )
        result = response.choices[0].message.content.strip()
        return result.lower()
    except Exception as e:
        print(f"⚠️ Error classifying the news: {e}")
        return "error"

# Agregar una noticia falsa difícil de identificar
fake_news = {
    "source": "BBC",
    "title": "Scientists discover a new form of life in the depths of the ocean",
    "link": "http://skynews.com/lifeinocean",
    "content": "An international team of scientists has made an astonishing discovery in the depths of the ocean. Using advanced technology, they managed to discover a new life form that could change our understanding of biology. This organism, which lives more than 10,000 meters below sea level, has the ability to survive without oxygen and could open new doors for research into extraterrestrial life. The details of this discovery will be revealed at a global conference next month."
}
fake_news_2 = {
    "source": "CNN",
    "title": "NASA Confirms Evidence of Ancient Aliens Found on Mars",
    "link": "http://marsaliensnews.com/nasa",
    "content": "NASA has confirmed the discovery of ancient alien artifacts on the surface of Mars. Using state-of-the-art robotic technology, researchers found what appear to be carvings and structures that suggest extraterrestrial life once existed on the red planet. This groundbreaking finding is expected to change the course of space exploration and human history. A press conference will be held next week to reveal the full details of this historic discovery."
}

"""fake_news_3 = {
    "source": "BBC",
    "title": "Cure for Cancer Discovered in a Remote Amazon Rainforest",
    "link": "http://newsbrazil.com/cureforcancer",
    "content": "In a stunning breakthrough, researchers have discovered a potential cure for cancer deep within the Amazon rainforest. After years of research, a team of scientists has found a plant-based compound that shows promising results in killing cancer cells. The plant, previously unknown to science, could revolutionize cancer treatment and save millions of lives worldwide. Experts are now racing to conduct further studies on this miraculous discovery."
}

fake_news_4 = {
    "source": "CNN",
    "title": "World's First Human Cloning Experiment Successfully Completed",
    "link": "http://cloningnews.com/humanclone",
    "content": "In a controversial and historic development, scientists have successfully cloned a human for the first time. This groundbreaking achievement has the potential to revolutionize medicine, offering solutions for organ transplantation, aging, and genetic disorders. The clone, a fully developed human with identical DNA to the donor, has been declared healthy and is expected to live a normal life. The ethical implications of this discovery are being debated worldwide."
}"""

news_data.extend([fake_news, fake_news_2])


def scrape_and_classify():
    scrape_bbc()
    scrape_cnn()
    
    for news in news_data:
        title = news["title"]
        print(title)
        content = news["content"]
        print(content)
        link = news["link"]
        print(link)
        classification = classify_news(title, content, link)
        news["classification"] = classification
    
    df = pd.DataFrame(news_data)
    df.to_csv("news_with_classification.csv", index=False)
    print("✅ Data saved in 'news_with_classification.csv'")

scrape_and_classify()


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/pedroalonso/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/pedroalonso/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Scientists discover a new form of life in the depths of the ocean
An international team of scientists has made an astonishing discovery in the depths of the ocean. Using advanced technology, they managed to discover a new life form that could change our understanding of biology. This organism, which lives more than 10,000 meters below sea level, has the ability to survive without oxygen and could open new doors for research into extraterrestrial life. The details of this discovery will be revealed at a global conference next month.
http://skynews.com/lifeinocean
NASA Confirms Evidence of Ancient Aliens Found on Mars
NASA has confirmed the discovery of ancient alien artifacts on the surface of Mars. Using state-of-the-art robotic technology, researchers found what appear to be carvings and structures that suggest extraterrestrial life once existed on the red planet. This groundbreaking finding is expected to change the course of space exploration and human history. A press conference wi