In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
from tqdm import tqdm

def is_relevant(paragraph):
    text = paragraph.lower()
    blacklist = [
        "copyright", "all rights reserved", "newsletter", "podcast", "bbc sounds",
        "read about our approach", "promotional content", "subscribe", "terms of use",
        "privacy policy", "twitter", "instagram", "facebook"
    ]
    return (
        len(paragraph.strip()) > 40
        and "http" not in paragraph
        and "www" not in paragraph
        and not any(bad in text for bad in blacklist)
    )

def scrape_article(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
    }
    try:
        response = requests.get(url, headers=headers, timeout=15)
        response.encoding = response.apparent_encoding  # Corrige encoding
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")
        title = soup.find("h1").get_text(strip=True) if soup.find("h1") else ""
        paragraphs = soup.find_all("p")
        para_list = [
            p.get_text(strip=True)
            for p in paragraphs
            if is_relevant(p.get_text(strip=True))
        ]
        # Junta os parágrafos em uma única string, separados por espaço e remove espaços duplos
        text = " ".join(para_list).replace("  ", " ").strip()
        return {
            "title": title,
            "text": text
        }
    except Exception as e:
        return {
            "title": "",
            "text": ""
        }

urls = [
    "https://www.nbcnews.com/news/us-news/jury-reaches-verdict-sean-diddy-combs-sex-trafficking-trial-rcna214785",
    "https://www.abc.net.au/news/2025-07-14/sean-combs-p-diddy-conviction-what-happens-now/105522300",
    "https://www.cbsnews.com/news/sean-diddy-combs-trial-verdict-jury/",
    "https://www.cnn.com/2025/07/02/entertainment/recap-diddy-trial-verdict-charges",
    "https://www.bbc.com/news/articles/c0qz32wzeego",
    "https://www.latimes.com/california/story/2025-07-02/diddy-trial-jury-verdict",
    "https://abcnews.go.com/US/Culture/sean-combs-guilty-2-5-counts-acquitted-racketeering/story?id=123019412",
    "https://www.aljazeera.com/news/2025/7/2/what-is-the-partial-verdict-in-the-sean-diddy-combs"
]

data = []
for url in tqdm(urls, desc="Scraping articles"):
    result = scrape_article(url)
    data.append(result)
    if result["title"]:
        print(f"Coletado: {result['title']}")
    else:
        print(f"Erro ao coletar artigo.")

df = pd.DataFrame(data)
df.to_csv("diddy_articles.csv", sep=',', index=False, encoding="utf-8")
print("Dados salvos em diddy_articles.csv")

Scraping articles:  12%|█▎        | 1/8 [00:00<00:03,  1.77it/s]

Coletado: Sean 'Diddy' Combs found not guilty of racketeering and sex trafficking but convicted of lesser charges


Scraping articles:  25%|██▌       | 2/8 [00:00<00:02,  2.31it/s]

Coletado: Sean 'Diddy' Combs's criminal conviction may not end his career


Scraping articles:  38%|███▊      | 3/8 [00:01<00:02,  2.01it/s]

Coletado: Sean "Diddy" Combs acquitted of sex trafficking and racketeering, convicted on prostitution-related counts


Scraping articles:  50%|█████     | 4/8 [00:02<00:02,  1.52it/s]

Coletado: Takeaways from the verdict in Sean ‘Diddy’ Combs’ federal sex trafficking trial


Scraping articles:  62%|██████▎   | 5/8 [00:02<00:01,  1.64it/s]

Coletado: What has Sean 'Diddy' Combs been convicted of?


Scraping articles:  75%|███████▌  | 6/8 [00:03<00:01,  1.92it/s]

Coletado: Sean ‘Diddy’ Combs not guilty of most serious charges but will remain in custody until sentencing


Scraping articles:  88%|████████▊ | 7/8 [00:03<00:00,  1.78it/s]

Coletado: Hip-hop mogul Sean 'Diddy' Combs acquitted on most serious charges in historic racketeering case


Scraping articles: 100%|██████████| 8/8 [00:04<00:00,  1.89it/s]

Coletado: Sean ‘Diddy’ Combs verdict: What was he found guilty of – and what’s next?
Dados salvos em diddy_articles.csv





In [6]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm

def is_relevant(paragraph):
    text = paragraph.lower()
    blacklist = [
        "copyright", "all rights reserved", "newsletter", "podcast", "bbc sounds",
        "read about our approach", "promotional content", "subscribe", "terms of use",
        "privacy policy", "twitter", "instagram", "facebook"
    ]
    return (
        len(paragraph.strip()) > 40
        and "http" not in paragraph
        and "www" not in paragraph
        and not any(bad in text for bad in blacklist)
    )

def get_source(url):
    # Extrai o domínio principal como fonte
    try:
        return url.split("//")[1].split("/")[0].replace("www.", "").split(".")[0].upper()
    except Exception:
        return "OUTRO"

def scrape_article(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
    }
    try:
        response = requests.get(url, headers=headers, timeout=15)
        response.encoding = response.apparent_encoding  # Corrige encoding
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")
        paragraphs = soup.find_all("p")
        para_list = [
            p.get_text(strip=True)
            for p in paragraphs
            if is_relevant(p.get_text(strip=True))
        ]
        text = " ".join(para_list).replace("  ", " ").strip()
        fonte = get_source(url)
        return {
            "fonte": fonte,
            "text": text
        }
    except Exception as e:
        return {
            "fonte": get_source(url),
            "text": ""
        }

urls = [
    "https://www.nbcnews.com/news/us-news/jury-reaches-verdict-sean-diddy-combs-sex-trafficking-trial-rcna214785",
    "https://www.abc.net.au/news/2025-07-14/sean-combs-p-diddy-conviction-what-happens-now/105522300",
    "https://www.cbsnews.com/news/sean-diddy-combs-trial-verdict-jury/",
    "https://www.cnn.com/2025/07/02/entertainment/recap-diddy-trial-verdict-charges",
    "https://www.bbc.com/news/articles/c0qz32wzeego",
    "https://www.latimes.com/california/story/2025-07-02/diddy-trial-jury-verdict",
    "https://abcnews.go.com/US/Culture/sean-combs-guilty-2-5-counts-acquitted-racketeering/story?id=123019412",
    "https://www.aljazeera.com/news/2025/7/2/what-is-the-partial-verdict-in-the-sean-diddy-combs"
]

data = []
for url in tqdm(urls, desc="Scraping articles"):
    result = scrape_article(url)
    data.append(result)
    if result["text"]:
        print(f"Coletado: {result['fonte']}")
    else:
        print(f"Erro ao coletar artigo de {result['fonte']}.")

df = pd.DataFrame(data)
df.to_csv("diddy_articles.csv", sep=',', index=False, encoding="utf-8")
print("Dados salvos em diddy_articles.csv")

Scraping articles:  12%|█▎        | 1/8 [00:01<00:09,  1.38s/it]

Coletado: NBCNEWS


Scraping articles:  25%|██▌       | 2/8 [00:02<00:07,  1.19s/it]

Coletado: ABC


Scraping articles:  38%|███▊      | 3/8 [00:03<00:04,  1.04it/s]

Coletado: CBSNEWS


Scraping articles:  50%|█████     | 4/8 [00:04<00:04,  1.03s/it]

Coletado: CNN


Scraping articles:  62%|██████▎   | 5/8 [00:04<00:02,  1.18it/s]

Coletado: BBC


Scraping articles:  75%|███████▌  | 6/8 [00:05<00:01,  1.30it/s]

Coletado: LATIMES


Scraping articles:  88%|████████▊ | 7/8 [00:06<00:00,  1.23it/s]

Coletado: ABCNEWS


Scraping articles: 100%|██████████| 8/8 [00:07<00:00,  1.05it/s]

Coletado: ALJAZEERA
Dados salvos em diddy_articles.csv



