In [1]:
import os
import pandas as pd
import requests as re
from bs4 import BeautifulSoup
from datetime import datetime, timezone
from urllib.parse import urlparse

In [2]:

SOURCE = "The Tab"
BASE_URL = "https://thetab.com"
DOMAIN = urlparse(BASE_URL).netloc
SECTIONS = ["news", "entertainment", "trends", "gaming", "politics", "opinion", "guides"]
CSV_PATH = "the_tab_articles.csv"

# Load previous data if exists
if os.path.exists(CSV_PATH):
    old_df = pd.read_csv(CSV_PATH)
    seen_urls = set(old_df['url'].tolist())
else:
    old_df = pd.DataFrame()
    seen_urls = set()

new_articles = []

for section in SECTIONS:
    section_url = f"{BASE_URL}/{section}"
    try:
        print(f"Scraping section: {section_url}")
        res = re.get(section_url, timeout=10)
        soup = BeautifulSoup(res.content, "html.parser")

        links = soup.select("a[href*='/202']")  # Find all 202x article links

        for link in links:
            article_url = link["href"]
            if not article_url.startswith("http"):
                article_url = BASE_URL + article_url

            if article_url in seen_urls:
                continue

            try:
                art_res = re.get(article_url, timeout=10)
                art_soup = BeautifulSoup(art_res.content, "html.parser")

                headline_tag = art_soup.find("h1")
                headline = headline_tag.get_text(strip=True) if headline_tag else None
                if not headline:
                    continue

                paragraphs = art_soup.find_all("p")
                article_text = " ".join(p.get_text(strip=True) for p in paragraphs)
                word_count = len(article_text.split())

                # Count internal and external links
                all_links = art_soup.find_all("a", href=True)
                internal_links = 0
                external_links = 0
                for a in all_links:
                    href = a['href']
                    parsed_href = urlparse(href)
                    if parsed_href.netloc == "" or DOMAIN in parsed_href.netloc:
                        internal_links += 1
                    else:
                        external_links += 1

                meta_date = art_soup.find("meta", {"property": "article:published_time"})
                pub_date = meta_date["content"] if meta_date else None

                new_articles.append({
                    "source": SOURCE,
                    "url": article_url,
                    "section": section,
                    "pub_date": pub_date,
                    "headline": headline,
                    "headline_len": len(headline.split()),
                    "word_count": word_count,
                    "internal_links": internal_links,
                    "external_links": external_links,
                    "article_text": article_text,
                    "scrape_date": datetime.now(timezone.utc).isoformat()
                })

            except Exception as e:
                print(f"Error parsing article: {article_url} | {e}")

    except Exception as e:
        print(f"Failed to fetch section {section_url} | {e}")


Scraping section: https://thetab.com/news
Scraping section: https://thetab.com/entertainment
Scraping section: https://thetab.com/trends
Scraping section: https://thetab.com/gaming
Scraping section: https://thetab.com/politics
Scraping section: https://thetab.com/opinion
Scraping section: https://thetab.com/guides


In [3]:
# Save combined data
new_df = pd.DataFrame(new_articles)
combined_df = pd.concat([old_df, new_df], ignore_index=True)
combined_df.to_csv(CSV_PATH, index=False)

print(f"Added {len(new_df)} new articles. Total saved: {len(combined_df)}.")


Added 54 new articles. Total saved: 166.
