In [6]:

#!/usr/bin/env python3

import os
import time
import json
import pyperclip
import pandas as pd
from tqdm import tqdm
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from datetime import datetime, timezone


# Configurations
MASTER_INDEX_PATH = './data/master_index.csv'
SCRAPED_LOG_PATH = './data/scraped_links.jsonl'
SLEEP_TIME = 5  # seconds
PAGE_TIMEOUT = 15  # seconds


In [7]:

def load_scraped_index_ids(scraped_log_path):
    """Load already scraped index_ids from JSONL."""
    if not os.path.exists(scraped_log_path):
        return set()
    scraped_ids = set()
    with open(scraped_log_path, 'r', encoding='utf-8') as f:
        for line in f:
            try:
                record = json.loads(line)
                scraped_ids.add(record['index_id'])
            except Exception as e:
                print(f"⚠️ Error parsing line: {e}")
    return scraped_ids

def scrape_article(row):
    """Launch browser, scrape article text via Ctrl+A Ctrl+C, and return enriched record."""
    options = Options()
    options.add_argument("--start-maximized")
    # options.add_argument("--headless")  # Uncomment to run without UI

    driver = None
    result = None
    try:
        driver = webdriver.Chrome(options=options)
        driver.set_page_load_timeout(PAGE_TIMEOUT)

        driver.get(row['Link'])
        time.sleep(SLEEP_TIME)  # Let dynamic content load

        body = driver.find_element(By.TAG_NAME, "body")
        body.send_keys(Keys.CONTROL, 'a')
        body.send_keys(Keys.CONTROL, 'c')
        time.sleep(1)  # Clipboard delay

        scraped_html = pyperclip.paste()

        result = row.to_dict()
        result['scraped_data'] = scraped_html
        print(f"✅ Fetched data from: {row['Link']}")

    except Exception as e:
        print(f"⚠️ Error scraping URL: {row['Link']}\n{e}")

    finally:
        if driver:
            driver.quit()

    return result

def append_scraped_records(scraped_records, scraped_log_path):
    """Append new scraped records to the JSONL log."""
    with open(scraped_log_path, 'a', encoding='utf-8') as f:
        for record in scraped_records:
            json.dump(record, f, ensure_ascii=False)
            f.write('\n')

def main(day=None, hour=None):
    # Load master index
    if not os.path.exists(MASTER_INDEX_PATH):
        print(f"❌ Master index file not found at {MASTER_INDEX_PATH}. Exiting.")
        return

    master_df = pd.read_csv(MASTER_INDEX_PATH)

    if master_df.empty or 'Published' not in master_df.columns:
        print("⚠️ Master index is empty or missing 'Published' column. Exiting.")
        return

    master_df['Published'] = pd.to_datetime(master_df['Published'], errors='coerce')

    # Filter by day/hour
    if day is None:
        # Default to today in UTC
        now_utc = datetime.now(timezone.utc)
        day = now_utc.strftime('%Y-%m-%d')
        hour = now_utc.hour

    filter_str = f"{day}"
    if hour is not None:
        filter_str += f" hour {hour}"

    print(f"🔎 Filtering articles published on: {filter_str}")

    filtered_df = master_df[master_df['Published'].dt.strftime('%Y-%m-%d') == day]
    if hour is not None:
        filtered_df = filtered_df[filtered_df['Published'].dt.hour == hour]

    print(f"🔎 Found {len(filtered_df)} articles matching time filter.")

    # Load already scraped index_ids
    scraped_ids = load_scraped_index_ids(SCRAPED_LOG_PATH)
    print(f"🔎 Found {len(scraped_ids)} already scraped articles.")

    # Filter unscraped articles
    unscraped_df = filtered_df[~filtered_df['index_id'].isin(scraped_ids)]
    print(f"🚀 Found {len(unscraped_df)} articles to scrape.")

    scraped_results = []

    for _, row in tqdm(unscraped_df.iterrows(), total=len(unscraped_df)):
        result = scrape_article(row)
        if result:
            scraped_results.append(result)

    # Append new results to log
    if scraped_results:
        append_scraped_records(scraped_results, SCRAPED_LOG_PATH)
        print(f"✅ Appended {len(scraped_results)} new articles to {SCRAPED_LOG_PATH}.")
    else:
        print("⚠️ No new articles were scraped this run.")    # Load master index
    if not os.path.exists(MASTER_INDEX_PATH):
        print(f"❌ Master index file not found at {MASTER_INDEX_PATH}. Exiting.")
        return

    master_df = pd.read_csv(MASTER_INDEX_PATH)

    # Load already scraped index_ids
    scraped_ids = load_scraped_index_ids(SCRAPED_LOG_PATH)
    print(f"🔎 Found {len(scraped_ids)} already scraped articles.")

    # Filter unscraped articles
    unscraped_df = master_df[~master_df['index_id'].isin(scraped_ids)]
    print(f"🚀 Found {len(unscraped_df)} articles to scrape.")

    scraped_results = []

    for _, row in tqdm(unscraped_df.iterrows(), total=len(unscraped_df)):
        result = scrape_article(row)
        if result:
            scraped_results.append(result)

    # Append new results to log
    if scraped_results:
        append_scraped_records(scraped_results, SCRAPED_LOG_PATH)
        print(f"✅ Appended {len(scraped_results)} new articles to {SCRAPED_LOG_PATH}.")
    else:
        print("⚠️ No new articles were scraped this run.")


In [10]:
if __name__ == "__main__":
    try:
        get_ipython
        IN_JUPYTER = True
    except NameError:
        IN_JUPYTER = False

    if IN_JUPYTER:
        # Running inside a notebook
        print("⚠️ Detected Jupyter environment. Running with default settings.")
        main()
    else:
        # Running as a standalone script
        import argparse
        parser = argparse.ArgumentParser(description="Scrape articles from master index using Selenium.")
        parser.add_argument('--day', type=str, help="Filter articles by day (YYYY-MM-DD). Defaults to today UTC.")
        parser.add_argument('--hour', type=int, help="Filter articles by hour (0-23). Defaults to current UTC hour.")
        args = parser.parse_args()

        if args.hour is not None and not (0 <= args.hour <= 23):
            print("⚠️ Hour must be between 0 and 23. Exiting.")
        else:
            main(day=args.day, hour=args.hour)


⚠️ Detected Jupyter environment. Running with default settings.
🔎 Filtering articles published on: 2025-06-11 hour 17
🔎 Found 0 articles matching time filter.
🔎 Found 0 already scraped articles.
🚀 Found 0 articles to scrape.


0it [00:00, ?it/s]

⚠️ No new articles were scraped this run.





🔎 Found 0 already scraped articles.
🚀 Found 6350 articles to scrape.


  0%|          | 0/6350 [00:00<?, ?it/s]

: 

: 


## 🔍 Key Features

✅ Reads master index and logs.
✅ Detects unscraped articles efficiently.
✅ Keeps all relevant metadata from master index (redundant but practical).
✅ Appends results incrementally — safe for repeated runs.
✅ Easily pluggable into a scheduler or cron.

---

## 🛠️ Optional Improvements

🔸 **Rate Limiting**: Add `time.sleep(2)` between scrapes to avoid getting flagged by servers.
🔸 **Structured Logging**: Save logs or errors for debugging.
🔸 **Retry Mechanism**: Keep track of failed scrapes for future attempts.
🔸 **Scraping with driver.page\_source**: More reliable than Ctrl+C on some pages.

In [None]:
xx

In [11]:
import os
import glob
import pandas as pd

# Directory path
directory = '/home/matias/Documents/media_monitor/data/rss_slices/'

# Find all CSV files in the directory
file_pattern = os.path.join(directory, '*.csv')
csv_files = glob.glob(file_pattern)

print(f"Found {len(csv_files)} CSV files.")


Found 134 CSV files.


In [12]:
# List to hold DataFrames
dfs = []

for file in csv_files:
    df = pd.read_csv(file)
    dfs.append(df)

# Concatenate all DataFrames
combined_df = pd.concat(dfs, ignore_index=True)


In [13]:
# Drop duplicates on all columns
deduped_df = combined_df.drop_duplicates()

# OR drop duplicates based on specific columns
deduped_df = combined_df.drop_duplicates(subset=['Title', 'Source'])


In [14]:
deduped_df.shape

(6350, 7)

In [15]:
deduped_df['day'] = pd.to_datetime(deduped_df['Published'], format='mixed').dt.strftime('%Y-%m-%d')

deduped_df.groupby('day').size().tail(30)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  deduped_df['day'] = pd.to_datetime(deduped_df['Published'], format='mixed').dt.strftime('%Y-%m-%d')


day
2025-05-13      1
2025-05-14      7
2025-05-15      3
2025-05-16      2
2025-05-17      1
2025-05-18      4
2025-05-19      1
2025-05-20      6
2025-05-21      5
2025-05-22      1
2025-05-23      9
2025-05-24      9
2025-05-25     18
2025-05-26     92
2025-05-27     94
2025-05-28    246
2025-05-29    452
2025-05-30    473
2025-05-31    258
2025-06-01    222
2025-06-02    466
2025-06-03    498
2025-06-04    546
2025-06-05    550
2025-06-06    499
2025-06-07    277
2025-06-08    204
2025-06-09    483
2025-06-10    483
2025-06-11    340
dtype: int64

In [16]:
links = deduped_df.sort_values('day').tail(20)['Link'].values
links = list(links)


In [None]:
import os
import time
import json
import pyperclip
from tqdm import tqdm
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# links = [
#     "https://example.com/page1",
#     "https://example.com/page2",
#     # ...
# ]

output_file = 'scraped_links.jsonl'
sleep_time = 5  # seconds
page_timeout = 15  # seconds

scraped_results = []

for idx, url in enumerate(tqdm(links)):
    options = Options()
    options.add_argument("--start-maximized")
    # options.add_argument("--headless")  # Uncomment to run without UI

    driver = None
    try:
        driver = webdriver.Chrome(options=options)
        driver.set_page_load_timeout(page_timeout)

        driver.get(url)
        time.sleep(sleep_time)  # Let dynamic content load

        body = driver.find_element(By.TAG_NAME, "body")
        body.send_keys(Keys.CONTROL, 'a')
        body.send_keys(Keys.CONTROL, 'c')
        time.sleep(1)  # Clipboard delay

        scraped_html = pyperclip.paste()

        scraped_results.append({
            'index': idx,
            'url': url,
            'scraped_data': scraped_html
        })
        print(f"✅ Fetched data from: {url}")

    except Exception as e:
        print(f"⚠️ Error scraping URL: {url}\n{e}")

    finally:
        if driver:
            driver.quit()

# Save results
with open(output_file, 'w', encoding='utf-8') as f:
    for record in scraped_results:
        json.dump(record, f, ensure_ascii=False)
        f.write('\n')

print(f"Done! {len(scraped_results)} pages saved to {output_file}.")


  5%|▌         | 1/20 [00:09<03:06,  9.83s/it]

✅ Fetched data from: https://news.google.com/rss/articles/CBMi3gFBVV95cUxPZVhqQW1JV1BkM3MwaUs0SzJnZjFxWG1iRHliTl9HRV9qVHlEUUM0RW1KbXpQWlV1cnAzc2oxNHdEcmZGOUZTMWMyR000V2hYOHNQUlczTVJ3Ty1GaF9XZ0daaGI4ZGQ1THhwTzdmTndoNzY4SnNXNzZDUDdfZHhqdUFESFJPRjNYcFhTaFdRajJaYnhRbC1ma2QwejB0RlhIcVdRdXltWmMyYUhMMHdNQ0hBS2hxZE5SamR0MTVGTm1BUjE3bkFrbkNma1FFZnZ4cjlDbkVJSkJNX3FnUGfSAd4BQVVfeXFMT2VYakFtSVdQZDNzMGlLNEsyZ2YxcVhtYkR5Yk5fR0VfalR5RFFDNEVtSm16UFpVdXJwM3NqMTR3RHJmRjlGUzFjMkdNNFdoWDhzUFJXM01Sd08tRmhfV2dHWmhiOGRkNUx4cE83Zk53aDc2OEpzVzc2Q1A3X2R4anVBREhST0YzWHBYU2hXUWoyWmJ4UWwtZmtkMHowdEZYSHFXUXV5bVpjMmFITDB3TUNIQUtocWROUmpkdDE1Rk5tQVIxN25Ba25DZmtRRWZ2eHI5Q25FSUpCTV9xZ1Bn?oc=5
✅ Fetched data from: https://news.google.com/rss/articles/CBMiwwFBVV95cUxQS0hCQ1J1UWl5SzN2QjQyel9XczMzTTg5TmhOeUVZLXFHT0o4NGlEclYtb0k2d0RoajVfMXk4VmwtTDFnTm1hSVJWVW1kdDh2b25EdjlLWkRtcW9pYkFnbU9SMFVNN2hUX1lWZGRPcEJZN21sMDJvRmZqMTkzNTY5TTVLWWhvU1p6THl1Qks0ZWdqbUg4Y3NTaFF4ZjVxN1pUX2JXUGJDSVdDcXNFck1lRVpxYXdsQmYzWUQ5Wk1CUGc3d1nSAcgBQV

 10%|█         | 2/20 [00:35<05:41, 18.98s/it]