# Test if URLs are still live

In [None]:
import os

import pandas as pd
import requests
from dotenv import load_dotenv
from sqlalchemy import create_engine

## Make HTTP request to the URL to check if it is still live

In [None]:
def is_website_online(url: str):
    """
    Check if a website is online by sending an HTTP GET request.
    """
    for scheme in ["", "https://", "http://"]:
        full_url = scheme + url
        try:
            response = requests.get(full_url, timeout=5)
            is_online = 200 <= response.status_code < 400
            return is_online, full_url
        except requests.exceptions.RequestException:
            continue  # Try the next scheme
    return False, url

## Load the URLs from the database

In [None]:
load_dotenv()

DATABASE_URL = os.getenv("DATABASE_URL")

engine = create_engine(DATABASE_URL, connect_args={"options": "-c timezone=utc"})

df = pd.read_sql_table("url", engine, index_col="id", parse_dates=["created_at", "updated_at"])

In [None]:
df.head()

Unnamed: 0_level_0,source_id,url,is_phishing,is_online,created_at,updated_at
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1,https://www.southbankmosaics.com,False,False,2024-11-15 12:44:55.549064,2024-11-15 12:44:55.549064
2,1,https://www.uni-mainz.de,False,False,2024-11-15 12:44:55.549064,2024-11-15 12:44:55.549064
3,1,https://www.voicefmradio.co.uk,False,False,2024-11-15 12:44:55.549064,2024-11-15 12:44:55.549064
4,1,https://www.sfnmjournal.com,False,False,2024-11-15 12:44:55.549064,2024-11-15 12:44:55.549064
5,1,https://www.rewildingargentina.org,False,False,2024-11-15 12:44:55.549064,2024-11-15 12:44:55.549064


## Apply the function to the URLs

In [None]:
# A function to be applied to each row of the DataFrame
def apply_is_website_online(row):
    row["is_online"], row["url"] = is_website_online(row["url"])
    row["updated_at"] = pd.Timestamp.now()
    return row

In [None]:
# Split the DataFrame into chunks to avoid memory issues nor to lose progress if the process is interrupted
chunk_size = 10
chunks = [df[i:i + chunk_size] for i in range(0, len(df), chunk_size)]

In [None]:
len(chunks)

83332

In [None]:
last_processed_chunk_index = 3999
processed_chunks = []

In [None]:
for i, chunk in enumerate(chunks):
    if i <= last_processed_chunk_index:
        continue  # Skip already processed chunks

    processed_chunk = chunk.apply(apply_is_website_online, axis=1)
    processed_chunks.append(processed_chunk)
    last_processed_chunk_index = i
    print(f"Processed chunk {i}")

    # Save to CSV at each thousandth chunk
    if (i + 1) % 1000 == 0:
        pd.concat(processed_chunks).to_csv(f'../data/processed_urls_{last_processed_chunk_index}.csv', index=True)
        processed_chunks = []  # Reset the list of processed chunks


KeyboardInterrupt: 

In [None]:
print(last_processed_chunk_index)

4099


In [None]:
# Save any remaining processed chunks
if processed_chunks:
    pd.concat(processed_chunks).to_csv(f'../data/processed_urls_{last_processed_chunk_index}.csv', index=True)
    processed_chunks = []

In [None]:
len(processed_chunks)

0