In [1]:
#pip install python-dotenv

First, mount google drive and import libraries

In [2]:
# First, mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

import requests
from bs4 import BeautifulSoup
import smtplib
from email.mime.text import MIMEText
import os
import hashlib
import pandas as pd
import datetime
import shutil
import re
from urllib.parse import urlparse

Mounted at /content/drive


Settings


In [None]:
# Path settings
BASE_FOLDER = "/content/drive/MyDrive/Colab_Notebooks/job_checker" # Change this to your desired path
STORAGE_FOLDER = f"{BASE_FOLDER}/career_pages"
ARCHIVE_FOLDER = f"{STORAGE_FOLDER}/archive"

# Ensure all folders exist
for folder in [STORAGE_FOLDER, ARCHIVE_FOLDER]:
    if not os.path.exists(folder):
        os.makedirs(folder)
        print(f"Created directory: {folder}")

GSHEET_CSV_URL = "https://docs.google.com/spreadsheets/d/ALPHANUMERIC CODE OF YOUR GSHEET /gviz/tq?tqx=out:csv&sheet=NAME OF YOUR TAB"
YOUR_EMAIL = "YOUR EMAIL"
YOUR_EMAIL_PASSWORD = "APP PASSWORD OF YOUR EMAIL (NOT YOUR NORMAL EMAIL PASSWORD)"
SEND_TO_EMAIL = "YOUR EMAIL"



In [None]:

def get_domain_from_url(url):
    """Extract a readable domain name from URL, limited to 25 chars"""
    parsed_url = urlparse(url)
    domain = parsed_url.netloc.replace("www.", "")
    # Remove non-alphanumeric characters
    domain = re.sub(r'[^a-zA-Z0-9]', '_', domain)
    # Limit to 25 characters max
    if len(domain) > 25:
        domain = domain[:25]
    return domain


#This function creates unique, consistent filenames regardless of how complex or long the URL is,
# it avoids special characters in URL (like '/', '?', '&') and has a Fixed length
def get_filename_for_url(url, date_str=None):
    """Generate filename with date tag and readable domain"""
    if date_str is None:
        date_str = datetime.datetime.now().strftime("%Y-%m-%d")

    domain = get_domain_from_url(url)
    url_hash = hashlib.md5(url.encode()).hexdigest()[:8]  # Use shorter hash

    return os.path.join(STORAGE_FOLDER, f"{domain}_{url_hash}_{date_str}.txt")



def get_yesterday_file(url):
    """Find yesterday's file for a given URL"""
    yesterday = (datetime.datetime.now() - datetime.timedelta(days=1)).strftime("%Y-%m-%d")
    domain = get_domain_from_url(url)
    url_hash = hashlib.md5(url.encode()).hexdigest()[:8]

    # Pattern to find yesterday's file
    pattern = f"{domain}_{url_hash}_{yesterday}.txt"

    # Look for exact match
    yesterday_file = os.path.join(STORAGE_FOLDER, pattern)
    if os.path.exists(yesterday_file):
        return yesterday_file

    # If not found, look for any file with similar pattern from previous days
    files = os.listdir(STORAGE_FOLDER)
    pattern_base = f"{domain}_{url_hash}_"

    # Filter files matching our URL and sort by date (newest first)
    matching_files = [f for f in files if f.startswith(pattern_base) and f.endswith(".txt")]
    matching_files.sort(reverse=True)

    # Return the most recent file if any found
    if matching_files:
        return os.path.join(STORAGE_FOLDER, matching_files[0])

    return None


def archive_old_files(days_threshold=4):
    """Move files older than specified days to archive folder"""
    today = datetime.datetime.now()
    files = os.listdir(STORAGE_FOLDER)

    # Only process txt files
    txt_files = [f for f in files if f.endswith(".txt")]

    archived_count = 0
    for filename in txt_files:
        # Extract date from filename (assumes format ending with YYYY-MM-DD.txt)
        try:
            date_str = filename.split("_")[-1].replace(".txt", "")
            file_date = datetime.datetime.strptime(date_str, "%Y-%m-%d")

            # Check if file is older than threshold
            days_old = (today - file_date).days
            if days_old >= days_threshold:
                src_path = os.path.join(STORAGE_FOLDER, filename)
                dst_path = os.path.join(ARCHIVE_FOLDER, filename)
                shutil.move(src_path, dst_path)
                archived_count += 1
        except (ValueError, IndexError):
            # Skip files that don't match our naming pattern
            continue

    if archived_count > 0:
        print(f"Archived {archived_count} files older than {days_threshold} days")


def load_urls_from_gsheet_csv(gsheet_csv_url):
    try:
        df = pd.read_csv(gsheet_csv_url)
        urls = df.iloc[:, 0].dropna().tolist()  # Only column A
        return urls
    except Exception as e:
        print(f"[ERROR] Could not load data from Google Sheets: {e}")
        return []


def fetch_page_text(url):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(url, timeout=10, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        return soup.get_text()
    except Exception as e:
        print(f"[ERROR] Could not fetch {url}: {e}")
        return None

def send_email(subject, body, is_html=False):
    try:
        if is_html:
            msg = MIMEText(body, 'html')
        else:
            msg = MIMEText(body)

        msg["Subject"] = subject
        msg["From"] = YOUR_EMAIL
        msg["To"] = SEND_TO_EMAIL

        with smtplib.SMTP_SSL("smtp.gmail.com", 465) as server:
            server.login(YOUR_EMAIL, YOUR_EMAIL_PASSWORD)
            server.send_message(msg)
        print("[EMAIL SENT] 🚀")
    except Exception as e:
        print(f"[EMAIL ERROR] {e}")

def main():
    # First, archive old files
    archive_old_files(days_threshold=4)

    # Then load URLs and check for changes
    CAREER_URLS = load_urls_from_gsheet_csv(GSHEET_CSV_URL)
    if not CAREER_URLS:
        print("[FATAL] No URLs loaded from Google Sheet. Exiting.")
        return

    # Track changes for email report
    changed_urls = []

    today = datetime.datetime.now().strftime("%Y-%m-%d")

    for url in CAREER_URLS:
        print(f"\n Checking: {url}")
        new_content = fetch_page_text(url)
        if new_content is None:
            continue

        # Get today's filename
        current_filename = get_filename_for_url(url, today)

        # Find yesterday's or most recent previous file
        previous_file = get_yesterday_file(url)

        if previous_file and os.path.exists(previous_file):
            with open(previous_file, "r", encoding="utf-8", errors="replace") as f:
                old_content = f.read()

            if new_content != old_content:
                print("✅ Change detected!")
                changed_urls.append(url)

                # Save new content
                with open(current_filename, "w", encoding="utf-8") as f:
                    f.write(new_content)
            else:
                print("🤷‍♀️ No change.")
        else:
            print("First-time check or no recent file found.")
            changed_urls.append(url)

            # Save initial content
            with open(current_filename, "w", encoding="utf-8") as f:
                f.write(new_content)

    # Send email report
    if changed_urls:
        # Create super simple email with just the URLs
        body = "These career pages have updates:\n\n"
        for url in changed_urls:
            body += f"{url}\n"

        body += "\n\nCopy the list in gsheet and open them all at once"

        subject = f"🌸 {len(changed_urls)} Career Page{'s' if len(changed_urls) > 1 else ''} Updated!"
        send_email(subject=subject, body=body)
    else:
        body = "Nothing changed"
        subject = "All the same "
        send_email(subject=subject, body=body)

if __name__ == "__main__":
    main()


 Checking: https://jobs.orbisk.com/vacancy
🤷‍♀️ No change.

 Checking: https://eden-projects.rippling-ats.com/
🤷‍♀️ No change.

 Checking: https://careers.southpole.com/jobs
🤷‍♀️ No change.

 Checking: https://nadara.wd3.myworkdayjobs.com/External
🤷‍♀️ No change.

 Checking: https://www.solarpowereurope.org/about/careers
🤷‍♀️ No change.

 Checking: https://eit-culture-creativity.eu/about-us/career-opportunities/
🤷‍♀️ No change.

 Checking: https://careers.landlifecompany.com/?_gl=1*gwgcls*_gcl_au*OTM1NDQ4MzU4LjE3NDU5Mjg4NzA.
🤷‍♀️ No change.

 Checking: https://recruiting.ultipro.com/RAI1015FORES/JobBoard/7a1c3d86-f0fa-4e0e-a501-dcfedd4f7d8c/?q=&o=postedDateDesc
🤷‍♀️ No change.

 Checking: https://ats.rippling.com/en-GB/trees/jobs?searchQuery=&workplaceType=&country=&state=&city=&page=0
🤷‍♀️ No change.

 Checking: https://everwave.de/en/team/
🤷‍♀️ No change.

 Checking: https://careers.samara.energy/jobs?location_id=690304
🤷‍♀️ No change.

 Checking: https://climatekic.recruitee.com/
🤷

Functions