In [1]:
import json
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import re
import time

def remove_non_ascii(text):
    return re.sub(r'[^\x00-\x7F]+', '', text)

def clean_text(text):
    # Remove newlines and content within parentheses
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'\([^)]*\)', '', text)
    text = re.sub(r'\((^))*\)', '', text)
    return text.strip()

def clean_content(content):
    bible_books = "Genesis|Exodus|Leviticus|Numbers|Deuteronomy|Joshua|Judges|Ruth|1 Samuel|2 Samuel|1 Kings|2 Kings|1 Chronicles|2 Chronicles|Ezra|Nehemiah|Esther|Job|Psalms|Proverbs|Ecclesiastes|Song of Solomon|Isaiah|Jeremiah|Lamentations|Ezekiel|Daniel|Hosea|Joel|Amos|Obadiah|Jonah|Micah|Nahum|Habakkuk|Zephaniah|Haggai|Zechariah|Malachi|Matthew|Mark|Luke|John|Acts|Romans|1 Corinthians|2 Corinthians|Galatians|Ephesians|Philippians|Colossians|1 Thessalonians|2 Thessalonians|1 Timothy|2 Timothy|Titus|Philemon|Hebrews|James|1 Peter|2 Peter|1 John|2 John|3 John|Jude|Revelation"
    
    # Remove Bible book references
    content = re.sub(r'\b(' + bible_books + r')\s+\d+[:]\d+(-\d+)?', '', content)
    
    # Remove all numbers
    content = re.sub(r'\d+', '', content)
    
    # Remove anything within square brackets []
    content = re.sub(r'\[.*?\]', '', content)
    
    # Remove anything within parentheses ()
    content = re.sub(r'\(.*?\)', '', content)
    
    # Remove the characters ':', ';', ',' and '-'
    content = re.sub(r'[:;,\-]', '', content)
    
    # Remove specific introductory phrases up to "Summary", case insensitive
    content = re.sub(r'^(.*?\bSummary\b)', '', content, flags=re.IGNORECASE)
    
    # Remove leading spaces and newlines, and normalize spaces
    content = re.sub(r'\s+', ' ', content).strip()
    
    return content

def final_clean(content):
    content = re.sub(r'\b(Summary|Summarized).*$', '', content, flags=re.IGNORECASE | re.DOTALL)
    return content.strip()

driver = webdriver.Chrome()
base_url = "https://www.insightfulsermons.com/"
articles = {}

def scrape_category(url, category_name):
    driver.get(url)
    wait = WebDriverWait(driver, 10)
    blog_posts = wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, "blog-post")))
    for post in blog_posts:
        title_element = post.find_element(By.CSS_SELECTOR, ".blog-title-link.blog-link")
        title = remove_non_ascii(title_element.text)
        href = title_element.get_attribute('href')
        paragraphs = post.find_elements(By.CLASS_NAME, "paragraph")
        content = " ".join([remove_non_ascii(p.text) for p in paragraphs])
        content = clean_content(content)
        content = final_clean(content)
        articles[title] = {
            "content": content,
            "url": href,
            "category": category_name
        }

# Navigate to home page
driver.get(base_url)
wait = WebDriverWait(driver, 10)

# Find category links
category_links = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".blog-category-list a.blog-link")))

# Scrape first 4 categories
for i in range(len(category_links)):
    try:
        category_links = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".blog-category-list a.blog-link")))
        category_url = category_links[i].get_attribute('href')
        category_name = category_links[i].text
        print(f"Scraping category: {category_name}")
        scrape_category(category_url, category_name)
        time.sleep(2)
    except Exception as e:
        print(f"Error scraping category {i}: {str(e)}")
        continue

driver.quit()

# Save to JSON file
with open('sermon_data.json', 'w', encoding='utf-8') as f:
    json.dump(articles, f, ensure_ascii=False, indent=4)

# Save to text file for easy viewing
with open('sermon_data.txt', 'w', encoding='utf-8') as f:
    for title, article in articles.items():
        f.write(f"Title: {title}\n")
        f.write(f"Category: {article['category']}\n")
        f.write(f"URL: {article['url']}\n")
        f.write(f"Content preview: {article['content'][:200]}...\n")
        f.write("-" * 80 + "\n\n")

# Format data into documents
documents = []
for title, article in articles.items():
    document = {
        "page_content": article['content'],
        "metadata": {"source": "sermon"}
    }
    documents.append(document)

# Save formatted documents to JSON file
with open('documents_formatted.json', 'w', encoding='utf-8') as json_file:
    json.dump(documents, json_file, ensure_ascii=False, indent=4)

print(f"Total articles scraped: {len(articles)}")
print("Data saved to sermon_data.json, sermon_data.txt, and documents_formatted.json")


Scraping category: All
Scraping category: Anger & Forgiveness
Scraping category: Baptism & Communion
Scraping category: Christian Community
Scraping category: Confession
Scraping category: Coping With Suffering
Scraping category: Courage
Scraping category: Divisive Culture
Scraping category: Experience Change
Scraping category: Faith
Scraping category: Family & Friends
Scraping category: Grace
Scraping category: Holiness
Scraping category: Holy Spirit
Scraping category: Hope
Scraping category: Humble
Scraping category: Identity
Scraping category: Joy
Scraping category: Leadership
Scraping category: Love
Scraping category: Marriage
Scraping category: New Peace
Scraping category: New Purpose
Scraping category: Perseverance
Scraping category: Pray
Scraping category: Questions On Christianity
Scraping category: Sermon On The Mount
Scraping category: Sexuality
Scraping category: Stewardship
Scraping category: Strong Foundation
Scraping category: Submission
Scraping category: The Cross
Scrap