In [2]:
!pip install requests beautifulsoup4 PyPDF2



Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import unicodedata
import time

def get_soup(url):
    """Create a tree structure (BeautifulSoup) out of a GET request's HTML."""
    try:
        r = requests.get(url, allow_redirects=True)
        r.raise_for_status()
        print(f"Successfully fetched {r.url}")
        return BeautifulSoup(r.content, "html5lib")
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

def is_decade_page(url):
    """Check if a page is a decade selection page."""
    return bool(re.search(r"/study/general-conference/\d{4}\d{4}", url))

def scrape_conference_pages(main_page_url):
    """Retrieve a list of URLs for each conference (year/month) from the main page."""
    soup = get_soup(main_page_url)
    if soup is None:
        print(f"Failed to fetch content from {main_page_url}")
        return []

    all_conference_links = []

    # Find all the links to individual conferences or decades
    links = [
        "https://www.churchofjesuschrist.org" + a["href"]
        for a in soup.find_all("a", href=True)
        if re.search(r"/study/general-conference/(\d{4}/(04|10)|\d{4}\d{4})", a["href"])
    ]

    for link in links:
        if is_decade_page(link):
            # Handle decade page
            decade_soup = get_soup(link)
            if decade_soup:
                year_links = [
                    "https://www.churchofjesuschrist.org" + a["href"]
                    for a in decade_soup.find_all("a", href=True)
                    if re.search(r"/study/general-conference/\d{4}/(04|10)", a["href"])
                ]
                all_conference_links.extend(year_links)
        else:
            all_conference_links.append(link)

    print(f"Total conference links found: {len(all_conference_links)}")
    print("Sample conference links:", all_conference_links[:5])
    return all_conference_links

def scrape_talk_urls(conference_url):
    """Retrieve a list of URLs for each talk in a specific conference."""
    soup = get_soup(conference_url)
    if soup is None:
        return []

    talk_links = [
        "https://www.churchofjesuschrist.org" + a["href"]
        for a in soup.find_all("a", href=True)
        if re.search(r"/study/general-conference/\d{4}/(04|10)/.*", a["href"])
    ]

    # Remove duplicate links and session links
    talk_links = list(set(talk_links))
    talk_links = [link for link in talk_links if not link.endswith("session?lang=eng")]

    print(f"Found {len(talk_links)} talk links in {conference_url}")
    if talk_links:
        print("Sample talk links:", talk_links[:3])
    return talk_links

def scrape_talk_data(url):
    """Scrapes a single talk for data such as: title, conference, calling, speaker, content."""
    try:
        soup = get_soup(url)
        if soup is None:
            return {}

        title_tag = soup.find("h1", {"id": "title1"})
        title = title_tag.text.strip() if title_tag else "No Title Found"

        conference_tag = soup.find("p", {"class": "subtitle"})
        conference = conference_tag.text.strip() if conference_tag else "No Conference Found"

        author_tag = soup.find("p", {"class": "author-name"})
        speaker = author_tag.text.strip() if author_tag else "No Speaker Found"

        calling_tag = soup.find("p", {"class": "author-role"})
        calling = calling_tag.text.strip() if calling_tag else "No Calling Found"

        content_array = soup.find("div", {"class": "body-block"})
        content = "\n\n".join(paragraph.text.strip() for paragraph in content_array.find_all("p")) if content_array else "No Content Found"

        footnotes = "\n".join(
            f"{idx}. {note.text.strip()}" for idx, note in enumerate(soup.find_all("li", {"class": "study-note"}), start=1)
        ) if soup.find_all("li", {"class": "study-note"}) else "No Footnotes Found"

        year = re.search(r'/(\d{4})/', url).group(1)
        season = "April" if "/04/" in url else "October"

        return {
            "title": title,
            "speaker": speaker,
            "calling": calling,
            "conference": conference,
            "year": year,
            "season": season,
            "url": url,
            "talk": content,
            "footnotes": footnotes,
        }
    except Exception as e:
        print(f"Failed to scrape {url}: {e}")
        return {}

def main_scrape_process():
    main_url = "https://www.churchofjesuschrist.org/study/general-conference?lang=eng"
    conference_urls = scrape_conference_pages(main_url)

    all_talk_urls = []
    for conference_url in conference_urls:
        all_talk_urls.extend(scrape_talk_urls(conference_url))

    print(f"Total talks found: {len(all_talk_urls)}")

    conference_talks = []
    for i, url in enumerate(all_talk_urls):
        print(f"Scraping talk {i+1}/{len(all_talk_urls)}: {url}")
        talk_data = scrape_talk_data(url)
        if talk_data:
            conference_talks.append(talk_data)

    conference_df = pd.DataFrame(conference_talks)

    for col in conference_df.columns:
        conference_df[col] = conference_df[col].apply(lambda x: unicodedata.normalize("NFD", x) if isinstance(x, str) else x)
        conference_df[col] = conference_df[col].apply(lambda x: x.replace("\t", "") if isinstance(x, str) else x)

    conference_df.to_csv("conference_talks.csv", index=False)
    print("Scraping complete. Data saved to 'conference_talks.csv'.")

    conference_df.to_json("conference_talks.json", orient="records", indent=4)
    print("Data also saved to 'conference_talks.json'.")

start = time.time()
main_scrape_process()
end = time.time()
print(f"Total time taken: {end - start} seconds")

Successfully fetched https://www.churchofjesuschrist.org/study/general-conference?lang=eng
Successfully fetched https://www.churchofjesuschrist.org/study/general-conference/20102019?lang=eng
Successfully fetched https://www.churchofjesuschrist.org/study/general-conference/20002009?lang=eng
Successfully fetched https://www.churchofjesuschrist.org/study/general-conference/19901999?lang=eng
Successfully fetched https://www.churchofjesuschrist.org/study/general-conference/19801989?lang=eng
Successfully fetched https://www.churchofjesuschrist.org/study/general-conference/19711979?lang=eng
Total conference links found: 107
Sample conference links: ['https://www.churchofjesuschrist.org/study/general-conference/2024/04?lang=eng', 'https://www.churchofjesuschrist.org/study/general-conference/2023/10?lang=eng', 'https://www.churchofjesuschrist.org/study/general-conference/2023/04?lang=eng', 'https://www.churchofjesuschrist.org/study/general-conference/2022/10?lang=eng', 'https://www.churchofjesu

In [None]:
from google.colab import files
files.download('conference_talks.csv')
