In [1]:
# Import necessary libraries
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
import time
import pandas as pd
from webdriver_manager.chrome import ChromeDriverManager
import re
import datetime

class DiscordScraper:
    def __init__(self, headless=False):
        """
        Initialize the Discord scraper with browser configuration

        Args:
            headless (bool): Whether to run Chrome in headless mode
        """
        # Set up Chrome options
        self.options = Options()
        if headless:
            self.options.add_argument("--headless")
        self.options.add_argument("--window-size=1920,1080")
        self.options.add_argument("--disable-notifications")
        self.options.add_argument("--disable-infobars")
        self.options.add_argument("--mute-audio")

        # Initialize the Chrome driver
        self.driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=self.options)

    def login(self, email, password):
        """
        Log in to Discord

        Args:
            email (str): Discord email
            password (str): Discord password
        """
        # Navigate to Discord login page
        self.driver.get("https://discord.com/login")

        # Wait for login form to load
        WebDriverWait(self.driver, 20).until(
            EC.presence_of_element_located((By.NAME, "email"))
        )

        # Enter login credentials
        email_field = self.driver.find_element(By.NAME, "email")
        password_field = self.driver.find_element(By.NAME, "password")

        email_field.send_keys(email)
        password_field.send_keys(password)

        # Submit the form
        password_field.submit()

        # Wait for navigation after login (adjust time if needed)
        time.sleep(5)
        print("Successfully logged in to Discord")

    def navigate_to_channel(self, channel_url):
        """
        Navigate to a specific Discord channel

        Args:
            channel_url (str): URL of the Discord channel
        """
        self.driver.get(channel_url)
        print(f"Navigating to {channel_url}")

        # Wait for messages list items to load (changed wait target)
        try:
            WebDriverWait(self.driver, 30).until(
                EC.presence_of_element_located((By.CLASS_NAME, "messageListItem__5126c"))
            )
            print(f"Successfully navigated to {channel_url} and found message list items.")
        except TimeoutException:
            print(f"Timed out waiting for message list items on {channel_url}. Check if the channel loaded correctly or if class names changed.")
            # Optionally raise the exception or handle it differently
            raise

    # Removed the old extract_message_data function as logic is moved to scrape_channel

    def scrape_channel(self, channel_url, max_messages=None, max_scrolls=None):
        """
        Scrape messages from a Discord channel, iterating by message block.

        Args:
            channel_url (str): URL of the Discord channel
            max_messages (int, optional): Maximum number of unique message content parts to scrape.
            max_scrolls (int, optional): Maximum number of scroll operations.

        Returns:
            pd.DataFrame: DataFrame containing scraped messages.
        """
        # Navigate to the channel
        print(f"Navigating to channel: {channel_url}")
        self.navigate_to_channel(channel_url)

        # Initialize variables
        scraped_data = []
        scroll_count = 0
        # Use a set to track processed message content tuples (author, timestamp, content) to avoid duplicates
        processed_message_tuples = set()

        # Main scraping loop
        while (max_messages is None or len(scraped_data) < max_messages) and \
              (max_scrolls is None or scroll_count < max_scrolls):

            time.sleep(1) # Allow time for content to potentially load after scroll

            # Find all message list item elements (the parent blocks)
            message_list_items = self.driver.find_elements(By.CLASS_NAME, "messageListItem__5126c")
            print(f"Found {len(message_list_items)} message list items on this view.")

            if not message_list_items and scroll_count > 0:
                 print("No more message list items found, potentially reached the top.")
                 # Optional: break here if you are sure no more content will load
                 # break

            new_messages_found_in_scroll = 0
            # Iterate through each message block
            for item_element in message_list_items:
                author = "Unknown"
                timestamp = None
                try:
                    # --- Extract Author and Timestamp from the message block ---
                    try:
                        # Find author within the current message block
                        author_element = item_element.find_element(By.CLASS_NAME, "username_c19a55")
                        author = author_element.text
                    except NoSuchElementException:
                        # Handle cases where author might not be present (e.g., system messages, older formats)
                        # You might want to check for alternative elements or log this
                        # print("Author element not found in this block.")
                        pass # Keep author as "Unknown"

                    try:
                        # Find timestamp within the current message block
                        timestamp_element = item_element.find_element(By.CLASS_NAME, "timestamp_c19a55").find_element(By.TAG_NAME, "time")
                        # Get the timestamp text/attribute (datetime contains full date/time)
                        timestamp = timestamp_element.get_attribute('datetime')
                        if not timestamp: # Fallback if aria-label is empty
                           timestamp = timestamp_element.text
                    except NoSuchElementException:
                        # print("Timestamp element not found in this block.")
                        pass # Keep timestamp as None

                    # --- Find all message content parts within this block ---
                    content_elements = item_element.find_elements(By.CLASS_NAME, "markup__75297.messageContent_c19a55")

                    if not content_elements:
                        # Sometimes messages might be embeds, images only, etc.
                        # print("No message content elements found in this block.")
                        continue # Skip to the next message block

                    # --- Create a record for each content part ---
                    for content_element in content_elements:
                        try:
                            content = content_element.text
                            if not content.strip(): # Skip empty content parts
                                continue

                            # Create a tuple for duplicate checking
                            message_tuple = (author, timestamp, content)

                            # Check if this specific message content has already been processed
                            if message_tuple not in processed_message_tuples:
                                message_data = {
                                    "author": author,
                                    "content": content,
                                    "timestamp": timestamp
                                }
                                scraped_data.append(message_data)
                                processed_message_tuples.add(message_tuple)
                                new_messages_found_in_scroll += 1

                                # Print progress
                                if len(scraped_data) % 20 == 0: # Adjusted print frequency
                                    print(f"Scraped {len(scraped_data)} total unique message parts...")

                                # Check if we've reached the maximum number of messages
                                if max_messages is not None and len(scraped_data) >= max_messages:
                                    print(f"Reached maximum message limit ({max_messages}).")
                                    break # Break inner loop

                        except Exception as e_inner:
                             print(f"Error processing a content element: {e_inner}")
                             # Optionally add error details to scraped_data
                             error_data = {
                                 "author": author,
                                 "content": f"Error extracting content: {e_inner}",
                                 "timestamp": timestamp
                             }
                             if (author, timestamp, error_data["content"]) not in processed_message_tuples:
                                 scraped_data.append(error_data)
                                 processed_message_tuples.add((author, timestamp, error_data["content"]))


                except Exception as e_outer:
                    print(f"Error processing a message list item block: {e_outer}")
                    # Log or handle block-level errors if necessary

                # Check again if max messages reached after processing a block's contents
                if max_messages is not None and len(scraped_data) >= max_messages:
                    break # Break outer loop

            # Check max messages one last time before scrolling
            if max_messages is not None and len(scraped_data) >= max_messages:
                    break

            # Scroll up to load older messages
            if max_scrolls is None or scroll_count < max_scrolls:
                try:
                    # Ensure you're targeting the correct scrollable element
                    # This class name might change, inspect element if scrolling fails
                    scrollable_area = WebDriverWait(self.driver, 10).until(
                       EC.presence_of_element_located((By.CLASS_NAME, "scroller__36d07"))
                       # Alternative selector if the above fails:
                       # EC.presence_of_element_located((By.CSS_SELECTOR, "div[class*='scrollerBase']"))
                    )
                    # Scroll up slightly less aggressively? Adjust value as needed
                    self.driver.execute_script("arguments[0].scrollTop = arguments[0].scrollTop - 2000;", scrollable_area)
                    scroll_count += 1
                    print(f"Scrolling up (Scroll {scroll_count}). Found {new_messages_found_in_scroll} new message parts in last view.")
                    time.sleep(2) # Crucial: Wait for content to load after scrolling

                    # Add a check to see if scrolling actually changed the view much
                    # This is complex, but could involve comparing element positions or first/last message IDs
                    if new_messages_found_in_scroll == 0 and scroll_count > 5: # Heuristic: if no new msgs after several scrolls
                        print("No new messages found after scrolling. Possibly reached the beginning or end.")
                        # Consider breaking if this happens multiple times consecutively
                        # break # Uncomment to stop if no new messages are found after scrolling

                except TimeoutException:
                    print("Could not find the scrollable area. Stopping scroll.")
                    break
                except Exception as e:
                    print(f"Error during scrolling: {e}")
                    break
            else:
                 print("Reached maximum scroll limit.")
                 break # Exit loop if max scrolls reached


        print(f"Finished scraping. Total unique message parts collected: {len(scraped_data)}")
        # Convert to DataFrame and return
        df = pd.DataFrame(scraped_data)
        return df

    def close(self):
        """Close the WebDriver"""
        self.driver.quit()

# Function to save messages to CSV (no changes needed)
def save_to_csv(df, filename="discord_messages.csv"):
    """
    Save the scraped messages to a CSV file

    Args:
        df (pd.DataFrame): DataFrame containing scraped messages
        filename (str): Output filename
    """
    # Ensure the DataFrame isn't empty before saving
    if df.empty:
        print("DataFrame is empty. No CSV file will be saved.")
        return
    try:
        df.to_csv(filename, index=False, encoding='utf-8-sig')
        print(f"Saved {len(df)} messages to {filename}")
    except Exception as e:
        print(f"Error saving DataFrame to CSV: {e}")




In [None]:
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

# Retrieve USERNAME and PASSWORD from environment variables
EMAIL = os.getenv("EMAIL")
PASSWORD = os.getenv("PASSWORD")

# Ensure the variables are loaded
if not EMAIL or not PASSWORD:
    raise ValueError("EMAIL or PASSWORD not found in .env file")

In [None]:

# Usage example
# Initialize the scraper
scraper = DiscordScraper(headless=False)  # Set headless=True for background operation

# Log in to Discord
scraper.login(email=EMAIL, password=PASSWORD)

# Scrape a specific channel
# Replace with your Discord channel URL
channel_url = "https://discord.com/channels/966683863541751818/1200151598987489422"

# Option 1: Scrape with a message limit
messages_df = scraper.scrape_channel(channel_url, max_messages=5000)

# Option 2: Scrape with a scroll limit (useful for testing)
# messages_df = scraper.scrape_channel(channel_url, max_scrolls=20)

# Option 3: Scrape without limits (will continue until reaching the top)
# messages_df = scraper.scrape_channel(channel_url)

# Save the results
save_to_csv(messages_df, "discord_channel_messages.csv")

# Close the browser when done
scraper.close()

Successfully logged in to Discord
Navigating to channel: https://discord.com/channels/966683863541751818/1200151598987489422
Navigating to https://discord.com/channels/966683863541751818/1200151598987489422


NoSuchWindowException: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=134.0.6998.178)
Stacktrace:
	GetHandleVerifier [0x00EEC7F3+24435]
	(No symbol) [0x00E72074]
	(No symbol) [0x00D406E3]
	(No symbol) [0x00D1F83E]
	(No symbol) [0x00DB455E]
	(No symbol) [0x00DCEB19]
	(No symbol) [0x00DAD5B6]
	(No symbol) [0x00D7C54F]
	(No symbol) [0x00D7D894]
	GetHandleVerifier [0x011F70A3+3213347]
	GetHandleVerifier [0x0120B0C9+3295305]
	GetHandleVerifier [0x0120558C+3271948]
	GetHandleVerifier [0x00F87360+658144]
	(No symbol) [0x00E7B27D]
	(No symbol) [0x00E78208]
	(No symbol) [0x00E783A9]
	(No symbol) [0x00E6AAC0]
	BaseThreadInitThunk [0x75F25D49+25]
	RtlInitializeExceptionChain [0x7750CE3B+107]
	RtlGetAppContainerNamedObjectPath [0x7750CDC1+561]
