In [1]:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException, StaleElementReferenceException


import time
from datetime import datetime
import json

options = Options()
options.headless = True  # Running in headless mode
options.add_experimental_option("prefs", {
    # disable images for speed
    "profile.managed_default_content_settings.images": 2,  
    # disable video features
    "profile.default_content_setting_values.media_stream": 2,
    "profile.default_content_setting_values.media_stream_mic": 2,
    "profile.default_content_setting_values.media_stream_camera": 2,
    "profile.default_content_setting_values.durable_storage": 2
})
options.page_load_strategy = 'eager'  # Set page load strategy to 'eager'


In [2]:
service = Service()
driver = webdriver.Chrome(service=service, options=options)

In [3]:
driver.get("https://www.twitch.tv/clix")  # Replace with the desired URL

In [4]:
processed_messages = set()
exception_count = 0

while True:
    try:
        # XPath for usernames and messages
        username_xpath = ".//span[contains(@class, 'chat-author__display-name')]"
        chat_message_xpath = "//span[contains(@class, 'text-fragment') and contains(@data-a-target, 'chat-message-text')]"

        # Find all usernames and chat messages
        usernames = driver.find_elements(By.XPATH, username_xpath)
        chat_messages = driver.find_elements(By.XPATH, chat_message_xpath)

        for username_element, chat_message_element in zip(usernames, chat_messages):
            message_id = hash(username_element.get_attribute("data-a-user") + chat_message_element.text)

            # Check if the message has been processed
            if message_id not in processed_messages:
                processed_messages.add(message_id)

                # Extract username and message
                username = username_element.get_attribute("data-a-user")
                chat_message = chat_message_element.text

                # Adding timestamp
                timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

                # Creating JSON object
                comment_data = {
                    "comment_id": message_id,
                    "timestamp": timestamp,
                    "username": username,
                    "message": chat_message
                }

                print(json.dumps(comment_data))
                exception_count = 0

        # Wait before next iteration
        time.sleep(2)

    except NoSuchElementException:
        print("No Element Found")
        exception_count += 1
        if exception_count >= 2:
            # If multiple exceptions in a row, wait longer before next attempt
            time.sleep(5)
        else:
            time.sleep(2)

    except StaleElementReferenceException:
        print(f"Stale element reference for message, re-locating...")
        # Re-locate the element and retry or skip to the next one
        continue

    except Exception as e:
        print(f"Something went wrong: {e}")
        break

{"comment_id": -1874609976999911916, "timestamp": "2023-12-07 19:47:56", "username": "aiai566", "message": "/"}
{"comment_id": -1342829118640795405, "timestamp": "2023-12-07 19:48:01", "username": "callmerompe", "message": "Xset x Dr3amin Jersey LIVE NOW! USE CODE CLIX20 FOR 20%"}
{"comment_id": 4896049823466342812, "timestamp": "2023-12-07 19:48:01", "username": "twitchsheild", "message": "they're actually a good duo"}
{"comment_id": -7095249502399629913, "timestamp": "2023-12-07 19:48:01", "username": "swxrv3_", "message": "where?"}
{"comment_id": -1008673907958215713, "timestamp": "2023-12-07 19:48:01", "username": "cayman1790", "message": "!SONG"}
{"comment_id": -7977167928822509818, "timestamp": "2023-12-07 19:48:01", "username": "nightbot", "message": "ron is so ass dude"}
{"comment_id": 5553749863510630940, "timestamp": "2023-12-07 19:48:01", "username": "aydenonytlol", "message": "!song"}
{"comment_id": 3031007694187398424, "timestamp": "2023-12-07 19:48:04", "username": "cgl22

KeyboardInterrupt: 

#### Websocketing

In [None]:
import asyncio
import websockets
import threading
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException, StaleElementReferenceException
import json
from datetime import datetime


drivers = {}
clients = set()


# Chat scraping function
async def scrape_chat(url):
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")
    driver = webdriver.Chrome(options=options)
    drivers[url] = driver

    driver.get(url)
    processed_messages = set()

    while url in drivers:
        try:
            # XPath for usernames and messages
            username_xpath = ".//span[contains(@class, 'chat-author__display-name')]"
            chat_message_xpath = "//span[contains(@class, 'text-fragment') and contains(@data-a-target, 'chat-message-text')]"

            # Find all usernames and chat messages
            usernames = driver.find_elements(By.XPATH, username_xpath)
            chat_messages = driver.find_elements(By.XPATH, chat_message_xpath)

            for username_element, chat_message_element in zip(usernames, chat_messages):
                message_id = hash(username_element.get_attribute("data-a-user") + chat_message_element.text)

                # Check if the message has been processed
                if message_id not in processed_messages:
                    processed_messages.add(message_id)

                    # Extract username and message
                    username = username_element.get_attribute("data-a-user")
                    chat_message = chat_message_element.text

                    # Adding timestamp
                    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

                    # Creating JSON object
                    comment_data = {
                        "comment_id": message_id,
                        "timestamp": timestamp,
                        "username": username,
                        "message": chat_message
                    }

                    print(json.dumps(comment_data))
                    exception_count = 0

            # Wait before next iteration
            time.sleep(2)

        except NoSuchElementException:
            print("No Element Found")
            exception_count += 1
            if exception_count >= 2:
                # If multiple exceptions in a row, wait longer before next attempt
                time.sleep(5)
            else:
                time.sleep(2)

        except StaleElementReferenceException:
            print(f"Stale element reference for message, re-locating...")
            # Re-locate the element and retry or skip to the next one
            continue

        except Exception as e:
            print(f"Something went wrong: {e}")
            break

    # Cleanup
    driver.quit()
    del drivers[url]

# WebSocket server handlers
async def register(websocket):
    clients.add(websocket)

async def unregister(websocket):
    clients.remove(websocket)

async def broadcast_message(message):
    if clients:
        await asyncio.wait([client.send(message) for client in clients])

async def websocket_server(websocket, path):
    await register(websocket)
    try:
        async for message in websocket:
            if message not in drivers:
                asyncio.create_task(scrape_chat(message))  # Start scraping for new URL
            else:
                # Logic to handle existing URL or other commands
                pass
    finally:
        await unregister(websocket)

# Run the WebSocket server
def start_websocket_loop():
    loop = asyncio.new_event_loop()
    asyncio.set_event_loop(loop)

    server = websockets.serve(websocket_server, "localhost", 6789)
    loop.run_until_complete(server)
    loop.run_forever()

# Main execution
if __name__ == "__main__":
    threading.Thread(target=start_websocket_loop, daemon=True).start()

    # Keep the main thread alive or perform other tasks
    while True:
        # Perform any main-thread-specific tasks or simply keep it alive
        asyncio.sleep(10)

  asyncio.sleep(10)
Exception in thread Thread-15:
Traceback (most recent call last):
  File "c:\Users\Danil\anaconda3\lib\threading.py", line 932, in _bootstrap_inner
    self.run()
  File "c:\Users\Danil\anaconda3\lib\threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "C:\Users\Danil\AppData\Local\Temp\ipykernel_17960\1583155084.py", line 113, in start_websocket_loop
  File "c:\Users\Danil\anaconda3\lib\asyncio\base_events.py", line 616, in run_until_complete
    return future.result()
  File "c:\Users\Danil\anaconda3\lib\asyncio\tasks.py", line 684, in _wrap_awaitable
    return (yield from awaitable.__await__())
  File "c:\Users\Danil\anaconda3\lib\site-packages\websockets\legacy\server.py", line 1127, in __await_impl__
    server = await self._create_server()
  File "c:\Users\Danil\anaconda3\lib\asyncio\base_events.py", line 1463, in create_server
    raise OSError(err.errno, 'error while attempting '
OSError: [Errno 10048] error while attempting 