In [None]:
import time
import json
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from kafka import KafkaProducer, KafkaConsumer
from s3fs import S3FileSystem

# Configuration
CHROMEDRIVER_PATH = "chromedriver.exe"
KAFKA_TOPIC = "demo_testing2" # Topic name
BOOTSTRAP_SERVERS = ["{public IP address}:9092"]  # Update with your Kafka server(s)
S3_BUCKET_NAME = "bucket name"

def crawl_topics(top_n=10):
    """
    Crawl the first `top_n` trending topics and their entries.
    :param top_n: Number of trending topics to crawl.
    :return: List of dictionaries, each with topic name, URL, and one entry per dictionary.
    """
    # Use Service class to specify the path to ChromeDriver
    service = Service("chromedriver.exe")  # Provide the correct path to your ChromeDriver
    browser = webdriver.Chrome(service=service)
    browser.minimize_window()
    browser.get("https://eksisozluk.com/basliklar/gundem")
    time.sleep(0.5)

    # Fetch trending topics
    topic_elements = browser.find_elements(By.XPATH, '/html/body/div[2]/div[1]/nav/ul/li/a')
    
    # Extract topic names and URLs immediately to avoid stale references
    trending_topics = [
        {"topic_name": topic.text.split("\n")[0], "topic_url": topic.get_attribute("href")}
        for topic in topic_elements[:top_n]
    ]

    entries_data = []  # A list to hold each entry as a separate JSON object

    for topic in trending_topics:
        topic_name = topic["topic_name"]
        topic_url = topic["topic_url"]
        print(f"Crawling topic: {topic_name} - {topic_url}")

        # Navigate to the topic's page and fetch entries
        browser.get(topic_url)
        time.sleep(1)

        elements = browser.find_elements(By.CLASS_NAME, "content")
        for element in elements:
            entry_text = element.text

            # Create a separate JSON object for each entry
            entry_data = {
                "topic_name": topic_name,
                "topic_url": topic_url,
                "text": entry_text,
            }
            entries_data.append(entry_data)

    browser.quit()
    return entries_data





def send_to_kafka(producer, topic, data):
    """
    Send data to Kafka.
    :param producer: Kafka producer instance.
    :param topic: Kafka topic name.
    :param data: Data to send (list of topic dictionaries).
    """
    for record in data:
        producer.send(topic, value=record)
        time.sleep(1)
        print(f"Sent to Kafka: {record['topic_name']}")

def consume_and_store(consumer, s3):
    """
    Consume data from Kafka and store it in S3.
    """
    for message in consumer:
        data = message.value
        print(f"Consumed from Kafka: {data['topic_name']}")

        # Save to S3
        file_path = f"s3://{S3_BUCKET_NAME}/eksisozluk_{int(time.time())}.json"
        with s3.open(file_path, 'w') as file:
            json.dump(data, file)
        print(f"Saved to S3: {file_path}")
        time.sleep(0.3)

        # Stop consuming after one message (remove break for continuous consumption)
        break

def main():
    # Kafka setup
    producer = KafkaProducer(
        bootstrap_servers=BOOTSTRAP_SERVERS,
        value_serializer=lambda x: json.dumps(x).encode('utf-8')
    )

    while True:
        # Step 1: Crawl data for the top 10 topics
        topics_data = crawl_topics(top_n=10)
        if not topics_data:
            print("No data crawled, retrying in 1 hour...")
            time.sleep(3600)  # Retry after 1 hour
            continue

        # Step 2: Send data to Kafka
        send_to_kafka(producer, KAFKA_TOPIC, topics_data)

        # Step 3: Consume data and store in S3
        # consume_and_store(consumer, s3)

        # Wait for the next run
        print("Waiting for 1 hour before the next run...")
        time.sleep(3600)  # 1 hour

if __name__ == "__main__":
    main()


Crawling topic: yeşil pasaport sahiplerine ön vize şartı - https://eksisozluk.com/yesil-pasaport-sahiplerine-on-vize-sarti--7922825?a=popular
Crawling topic: tanju özcan'ın hukuksuz işlem yaptım itirafı - https://eksisozluk.com/tanju-ozcanin-hukuksuz-islem-yaptim-itirafi--7922900?a=popular
Crawling topic: tabutun etrafında semah dönen aleviler - https://eksisozluk.com/tabutun-etrafinda-semah-donen-aleviler--7922775?a=popular
Crawling topic: alman ekonomisinin dibe vurması - https://eksisozluk.com/alman-ekonomisinin-dibe-vurmasi--6496573?a=popular
Crawling topic: sigara içenler kokuyor yalanı - https://eksisozluk.com/sigara-icenler-kokuyor-yalani--6307389?a=popular
Crawling topic: kötü bir üniversiteye gideceğinize gitmeyin - https://eksisozluk.com/kotu-bir-universiteye-gideceginize-gitmeyin--7922746?a=popular
Crawling topic: müşteriye abi abla diye hitap eden kasiyer - https://eksisozluk.com/musteriye-abi-abla-diye-hitap-eden-kasiyer--7276341?a=popular
Crawling topic: troy karta geçiyo