In [1]:
import re
import os
import json
import time
import requests
import pandas as pd
from selenium import webdriver
from urllib.parse import urlparse
from datetime import datetime, timedelta
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.support.ui import WebDriverWait

In [2]:
class WebDriverManager:
    """
    Manages WebDriver initialization, configuration, and browser sessions.
    """
    def __init__(self, auth_token=None, headless=True):
       
        self.driver = None
        self.auth_token = auth_token
        self.headless = headless
    
    def initialize_driver(self):
        """
        Initialize and configure the Selenium WebDriver.
        """
        options = webdriver.ChromeOptions()
        options.add_argument("--verbose")
        options.add_argument('--no-sandbox')
        options.add_argument('--disable-gpu')
        options.add_argument("--window-size=1920, 1200")
        options.add_argument('--disable-dev-shm-usage')
        options.headless = self.headless
        
        self.driver = webdriver.Chrome(options=options)
        self.driver.get("https://x.com")
        
        if self.auth_token:
            self._set_auth_token()
        
        return self.driver
    
    def _set_auth_token(self):
        """
        Set authentication token as a cookie.
        """
        if not self.auth_token:
            raise ValueError("Access token is missing. Please configure it properly.")
        
        expiration = (datetime.now() + timedelta(days=14)).strftime("%Y-%m-%d")
        cookie_script = f"document.cookie = 'auth_token={self.auth_token}; expires={expiration}; path=/';"
        self.driver.execute_script(cookie_script)
    
    def get_current_tweet(self):
        """
        Retrieve the currently visible tweet on the page.
        """
        try:
            current_tweet_element = self.driver.find_element(By.XPATH, "//article[@role='article']")
            return current_tweet_element if current_tweet_element else None
        except Exception as e:
            print(f"Error while fetching the current tweet: {str(e)}")
            return None
    
    def close(self):
        """
        Close the WebDriver and end the session.
        """
        if self.driver:
            self.driver.quit()

In [3]:
class TwitterScraper:
    """
    Handles tweet extraction, processing, and analysis.
    """
    
    def __init__(self, driver_manager=None):
        self.driver_manager = driver_manager or WebDriverManager()
        self.driver = self.driver_manager.driver
    
    def _get_first_tweet(self, timeout=10, max_retries=5, retry_delay=2):
        """
        Retrieve the first tweet element from the page.
        """
        retries = 0
        while retries < max_retries:
            try:
                WebDriverWait(self.driver, timeout).until(
                    lambda d: d.find_elements(By.XPATH, "//article[@data-testid='tweet']")
                )
                return self.driver.find_element(By.XPATH, "//article[@data-testid='tweet']")
            except TimeoutException:
                retries += 1
                time.sleep(retry_delay)
        
        return None
    
    def fetch_tweets_list(self, url, start_date, end_date, time_threshold_minutes=2):
        """
        Fetch tweets from a given tweet list URL within a specified date range and assign thread numbers.
        """
        if not self.driver:
            self.driver_manager.initialize_driver()
        
        self.driver.get(url)
        start_date_obj = datetime.strptime(start_date, "%Y-%m-%d")
        end_date_obj = datetime.strptime(end_date, "%Y-%m-%d")
        count = 0
       
        tweets = []
        while True:
           
            tweet_element = self._get_first_tweet()
            tweet_data = self._process_tweet(tweet_element)
            if not tweet_data:
                continue
            
            tweet_date = datetime.strptime(tweet_data["date"], "%Y-%m-%d")
            tweets.append(tweet_data)
            if tweet_date < start_date_obj:
                if not tweet_data['is_reposted']:
                    break
            elif tweet_date > end_date_obj:
                count+=1
                self._delete_first_tweet()
                
                continue
            
            self._delete_first_tweet()
        
        df = pd.DataFrame(tweets)
        
        if not df.empty:
            df["datetime"] = pd.to_datetime(df["date"] + " " + df["time"])
            df.sort_values(by=["author_name", "datetime"], inplace=True)
            
            thread_numbers = []
            thread_count = 0
            
            for author, group in df.groupby("author_name"):
                last_datetime = None
                for idx, row in group.iterrows():
                    if last_datetime is None or (row["datetime"] - last_datetime).total_seconds() > time_threshold_minutes * 60:
                        thread_count += 1
                    thread_numbers.append(thread_count)
                    last_datetime = row["datetime"]
            
            df["thread_number"] = thread_numbers
            df.drop(columns=["datetime"], inplace=True)
        
        return df
    
    def _delete_first_tweet(self):
        """
        Remove the first tweet from the page 
        """
        try:
            tweet_element = self.driver.find_element(By.XPATH, "//article[@data-testid='tweet'][1]")
            self.driver.execute_script("arguments[0].remove();", tweet_element)
        except NoSuchElementException:
            print("No tweet to delete.")
    
    def get_element_text(self, parent, xpath):
        try:
            return parent.find_element(By.XPATH, xpath).text
        except NoSuchElementException:
            return ""
    
    def get_element_attribute(self, parent, selector, attribute):
        try:
            return parent.find_element(By.CSS_SELECTOR, selector).get_attribute(attribute)
        except NoSuchElementException:
            return ""
    
    def get_tweet_url(self, tweet_element):
        try:
            link_element = tweet_element.find_element(By.XPATH, ".//a[contains(@href, '/status/')]")
            return link_element.get_attribute("href")
        except NoSuchElementException:
            return ""
    
    def get_mentioned_urls(self, tweet_element):
        try:
            link_elements = tweet_element.find_elements(By.XPATH, ".//a[contains(@href, 'http')]")
            return [elem.get_attribute("href") for elem in link_elements]
        except NoSuchElementException:
            return []
    
    def is_retweet(self, tweet_element):
        try:
            return bool('reposted' in tweet_element.find_element(By.XPATH,'.//span').text)
        except NoSuchElementException:
            return False
    
    def get_media_type(self, tweet_element):
        if tweet_element.find_elements(By.CSS_SELECTOR, "div[data-testid='videoPlayer']"):
            return "Video"
        if tweet_element.find_elements(By.CSS_SELECTOR, "div[data-testid='tweetPhoto']"):
            return "Image"
        return "No media"
    
    def get_images_urls(self, tweet_element):
        image_elements = tweet_element.find_elements(By.XPATH, ".//div[@data-testid='tweetPhoto']//img")
        return [img.get_attribute("src") for img in image_elements]
    
    def _process_tweet(self, tweet_element):
        """
        Process a single tweet element and extract relevant information.
        """
        try:
            author_details = self.get_element_text(tweet_element, ".//div[@data-testid='User-Name']")
            parts = author_details.split("\n")
            author_name, author_handle = parts[0], parts[1] if len(parts) > 1 else ""
            
            tweet_datetime_str = self.get_element_attribute(tweet_element, "time", "datetime")
            tweet_datetime = datetime.strptime(tweet_datetime_str, "%Y-%m-%dT%H:%M:%S.000Z")
            tweet_date = tweet_datetime.date()
            tweet_time = tweet_datetime.time()
            
            tweet_data = {
                "text": self.get_element_text(tweet_element, ".//div[@data-testid='tweetText']"),
                "author_name": author_name,
                "author_handle": author_handle,
                "date": tweet_date.strftime('%Y-%m-%d'),
                "time": tweet_time.strftime('%H:%M:%S'),
                "lang": self.get_element_attribute(tweet_element, "div[data-testid='tweetText']", "lang"),
                "tweet_url": self.get_tweet_url(tweet_element),
                "mentioned_urls": self.get_mentioned_urls(tweet_element),
                "is_reposted": self.is_retweet(tweet_element),
                "media_type": self.get_media_type(tweet_element),
                "image_urls": self.get_images_urls(tweet_element)
            }
            return tweet_data
        except Exception as e:
            print(f"Error processing tweet: {str(e)}")
            return None

    def download_images(self, df, output_dir='downloaded_images', timeout=10, chunk_size=8192, delay=1):
        """
        Download images from tweets and save them in a given folder
        """
   
        os.makedirs(output_dir, exist_ok=True)
        
        for _, row in df.iterrows():
            if not row.get('image_urls'):
                continue  # Skip rows without image URLs
            
            # Parse tweet details for naming files
            url_parts = urlparse(row['tweet_url'])
            twitter_name = url_parts.path.split('/')[1]
            tweet_id = url_parts.path.split('/')[-1]
            
            for i, image_url in enumerate(row['image_urls'], start=1):
                image_id = f"{twitter_name}__{tweet_id}_{i}"
                image_path = os.path.join(output_dir, f"{image_id}.jpg")
                
                # Skip download if the file already exists
                if os.path.exists(image_path):
                    print(f"Image already exists: {image_id}")
                    continue
                
                try:
                    # Download image
                    response = requests.get(image_url, stream=True, timeout=timeout)
                    response.raise_for_status()
                    
                    with open(image_path, 'wb') as file:
                        for chunk in response.iter_content(chunk_size=chunk_size):
                            file.write(chunk)
                    
                    print(f"Successfully downloaded image: {image_id}")
                    
                    # Delay to prevent rate-limiting
                    time.sleep(delay)
                
                except requests.exceptions.RequestException as e:
                    print(f"Error downloading image: {image_id}")
                    print(f"Error message: {str(e)}")
    
        print("Image download completed.")

In [4]:
auth_token = '8447d678bdaf27e220dcfb3fda5c34fe48179dc9'
driver_manager = WebDriverManager(auth_token=auth_token)
driver_manager.initialize_driver()

<selenium.webdriver.chrome.webdriver.WebDriver (session="9680dfd914cb6a17af1a02331dabfea4")>

In [5]:
twitter_scraper = TwitterScraper(driver_manager)

In [20]:
tweets = twitter_scraper.fetch_tweets_list(
        "https://x.com/i/lists/1866834968594317670",
        start_date="2024-12-08",
        end_date="2024-12-15",
    )

In [21]:
tweets

Unnamed: 0,text,author_name,author_handle,date,time,lang,tweet_url,mentioned_urls,is_retweet,media_type,image_urls,thread_number
13,Explore On-Prem Agents with \n@LangChainAI\n'...,AI Makerspace,@AIMakerspace,2024-12-14,18:30:00,en,https://x.com/AIMakerspace/status/186800050479...,"[https://t.co/u5FfbNm3fU, https://t.co/u5FfbNm...",False,No media,[],1
32,New release from Meta FAIR — Meta Motivo is a...,AI at Meta,@AIatMeta,2024-12-13,20:15:37,en,https://x.com/AIatMeta/status/1867664693319586289,[],False,Video,[],2
33,"More details, including links to the research ...",AI at Meta,@AIatMeta,2024-12-13,20:15:37,en,https://x.com/AIatMeta/status/1867664695072895489,"[https://t.co/2aaLeTmU5T, https://t.co/2aaLeTm...",False,No media,[],2
37,Comments from \n@Klarna\n CEO \n@klarnaseb\n o...,Alec Coughlin,@Alec_Coughlin,2024-12-13,18:57:21,en,https://x.com/Alec_Coughlin/status/18676449984...,[],False,Image,[https://pbs.twimg.com/media/Ges1F0QXwAAIXxb?f...,3
47,Great conversation with \n@LangChainAI\n CEO H...,Charu Sharma,@charu1603,2024-12-13,16:54:45,en,https://x.com/charu1603/status/186761414562727...,[https://t.co/51QxgUpJgN],False,Image,[https://pbs.twimg.com/media/GesZBYxaUAAmcL1?f...,4
48,Agentic Travel Planner\n\nLangGraph (\n@langc...,CopilotKit,@CopilotKit,2024-12-13,17:09:48,en,https://x.com/CopilotKit/status/18676179316196...,[https://t.co/T7YMvNOuVS],False,Video,[https://pbs.twimg.com/media/GeqzrCZbsAEWiMH?f...,5
11,I remember last week when I read about this in...,Daniel Torres,@danieltorres_c,2024-12-14,19:48:44,en,https://x.com/danieltorres_c/status/1868020317...,[],False,No media,[],6
52,"In case you missed it, earlier this week we la...",DeepLearning.AI,@DeepLearningAI,2024-12-13,15:34:24,en,https://x.com/DeepLearningAI/status/1867593926...,[https://t.co/7hWI3uJXSb],False,Video,[https://pbs.twimg.com/ext_tw_video_thumb/1867...,7
39,OpenAI's o1 model and its pro mode enhance per...,DeepLearning.AI,@DeepLearningAI,2024-12-13,18:59:59,en,https://x.com/DeepLearningAI/status/1867645660...,"[https://t.co/VRUKvaYSpL, https://t.co/VRUKvaY...",False,No media,[],8
24,"Fascinated by AI, Matt Struble gained the skil...",DeepLearning.AI,@DeepLearningAI,2024-12-13,23:00:06,en,https://x.com/DeepLearningAI/status/1867706089...,[https://t.co/uiVlDN5Wyo],False,Video,[],9


In [None]:
twitter_scraper.download_images(tweets)

Successfully downloaded image: Alec_Coughlin__1867644998495199360_1
Successfully downloaded image: charu1603__1867614145627271311_1
Successfully downloaded image: CopilotKit__1867617931619697121_1
Successfully downloaded image: DeepLearningAI__1867593926682481106_1
Successfully downloaded image: HardKothari__1868004115114336762_1
Successfully downloaded image: jerryjliu0__1868109665772646724_1
Successfully downloaded image: jerryjliu0__1868109665772646724_2
Successfully downloaded image: LangChainAI__1867611691061850271_1
