In [17]:
import re
import json
import time
import pandas as pd
from selenium import webdriver
from datetime import datetime, timedelta
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from tenacity import retry, stop_after_attempt, wait_fixed, retry_if_exception_type

In [18]:
auth_token = 'd5f6fb7cebaa271fdd881c395d3c9c0d3f32db7f'

In [19]:
def initialize_driver(headless=True):
    options = webdriver.ChromeOptions()
    options.add_argument("--verbose")
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-gpu')
    options.add_argument("--window-size=1920, 1200")
    options.add_argument('--disable-dev-shm-usage')
    driver = webdriver.Chrome(options=options)
    options.headless = headless
    driver.get("https://x.com")
    return driver

In [20]:
def set_auth_token(driver, auth_token):
    if not auth_token:
        raise ValueError("Access token is missing. Please configure it properly.")
    expiration = (datetime.now() + timedelta(days=14)).strftime("%Y-%m-%d")
    cookie_script = f"document.cookie = 'auth_token={auth_token}; expires={expiration}; path=/';"
    driver.execute_script(cookie_script)

In [21]:
driver = initialize_driver(True)
set_auth_token(driver,auth_token)

In [22]:
def get_first_tweet(driver, timeout=10, max_retries=5, retry_delay=2):
    retries = 0
    while retries < max_retries:
        try:
            WebDriverWait(driver, timeout).until(lambda d: d.find_elements(By.XPATH, "//article[@data-testid='tweet']"))
            return driver.find_element(By.XPATH, "//article[@data-testid='tweet']")
        except TimeoutException:
            retries += 1
            time.sleep(retry_delay)
        except NoSuchElementException:
            print("No tweet found.")
            raise
    print("Exceeded maximum retries waiting for the first tweet.")
    raise TimeoutException("Failed to retrieve the first tweet after multiple attempts.")

In [24]:
def extract_author_details(tweet_element):
    author_details = get_element_text(tweet_element, ".//div[@data-testid='User-Name']")
    parts = author_details.split("\n")
    return (parts[0], parts[1] if len(parts) > 1 else "")

In [25]:
def get_element_text(parent, xpath):
    try:
        return parent.find_element(By.XPATH, xpath).text
    except NoSuchElementException:
        return ""
        
def get_element_attribute(parent, selector, attribute):
    try:
        return parent.find_element(By.CSS_SELECTOR, selector).get_attribute(attribute)
    except NoSuchElementException:
        return ""

In [26]:
def get_tweet_url(tweet_element):
    try:
        link_element = tweet_element.find_element(By.XPATH, ".//a[contains(@href, '/status/')]")
        return link_element.get_attribute("href")
    except NoSuchElementException:
        return ""

def get_mentioned_urls(tweet_element):
    try:
        link_elements = tweet_element.find_elements(By.XPATH, ".//a[contains(@href, 'http')]")
        return [elem.get_attribute("href") for elem in link_elements]
    except NoSuchElementException:
        return []

def is_retweet(tweet_element):
    try:
        return tweet_element.find_elements(By.XPATH, ".//div[contains(text(), 'Retweeted')]")
    except NoSuchElementException:
        return False

def get_media_type(tweet_element):
    if tweet_element.find_elements(By.CSS_SELECTOR, "div[data-testid='videoPlayer']"):
        return "Video"
    if tweet_element.find_elements(By.CSS_SELECTOR, "div[data-testid='tweetPhoto']"):
        return "Image"
    return "No media"

def get_images_urls(tweet_element):
    image_elements = tweet_element.find_elements(By.XPATH, ".//div[@data-testid='tweetPhoto']//img")
    return [img.get_attribute("src") for img in image_elements]

In [27]:
def delete_first_tweet(driver):
    try:
        tweet_element = driver.find_element(By.XPATH, "//article[@data-testid='tweet'][1]")
        driver.execute_script("arguments[0].remove();", tweet_element)
    except NoSuchElementException:
        logger.info("No tweet to delete.")

In [28]:
def process_tweet(tweet_element):
    try:
        author_name, author_handle = extract_author_details(tweet_element)
        tweet_data = {
            "text": get_element_text(tweet_element, ".//div[@data-testid='tweetText']"),
            "author_name": author_name,
            "author_handle": author_handle,
            "date": get_element_attribute(tweet_element, "time", "datetime")[:10],
            "lang": get_element_attribute(tweet_element, "div[data-testid='tweetText']", "lang"),
            "url": get_tweet_url(tweet_element),
            "mentioned_urls": get_mentioned_urls(tweet_element),
            "is_retweet": is_retweet(tweet_element),
            "media_type": get_media_type(tweet_element),
            "images_urls": get_images_urls(tweet_element) if get_media_type(tweet_element) == "Image" else None,
        }
        return tweet_data
    except Exception as e:
       print(str(e))

In [55]:
def fetch_tweets(driver, url, start_date, end_date):
    driver.get(url)
    file_prefix = f"data/tweets_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}"
    start_date_obj = datetime.strptime(start_date, "%Y-%m-%d")
    end_date_obj = datetime.strptime(end_date, "%Y-%m-%d")
    
    tweets = []
    while True:
        tweet_element = get_first_tweet(driver)
        if not tweet_element:
            continue
        tweet_data = process_tweet(tweet_element)
        
        tweet_date = datetime.strptime(tweet_data["date"], "%Y-%m-%d")
        tweets.append(tweet_data)
        if tweet_date < start_date_obj:
            break
        elif tweet_date > end_date_obj:
            delete_first_tweet(driver)
            continue
        delete_first_tweet(driver)
    tweet_df = pd.DataFrame(tweets)
    return tweet_df
      

    

In [56]:
tweets = fetch_tweets(driver,
        "https://x.com/i/lists/1866134598868160880",
        start_date="2024-12-01",
        end_date="2024-12-10",
    ) 

In [57]:
tweets

Unnamed: 0,text,author_name,author_handle,date,lang,url,mentioned_urls,is_retweet,media_type,images_urls
0,"canvas is now available to all chatgpt users, ...",Sam Altman,@sama,2024-12-10,en,https://x.com/sama/status/1866555731149045990,[],[],No media,
1,a few months ago robinhood sent me a gold cred...,Sam Altman,@sama,2024-12-10,en,https://x.com/sama/status/1866517449589621082,[],[],Image,[https://pbs.twimg.com/media/GeczudUaMAAxo6S?f...
2,we significantly underestimated demand for sor...,Sam Altman,@sama,2024-12-10,en,https://x.com/sama/status/1866332878499623098,[],[],No media,
3,big congrats!!,Sam Altman,@sama,2024-12-09,en,https://x.com/sama/status/1866210243992269271,[],[],No media,
4,demand higher than expected; signups will be d...,Sam Altman,@sama,2024-12-09,en,https://x.com/sama/status/1866208762975146476,[],[],No media,
5,amazing work bill!,Sam Altman,@sama,2024-12-09,en,https://x.com/sama/status/1866202531090792586,[],[],Video,
6,aditya (\n@model_mechanic\n) is a legend and v...,Sam Altman,@sama,2024-12-09,en,https://x.com/sama/status/1866194899051422077,"[https://t.co/sBn2oF9a0o, https://t.co/sBn2oF9...",[],No media,
7,"we are launching sora today, and we made a new...",Sam Altman,@sama,2024-12-09,en,https://x.com/sama/status/1866187525821538436,[https://t.co/VZBcJFqChS],[],No media,
8,excited to see what you make :),Sam Altman,@sama,2024-12-09,en,https://x.com/sama/status/1866187528438677674,[],[],No media,
9,"details:\n\nwith an openai plus account, you g...",Sam Altman,@sama,2024-12-09,en,https://x.com/sama/status/1866187529650917618,[],[],No media,


In [58]:
tweets.loc[1]

text              a few months ago robinhood sent me a gold cred...
author_name                                              Sam Altman
author_handle                                                 @sama
date                                                     2024-12-10
lang                                                             en
url                   https://x.com/sama/status/1866517449589621082
mentioned_urls                                                   []
is_retweet                                                       []
media_type                                                    Image
images_urls       [https://pbs.twimg.com/media/GeczudUaMAAxo6S?f...
Name: 1, dtype: object

In [59]:
tweets.text.to_list()

['canvas is now available to all chatgpt users, and can execute code!\n\nmore importantly it can also still emojify your writing.',
 'a few months ago robinhood sent me a gold credit card with extremely high-quality details. \n\ni thought it was a ridiculous marketing stunt at the time but now it’s an example i give when talking about great design.',
 'we significantly underestimated demand for sora; it is going to take awhile to get everyone access.\n\ntrying to figure out how to do it as fast as possible!',
 'big congrats!!',
 'demand higher than expected; signups will be disabled on and off and generations will be slow for awhile.\n\ndoing our best!',
 'amazing work bill!',
 'aditya (\n@model_mechanic\n) is a legend and visionary in the field, and runs a very special team.',
 'we are launching sora today, and we made a new product to go with it.\n\nif you have an openai plus or pro account, you can generate videos. anyone can view them.\n\nit will take some time to roll out, but by 

In [60]:
import os
import requests
import time
from urllib.parse import urlparse

def download_images(df, output_dir='downloaded_images', timeout=10, chunk_size=8192, delay=1):
    """
    Downloads images from a DataFrame containing tweet metadata.

    Args:
        df (pd.DataFrame): DataFrame with columns `url` and `images_urls`.
        output_dir (str): Directory where images will be saved.
        timeout (int): Timeout for image download requests in seconds.
        chunk_size (int): Chunk size for streaming image content.
        delay (int): Delay between downloading consecutive images to prevent bans.

    Returns:
        None
    """
    # Ensure the output directory exists
    os.makedirs(output_dir, exist_ok=True)

    for _, row in df.iterrows():
        if not row.get('images_urls'):
            continue  # Skip rows without image URLs

        # Parse tweet details for naming files
        url_parts = urlparse(row['url'])
        twitter_name = url_parts.path.split('/')[1]
        tweet_id = url_parts.path.split('/')[-1]

        for i, image_url in enumerate(row['images_urls'], start=1):
            image_id = f"{twitter_name}__{tweet_id}_{i}"
            image_path = os.path.join(output_dir, f"{image_id}.jpg")

            # Skip download if the file already exists
            if os.path.exists(image_path):
                print(f"Image already exists: {image_id}")
                continue

            try:
                # Download image
                response = requests.get(image_url, stream=True, timeout=timeout)
                response.raise_for_status()

                with open(image_path, 'wb') as file:
                    for chunk in response.iter_content(chunk_size=chunk_size):
                        file.write(chunk)

                print(f"Successfully downloaded image: {image_id}")

                # Delay to prevent rate-limiting
                time.sleep(delay)

            except requests.exceptions.RequestException as e:
                print(f"Error downloading image: {image_id}")
                print(f"Error message: {str(e)}")

    print("Image download completed.")


In [61]:
download_images(tweets)

Image already exists: sama__1866517449589621082_1
Image already exists: karpathy__1865981888848130329_1
Image already exists: karpathy__1864023344435380613_1
Image already exists: karpathy__1863284668159980007_1
Image already exists: sama__1863015440123121798_1
Image download completed.


In [86]:
def fetch_threads(driver, url):
    driver.get(url)
    
    tweets = []
    initial_author = None

    while True:
        tweet_element = get_first_tweet(driver)
        if not tweet_element:
            continue

        tweet_data = process_tweet(tweet_element)

        if initial_author is None:
            initial_author = tweet_data["author_handle"]
        elif tweet_data["author_handle"] != initial_author:
            print("Author changed, stopping thread fetch.")
            break

        tweets.append(tweet_data)
        delete_first_tweet(driver)

    tweet_df = pd.DataFrame(tweets)
    return tweet_df

In [87]:
karpathy_thread = fetch_threads(driver, "https://x.com/karpathy/status/1864023344435380613" ) 

Author changed, stopping thread fetch.


In [88]:
karpathy_thread.text.to_list()

['The (true) story of development and inspiration behind the "attention" operator, the one in "Attention is All you Need" that introduced the Transformer. From personal email correspondence with the author \n@DBahdanau\n ~2 years ago, published here and now (with permission) following some fake news about how it was developed that circulated here over the last few days.\n\nAttention is a brilliant (data-dependent) weighted average operation. It is a form of global pooling, a reduction, communication. It is a way to aggregate relevant information from multiple nodes (tokens, image patches, or etc.). It is expressive, powerful, has plenty of parallelism, and is efficiently optimizable. Even the Multilayer Perceptron (MLP) can actually be almost re-written as Attention over data-indepedent weights (1st layer weights are the queries, 2nd layer weights are the values, the keys are just input, and softmax becomes elementwise, deleting the normalization). TLDR Attention is awesome and a *majo

In [93]:
karpathy_thread

Unnamed: 0,text,author_name,author_handle,date,lang,url,mentioned_urls,is_retweet,media_type,images_urls
0,The (true) story of development and inspiratio...,Andrej Karpathy,@karpathy,2024-12-03,en,https://x.com/karpathy/status/1864023344435380...,[],[],Image,[https://pbs.twimg.com/media/Gd5WAejaQAAVHYh?f...
1,"""Links in the reply followup"" (not a huge fan ...",Andrej Karpathy,@karpathy,2024-12-03,en,https://x.com/karpathy/status/1864028921664319735,"[https://t.co/Geg2YCzyj9, https://t.co/df3wrVg...",[],No media,
2,"Ty to a reply, text version for those on mobil...",Andrej Karpathy,@karpathy,2024-12-03,en,https://x.com/karpathy/status/1864030016457375916,[],[],No media,
3,Oh and bleh I forgot to mention for those outs...,Andrej Karpathy,@karpathy,2024-12-03,en,https://x.com/karpathy/status/1864033537479135369,[],[],No media,


In [89]:
sama_thread = fetch_threads(driver, "https://x.com/sama/status/1866187525821538436" ) 

Author changed, stopping thread fetch.


In [91]:
sama_thread.text.to_list()

['we are launching sora today, and we made a new product to go with it.\n\nif you have an openai plus or pro account, you can generate videos. anyone can view them.\n\nit will take some time to roll out, but by the end of the day it should be available at http://sora.com',
 'one of the most exciting things to me about this product is how easy it is to co-create with others; it feels like an interesting new thing!\n\nthis is early--think of it like GPT-1 for video--but i already think the feed is so compelling.',
 'excited to see what you make :)',
 'details:\n\nwith an openai plus account, you get 50 generations per month.\n\nwith a pro account, you get 500 fast generations (or fewer at high resolution) and unlimited in a slower generation mode.\n\navailable in many countries, but not most of europe and the UK for now.']

In [92]:
sama_thread

Unnamed: 0,text,author_name,author_handle,date,lang,url,mentioned_urls,is_retweet,media_type,images_urls
0,"we are launching sora today, and we made a new...",Sam Altman,@sama,2024-12-09,en,https://x.com/sama/status/1866187525821538436,[https://t.co/VZBcJFqChS],[],No media,
1,one of the most exciting things to me about th...,Sam Altman,@sama,2024-12-09,en,https://x.com/sama/status/1866187527184667113,[],[],No media,
2,excited to see what you make :),Sam Altman,@sama,2024-12-09,en,https://x.com/sama/status/1866187528438677674,[],[],No media,
3,"details:\n\nwith an openai plus account, you g...",Sam Altman,@sama,2024-12-09,en,https://x.com/sama/status/1866187529650917618,[],[],No media,
