In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time
from openai import OpenAI

class TwitterScraper:
    def __init__(self, url, username, openai_api_key, tweet_limit=50, scroll_pause_time=2, max_scroll_attempts=10):
        self.url = url
        self.username = username  # Main profile username (without '@')
        self.tweet_limit = tweet_limit
        self.scroll_pause_time = scroll_pause_time
        self.max_scroll_attempts = max_scroll_attempts
        self.driver = webdriver.Chrome()
        self.client = OpenAI(api_key=openai_api_key)

    def login(self, twitter_username, twitter_password):
        """Log in to Twitter to access full tweets for the profile."""
        self.driver.get("https://twitter.com/login")
        
        # Wait for username field to load
        username_field = WebDriverWait(self.driver, 10).until(
            EC.presence_of_element_located((By.NAME, "text"))
        )
        username_field.send_keys(twitter_username)
        username_field.send_keys(Keys.RETURN)
        
        # Wait for password field to load
        password_field = WebDriverWait(self.driver, 10).until(
            EC.presence_of_element_located((By.NAME, "password"))
        )
        password_field.send_keys(twitter_password)
        password_field.send_keys(Keys.RETURN)
        
        # Wait until home page is loaded by checking if the home element is present
        WebDriverWait(self.driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "[data-testid='AppTabBar_Home_Link']"))
        )

    def get_tweets_and_reposts(self):
        """Extract tweets and reposts without distinguishing between them, as in the original working code."""
        self.driver.get(self.url)
        time.sleep(5)  # Wait for the page to fully load

        # Initialize list for all tweets + reposts
        all_tweets = set()  # Use set to avoid duplicates
        last_height = self.driver.execute_script("return document.body.scrollHeight")
        scroll_attempts = 0  # Track number of scroll attempts without finding new tweets

        while len(all_tweets) < self.tweet_limit and scroll_attempts < self.max_scroll_attempts:
            # Parse the page source with BeautifulSoup after each scroll
            soup = BeautifulSoup(self.driver.page_source, 'html.parser')
            
            # Find all tweet text spans
            tweet_elements = soup.find_all('div', {'data-testid': 'tweetText'})
            
            # Extract tweet text and add it to all_tweets set
            initial_count = len(all_tweets)  # Track count before adding new elements
            for element in tweet_elements:
                tweet_text = element.get_text(strip=True)
                all_tweets.add(tweet_text)
            
            # If no new tweets were added, increment scroll_attempts
            if len(all_tweets) == initial_count:
                scroll_attempts += 1
            else:
                scroll_attempts = 0  # Reset if new tweets are found

            # Scroll down to load more tweets and pause to allow loading
            self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(self.scroll_pause_time)

            # Check if the page height has changed (indicating new content loaded)
            new_height = self.driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break  # Exit loop if no more content is being loaded
            last_height = new_height

        self.driver.quit()
        return list(all_tweets)  # Convert set to list for final output
    
    def analyze_tweets_with_gpt(self, tweets, model="gpt-4-turbo", response_language="English", analyze_type="main_topics"):
        tweet_content = "\n".join(tweets)
        if analyze_type == "main_topics":
            prompt = (f"This person, {self.username}, usually tweets about the following topics:\n"
                      f"{tweet_content}\n\n"
                      f"What are the main topics this person tweets about? Please respond in {response_language}.")
            instructions = f"Analyze the topics discussed in the tweets and respond in {response_language}."
        elif analyze_type == "aggressive_language":
            prompt = (f"Here is a list of tweets by {self.username}:\n"
                      f"{tweet_content}\n\n"
                      f"Please analyze if this person tends to use aggressive or offensive language in their tweets. "
                      f"Respond with your analysis in {response_language}.")
            instructions = f"Check if the tweets contain any aggressive or offensive language and respond in {response_language}."

        assistant = self.client.beta.assistants.create(name="TwitterTopicAnalyzer", instructions="Analyze user's tweets", model=model)
        thread = self.client.beta.threads.create()
        message = self.client.beta.threads.messages.create(thread_id=thread.id, role="user", content=prompt)
        run = self.client.beta.threads.runs.create_and_poll(thread_id=thread.id, assistant_id=assistant.id, instructions=instructions)
        
        if run.status == 'completed':
            messages = self.client.beta.threads.messages.list(thread_id=thread.id)
            response_text = next((msg.content[0].text.value for msg in messages if msg.role == 'assistant'), "No response from assistant")
            return response_text
        else:
            return f"Run status: {run.status}"


    def show_tweets(self, tweets):
        # Print collected tweets in a formatted way
        for idx, tweet_text in enumerate(tweets, 1):
            print(f"Tweet {idx}:\n{tweet_text}\n")

# Usage
username = "solemnplayer"  # Replace with the desired username
url = f"https://x.com/{username}"  # Replace with desired profile URL
openai_api_key = "sk-..."  # Place your OpenAI API key here  
twitter_username = "..."  # Replace with your Twitter username(It can be fake.)
twitter_password = "..."  # Replace with your Twitter password
response_language = "Turkish"  # Language for the analysis response
model = "gpt-4-turbo"  # Model to use for analysis [gpt-3.5-turbo, gpt-4 , gpt-4-turbo, gpt-4o, gpt-4o-mini]

# Initialize scraper
scraper = TwitterScraper(url=url, username=username, openai_api_key=openai_api_key, tweet_limit=300, max_scroll_attempts=50, scroll_pause_time= 2)

# Log in to Twitter
scraper.login(twitter_username, twitter_password)

# Fetch tweets and display
tweets = scraper.get_tweets_and_reposts()
scraper.show_tweets(tweets)

# Analyze tweets with GPT, specifying model, response language, and analysis type
for analyze_type in ["main_topics", "aggressive_language"]:
    analysis = scraper.analyze_tweets_with_gpt(tweets, model=model, response_language=response_language, analyze_type=analyze_type)
    print(f"{analyze_type} Analysis ({model}):", analysis)
    print("\n")

Tweet 1:
müge anlı ''okumak isteyene kimse engel olamaz 14 yaşında anneniz babanız sizi birilerine veriyosa sağda solda milletle geziyosunuzdur adınız çıkmasın diye başınızı bağlıyolar'' dedi az önce bunu söyledi 14 yaşında istismara uğramış 3 çocuk doğurmuş

Tweet 2:
Dikkat ettiyseniz bu yıl kimse bitsin bu sene demedi ya çok güzel bir yıl geçirdi herkes ya da meselenin senelerle alakası olmadığını anladı x.com/bpthaber/statu…

Tweet 3:
keske hic durmadan alisveris yapsam bir saniye bile durmasam alsam da alsam

Tweet 4:
Gözaltı| Yalova'da kızları tehdit edip sosyal medyada cinsel sapkınlık yapan kişi dün ihbarlar kapsamında gözaltına alındı.
Yalovada görüştügüm 2 yetkili ihbarları takip ettiklerini bildirdi. Yalova Valisi durumu doğruladı.

Tweet 5:
Gece uykudan 2-3 defa uyandığım için harap bir haldeyim

Tweet 6:
kalorifere yapisik yatan taraf=ben

Tweet 7:
fanfasy bra da yapmamissiniz bu sene ne yaptiniz aq

Tweet 8:
bu tweeti görüyorsan bir annenin yardım çığlıklarını görmezden ge

In [2]:
# Testing
# Need a bit better work for scrape every tweet & rt perfectly. Check from this page contents carefully.

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
from bs4 import BeautifulSoup

class TwitterScraperTest:
    def __init__(self, url):
        self.url = url
        self.driver = webdriver.Chrome()

    def login(self, twitter_username, twitter_password):
        """Log in to Twitter to access full tweets for the profile."""
        self.driver.get("https://twitter.com/login")
        
        # Wait for username field to load
        username_field = WebDriverWait(self.driver, 10).until(
            EC.presence_of_element_located((By.NAME, "text"))
        )
        username_field.send_keys(twitter_username)
        username_field.send_keys(Keys.RETURN)
        
        # Wait for password field to load
        password_field = WebDriverWait(self.driver, 10).until(
            EC.presence_of_element_located((By.NAME, "password"))
        )
        password_field.send_keys(twitter_password)
        password_field.send_keys(Keys.RETURN)
        
        # Wait until home page is loaded by checking if the home element is present
        WebDriverWait(self.driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "[data-testid='AppTabBar_Home_Link']"))
        )

    def view_page_content(self):
        """Navigate to the target profile and print out the HTML content for analysis."""
        # Go to the user profile
        self.driver.get(self.url)
        time.sleep(5)  # Wait for the page to fully load

        # Get the page source and print a sample of the content
        page_source = self.driver.page_source
        print(page_source) 
        soup = BeautifulSoup(page_source, 'html.parser')
        tweets = soup.find_all('span', {'class': 'css-1jxf684'})
        print(tweets)

        # Close driver
        self.driver.quit()

# Usage
username = "solemnplayer" #"ess_more" # "iyi_psikoloji"  # Replace with the desired username
url = f"https://x.com/{username}"  # Replace with desired profile URL
twitter_username = "HangingTime1"  # Replace with your Twitter username (It can be a fake X account.)
twitter_password = "NoNoNo53!"  # Replace with your Twitter password

# Initialize the test scraper
scraper = TwitterScraperTest(url)

# Log in to Twitter and view page content
scraper.login(twitter_username, twitter_password)
scraper.view_page_content()


<html dir="ltr" lang="en" style="overflow-y: scroll; overscroll-behavior-y: none; font-size: 15px; color-scheme: dark;"><head><style>input::placeholder { user-select: none; -webkit-user-select: none; } iframe { color-scheme: auto; }button { text-align: inherit; font-size: inherit; font-family: inherit; color: inherit }</style><style>@font-face {
  font-family: TwitterChirp;
  src: url(https://abs.twimg.com/responsive-web/client-web/Chirp-Light.3a18e64a.woff2) format('woff2'), url(https://abs.twimg.com/responsive-web/client-web/Chirp-Light.7a5673aa.woff) format('woff');
  font-weight: 300;
  font-style: 'normal';
  font-display: 'swap';
}
@font-face {
  font-family: TwitterChirp;
  src: url(https://abs.twimg.com/responsive-web/client-web/Chirp-Regular.80fda27a.woff2) format('woff2'), url(https://abs.twimg.com/responsive-web/client-web/Chirp-Regular.60b215ba.woff) format('woff');
  font-weight: 400;
  font-style: 'normal';
  font-display: 'swap';
}
@font-face {
  font-family: TwitterChir

In [3]:
tweets[0]

NameError: name 'tweets' is not defined

In [8]:
tweets[1]

'To view keyboard shortcuts, press question mark View keyboard shortcuts    19   funda funda funda    Follow Follow See new posts    Follow Follow funda funda'

In [9]:
tweets[2]

'anitsayac.com Joined January 2020 Joined January 2020 '