In [None]:
!pip install twikit
!pip install nltk
!pip install spacy



lemma ke sath

In [None]:
import csv
import asyncio
from configparser import ConfigParser
from datetime import datetime
from random import randint
from twikit import Client, TooManyRequests
import nltk
import re

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK resources
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

MINIMUM_TWEETS = 100
QUERIES = ['elon musk', 'artificial intelligence', 'climate change', 'space exploration', 'bitcoin',
           'electric vehicles', 'machine learning', 'global economy', 'sports', 'healthcare']


async def get_tweets(client, tweets, query):
    if tweets is None:
        # Get initial tweets
        print(f'{datetime.now()} - Getting tweets for "{query}"...')
        tweets = await client.search_tweet(query, product='Latest')
    else:
        wait_time = randint(5, 10)
        print(f'{datetime.now()} - Getting next tweets for "{query}" after {wait_time} seconds...')
        await asyncio.sleep(wait_time)
        tweets = await tweets.next()

    return tweets


async def main():
    # Load login credentials
    config = ConfigParser()
    config.read('config.ini')
    # comment the next 3 lines after cookies.json is created
    username = config['X']['username']
    email = config['X']['email']
    password = config['X']['password']

    # Create a CSV file
    with open('tweets.csv', 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['Query', 'Tweet_count', 'Username', 'Text', 'Created At', 'Retweets', 'Likes', 'Tokenized text', 'Lemmatized text'])

    # Authenticate to X.com
    client = Client(language='en-US')
    # run these next 2 line sof code to first access the site. Once cookies .json is created comment these lines
    await client.login(auth_info_1=username, auth_info_2=email, password=password)
    client.save_cookies('cookies.json')

    # Uncomment out the next line after you have created cookies.json
    # client.load_cookies('cookies.json')

    for query in QUERIES:
        tweet_count = 0
        tweets = None

        while tweet_count < MINIMUM_TWEETS:
            try:
                tweets = await get_tweets(client, tweets, query)
            except TooManyRequests as e:
                rate_limit_reset = datetime.fromtimestamp(e.rate_limit_reset)
                print(f'{datetime.now()} - Rate limit reached. Waiting until {rate_limit_reset}')
                wait_time = (rate_limit_reset - datetime.now()).total_seconds()
                await asyncio.sleep(wait_time)
                continue

            if not tweets:
                print(f'{datetime.now()} - No more tweets found for "{query}"')
                break

            for tweet in tweets:

                if tweet.lang != 'en':
                  # print(f"Skipped non-English tweet")
                  continue

                tweet_count += 1

                # Cleaning text
                clean_text = re.sub(r'[^a-zA-Z0-9\s]', '', tweet.text)
                clean_text = clean_text.lower()
                clean_text = re.sub(r'http\S+|www\S+', '', clean_text)

                # Tokenize and lemmatize
                tokenized_text = word_tokenize(clean_text)
                lemmatized_text = [lemmatizer.lemmatize(word) for word in tokenized_text]

                tweet_data = [
                    query,  # Query name
                    tweet_count,  # Sequential tweet count
                    getattr(tweet.user, 'name', 'Unknown'),  # Username, default to 'Unknown' if missing
                    clean_text if clean_text else '',  # Cleaned text, empty string if missing
                    tweet.created_at if tweet.created_at else '',  # Timestamp, empty string if missing
                    tweet.retweet_count if tweet.retweet_count else 0,  # Retweet count, default to 0
                    tweet.favorite_count if tweet.favorite_count else 0,  # Like count, default to 0
                    tokenized_text if tokenized_text else [],  # Tokenized text, default to empty list
                    lemmatized_text if lemmatized_text else []  # Lemmatized text, default to empty list
                ]

                # Append to CSV
                with open('tweets.csv', 'a', newline='', encoding='utf-8') as file:
                    writer = csv.writer(file)
                    writer.writerow(tweet_data)

            print(f'{datetime.now()} - Got {tweet_count} tweets for "{query}"')

        print(f'{datetime.now()} - Done! Got {tweet_count} tweets for "{query}"')

    print(f'{datetime.now()} - Completed fetching tweets for all queries!')


if __name__ == "__main__":
    await main()


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In order to protect your account from suspicious activity, we've sent a confirmation code to is*********@s******.***. Enter it below to sign in.	
>>> dhk32ixw
2025-01-25 13:51:02.027881 - Getting tweets for "elon musk"...
2025-01-25 13:51:06.583281 - Got 9 tweets for "elon musk"
2025-01-25 13:51:06.583385 - Getting next tweets for "elon musk" after 6 seconds...
2025-01-25 13:51:13.032683 - Got 18 tweets for "elon musk"
2025-01-25 13:51:13.032783 - Getting next tweets for "elon musk" after 10 seconds...
2025-01-25 13:51:23.582634 - Got 29 tweets for "elon musk"
2025-01-25 13:51:23.582740 - Getting next tweets for "elon musk" after 7 seconds...
2025-01-25 13:51:31.333595 - Got 33 tweets for "elon musk"
2025-01-25 13:51:31.333705 - Getting next tweets for "elon musk" after 5 seconds...
2025-01-25 13:51:36.753463 - Got 43 tweets for "elon musk"
2025-01-25 13:51:36.753567 - Getting next tweets for "elon musk" after 6 seconds...
2025-01-25 13:51:43.227641 - Got 52 tweets for "elon musk"
2025

better lemma hopefully

In [None]:
import csv
import asyncio
from configparser import ConfigParser
from datetime import datetime
from random import randint
from twikit import Client, TooManyRequests
import nltk
import re
import spacy

from nltk.tokenize import word_tokenize
# from nltk.stem import WordNetLemmatizer
# Load spaCy's English language model
nlp = spacy.load('en_core_web_sm')

# Download necessary NLTK resources
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('omw-1.4')

MINIMUM_TWEETS = 90
QUERIES = ['elon musk', 'artificial intelligence', 'climate change', 'space exploration', 'bitcoin',
           'electric vehicles', 'machine learning', 'global economy', 'sports', 'healthcare']


async def get_tweets(client, tweets, query):
    if tweets is None:
        # Get initial tweets
        print(f'{datetime.now()} - Getting tweets for "{query}"...')
        tweets = await client.search_tweet(query, product='Latest')
    else:
        wait_time = randint(5, 10)
        print(f'{datetime.now()} - Getting next tweets for "{query}" after {wait_time} seconds...')
        await asyncio.sleep(wait_time)
        tweets = await tweets.next()

    return tweets

def spacy_lemmatize(text):
    """
    Function to lemmatize text using spaCy.
    """
    doc = nlp(text)
    return [token.lemma_ for token in doc if not token.is_punct and not token.is_space]


async def main():
    # Load login credentials
    config = ConfigParser()
    config.read('config.ini')
    # comment the next 3 lines after cookies.json is created
    username = config['X']['username']
    email = config['X']['email']
    password = config['X']['password']

    # Create a CSV file
    with open('tweets.csv', 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['Query', 'Tweet_count', 'Username', 'Text', 'Created At', 'Retweets', 'Likes', 'Tokenized text', 'Lemmatized text'])

    # Authenticate to X.com
    client = Client(language='en-US')
    # run these next 2 line sof code to first access the site. Once cookies .json is created comment these lines
    await client.login(auth_info_1=username, auth_info_2=email, password=password)
    client.save_cookies('cookies.json')

    # Uncomment out the next line after you have created cookies.json
    # client.load_cookies('cookies.json')

    for query in QUERIES:
        tweet_count = 0
        tweets = None

        while tweet_count < MINIMUM_TWEETS:
            try:
                tweets = await get_tweets(client, tweets, query)
            except TooManyRequests as e:
                rate_limit_reset = datetime.fromtimestamp(e.rate_limit_reset)
                print(f'{datetime.now()} - Rate limit reached. Waiting until {rate_limit_reset}')
                wait_time = (rate_limit_reset - datetime.now()).total_seconds()
                await asyncio.sleep(wait_time)
                continue

            if not tweets:
                print(f'{datetime.now()} - No more tweets found for "{query}"')
                break

            for tweet in tweets:

                if tweet.lang != 'en':
                  # print(f"Skipped non-English tweet")
                  continue

                tweet_count += 1

                # Cleaning text
                clean_text = re.sub(r'[^a-zA-Z0-9\s]', '', tweet.text)
                clean_text = clean_text.lower()
                clean_text = re.sub(r'http\S+|www\S+', '', clean_text)

                # Tokenize and lemmatize
                tokenized_text = word_tokenize(clean_text)
                lemmatized_text = spacy_lemmatize(clean_text)

                tweet_data = [
                    query,  # Query name
                    tweet_count,  # Sequential tweet count
                    getattr(tweet.user, 'name', 'Unknown'),  # Username, default to 'Unknown' if missing
                    clean_text if clean_text else '',  # Cleaned text, empty string if missing
                    tweet.created_at if tweet.created_at else '',  # Timestamp, empty string if missing
                    tweet.retweet_count if tweet.retweet_count else 0,  # Retweet count, default to 0
                    tweet.favorite_count if tweet.favorite_count else 0,  # Like count, default to 0
                    tokenized_text if tokenized_text else [],  # Tokenized text, default to empty list
                    lemmatized_text if lemmatized_text else []  # Lemmatized text, default to empty list
                ]

                # Append to CSV
                with open('tweets.csv', 'a', newline='', encoding='utf-8') as file:
                    writer = csv.writer(file)
                    writer.writerow(tweet_data)

            print(f'{datetime.now()} - Got {tweet_count} tweets for "{query}"')

        print(f'{datetime.now()} - Done! Got {tweet_count} tweets for "{query}"')

    print(f'{datetime.now()} - Completed fetching tweets for all queries!')


if __name__ == "__main__":
    await main()


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


KeyError: 'X'

write how tokenization works
difference between nltk lemma vs spacy lemma and why used spacy

original

In [None]:
import csv
import asyncio
from configparser import ConfigParser
from datetime import datetime
from random import randint
from twikit import Client, TooManyRequests
import nltk

from nltk.tokenize import word_tokenize

# using nitk to tokenize the text for better assessment.
nltk.download('punkt_tab')

MINIMUM_TWEETS = 100
QUERY = 'elon musk'  # for whatever word you want to search.


async def get_tweets(client, tweets):
    if tweets is None:
        # Get initial tweets
        print(f'{datetime.now()} - Getting tweets...')
        tweets = await client.search_tweet(QUERY, product='Latest')
    else:
        wait_time = randint(5, 10)
        print(f'{datetime.now()} - Getting next tweets after {wait_time} seconds...')
        await asyncio.sleep(wait_time)
        tweets = await tweets.next()

    return tweets


async def main():
    # Load login credentials
    config = ConfigParser()
    config.read('config.ini')
    # comment the next 3 lines after cookies.json is created
    # username = config['X']['username']
    # email = config['X']['email']
    # password = config['X']['password']

    # Create a CSV file
    with open('tweets.csv', 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['Tweet_count', 'Username', 'Text', 'Created At', 'Retweets', 'Likes', 'Tokenized text'])

    # Authenticate to X.com
    client = Client(language='en-US')
    # run these next 2 line sof code to first access the site. Once cookies .json is created comment these lines
    # await client.login(auth_info_1=username, auth_info_2=email, password=password)
    # client.save_cookies('cookies.json')

    # Uncomment out the next line after you have created cookies.json
    client.load_cookies('cookies.json')

    tweet_count = 0
    tweets = None

    while tweet_count < MINIMUM_TWEETS:
        try:
            tweets = await get_tweets(client, tweets)
        except TooManyRequests as e:
            rate_limit_reset = datetime.fromtimestamp(e.rate_limit_reset)
            print(f'{datetime.now()} - Rate limit reached. Waiting until {rate_limit_reset}')
            wait_time = (rate_limit_reset - datetime.now()).total_seconds()
            await asyncio.sleep(wait_time)
            continue

        if not tweets:
            print(f'{datetime.now()} - No more tweets found')
            break

        for tweet in tweets:
            tweet_count += 1
            # tweet_data = [tweet_count, tweet.user.name, tweet.text, tweet.created_at, tweet.retweet_count,
            #               tweet.favorite_count]
            tweet_data = [tweet_count, tweet.user.name, tweet.text, tweet.created_at, tweet.retweet_count, tweet.favorite_count, word_tokenize(tweet.text)]

            with open('tweets.csv', 'a', newline='', encoding='utf-8') as file:
                writer = csv.writer(file)
                writer.writerow(tweet_data)

        print(f'{datetime.now()} - Got {tweet_count} tweets')

    print(f'{datetime.now()} - Done! Got {tweet_count} tweets')


if __name__ == "__main__":
    await main()


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


2025-01-18 13:22:17.926374 - Getting tweets...
2025-01-18 13:22:18.626343 - Got 17 tweets
2025-01-18 13:22:18.626442 - Getting next tweets after 6 seconds...
2025-01-18 13:22:25.181804 - Got 37 tweets
2025-01-18 13:22:25.181925 - Getting next tweets after 10 seconds...
2025-01-18 13:22:35.628589 - Got 55 tweets
2025-01-18 13:22:35.628694 - Getting next tweets after 7 seconds...
2025-01-18 13:22:43.104944 - Got 71 tweets
2025-01-18 13:22:43.105052 - Getting next tweets after 8 seconds...
2025-01-18 13:22:51.559334 - Got 87 tweets
2025-01-18 13:22:51.559450 - Getting next tweets after 10 seconds...
2025-01-18 13:23:01.953901 - Got 103 tweets
2025-01-18 13:23:01.954002 - Done! Got 103 tweets
