In [1]:
# CELL 1: IMPORTS AND SETUP

import tweepy
import pandas as pd
import re
from textblob import TextBlob
from deep_translator import GoogleTranslator
from dotenv import load_dotenv
import os
import time # <-- IMPORTED FOR ERROR HANDLING
import ast 
import matplotlib.pyplot as plt
import seaborn as sns

# Load environment variables from .env file
load_dotenv()

print("Libraries imported and environment loaded.")

Libraries imported and environment loaded.


In [2]:
# CELL 2: CONFIGURATION AND CONSTANTS

# --- API and Data Files ---
BEARER_TOKEN = os.getenv("BEARER_TOKEN")
RAW_TWEETS_CSV = 'sri_lanka_election_tweets_raw.csv'
FINAL_PROCESSED_CSV = 'sri_lanka_election_tweets_final_processed.csv'


# --- Search and Attribution ---
politician_keywords = {
    'Anura Kumara Dissanayake': ['anura kumara', 'akd', '@anuradissanayake', 'anuradissanayake'],
    'Sajith Premadasa': ['sajith premadasa', 'sajith', '@sajithpremadasa'],
    'Ranil Wickremesinghe': ['ranil wickremesinghe', 'ranil', '@RW_UNP'],
    'Namal Rajapaksa': ['namal rajapaksa', 'namal', '@RajapaksaNamal']
}
SEARCH_QUERY = '("Anura Kumara" OR AKD OR "Sajith" OR "Ranil" OR "Namal Rajapaksa") (lang:en OR lang:si) -is:retweet'
TWEET_LIMIT = 2500 # Increased limit for a more robust dataset


# --- Sentiment Dictionaries ---
emoticons_happy = set([':-)', ':)', ';)', ':o)', ':]', ':3', ':c)', ':>', '=]', '8)', '=)', ':}', ':^)', ':-D', ':D', '8-D', '8D', 'x-D', 'xD', 'X-D', 'XD', '=-D', '=D', '=-3', '=3', ':-))', ":'-)", ":')", ':*', ':^*', '>:P', ':-P', ':P', 'X-P', 'x-p', 'xp', 'XP', ':-p', ':p', '=p', ':-b', ':b', '>:)', '>;)', '>:-)', '<3'])
emoticons_sad = set([':L', ':-/', '>:/', ':S', '>:[', ':@', ':-(', ':[', ':-||', '=L', ':<', ':-[', ':-<', '=\\', '=/', '>:(', ':(', '>.<', ":'-(", ":'(", ':\\', ':-c', ':c', ':{', '>:\\', ';('])
singlish_happy = set(['hondai', 'honday', 'hondaiy', 'hondaii', 'niyamai', 'niyamay', 'supiri', 'supiriyak', 'supiriii', 'patta', 'maru', 'shok', 'shoi', 'ela', 'elakiri', 'elaa', 'jayawewa', 'jaya wewa', 'lassanai', 'lassanay', 'gammak', 'gammac', 'sira', 'siraa', 'ow', 'owu', 'ov', 'hari', 'aththa', 'aththac', 'subapathum', 'suba pathum', 'pissu kora', 'thanks', 'thankz', 'thnx', 'tnx'])
singlish_sad = set(['narakai', 'narakay', 'boru', 'boruwak', 'boruu', 'weradi', 'waradi', 'varadi', 'weradii', 'chater', 'chaater', 'chaa', 'epaa', 'epa', 'hora', 'horu', 'horakam', 'pissu', 'pisso', 'gon', 'gonn', 'gon haraka', 'pal horu', 'kalakanni', 'pala', 'palayan', 'aiyo', 'aiyoo', 'ane', 'apoi', 'ammapa', 'na', 'naa', 'ne', 'naha', 'nathuwa', 'nathi', 'neti', 'nathe'])

print("Configuration loaded.")

Configuration loaded.


In [3]:
# CELL 3: HELPER FUNCTIONS

def get_target_politicians(text, keywords_dict):
    """Identifies which politician(s) are mentioned in a tweet."""
    mentioned = []
    text_lower = text.lower()
    for politician, keywords in keywords_dict.items():
        if any(keyword in text_lower for keyword in keywords):
            mentioned.append(politician)
    # If no specific politician is found, you might want to label it as 'General' or skip
    return mentioned if mentioned else []

def clean_text_for_blob(tweet_text):
    """Minimal cleaning for TextBlob."""
    tweet_text = re.sub(r'https?:\/\/\S+', '', tweet_text)
    tweet_text = re.sub(r'@[A-Za-z0-9_]+', '', tweet_text)
    tweet_text = re.sub(r'#', '', tweet_text)
    return tweet_text

def calculate_custom_polarity(tweet_text):
    """Calculates custom polarity using Singlish words and emoticons as modifiers."""
    polarity = 0.0
    text_lower = tweet_text.lower()
    for word in text_lower.split():
        if word in singlish_happy: polarity += 0.1
        elif word in singlish_sad: polarity -= 0.1
    for emoticon in emoticons_happy:
        if emoticon in text_lower: polarity += 0.1
    for emoticon in emoticons_sad:
        if emoticon in text_lower: polarity -= 0.1
    return max(min(polarity, 1.0), -1.0)

def get_hybrid_sentiment(text_for_blob, custom_polarity):
    """Combines TextBlob and custom polarities, then clamps the result."""
    try:
        analysis = TextBlob(text_for_blob)
        textblob_polarity = analysis.sentiment.polarity
        subjectivity = analysis.sentiment.subjectivity
    except Exception:
        textblob_polarity = 0.0
        subjectivity = 0.0
    hybrid_polarity = textblob_polarity + custom_polarity
    hybrid_polarity = max(min(hybrid_polarity, 1.0), -1.0)
    if hybrid_polarity > 0.05: sentiment = 'Positive'
    elif hybrid_polarity < -0.05: sentiment = 'Negative'
    else: sentiment = 'Neutral'
    return sentiment, hybrid_polarity, subjectivity

def translate_text(text, target_lang='en'):
    """Translates text, returning original on failure."""
    try:
        if not text or not isinstance(text, str): return text
        return GoogleTranslator(source='auto', target=target_lang).translate(text) or text
    except Exception:
        return text

print("Helper functions defined.")

Helper functions defined.


In [4]:
# CELL 4: DATA EXTRACTION WITH ROBUST ERROR HANDLING

# Check if the raw data file already exists to avoid re-running
if os.path.exists(RAW_TWEETS_CSV):
    print(f"Raw data file '{RAW_TWEETS_CSV}' found. Loading from disk.")
    df_raw = pd.read_csv(RAW_TWEETS_CSV)
else:
    print(f"Raw data file not found. Starting fresh extraction from Twitter...")
    if not BEARER_TOKEN:
        raise Exception("BEARER_TOKEN not found in environment. Please check your .env file.")

    client = tweepy.Client(bearer_token=BEARER_TOKEN, wait_on_rate_limit=True)
    raw_tweet_data = []
    
    # --- NEW: Robust retry loop with exponential backoff ---
    backoff_counter = 1
    while True:
        try:
            print("Attempting to fetch tweets...")
            # Using tweepy's Paginator to handle response pages automatically
            for i, tweet in enumerate(tweepy.Paginator(client.search_recent_tweets,
                                                     query=SEARCH_QUERY,
                                                     tweet_fields=["id", "text", "created_at", "source", "lang", "public_metrics", "possibly_sensitive", "author_id"],
                                                     expansions=["author_id"],
                                                     max_results=100).flatten(limit=TWEET_LIMIT)):
                if (i + 1) % 100 == 0:
                    print(f"...{i + 1} tweets fetched")

                raw_tweet_data.append({
                    'id': tweet.id,
                    'created_at': tweet.created_at,
                    'original_text': tweet.text,
                    'lang': tweet.lang,
                    'source': tweet.source,
                    'favorite_count': tweet.public_metrics.get('like_count', 0),
                    'retweet_count': tweet.public_metrics.get('retweet_count', 0)
                })

            print("Successfully completed tweet fetching.")
            break  # Exit the while loop on success

        # Catching the specific Tweepy exception for network errors, rate limits, etc.
        except tweepy.errors.TweepyException as e:
            sleep_duration = 60 * backoff_counter
            print(f"--- AN ERROR OCCURRED: {e} ---")
            print(f"Waiting for {sleep_duration} seconds before retrying...")
            time.sleep(sleep_duration)
            backoff_counter += 1  # Increase wait time for the next potential error
            continue # Retry the fetch

    # --- Save the raw data so we don't have to do this again ---
    df_raw = pd.DataFrame(raw_tweet_data)
    df_raw.to_csv(RAW_TWEETS_CSV, index=False, encoding='utf-8')
    print(f"\nExtraction complete. {len(df_raw)} tweets saved to '{RAW_TWEETS_CSV}'")

print("\nRaw DataFrame shape:", df_raw.shape)
df_raw.head()

Raw data file not found. Starting fresh extraction from Twitter...
Attempting to fetch tweets...
--- AN ERROR OCCURRED: 401 Unauthorized
Unauthorized ---
Waiting for 60 seconds before retrying...
Attempting to fetch tweets...
--- AN ERROR OCCURRED: 401 Unauthorized
Unauthorized ---
Waiting for 120 seconds before retrying...
