In [None]:
!pip install pandas \
matplotlib \
seaborn \
textblob \
deep-translator \
ipywidgets \
scikit-learn \
xgboost \
tensorflow=='2.16.1' \
transformers \
notebook \
bertopic \
tf-keras \
polars \
emoji \
pyarrow

In [None]:
import polars as pl
import numpy as np
import re
import emoji
from textblob import TextBlob
import matplotlib.pyplot as plt
import seaborn as sns

# Scikit-learn for ML pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Models
import xgboost as xgb
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

print(f"Polars version: {pl.__version__}")
print(f"TensorFlow version: {tf.__version__}")

In [None]:
# Load the dataset using Polars
try:
    df = pl.read_csv('/kaggle/input/tweet-politicians/sri_lanka_election_tweets_analysis_ready.csv')
except FileNotFoundError:
    print("Error: 'merged_data_2.csv' not found. Please upload your dataset.")
    exit()


print("Data shape:", df.shape)
print("Data head:")
print(df.head())

In [None]:
# 1. Domain-Specific Lexicons (Singlish/Colloquial)
singlish_happy = {
    'patta': 2, 'maru': 2, 'niyamai': 2, 'ela': 2, 'supiri': 2, 'gathi': 2,
    'hodai': 1, 'lassanai': 1, 'shok': 1, 'jayawewa': 2, 'good':1, 'love':2
}
singlish_sad = {
    'apalai': -2, 'chaa': -2, 'anthimai': -2, 'weda na': -2, 'boring': -1,
    'boru': -1, 'harak': -2, 'gon': -2, 'pissu': -2, 'sad':-1,'bad':-1
}

# 2. Text Cleaning Function
def clean_text(text):
    """Cleans tweet text by removing URLs, mentions, hashtags, and non-alphanumeric characters."""
    if text is None:
        return ""
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)  # Remove mentions
    text = re.sub(r'#\w+', '', text)  # Remove hashtag symbols but keep the text
    text = emoji.demojize(text) # Convert emojis to text (e.g., ❤️ -> :red_heart:)
    text = re.sub(r'[^a-zA-Z\s]', '', text) # Remove non-alphabetic characters except spaces
    text = text.lower().strip()
    return text

# 3. Custom Polarity Calculation Function
def calculate_custom_polarity(text):
    """Calculates polarity based on the custom Singlish lexicons."""
    score = 0
    words = text.split()
    for word in words:
        score += singlish_happy.get(word, 0)
        score += singlish_sad.get(word, 0)
    return score

# 4. Hybrid Sentiment Calculation Function
def get_hybrid_sentiment(text, custom_score):
    """
    Combines TextBlob's general sentiment with our custom score.
    This gives more weight to our domain-specific lexicon.
    """
    textblob_polarity = TextBlob(text).sentiment.polarity
    # Simple averaging, but can be weighted if desired
    # We will give our custom score more importance
    hybrid_score = (textblob_polarity * 0.4) + (custom_score * 0.6)
    return max(-1.0, min(1.0, hybrid_score)) # Clamp the score between -1 and 1

print("✅ Helper functions defined.")

In [None]:
# This cell applies all the feature engineering steps using Polars expressions.
# Polars' expression-based API is highly optimized and much faster than row-by-row apply in pandas.

df_featured = df.with_columns([
    # Step 1: Clean the tweet text
    pl.col("original_text").map_elements(clean_text, return_dtype=pl.String).alias("clean_text"),

    # Step 2: Calculate engagement score (using a log transform to handle large variations)
    # We add 1 to avoid log(0). Retweets are often more valuable than likes.
    (pl.col("favorite_count") + (pl.col("retweet_count") * 1.5) + 1).log().alias("engagement_score")
]).with_columns([
    # Step 3: Calculate custom and hybrid polarity on the cleaned text
    pl.col("clean_text").map_elements(calculate_custom_polarity, return_dtype=pl.Float64).alias("custom_polarity"),
    pl.col("clean_text").map_elements(lambda x: TextBlob(x).sentiment.polarity, return_dtype=pl.Float64).alias("textblob_polarity")
]).with_columns([
    # Step 4: Combine into a hybrid score
    ((pl.col("textblob_polarity") * 0.4) + (pl.col("custom_polarity") * 0.6)).alias("hybrid_polarity")
]).with_columns([
    # Step 5: This is our TARGET variable! The final weighted impact score.
    (pl.col("hybrid_polarity") * pl.col("engagement_score")).alias("weighted_polarity")
])

print("Feature engineering complete. Final DataFrame schema:")
print(df_featured.schema)
print("\nSample of the engineered data:")
print(df_featured.select([
    "clean_text", "hybrid_polarity", "engagement_score", "weighted_polarity", "target_politicians"
]).head())

In [None]:
# Define our features (X) and target (y)
# Features: The cleaned text and the raw engagement numbers
# Target: The weighted_polarity score we just created

# Note: We convert to pandas here because scikit-learn's train_test_split
# and ColumnTransformer have better integration with pandas DataFrames.
# The heavy lifting (feature engineering) has already been done efficiently by Polars.
pandas_df = df_featured.to_pandas()

X = pandas_df[['clean_text', 'favorite_count', 'retweet_count', 'hybrid_polarity', 'target_politicians']]
y = pandas_df['weighted_polarity']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set size: {X_train.shape[0]}")
print(f"Testing set size: {X_test.shape[0]}")

In [None]:
# Define the preprocessing steps for different column types
preprocessor = ColumnTransformer(
    transformers=[
        ('text', TfidfVectorizer(max_features=5000, ngram_range=(1, 2)), 'clean_text'),
        ('numeric', StandardScaler(), ['favorite_count', 'retweet_count', 'hybrid_polarity'])
    ],
    # remainder='passthrough' # Keep other columns if any
)

# Create the full pipeline with the preprocessor and the XGBoost Regressor
xgb_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', xgb.XGBRegressor(
        objective='reg:squarederror',
        n_estimators=1000,          # More trees
        learning_rate=0.05,         # Lower learning rate
        max_depth=5,                # Deeper trees
        subsample=0.8,              # Use 80% of data per tree
        colsample_bytree=0.8,       # Use 80% of features per tree
        random_state=42,
        n_jobs=-1,                  # Use all available CPU cores
        early_stopping_rounds=50    # Stop training if validation score doesn't improve
    ))
])

# Train the pipeline
# The 'eval_set' is used for early stopping to prevent overfitting
xgb_pipeline.named_steps['preprocessor'].fit(X_test)
eval_set = [(xgb_pipeline.named_steps['preprocessor'].transform(X_test), y_test)]
xgb_pipeline.fit(X_train, y_train, regressor__eval_set=eval_set, regressor__verbose=False)

import pickle
with open('xgb_pipeline_model.pkl', 'wb') as f:
    pickle.dump(xgb_pipeline, f)

print("✅ XGBoost model training complete.")

In [None]:
# Make predictions on the test set
y_pred_xgb = xgb_pipeline.predict(X_test)

# Calculate regression metrics
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

print("--- XGBoost Model Evaluation ---")
print(f"Mean Absolute Error (MAE): {mae_xgb:.4f}")
print(f"Mean Squared Error (MSE): {mse_xgb:.4f}")
print(f"R-squared (R²): {r2_xgb:.4f}")

# Visualize the results
plt.figure(figsize=(8, 8))
sns.scatterplot(x=y_test, y=y_pred_xgb, alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], '--r', linewidth=2)
plt.title('XGBoost: True vs. Predicted Impact Score')
plt.xlabel('True Weighted Polarity')
plt.ylabel('Predicted Weighted Polarity')
plt.grid(True)
plt.show()

In [None]:
# NN-specific data preparation
MODEL_NAME = 'distilbert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

# Tokenize the text data
X_train_tokens = tokenizer(
    X_train['clean_text'].tolist(),
    max_length=128,
    truncation=True,
    padding='max_length',
    return_tensors='tf'
)
X_test_tokens = tokenizer(
    X_test['clean_text'].tolist(),
    max_length=128,
    truncation=True,
    padding='max_length',
    return_tensors='tf'
)

# Scale numeric features for the NN
numeric_scaler = StandardScaler()
X_train_numeric = numeric_scaler.fit_transform(X_train[['favorite_count', 'retweet_count', 'hybrid_polarity']])
X_test_numeric = numeric_scaler.transform(X_test[['favorite_count', 'retweet_count', 'hybrid_polarity']])

# Function to create the hybrid model
def create_hybrid_regressor():
    # Transformer part for text
    bert_model = TFBertModel.from_pretrained(MODEL_NAME, trainable=True) # Freeze BERT layers for faster training
    input_ids = tf.keras.layers.Input(shape=(128,), dtype=tf.int32, name='input_ids')
    attention_mask = tf.keras.layers.Input(shape=(128,), dtype=tf.int32, name='attention_mask')
    bert_output = bert_model(input_ids, attention_mask=attention_mask)[0] # Use the [CLS] token output
    text_features = tf.keras.layers.GlobalAveragePooling1D()(bert_output)

    # Numeric part
    numeric_input = tf.keras.layers.Input(shape=(X_train_numeric.shape[1],), name='numeric_input')

    # Concatenate text and numeric features
    concatenated = tf.keras.layers.Concatenate()([text_features, numeric_input])

    # Dense layers for regression
    x = tf.keras.layers.Dense(64, activation='relu')(concatenated)
    x = tf.keras.layers.Dropout(0.2)(x)
    x = tf.keras.layers.Dense(32, activation='relu')(x)
    output = tf.keras.layers.Dense(1, activation='linear', name='output')(x) # Linear activation for regression

    model = tf.keras.Model(inputs=[input_ids, attention_mask, numeric_input], outputs=output)
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
                  loss='mean_squared_error', # Regression loss
                  metrics=['mae'])
    return model


nn_model = create_hybrid_regressor()

In [None]:
# Prepare the input dictionary for the model
X_train_nn = {
    'input_ids': X_train_tokens['input_ids'],
    'attention_mask': X_train_tokens['attention_mask'],
    'numeric_input': X_train_numeric
}
X_test_nn = {
    'input_ids': X_test_tokens['input_ids'],
    'attention_mask': X_test_tokens['attention_mask'],
    'numeric_input': X_test_numeric
}

# Train the model
history = nn_model.fit(
    X_train_nn,
    y_train,
    validation_split=0.1,
    epochs=10, # Increase epochs for better performance, but 10 is good for a demo
    batch_size=32,
    callbacks=[tf.keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True)]
)

print("\n✅ Neural Network model training complete.")

# Evaluate the NN model
y_pred_nn = nn_model.predict(X_test_nn).flatten()

mae_nn = mean_absolute_error(y_test, y_pred_nn)
mse_nn = mean_squared_error(y_test, y_pred_nn)
r2_nn = r2_score(y_test, y_pred_nn)

print("\n--- Neural Network Model Evaluation ---")
print(f"Mean Absolute Error (MAE): {mae_nn:.4f}")
print(f"Mean Squared Error (MSE): {mse_nn:.4f}")
print(f"R-squared (R²): {r2_nn:.4f}")

# Visualize the results
plt.figure(figsize=(8, 8))
sns.scatterplot(x=y_test, y=y_pred_nn, alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], '--r', linewidth=2)
plt.title('Neural Network: True vs. Predicted Impact Score')
plt.xlabel('True Weighted Polarity')
plt.ylabel('Predicted Weighted Polarity')
plt.grid(True)
plt.show()

In [None]:
# Extract the preprocessor and regressor from the pipeline
preprocessor = xgb_pipeline.named_steps['preprocessor']
regressor = xgb_pipeline.named_steps['regressor']

# Get feature names from the TfidfVectorizer and the numeric columns
text_features = preprocessor.named_transformers_['text'].get_feature_names_out()
numeric_features = preprocessor.named_transformers_['numeric'].get_feature_names_out()

# Combine all feature names in the correct order
all_features = np.concatenate([text_features, numeric_features])

# Get the feature importances from the trained XGBoost model
importances = regressor.feature_importances_

# Create a Polars DataFrame for easy sorting and plotting
importance_df = pl.DataFrame({
    'feature': all_features,
    'importance': importances
}).sort('importance', descending=True)

print("Top 20 Most Important Features:")
print(importance_df.head(20))

# Plot the top 20 features
plt.figure(figsize=(10, 8))
top_20 = importance_df.head(20).to_pandas() # Matplotlib works best with pandas
sns.barplot(x='importance', y='feature', data=top_20, palette='viridis')
plt.title('Top 20 Feature Importances for XGBoost Model')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()

=================================== option 2 =====================================

In [None]:
import polars as pl
import pandas as pd
import numpy as np
import re
import emoji
from textblob import TextBlob
import matplotlib.pyplot as plt
import seaborn as sns

# Scikit-learn for ML pipeline and evaluation
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

# Hugging Face Transformers and TensorFlow
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

print(f"TensorFlow version: {tf.__version__}")

In [None]:
# --- This section re-uses the feature engineering from Option 1 to get the polarity score ---

# Define Lexicons and Helper Functions (same as before)
singlish_happy = {'patta': 2, 'maru': 2, 'niyamai': 2, 'ela': 2, 'supiri': 2, 'gathi': 2, 'hodai': 1, 'lassanai': 1, 'shok': 1, 'jayawewa': 2, 'good':1, 'love':2}
singlish_sad = {'apalai': -2, 'chaa': -2, 'anthimai': -2, 'weda na': -2, 'boring': -1, 'boru': -1, 'harak': -2, 'gon': -2, 'pissu': -2, 'sad':-1,'bad':-1}

def clean_text(text):
    if text is None: return ""
    text = re.sub(r'http\S+', '', text); text = re.sub(r'@\w+', '', text); text = re.sub(r'#\w+', '', text)
    text = emoji.demojize(text); text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text.lower().strip()

def calculate_custom_polarity(text):
    score = 0; words = text.split()
    for word in words: score += singlish_happy.get(word, 0); score += singlish_sad.get(word, 0)
    return score

# Load data
df = pl.read_csv('/kaggle/input/tweet-politicians/sri_lanka_election_tweets_analysis_ready.csv')

# Engineer the hybrid_polarity score
df_featured = df.with_columns(
    pl.col("original_text").map_elements(clean_text, return_dtype=pl.String).alias("clean_text")
).with_columns(
    pl.col("clean_text").map_elements(calculate_custom_polarity, return_dtype=pl.Float64).alias("custom_polarity"),
    pl.col("clean_text").map_elements(lambda x: TextBlob(x).sentiment.polarity, return_dtype=pl.Float64).alias("textblob_polarity")
).with_columns(
    ((pl.col("textblob_polarity") * 0.4) + (pl.col("custom_polarity") * 0.6)).alias("hybrid_polarity")
)

# --- NEW PART: Create categorical labels ---
# We define thresholds to convert the continuous polarity score into discrete labels.
def label_sentiment(polarity):
    if polarity > 0.1:  # Threshold for positive
        return 2  # Corresponds to 'positive'
    elif polarity < -0.1:  # Threshold for negative
        return 0  # Corresponds to 'negative'
    else:
        return 1  # Corresponds to 'neutral'

# Map polarity to integer labels
df_labeled = df_featured.with_columns(
    pl.col("hybrid_polarity").map_elements(label_sentiment, return_dtype=pl.Int64).alias("label")
).select(["clean_text", "label"]).drop_nulls()


# Check the distribution of our new labels
print("Label Distribution:")
print(df_labeled['label'].value_counts())

# Convert to pandas for scikit-learn and TensorFlow compatibility
pandas_df = df_labeled.to_pandas()

# Define features (X) and target (y)
X = pandas_df['clean_text']
y = pandas_df['label']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"\nTraining set size: {len(X_train)}")
print(f"Testing set size: {len(X_test)}")

In [None]:
# The model we are going to fine-tune
MODEL_NAME = 'cardiffnlp/twitter-roberta-base-sentiment-latest'

# Load the tokenizer associated with this model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Load the model itself.
# We specify num_labels=3 so the model creates a classification head
# with 3 outputs (Negative, Neutral, Positive).
model = TFAutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=3)

# The pre-trained model has a label mapping we can use for clarity
LABEL_MAP = model.config.id2label
print("Model's label mapping:", LABEL_MAP)
# Our labeling function was designed to match this: 0 -> negative, 1 -> neutral, 2 -> positive

In [None]:
# Tokenize the text data for both training and testing sets
train_encodings = tokenizer(X_train.tolist(), truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(X_test.tolist(), truncation=True, padding=True, max_length=128)

# Create TensorFlow datasets, which are highly efficient for training
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    y_train.tolist()
))
test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    y_test.tolist()
))

# Batch and prefetch the datasets for performance
BATCH_SIZE = 16 # Use 8 or 16 for fine-tuning, depending on GPU memory
train_dataset = train_dataset.shuffle(1000).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

print("\n✅ Datasets are ready for training.")

In [None]:
# Use the AdamW optimizer, recommended for Transformers
optimizer = tf.keras.optimizers.AdamW(learning_rate=2e-5)

# We must compile the model with from_logits=True because the model outputs raw scores (logits),
# not probabilities (which a softmax function would produce).
model.compile(
    optimizer=optimizer,
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy']
)

# Define a callback for early stopping
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_accuracy',
    patience=2,
    restore_best_weights=True
)

# Fine-tune the model
history = model.fit(
    train_dataset,
    epochs=5, # Fine-tuning usually requires fewer epochs
    validation_data=test_dataset,
    callbacks=[early_stopping]
)

print("\n✅ Model fine-tuning complete.")

In [None]:
# First, get the model's raw predictions (logits) on the test set
test_logits = model.predict(test_dataset).logits

# Convert the logits to class predictions by taking the argmax
y_pred = np.argmax(test_logits, axis=1)

# Generate the classification report
print("--- Classification Report ---")
print(classification_report(y_test, y_pred, target_names=LABEL_MAP.values()))

# Generate and plot the confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=LABEL_MAP.values(),
            yticklabels=LABEL_MAP.values())
plt.title('Confusion Matrix')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

In [None]:
# --- Add this as a new cell at the end of your notebook ---

import polars as pl
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Goal: Aggregate the predicted scores for each politician in the test set.

# Step 1: Create a consolidated DataFrame with all necessary information.
# The 'X_test' DataFrame has the 'Politician' column. 'y_test' has the true score.
# 'y_pred_xgb' (a numpy array) has our predicted scores.

# It's crucial to align the indices to ensure data integrity.
results_df = X_test.copy()
results_df['true_impact_score'] = y_test
results_df['predicted_impact_score'] = y_pred_xgb

print("--- Sample of the Results DataFrame ---")
display(results_df.head())


# Step 2: Use groupby() to aggregate the scores for each politician.
# We will calculate the total impact, the average impact per tweet, and the number of tweets.
politician_summary = results_df.groupby('target_politicians').agg(
    total_predicted_impact=('predicted_impact_score', 'sum'),
    average_predicted_impact=('predicted_impact_score', 'mean'),
    tweet_count=('target_politicians', 'count')
).sort_values('total_predicted_impact', ascending=False) # Sort by who has the most impact

print("\n--- Aggregated Impact Summary per Politician ---")
display(politician_summary)


# Step 3: Visualize the results for a clear comparison.
# A bar chart is perfect for showing the total predicted impact.
plt.figure(figsize=(12, 7))
sns.barplot(
    x=politician_summary.index,
    y=politician_summary['total_predicted_impact'],
    palette='viridis'
)
plt.title('Total Predicted Social Media Impact per Politician (on Test Set)', fontsize=16)
plt.xlabel('Politician', fontsize=12)
plt.ylabel('Total Predicted Impact Score', fontsize=12)
plt.xticks(rotation=45, ha='right') # Rotate labels for better readability
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()