In [1]:
# trump_author_attribution.py – *Extended* solution for Assignment 3
# --------------------------------------------------------------
# Supports **four** algorithms selectable via --algo <name>
#   1. baseline   – TF‑IDF  + Logistic Regression (as before)
#   2. svm        – TF‑IDF  + Linear SVM
#   3. mlp        – TF‑IDF  + 2‑layer feed‑forward neural net
#   4. transformer – DistilBERT fine‑tuning (requires GPU & 🤗)
# Produces <algo>_preds.txt in the required single‑line format.
# --------------------------------------------------------------
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from __future__ import annotations
import os, re, string, pickle, argparse, sys
from pathlib import Path
from typing import List, Tuple

import numpy as np
import pandas as pd

# ─────────────────────── scikit‑learn components ────────────────────────
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from datetime import datetime
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
# Add these imports at the top
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
import torch.nn.functional as F
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
import numpy as np
# Add these imports at the top
from transformers import (
    DistilBertTokenizer,
    DistilBertForSequenceClassification,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)
import torch
from torch.utils.data import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [2]:
import os
os.environ["WANDB_DISABLED"] = "true"
os.environ["HF_HUB_DISABLE_TELEMETRY"]="1"

## Data Collection

In [3]:
class TweetDataset(Dataset):
    """Custom Dataset for tweet classification with DistilBERT"""

    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        # Tokenize the text
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

## Preprocessing data

In [4]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
import string

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')
nltk.download('omw-1.4')

# Initialize preprocessing tools
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    """
    Enhanced preprocessing for tweet text with common tweet-specific issues
    """
    if pd.isna(text) or text == '':
        return ''

    # Convert to string and lowercase
    text = str(text).lower()

    # Handle HTML entities and special characters
    text = re.sub(r'&amp;', 'and', text)
    text = re.sub(r'&lt;', '<', text)
    text = re.sub(r'&gt;', '>', text)
    text = re.sub(r'&quot;', '"', text)
    text = re.sub(r'&apos;', "'", text)

    # Remove URLs (more comprehensive)
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
    text = re.sub(r'www\.(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
    text = re.sub(r't\.co/\w+', '', text)  # Twitter's URL shortener

    # Remove user mentions (@username) but keep the context
    text = re.sub(r'@\w+', '', text)

    # Handle hashtags - remove # but keep the word
    text = re.sub(r'#(\w+)', r'\1', text)

    # Handle retweets
    text = re.sub(r'^rt\s+', '', text)  # Remove RT at beginning
    text = re.sub(r'\brt\b', '', text)  # Remove standalone RT

    # Handle contractions and common abbreviations
    contractions = {
        "won't": "will not", "can't": "cannot", "n't": " not",
        "'re": " are", "'ve": " have", "'ll": " will", "'d": " would",
        "'m": " am", "thx": "thanks", "u": "you", "ur": "your",
        "ppl": "people", "govt": "government", "b4": "before"
    }
    for contraction, expansion in contractions.items():
        text = re.sub(contraction, expansion, text)

    # Handle repeated characters (e.g., "sooooo" -> "so")
    text = re.sub(r'(.)\1{2,}', r'\1', text)

    # Remove excessive whitespace and newlines
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\n+', ' ', text)

    # Remove punctuation but preserve emoticons/emoji patterns first
    # Simple emoticon preservation
    emoticon_pattern = r'[;:=8][\-o\*\']?[\)\]\(\[dDpP/\\OpP]'
    emoticons = re.findall(emoticon_pattern, text)

    # Remove punctuation and numbers
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\d+', '', text)

    # Tokenize
    try:
        tokens = word_tokenize(text)
    except:
        tokens = text.split()

    # Remove stopwords, short words, and lemmatize
    tokens = [
        lemmatizer.lemmatize(token)
        for token in tokens
        if token not in stop_words
        and len(token) > 2
        and token.isalpha()  # Only keep alphabetic tokens
    ]

    # Add back emoticons as tokens
    tokens.extend(emoticons)

    # Remove empty tokens and duplicates while preserving order
    seen = set()
    clean_tokens = []
    for token in tokens:
        if token and token not in seen:
            clean_tokens.append(token)
            seen.add(token)

    result = ' '.join(clean_tokens)
    return result if result.strip() else 'empty_tweet'

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [5]:
def preprocess_train_data(train_fn: pd.DataFrame) -> pd.DataFrame:
    # Load and preprocess data
    cols = ['tweet_id','user_handle','tweet','timestamp','device']
    train_df = pd.read_csv(train_fn, sep="\t", header=None, names=cols, quoting=3, index_col=False)
    # print(f"First value in the 'user_handle' column: {train_df['user_handle'].iloc[0]}")
    # print(train_df.head())

    train_df = train_df[train_df['user_handle'] == 'realDonaldTrump']

    print(f"Shape of the original dataset: {train_df.shape}")



    # Filter for Android and iPhone only
    # device_mapping = {'android': 'android', 'iphone': 'iphone'}
    # train_clean = train_df[train_df['device'].isin(device_mapping.keys())].copy()
    train_df['device'] = train_df['device'].apply(lambda x: 'iphone' if x != 'android' else x)

    train_df['processed_tweet'] = train_df['tweet'].apply(preprocess_text)

    return train_df

## Feature extraction
### We've extracted time features from the timestamp column

In [6]:


def extract_timestamp_features(df):
    """
    Extract temporal features from timestamp column
    """
    # Convert timestamp to datetime
    df['datetime'] = pd.to_datetime(df['timestamp'], errors='coerce')
    # print(f"first 5 rows of the 'datetime' column:\n{df['datetime'].head()}")
    # # Drop rows with invalid datetime
    # df = df.dropna(subset=['datetime'])

    # Extract temporal features
    df['hour_of_day'] = df['datetime'].dt.hour
    df['day_of_week'] = df['datetime'].dt.dayofweek  # Monday=0, Sunday=6
    df['day_of_month'] = df['datetime'].dt.day
    df['month'] = df['datetime'].dt.month
    df['year'] = df['datetime'].dt.year

    # Weekend indicator (Saturday=5, Sunday=6)
    df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)

    # Season (Northern Hemisphere)
    def get_season(month):
        if month in [12, 1, 2]:
            return 0  # Winter
        elif month in [3, 4, 5]:
            return 1  # Spring
        elif month in [6, 7, 8]:
            return 2  # Summer
        else:
            return 3  # Fall

    df['season'] = df['month'].apply(get_season)

    # Time of day categories
    def get_time_period(hour):
        if 6 <= hour < 12:
            return 0  # Morning
        elif 12 <= hour < 18:
            return 1  # Afternoon
        elif 18 <= hour < 22:
            return 2  # Evening
        else:
            return 3  # Night

    df['time_period'] = df['hour_of_day'].apply(get_time_period)

    # Business hours indicator (9 AM to 5 PM, weekdays)
    df['is_business_hours'] = ((df['hour_of_day'] >= 9) &
                               (df['hour_of_day'] < 17) &
                               (df['day_of_week'] < 5)).astype(int)

    # Late night indicator (11 PM to 6 AM)
    df['is_late_night'] = ((df['hour_of_day'] >= 23) |
                           (df['hour_of_day'] < 6)).astype(int)

    # for any null values in one of the column created, fill with mod value
    for col in ['hour_of_day', 'day_of_week', 'day_of_month', 'month', 'year',
                'is_weekend', 'season', 'time_period', 'is_business_hours',
                'is_late_night']:
        if df[col].isnull().any():
            try:
                mod_value = df[col].mode()[0]  # Get the most common value
            except:
                print(df[col])
            # mod_value = df[col].mode()[0]
            df[col].fillna(mod_value, inplace=True)

    return df


## Training Pipeline

In [7]:
# Define the neural network architecture
class FFNNClassifier(nn.Module):
    def __init__(self, input_size, hidden_sizes=[512, 256, 128], dropout_rate=0.3):
        super(FFNNClassifier, self).__init__()

        # Create layers dynamically
        layers = []
        prev_size = input_size

        for hidden_size in hidden_sizes:
            layers.extend([
                nn.Linear(prev_size, hidden_size),
                nn.ReLU(),
                nn.BatchNorm1d(hidden_size),
                nn.Dropout(dropout_rate)
            ])
            prev_size = hidden_size

        # Output layer
        layers.append(nn.Linear(prev_size, 2))  # 2 classes: android/iphone

        self.network = nn.Sequential(*layers)

    def forward(self, x):
        return self.network(x)

In [8]:
def extract_rich_features(df):
    """
    Extract comprehensive stylometric and linguistic features
    """
    features = pd.DataFrame()

    # Basic text statistics
    features['text_length'] = df['tweet'].str.len()
    features['word_count'] = df['tweet'].str.split().str.len()
    features['avg_word_length'] = df['tweet'].apply(lambda x: np.mean([len(word) for word in str(x).split()]) if str(x).split() else 0)
    features['sentence_count'] = df['tweet'].str.count(r'[.!?]+') + 1

    # Character-level features
    features['uppercase_ratio'] = df['tweet'].apply(lambda x: sum(1 for c in str(x) if c.isupper()) / max(len(str(x)), 1))
    features['digit_ratio'] = df['tweet'].apply(lambda x: sum(1 for c in str(x) if c.isdigit()) / max(len(str(x)), 1))
    features['space_ratio'] = df['tweet'].apply(lambda x: sum(1 for c in str(x) if c.isspace()) / max(len(str(x)), 1))
    features['punctuation_ratio'] = df['tweet'].apply(lambda x: sum(1 for c in str(x) if c in string.punctuation) / max(len(str(x)), 1))

    # Specific punctuation counts
    features['exclamation_count'] = df['tweet'].str.count('!')
    features['question_count'] = df['tweet'].str.count(r'\?')
    features['comma_count'] = df['tweet'].str.count(',')
    features['period_count'] = df['tweet'].str.count(r'\.')
    features['ellipsis_count'] = df['tweet'].str.count(r'\.{2,}')
    features['dash_count'] = df['tweet'].str.count('-')
    features['quote_count'] = df['tweet'].str.count('"') + df['tweet'].str.count("'")

    # Twitter-specific features
    features['mention_count'] = df['tweet'].str.count('@')
    features['hashtag_count'] = df['tweet'].str.count('#')
    features['url_count'] = df['tweet'].str.count('http') + df['tweet'].str.count('www')
    features['retweet_indicator'] = df['tweet'].str.lower().str.startswith('rt').astype(int)

    # Capitalization patterns
    features['all_caps_words'] = df['tweet'].apply(lambda x: sum(1 for word in str(x).split() if word.isupper() and len(word) > 1))
    features['title_case_words'] = df['tweet'].apply(lambda x: sum(1 for word in str(x).split() if word.istitle()))
    features['starts_with_capital'] = df['tweet'].apply(lambda x: 1 if str(x) and str(x)[0].isupper() else 0)

    # Repetition patterns
    features['repeated_chars'] = df['tweet'].apply(lambda x: len(re.findall(r'(.)\1{2,}', str(x))))
    features['repeated_words'] = df['tweet'].apply(lambda x: len(str(x).split()) - len(set(str(x).lower().split())))

    # Vocabulary richness
    features['unique_word_ratio'] = df['tweet'].apply(lambda x: len(set(str(x).lower().split())) / max(len(str(x).split()), 1))

    # Emotional indicators
    features['positive_emoji'] = df['tweet'].str.count(r'[😀😃😄😁😆😅😂🤣😊😇🙂😉😌😍🥰😘😗😙😚😋😛😝😜🤪🤨🧐🤓😎🤩🥳😏😒😞😔😟😕🙁☹️😣😖😫😩🥺😢😭😤😠😡🤬🤯😳🥵🥶😱😨😰😥😓🤗🤔🤭🤫🤥😶😐😑😬🙄😯😦😧😮😲🥱😴🤤😪😵🤐🥴🤢🤮🤧😷🤒🤕🤑🤠😈👿👹👺🤡💩👻💀☠️👽👾🤖🎃😺😸😹😻😼😽🙀😿😾]')
    features['negative_emoji'] = df['tweet'].str.count(r'[😞😔😟😕🙁☹️😣😖😫😩🥺😢😭😤😠😡🤬]')

    # Linguistic style
    features['first_person_pronouns'] = df['tweet'].str.lower().str.count(r'\b(i|me|my|mine|myself)\b')
    features['second_person_pronouns'] = df['tweet'].str.lower().str.count(r'\b(you|your|yours|yourself)\b')
    features['third_person_pronouns'] = df['tweet'].str.lower().str.count(r'\b(he|she|it|they|him|her|them|his|hers|its|their|theirs)\b')

    # Common Trump-specific patterns
    features['trump_phrases'] = df['tweet'].str.lower().str.count(r'\b(great|tremendous|amazing|fantastic|incredible|believe me|folks|sad|fake|winner|loser)\b')
    features['superlatives'] = df['tweet'].str.lower().str.count(r'\b(best|worst|greatest|biggest|smallest|most|least)\b')
    features['absolutist_words'] = df['tweet'].str.lower().str.count(r'\b(always|never|all|none|every|nothing|everything|everyone|nobody)\b')

    # Fill NaN values
    features = features.fillna(0)

    return features

In [9]:
#### Functions for Bert

# Use ORIGINAL tweet text for transformer (not heavily preprocessed)
# Light preprocessing for transformers
def light_preprocess_for_transformer(text):
    """Light preprocessing that preserves important stylistic features"""
    if pd.isna(text) or text == '':
        return ''

    text = str(text)

    # Only remove excessive whitespace and newlines
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\n+', ' ', text)

    # Handle some HTML entities
    text = re.sub(r'&amp;', '&', text)
    text = re.sub(r'&lt;', '<', text)
    text = re.sub(r'&gt;', '>', text)
    text = re.sub(r'&quot;', '"', text)

    return text.strip()

# Define compute metrics function
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
    accuracy = accuracy_score(labels, predictions)
    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

## FFNN helper functions

In [10]:
from sklearn.metrics import f1_score

def create_ffnn_param_grid():
    """Create parameter grid for FFNN"""
    return {
        'hidden_sizes': [[128], [256, 128], [256, 128]],
        'dropout_rate': [0.2, 0.5],
        'learning_rate': [0.001, 0.01],
        'batch_size': [32],
        'weight_decay': [1e-4]
    }

def train_ffnn_model(X_tensor, y_tensor, params, input_size):
    """Train a single FFNN model with given parameters"""
    # Create data loaders
    dataset = TensorDataset(X_tensor, y_tensor)
    dataloader = DataLoader(dataset, batch_size=params['batch_size'], shuffle=True)

    # Initialize model
    model = FFNNClassifier(
        input_size,
        hidden_sizes=params['hidden_sizes'],
        dropout_rate=params['dropout_rate']
    )

    # Setup training
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(
        model.parameters(),
        lr=params['learning_rate'],
        weight_decay=params['weight_decay']
    )

    # Training loop with early stopping
    model.train()
    best_loss = float('inf')
    patience_counter = 0

    for epoch in range(30):  # Reduced epochs for grid search
        total_loss = 0
        for batch_X, batch_y in dataloader:
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        avg_loss = total_loss / len(dataloader)

        # Early stopping
        if avg_loss < best_loss:
            best_loss = avg_loss
            patience_counter = 0
        else:
            patience_counter += 1

        if patience_counter >= 5:  # Reduced patience
            break

    return model, best_loss

def evaluate_ffnn_model(model, X_val, y_val):
    """Evaluate FFNN model and return F1-score"""
    model.eval()
    with torch.no_grad():
        outputs = model(X_val)
        _, predicted = torch.max(outputs, dim=1)
    y_true = y_val.cpu().numpy()
    y_pred = predicted.cpu().numpy()
    return f1_score(y_true, y_pred, average='macro')

def ffnn_grid_search(X_tensor, y_tensor, input_size):
    """Perform grid search for FFNN hyperparameters"""
    param_grid = create_ffnn_param_grid()

    # Split data for validation
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import f1_score
    X_train, X_val, y_train, y_val = train_test_split(
        X_tensor, y_tensor, test_size=0.2, random_state=42, stratify=y_tensor
    )

    best_score = 0
    best_params = {}
    best_model = None

    print("Starting FFNN Grid Search...")

    # Generate all parameter combinations
    combinations = []
    for hidden_sizes in param_grid['hidden_sizes']:
        for dropout_rate in param_grid['dropout_rate']:
            for learning_rate in param_grid['learning_rate']:
                for batch_size in param_grid['batch_size']:
                    for weight_decay in param_grid['weight_decay']:
                        combinations.append({
                            'hidden_sizes': hidden_sizes,
                            'dropout_rate': dropout_rate,
                            'learning_rate': learning_rate,
                            'batch_size': batch_size,
                            'weight_decay': weight_decay
                        })

    print(f"Testing {len(combinations)} combinations...")

    for i, params in enumerate(combinations, 1):
        try:
            print(f"Combination {i}/{len(combinations)}: {params}")

            # Train model
            model, _ = train_ffnn_model(X_train, y_train, params, input_size)

            # Evaluate model
            score = evaluate_ffnn_model(model, X_val, y_val)

            print(f"  Validation F1: {score:.4f}")

            # Update best if necessary
            if score > best_score:
                best_score = score
                best_params = params.copy()
                best_model = model.state_dict().copy()
                print(f"  New best! Score: {best_score:.4f}")

        except Exception as e:
            print(f"  Error: {e}")
            continue

    return best_params, best_score, best_model

## Distillbert helper functions

In [None]:
def create_distilbert_param_grid():
    """Create parameter grid for DistilBERT"""
    return {
        'num_train_epochs': [3],
        'per_device_train_batch_size': [16, 32],
        'learning_rate': [2e-5, 5e-5],
        'weight_decay': [0.01, 0.001],
        'warmup_steps': [100, 200]
    }

def train_distilbert_model(train_dataset, val_dataset, tokenizer, params):
    """Train a single DistilBERT model with given parameters"""
    # Load model
    model = DistilBertForSequenceClassification.from_pretrained(
        'distilbert-base-uncased',
        num_labels=2,
        problem_type="single_label_classification"
    )

    # Training arguments
    training_args = TrainingArguments(
        output_dir='./trump_distilbert_temp',
        num_train_epochs=params['num_train_epochs'],
        per_device_train_batch_size=params['per_device_train_batch_size'],
        per_device_eval_batch_size=params['per_device_train_batch_size'],
        learning_rate=params['learning_rate'],
        weight_decay=params['weight_decay'],
        warmup_steps=params['warmup_steps'],
        logging_steps=50,
        eval_strategy="steps",
        eval_steps=100,
        save_strategy="no",  # Don't save during grid search
        load_best_model_at_end=False,
        remove_unused_columns=False,
        push_to_hub=False,
        report_to=None,
        metric_for_best_model="f1",
        greater_is_better=True
    )

    # Initialize trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]  # Reduced patience
    )

    # Train model
    trainer.train()

    # Evaluate and return score
    eval_results = trainer.evaluate()
    return model, eval_results['eval_f1']

def distilbert_grid_search(texts, labels, tokenizer):
    """Perform grid search for DistilBERT hyperparameters"""
    param_grid = create_distilbert_param_grid()

    # Split data for validation
    from sklearn.model_selection import train_test_split
    train_texts, val_texts, train_labels, val_labels = train_test_split(
        texts, labels, test_size=0.2, random_state=42, stratify=labels
    )

    # Create datasets
    train_dataset = TweetDataset(train_texts, train_labels, tokenizer)
    val_dataset = TweetDataset(val_texts, val_labels, tokenizer)

    best_score = 0
    best_params = {}
    best_model = None

    print("Starting DistilBERT Grid Search...")

    # Generate parameter combinations (simplified)
    combinations = []
    for epochs in param_grid['num_train_epochs']:
        for batch_size in param_grid['per_device_train_batch_size']:
            for lr in param_grid['learning_rate']:
                for wd in param_grid['weight_decay']:
                    for warmup in param_grid['warmup_steps']:
                        combinations.append({
                            'num_train_epochs': epochs,
                            'per_device_train_batch_size': batch_size,
                            'learning_rate': lr,
                            'weight_decay': wd,
                            'warmup_steps': warmup
                        })

    # Limit combinations for computational efficiency
    combinations = combinations[:3]  # Test only first 3 combinations
    print(f"Testing {len(combinations)} combinations...")

    for i, params in enumerate(combinations, 1):
        try:
            print(f"Combination {i}/{len(combinations)}: {params}")

            # Train model
            model, score = train_distilbert_model(train_dataset, val_dataset, tokenizer, params)

            print(f"  F1 Score: {score:.4f}")

            # Update best if necessary
            if score > best_score:
                best_score = score
                best_params = params.copy()
                best_model = model
                print(f"  New best! Score: {best_score:.4f}")

        except Exception as e:
            print(f"  Error: {e}")
            continue

    return best_params, best_score, best_model

In [11]:
def training_pipeline(alg, train_fn):
    """Returns a trained model given the specific task and algorithm."""

    train_clean = preprocess_train_data(train_fn)
    # print(train_clean)

    # Extract features
    train_clean = extract_timestamp_features(train_clean)
    train_clean['label'] = train_clean['device'].map({'android': 0, 'iphone': 1})

    # Define feature columns
    temporal_features = ['hour_of_day', 'day_of_week', 'day_of_month', 'month',
                'is_weekend', 'season', 'time_period', 'is_business_hours', 'is_late_night']

    if alg == 1:  # Logistic Regression with Grid Search

        # Prepare features
        X_text = train_clean['processed_tweet']
        X_temporal = train_clean[temporal_features]
        y = train_clean['label']

        # Create and fit TF-IDF vectorizer
        tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), min_df=2, max_df=0.95)
        X_text_tfidf = tfidf.fit_transform(X_text)

        # Scale temporal features
        scaler = StandardScaler()
        X_temp_scaled = scaler.fit_transform(X_temporal)

        # Combine features
        from scipy.sparse import hstack
        X_combined = hstack([X_text_tfidf, X_temp_scaled])

        # Grid search for Logistic Regression hyperparameters
        param_grid = {
            'C': [1.0, 2.0],
            'max_iter': [10000],
            'solver': ['liblinear', 'lbfgs'],
            'penalty': ['l2']
        }

        # Create LogisticRegression instance
        lr = LogisticRegression(random_state=42)

        # Perform grid search with cross-validation
        print("Starting Grid Search for Logistic Regression...")
        grid_search = GridSearchCV(
            estimator=lr,
            param_grid=param_grid,
            cv=3,  # 3-fold cross-validation
            scoring='f1',
            n_jobs=-1,  # Use all available cores
            verbose=1   # Print progress
        )

        # Fit the grid search
        grid_search.fit(X_combined, y)

        # Print results
        print(f"Best parameters: {grid_search.best_params_}")
        print(f"Best cross-validation score: {grid_search.best_score_:.4f}")

        # Get the best classifier
        best_classifier = grid_search.best_estimator_

        # Return model components
        return {
            'algo_num': 1,
            'classifier': best_classifier,
            'tfidf': tfidf,
            'scaler': scaler,
            'temporal_features': temporal_features,
            'best_params': grid_search.best_params_,
            'cv_score': grid_search.best_score_,
            'grid_search': grid_search  # Store the full grid search object if needed
        }

    elif alg == 2:  # SVM (both linear and nonlinear)

        # Prepare features
        X_text = train_clean['processed_tweet']
        X_temporal = train_clean[temporal_features]
        # print(X_temporal)
        y = train_clean['label']


        # Create and fit TF-IDF vectorizer
        tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), min_df=10, max_df=0.95)
        # print(X_text)
        X_text_tfidf = tfidf.fit_transform(X_text)

        # Scale temporal features
        scaler = StandardScaler()
        X_temp_scaled = scaler.fit_transform(X_temporal)

        # Combine features
        from scipy.sparse import hstack
        X_combined = hstack([X_text_tfidf, X_temp_scaled])

        # Grid search for best SVM kernel and parameters
        param_grid = [
            # Linear kernel
            {
                'kernel': ['linear'],
                'C': [0.1, 1.0, 10.0]
            },
            # RBF (nonlinear) kernel
            {
                'kernel': ['rbf'],
                'C': [0.1, 1.0, 10.0],
                'gamma': ['scale', 'auto', 0.001, 0.01]
            },
            # Polynomial (nonlinear) kernel
            {
                'kernel': ['poly'],
                'C': [0.1, 1.0, 10.0],
                'degree': [2, 3],
                'gamma': ['scale', 'auto']
            }
        ]

        # Use GridSearchCV to find best parameters
        svm = SVC(random_state=42, probability=True)  # probability=True for predict_proba
        grid_search = GridSearchCV(
            svm,
            param_grid,
            cv=3,  # 3-fold CV to save time
            scoring='f1',
            n_jobs=-1,
            verbose=1
        )

        print("Training SVM with grid search...")
        grid_search.fit(X_combined, y)

        print(f"Best SVM parameters: {grid_search.best_params_}")
        print(f"Best cross-validation score: {grid_search.best_score_:.4f}")

        # Return model components
        return {
            'algo_num': 2,
            'classifier': grid_search.best_estimator_,
            'tfidf': tfidf,
            'scaler': scaler,
            'temporal_features': temporal_features,
            'best_params': grid_search.best_params_,
            'cv_score': grid_search.best_score_
        }

    # Add other algorithms here (alg 3-5)
    elif alg == 3:  # Feed-Forward Neural Network (FFNN) with Grid Search

        # Prepare features (same as before)
        X_text = train_clean['processed_tweet']
        X_temporal = train_clean[temporal_features]
        y = train_clean['label']

        # TF-IDF setup (same as before)
        valid_texts = [text for text in X_text if text and text.strip() and text != 'empty_tweet']

        if len(valid_texts) < 2:
            print(f"Warning: Only {len(valid_texts)} valid texts found. Using minimal TF-IDF settings.")
            tfidf = TfidfVectorizer(max_features=1000, ngram_range=(1, 1), min_df=1, max_df=1.0)
        else:
            min_df = max(1, min(2, len(valid_texts) // 10))
            tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), min_df=min_df, max_df=0.95)

        try:
            X_text_tfidf = tfidf.fit_transform(X_text)
        except ValueError as e:
            print(f"TF-IDF failed with error: {e}")
            print("Falling back to basic word count...")
            tfidf = TfidfVectorizer(max_features=100, ngram_range=(1, 1), min_df=1, max_df=1.0)
            X_text_tfidf = tfidf.fit_transform(X_text)

        # Scale temporal features
        scaler = StandardScaler()
        X_temp_scaled = scaler.fit_transform(X_temporal)

        # Combine features and convert to tensors
        from scipy.sparse import hstack
        X_combined = hstack([X_text_tfidf, X_temp_scaled]).toarray()
        X_tensor = torch.FloatTensor(X_combined)
        y_tensor = torch.LongTensor(y.values)
        input_size = X_combined.shape[1]

        # Perform grid search
        best_params, best_score, best_model_state = ffnn_grid_search(X_tensor, y_tensor, input_size)

        print(f"\nGrid Search Complete!")
        print(f"Best parameters: {best_params}")
        print(f"Best validation score: {best_score:.4f}")

        # Train final model with best parameters on full dataset
        print("Training final model...")
        dataset = TensorDataset(X_tensor, y_tensor)
        dataloader = DataLoader(dataset, batch_size=best_params['batch_size'], shuffle=True)

        final_model = FFNNClassifier(
            input_size,
            hidden_sizes=best_params['hidden_sizes'],
            dropout_rate=best_params['dropout_rate']
        )

        # Load best weights if available
        if best_model_state is not None:
            try:
                final_model.load_state_dict(best_model_state)
            except:
                print("Could not load best weights, training from scratch...")

        # Final training (same training loop as before but with best params)
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.Adam(
            final_model.parameters(),
            lr=best_params['learning_rate'],
            weight_decay=best_params['weight_decay']
        )

        final_model.train()
        best_loss = float('inf')
        patience_counter = 0

        print("Final training...")
        for epoch in range(100):
            total_loss = 0
            for batch_X, batch_y in dataloader:
                optimizer.zero_grad()
                outputs = final_model(batch_X)
                loss = criterion(outputs, batch_y)
                loss.backward()
                optimizer.step()
                total_loss += loss.item()

            avg_loss = total_loss / len(dataloader)

            if avg_loss < best_loss:
                best_loss = avg_loss
                patience_counter = 0
            else:
                patience_counter += 1

            if patience_counter >= 10:
                print(f"Early stopping at epoch {epoch+1}")
                break

            if (epoch + 1) % 20 == 0:
                print(f"Epoch [{epoch+1}/100], Loss: {avg_loss:.4f}")

        final_model.eval()

        # Return model components
        return {
            'algo_num': 3,
            'model': final_model,
            'tfidf': tfidf,
            'scaler': scaler,
            'temporal_features': temporal_features,
            'best_params': best_params,
            'cv_score': best_score
        }

    # Add other algorithms here (alg 4-5)
    elif alg == 4:  # Random Forest with Rich Feature Engineering

        # Extract rich stylometric features
        rich_features = extract_rich_features(train_clean)

        # Prepare text features (simplified TF-IDF for speed)
        X_text = train_clean['processed_tweet']

        # Adaptive TF-IDF settings
        valid_texts = [text for text in X_text if text and text.strip() and text != 'empty_tweet']
        min_df = max(1, min(3, len(valid_texts) // 20))

        try:
            tfidf = TfidfVectorizer(max_features=2000, ngram_range=(1, 2), min_df=min_df, max_df=0.9)
            X_text_tfidf = tfidf.fit_transform(X_text)
        except ValueError:
            tfidf = TfidfVectorizer(max_features=500, ngram_range=(1, 1), min_df=1, max_df=1.0)
            X_text_tfidf = tfidf.fit_transform(X_text)

        # Prepare temporal and rich features
        X_temporal = train_clean[temporal_features]
        X_rich = rich_features

        # Scale features
        scaler_temporal = StandardScaler()
        scaler_rich = StandardScaler()

        X_temp_scaled = scaler_temporal.fit_transform(X_temporal)
        X_rich_scaled = scaler_rich.fit_transform(X_rich)

        # Combine all features
        from scipy.sparse import hstack, csr_matrix
        X_combined = hstack([
            X_text_tfidf,
            csr_matrix(X_temp_scaled),
            csr_matrix(X_rich_scaled)
        ])

        y = train_clean['label']

        # Grid search for Random Forest
        param_grid = {
            'n_estimators': [100, 200],
            'max_depth': [10, 20, None],
            'min_samples_split': [2],
            'min_samples_leaf': [1, 2],
            'max_features': ['sqrt', None]
            # 'n_estimators': [100],
            # 'max_depth': [10],
            # 'min_samples_split': [5],
            # 'min_samples_leaf': [2],
            # 'max_features': ['sqrt', 'log2', None]
        }

        rf = RandomForestClassifier(random_state=42, n_jobs=-1)
        grid_search = GridSearchCV(
            rf,
            param_grid,
            cv=3,
            scoring='f1',
            n_jobs=-1,
            verbose=1
        )

        print("Training Random Forest with rich features...")
        grid_search.fit(X_combined, y)

        print(f"Best RF parameters: {grid_search.best_params_}")
        print(f"Best cross-validation score: {grid_search.best_score_:.4f}")

        # Get feature importance
        feature_names = (
            [f'tfidf_{i}' for i in range(X_text_tfidf.shape[1])] +
            temporal_features +
            list(rich_features.columns)
        )

        feature_importance = pd.DataFrame({
            'feature': feature_names,
            'importance': grid_search.best_estimator_.feature_importances_
        }).sort_values('importance', ascending=False)

        print("\nTop 20 Most Important Features:")
        print(feature_importance.head(20))

        # Return model components
        return {
            'algo_num': 4,
            'classifier': grid_search.best_estimator_,
            'tfidf': tfidf,
            'scaler_temporal': scaler_temporal,
            'scaler_rich': scaler_rich,
            'temporal_features': temporal_features,
            'rich_feature_names': list(rich_features.columns),
            'best_params': grid_search.best_params_,
            'cv_score': grid_search.best_score_,
            'feature_importance': feature_importance,
            'extract_rich_features': extract_rich_features  # Store the function for prediction
        }

    elif alg == 5:  # DistilBERT Transformer with Grid Search

        print("Loading DistilBERT tokenizer...")
        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

        # Prepare texts with light preprocessing
        texts = train_clean['tweet'].apply(light_preprocess_for_transformer).tolist()
        labels = train_clean['label'].tolist()

        # Perform grid search
        best_params, best_score, best_model = distilbert_grid_search(texts, labels, tokenizer)

        print(f"\nGrid Search Complete!")
        print(f"Best parameters: {best_params}")
        print(f"Best validation score: {best_score:.4f}")

        # Train final model with best parameters on full dataset
        print("Training final model with best parameters...")

        # Split data for final training
        from sklearn.model_selection import train_test_split
        train_texts, val_texts, train_labels, val_labels = train_test_split(
            texts, labels, test_size=0.2, random_state=42, stratify=labels
        )

        # Create datasets
        train_dataset = TweetDataset(train_texts, train_labels, tokenizer)
        val_dataset = TweetDataset(val_texts, val_labels, tokenizer)

        # Final model with best parameters
        final_model = DistilBertForSequenceClassification.from_pretrained(
            'distilbert-base-uncased',
            num_labels=2,
            problem_type="single_label_classification"
        )

        # Final training arguments
        final_training_args = TrainingArguments(
            output_dir='./trump_distilbert_final',
            num_train_epochs=best_params['num_train_epochs'],
            per_device_train_batch_size=best_params['per_device_train_batch_size'],
            per_device_eval_batch_size=best_params['per_device_train_batch_size'],
            learning_rate=best_params['learning_rate'],
            weight_decay=best_params['weight_decay'],
            warmup_steps=best_params['warmup_steps'],
            logging_dir='./logs',
            logging_steps=10,
            eval_strategy="steps",
            eval_steps=50,
            save_strategy="steps",
            save_steps=50,
            load_best_model_at_end=True,
            metric_for_best_model="f1",
            greater_is_better=True,
            save_total_limit=2,
            remove_unused_columns=False,
            push_to_hub=False,
            report_to=None,
        )

        # Final trainer
        final_trainer = Trainer(
            model=final_model,
            args=final_training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            compute_metrics=compute_metrics,
            callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
        )

        print("Final training...")
        final_trainer.train()

        # Final evaluation
        eval_results = final_trainer.evaluate()
        print(f"Final DistilBERT Results: {eval_results}")

        # Return model components
        return {
            'algo_num': 5,
            'model': final_model,
            'tokenizer': tokenizer,
            'trainer': final_trainer,
            'light_preprocess': light_preprocess_for_transformer,
            'best_params': best_params,
            'cv_score': best_score,
            'eval_results': eval_results
        }
    else:
        raise NotImplementedError(f"Algorithm {alg} not implemented yet")

In [None]:
print(f"\nTraining pipeline for algorithm: logistic regression")
lr_model = training_pipeline(1, 'trump_train.tsv')



Training pipeline for algorithm: logistic regression
Shape of the original dataset: (3515, 5)
Starting Grid Search for Logistic Regression...
Fitting 3 folds for each of 4 candidates, totalling 12 fits


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(mod_value, inplace=True)


Best parameters: {'C': 2.0, 'max_iter': 10000, 'penalty': 'l2', 'solver': 'lbfgs'}
Best cross-validation score: 0.6855


In [None]:
print(f"\nTraining pipeline for algorithm: SVM")
svm_model = training_pipeline(2, 'trump_train.tsv')


Training pipeline for algorithm: SVM
Shape of the original dataset: (3515, 5)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(mod_value, inplace=True)


Training SVM with grid search...
Fitting 3 folds for each of 27 candidates, totalling 81 fits
Best SVM parameters: {'C': 1.0, 'kernel': 'linear'}
Best cross-validation score: 0.6879


In [13]:
print(f"\nTraining pipeline for algorithm: FFNN")
FFNN_model = training_pipeline(3, 'trump_train.tsv')


Training pipeline for algorithm: FFNN
Shape of the original dataset: (3515, 5)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(mod_value, inplace=True)


Starting FFNN Grid Search...
Testing 12 combinations...
Combination 1/12: {'hidden_sizes': [128], 'dropout_rate': 0.2, 'learning_rate': 0.001, 'batch_size': 32, 'weight_decay': 0.0001}
  Validation F1: 0.7731
  New best! Score: 0.7731
Combination 2/12: {'hidden_sizes': [128], 'dropout_rate': 0.2, 'learning_rate': 0.01, 'batch_size': 32, 'weight_decay': 0.0001}
  Validation F1: 0.7691
Combination 3/12: {'hidden_sizes': [128], 'dropout_rate': 0.5, 'learning_rate': 0.001, 'batch_size': 32, 'weight_decay': 0.0001}
  Validation F1: 0.7849
  New best! Score: 0.7849
Combination 4/12: {'hidden_sizes': [128], 'dropout_rate': 0.5, 'learning_rate': 0.01, 'batch_size': 32, 'weight_decay': 0.0001}
  Validation F1: 0.7827
Combination 5/12: {'hidden_sizes': [256, 128], 'dropout_rate': 0.2, 'learning_rate': 0.001, 'batch_size': 32, 'weight_decay': 0.0001}
  Validation F1: 0.7722
Combination 6/12: {'hidden_sizes': [256, 128], 'dropout_rate': 0.2, 'learning_rate': 0.01, 'batch_size': 32, 'weight_decay':

In [None]:
print(f"\nTraining pipeline for algorithm: Random Forest with Rich Features")
rf_model = training_pipeline(4, 'trump_train.tsv')


Training pipeline for algorithm: Random Forest with Rich Features
Shape of the original dataset: (3515, 5)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(mod_value, inplace=True)


Training Random Forest with rich features...
Fitting 3 folds for each of 24 candidates, totalling 72 fits
Best RF parameters: {'max_depth': None, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}
Best cross-validation score: 0.8093

Top 20 Most Important Features:
                  feature  importance
2026            url_count    0.352875
2023          quote_count    0.119116
2000          hour_of_day    0.079986
2024        mention_count    0.045704
2009          text_length    0.035609
2030  starts_with_capital    0.028316
2002         day_of_month    0.016055
2001          day_of_week    0.015354
2013      uppercase_ratio    0.014983
2016    punctuation_ratio    0.010591
2003                month    0.009519
2025        hashtag_count    0.009412
2015          space_ratio    0.007871
2022           dash_count    0.007293
2011      avg_word_length    0.007171
2029     title_case_words    0.007005
2010           word_count    0.006879
2006       

In [None]:
import warnings

warnings.filterwarnings("ignore")

print(f"\nTraining pipeline for algorithm: Bert")
Bert_model = training_pipeline(5, 'trump_train.tsv')


Training pipeline for algorithm: Bert
Shape of the original dataset: (3515, 5)
Loading DistilBERT tokenizer...


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Starting DistilBERT Grid Search...
Testing 3 combinations...
Combination 1/3: {'num_train_epochs': 3, 'per_device_train_batch_size': 16, 'learning_rate': 2e-05, 'weight_decay': 0.01, 'warmup_steps': 100}


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
100,0.3994,0.345648,0.871977,0.785714,0.982143,0.654762
200,0.2847,0.280947,0.887624,0.820862,0.957672,0.718254
300,0.243,0.290146,0.887624,0.816705,0.98324,0.698413
400,0.1549,0.266533,0.897582,0.84141,0.945545,0.757937
500,0.166,0.251703,0.899004,0.846652,0.92891,0.777778


  F1 Score: 0.8498
  New best! Score: 0.8498
Combination 2/3: {'num_train_epochs': 3, 'per_device_train_batch_size': 16, 'learning_rate': 2e-05, 'weight_decay': 0.01, 'warmup_steps': 200}


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
100,0.5011,0.396815,0.85633,0.753056,0.980892,0.611111
200,0.3079,0.310123,0.871977,0.78972,0.960227,0.670635
300,0.2548,0.32394,0.886202,0.812207,0.994253,0.686508
400,0.1711,0.26533,0.900427,0.848485,0.933333,0.777778
500,0.1666,0.254367,0.904694,0.857143,0.926267,0.797619


  F1 Score: 0.8620
  New best! Score: 0.8620
Combination 3/3: {'num_train_epochs': 3, 'per_device_train_batch_size': 16, 'learning_rate': 2e-05, 'weight_decay': 0.001, 'warmup_steps': 100}


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
100,0.4115,0.350422,0.876245,0.792363,0.994012,0.65873
200,0.2856,0.29555,0.881935,0.808314,0.966851,0.694444
300,0.2436,0.307157,0.889047,0.819444,0.983333,0.702381
400,0.1588,0.278361,0.899004,0.846652,0.92891,0.777778
500,0.169,0.279831,0.897582,0.842795,0.936893,0.765873


  F1 Score: 0.8405

Grid Search Complete!
Best parameters: {'num_train_epochs': 3, 'per_device_train_batch_size': 16, 'learning_rate': 2e-05, 'weight_decay': 0.01, 'warmup_steps': 200}
Best validation score: 0.8620
Training final model with best parameters...


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Final training...


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
50,0.6218,0.612004,0.647226,0.03125,1.0,0.015873
100,0.3485,0.396815,0.85633,0.753056,0.980892,0.611111
150,0.3519,0.328484,0.86771,0.781176,0.959538,0.65873
200,0.4099,0.310123,0.871977,0.78972,0.960227,0.670635
250,0.3255,0.273718,0.889047,0.828194,0.930693,0.746032
300,0.1354,0.32394,0.886202,0.812207,0.994253,0.686508
350,0.2098,0.286025,0.889047,0.828947,0.926471,0.75
400,0.2208,0.26533,0.900427,0.848485,0.933333,0.777778
450,0.1716,0.253557,0.894737,0.847107,0.883621,0.813492
500,0.1352,0.254367,0.904694,0.857143,0.926267,0.797619


Final DistilBERT Results: {'eval_loss': 0.25436684489250183, 'eval_accuracy': 0.9046941678520626, 'eval_f1': 0.8571428571428571, 'eval_precision': 0.9262672811059908, 'eval_recall': 0.7976190476190477, 'eval_runtime': 1.3365, 'eval_samples_per_second': 526.016, 'eval_steps_per_second': 32.923, 'epoch': 3.0}


In [None]:
def retrain_best_model(train_fn):
    """
    Retrain DistilBERT model with the best hyperparameters found from grid search.
    Trains on the full dataset without validation split or evaluation.

    Args:
        train_fn: Path to training file

    Returns:
        Dictionary with trained model components
    """

    # Hardcoded best parameters from grid search
    best_params = {
        'num_train_epochs': 3,
        'per_device_train_batch_size': 16,
        'learning_rate': 2e-05,
        'weight_decay': 0.01,
        'warmup_steps': 200
    }

    print("Starting DistilBERT retraining with best parameters...")
    print(f"Best parameters: {best_params}")

    # Preprocess training data
    train_clean = preprocess_train_data(train_fn)
    train_clean = extract_timestamp_features(train_clean)
    train_clean['label'] = train_clean['device'].map({'android': 0, 'iphone': 1})

    # Load tokenizer
    print("Loading DistilBERT tokenizer...")
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

    # Prepare texts with light preprocessing
    texts = train_clean['tweet'].apply(light_preprocess_for_transformer).tolist()
    labels = train_clean['label'].tolist()

    # Create dataset for full training data
    print("Creating training dataset...")
    train_dataset = TweetDataset(texts, labels, tokenizer)

    # Initialize model with best parameters
    print("Initializing DistilBERT model...")
    model = DistilBertForSequenceClassification.from_pretrained(
        'distilbert-base-uncased',
        num_labels=2,
        problem_type="single_label_classification"
    )

    # Training arguments with best parameters
    training_args = TrainingArguments(
        output_dir='./trump_distilbert_retrained',
        num_train_epochs=best_params['num_train_epochs'],
        per_device_train_batch_size=best_params['per_device_train_batch_size'],
        per_device_eval_batch_size=best_params['per_device_train_batch_size'],
        learning_rate=best_params['learning_rate'],
        weight_decay=best_params['weight_decay'],
        warmup_steps=best_params['warmup_steps'],
        logging_dir='./logs_retrained',
        logging_steps=10,
        save_strategy="no",  # Disable saving to avoid version conflicts
        remove_unused_columns=False,
        push_to_hub=False,
        report_to=None,
    )

    # Initialize trainer (no validation dataset)
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        # No eval_dataset, compute_metrics, or callbacks
    )

    # Train the model
    print("Training DistilBERT with best parameters...")
    trainer.train()

    print("Training completed!")

    # Return model components
    return {
        'algo_num': 5,
        'model': model,
        'tokenizer': tokenizer,
        'trainer': trainer,
        'light_preprocess': light_preprocess_for_transformer,
        'best_params': best_params
    }

In [None]:
best_model = retrain_best_model('trump_train.tsv')

Starting DistilBERT retraining with best parameters...
Best parameters: {'num_train_epochs': 3, 'per_device_train_batch_size': 16, 'learning_rate': 2e-05, 'weight_decay': 0.01, 'warmup_steps': 200}
Shape of the original dataset: (3515, 5)
Loading DistilBERT tokenizer...
Creating training dataset...
Initializing DistilBERT model...


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Training DistilBERT with best parameters...


Step,Training Loss
10,0.6869
20,0.6852
30,0.6625
40,0.6639
50,0.6312
60,0.6499
70,0.5334
80,0.5012
90,0.4804
100,0.4009


Training completed!


## Prediction Pipeline

In [None]:
from scipy.sparse import hstack

def predict(m, fn):
    """ Returns a list of 0s and 1s, corresponding to the lines in the specified file.

        Args:
        m: the trained model to be used.
        fn: the full path to a file in the same format as the test set we have proveded.
    """
    # Load test data
    cols = ['user_handle','tweet','timestamp']
    test_df = pd.read_csv(fn, sep="\t", header=None, names=cols, quoting=3,index_col=False)

    # Extract timestamp features
    test_clean = extract_timestamp_features(test_df)

    # Preprocess text
    test_clean['processed_tweet'] = test_clean['tweet'].apply(preprocess_text)

    # Prepare features
    X_text_test = test_clean['processed_tweet']
        # do nothing if temporal features are not present

    # print(test_df.head())

    if m['algo_num'] == 1:
        X_temporal_test = test_clean[m['temporal_features']]

        # Transform features using fitted transformers
        X_text_tfidf_test = m['tfidf'].transform(X_text_test)
        X_temp_scaled_test = m['scaler'].transform(X_temporal_test)

        # Combine features
        from scipy.sparse import hstack
        X_combined_test = hstack([X_text_tfidf_test, X_temp_scaled_test])

        # Make predictions
        predictions = m['classifier'].predict(X_combined_test)

        return predictions.tolist()
    elif m['algo_num'] == 2:
        X_temporal_test = test_clean[m['temporal_features']]

        # Transform features using fitted transformers
        X_text_tfidf_test = m['tfidf'].transform(X_text_test)
        X_temp_scaled_test = m['scaler'].transform(X_temporal_test)

        # Combine features
        from scipy.sparse import hstack
        X_combined_test = hstack([X_text_tfidf_test, X_temp_scaled_test])

        # Make predictions
        predictions = m['classifier'].predict(X_combined_test)

        return predictions.tolist()
    elif m['algo_num'] == 3:  # FFNN
        import torch
        X_temporal_test = test_clean[m['temporal_features']]


        # Transform features using fitted transformers
        X_text_tfidf_test = m['tfidf'].transform(X_text_test)
        X_temp_scaled_test = m['scaler'].transform(X_temporal_test)

        # Combine features and convert to dense
        X_combined_test = hstack([X_text_tfidf_test, X_temp_scaled_test]).toarray()

        # Convert to PyTorch tensor
        X_tensor_test = torch.FloatTensor(X_combined_test)

        # Set model to evaluation mode
        m['model'].eval()

        # Make predictions
        with torch.no_grad():
            outputs = m['model'](X_tensor_test)
            _, predictions = torch.max(outputs, 1)

        return predictions.numpy().tolist()
    elif m['algo_num'] == 4:  # Random Forest
        # Transform features using fitted transformers
        X_temporal_test = test_clean[m['temporal_features']]

        X_text_tfidf_test = m['tfidf'].transform(X_text_test)
        X_temp_scaled_test = m['scaler_temporal'].transform(X_temporal_test)

        # Extract rich features for test data
        rich_features_test = m['extract_rich_features'](test_clean)
        X_rich_scaled_test = m['scaler_rich'].transform(rich_features_test)

        # Combine all features
        from scipy.sparse import hstack, csr_matrix
        X_combined_test = hstack([
            X_text_tfidf_test,
            csr_matrix(X_temp_scaled_test),
            csr_matrix(X_rich_scaled_test)
        ])

        # Make predictions
        predictions = m['classifier'].predict(X_combined_test)
        return predictions.tolist()
    elif m['algo_num'] == 5:  # DistilBERT
        import torch

        # Select device
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        # Load model and tokenizer
        model = m['model'].to(device)
        tokenizer = m['tokenizer']

        # Preprocess raw tweets for transformer
        test_texts = test_df['tweet'].apply(m['light_preprocess']).tolist()

        model.eval()
        predictions = []
        batch_size = 16  # adjust as needed

        with torch.no_grad():
            for i in range(0, len(test_texts), batch_size):
                batch_texts = test_texts[i:i + batch_size]

                # Tokenize and move inputs to device
                encodings = tokenizer(
                    batch_texts,
                    truncation=True,
                    padding=True,
                    max_length=128,
                    return_tensors='pt'
                )
                encodings = {k: v.to(device) for k, v in encodings.items()}

                # Forward pass
                outputs = model(**encodings)
                preds = torch.argmax(outputs.logits, dim=-1)

                # Move predictions back to CPU and extend list
                predictions.extend(preds.cpu().tolist())

        return predictions
    else:
        raise NotImplementedError(f"Algorithm {m['algo_num']} not implemented in predict function")



In [None]:
predictions = predict(Bert_model, 'trump_tweets_test_a.tsv')

In [None]:
print("Predictions:", predictions)

Predictions: [1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0]


In [None]:
# Save predictions to a file, one label per line
output_path = "predictions.txt"
with open(output_path, "w") as f:
    for label in predictions:
        f.write(f"{label}\n")
print(f"Saved {len(predictions)} predictions to {output_path}")

Saved 200 predictions to predictions.txt


In [None]:
def who_am_i():  # this is not a class method
    """Returns a list of dictionaries, each dictionary with your name, id number and email. keys=['name', 'id','email']
        (If you are submitting solo, the list should contain only one dictionary. If you submit as a team, the list should
        contain a dictionary for each team member.)
        Make sure you return your own info!
    """
    return [{'name': 'Mickael Zeitoun', 'id': '328783105', 'email': 'mickaelz@post.bgu.ac.il'},{'name': 'Dor Meir', 'id': '313254724', 'email': 'dorgalon@post.bgu.ac.il'}]