In [2]:
pip install emoji

Collecting emoji
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Downloading emoji-2.14.1-py3-none-any.whl (590 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/590.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m590.6/590.6 kB[0m [31m33.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.14.1


In [3]:
#Cleaned + Ensemble (Distilbert + Bertweet)

import pandas as pd
import numpy as np
import torch
import re
import random
import emoji
from textblob import TextBlob
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm

# Set seeds for full reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

# Ensure deterministic behavior
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Load dataset
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

# Preprocessing function
def preprocess_text(text):
    if pd.isna(text):
        return ""
    # Replace emojis with textual descriptions
    text = emoji.replace_emoji(text, replace=lambda chars, data_dict: ' '.join(data_dict['en'].split('_')).strip(':'))
    # Remove URLs
    text = re.sub(r"http\S+|www\S+", "", text)
    # Remove mentions
    text = re.sub(r"@\w+", "", text)
    # Remove hashtags but keep the word (e.g., #earthquake -> earthquake)
    text = re.sub(r"#", "", text)
    # Remove special characters and extra spaces
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    return text.lower().strip()

# Apply preprocessing to the dataset
for df in [train_df, test_df]:
    df['text'] = df['text'].fillna('').apply(preprocess_text)
    df['keyword'] = df['keyword'].fillna('').str.replace('%20', ' ').apply(preprocess_text)
    df['combined'] = df['keyword'] + " " + df['text']

# Feature Engineering: Add sentiment polarity and word count
def extract_sentiment(text):
    return TextBlob(text).sentiment.polarity

for df in [train_df, test_df]:
    df['sentiment'] = df['text'].apply(extract_sentiment)
    df['word_count'] = df['text'].apply(lambda x: len(x.split()))
    df['exclamation_count'] = df['text'].str.count('!')

# Save cleaned data
train_df.to_csv("cleaned_train.csv", index=False)
test_df.to_csv("cleaned_test.csv", index=False)

print("Data cleaning complete!")

# Tokenizers
tokenizer_distilbert = AutoTokenizer.from_pretrained("distilbert-base-uncased")
tokenizer_bertweet = AutoTokenizer.from_pretrained("vinai/bertweet-base")

# Tokenization function
def encode_text(texts, tokenizer, max_length=256):
    return tokenizer(texts.tolist(), padding=True, truncation=True, max_length=max_length, return_tensors='pt')

# Encode text data
X_distilbert = encode_text(train_df['combined'], tokenizer_distilbert)
X_bertweet = encode_text(train_df['combined'], tokenizer_bertweet)
y = torch.tensor(train_df['target'].values)

# Split dataset
X_train_d, X_val_d, y_train, y_val = train_test_split(X_distilbert['input_ids'], y, test_size=0.1, random_state=SEED)
X_train_b, X_val_b, _, _ = train_test_split(X_bertweet['input_ids'], y, test_size=0.1, random_state=SEED)

X_train_mask_d, X_val_mask_d = train_test_split(X_distilbert['attention_mask'], test_size=0.1, random_state=SEED)
X_train_mask_b, X_val_mask_b = train_test_split(X_bertweet['attention_mask'], test_size=0.1, random_state=SEED)

# Use a seeded generator for DataLoader reproducibility
generator = torch.Generator().manual_seed(SEED)

# Create DataLoaders
batch_size = 32
train_loader_d = DataLoader(TensorDataset(X_train_d, X_train_mask_d, y_train), batch_size=batch_size, shuffle=True, generator=generator)
val_loader_d = DataLoader(TensorDataset(X_val_d, X_val_mask_d, y_val), batch_size=batch_size, generator=generator)

train_loader_b = DataLoader(TensorDataset(X_train_b, X_train_mask_b, y_train), batch_size=batch_size, shuffle=True, generator=generator)
val_loader_b = DataLoader(TensorDataset(X_val_b, X_val_mask_b, y_val), batch_size=batch_size, generator=generator)

# Initialize models
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_distilbert = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2).to(device)
model_bertweet = AutoModelForSequenceClassification.from_pretrained("vinai/bertweet-base", num_labels=2).to(device)

optimizer_d = AdamW(model_distilbert.parameters(), lr=1e-5, eps=1e-8)
optimizer_b = AdamW(model_bertweet.parameters(), lr=2e-5, eps=1e-8)

# Training function
def train_model(model, train_loader, val_loader, optimizer, model_name):
    best_acc = 0
    patience = 3
    no_improve_epochs = 0
    epochs = 5

    for epoch in range(epochs):
        model.train()
        total_loss = 0

        for batch in tqdm(train_loader, desc=f"Training {model_name} (Epoch {epoch + 1})"):
            optimizer.zero_grad()
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f"Epoch {epoch + 1} Loss ({model_name}): {total_loss / len(train_loader):.4f}")

        # Validation
        model.eval()
        val_preds, val_true = [], []

        with torch.no_grad():
            for batch in val_loader:
                input_ids, attention_mask, labels = [b.to(device) for b in batch]
                outputs = model(input_ids, attention_mask=attention_mask)
                preds = torch.argmax(outputs.logits, dim=1)
                val_preds.extend(preds.cpu().numpy())
                val_true.extend(labels.cpu().numpy())

        acc = accuracy_score(val_true, val_preds)
        print(f"{model_name} - Validation Accuracy: {acc:.4f}")

        # Early stopping
        if acc > best_acc:
            best_acc = acc
            no_improve_epochs = 0
            torch.save(model.state_dict(), f"{model_name}_best.pth")
        else:
            no_improve_epochs += 1
            if no_improve_epochs >= patience:
                print(f"Early stopping for {model_name}!")
                break

# Train both models
train_model(model_distilbert, train_loader_d, val_loader_d, optimizer_d, "distilbert")
train_model(model_bertweet, train_loader_b, val_loader_b, optimizer_b, "bertweet")

# Load best models
model_distilbert.load_state_dict(torch.load("distilbert_best.pth"))
model_bertweet.load_state_dict(torch.load("bertweet_best.pth"))

# Prepare test data
X_test_d = encode_text(test_df['combined'], tokenizer_distilbert)
X_test_b = encode_text(test_df['combined'], tokenizer_bertweet)

test_loader_d = DataLoader(TensorDataset(X_test_d['input_ids'], X_test_d['attention_mask']), batch_size=batch_size, generator=generator)
test_loader_b = DataLoader(TensorDataset(X_test_b['input_ids'], X_test_b['attention_mask']), batch_size=batch_size, generator=generator)

# Function to get predictions
def get_predictions(model, test_loader):
    model.eval()
    preds = []
    with torch.no_grad():
        for batch in test_loader:
            input_ids, attention_mask = [b.to(device) for b in batch]
            outputs = model(input_ids, attention_mask=attention_mask)
            preds.extend(torch.softmax(outputs.logits, dim=1)[:, 1].cpu().numpy())
    return np.array(preds)

# Get predictions
preds_d = get_predictions(model_distilbert, test_loader_d)
preds_b = get_predictions(model_bertweet, test_loader_b)

# Ensemble and save
final_preds = ((preds_d + preds_b) / 2 >= 0.5).astype(int)
submission_df = pd.read_csv("sample_submission.csv")
submission_df["target"] = final_preds
submission_df.to_csv("ensemble_submission.csv", index=False)
print("Predictions saved to ensemble_submission.csv")


Data cleaning complete!


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/558 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/843k [00:00<?, ?B/s]

bpe.codes:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.91M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


pytorch_model.bin:   0%|          | 0.00/543M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training distilbert (Epoch 1):   6%|▌         | 13/215 [00:01<00:16, 12.50it/s]

model.safetensors:   0%|          | 0.00/543M [00:00<?, ?B/s]

Training distilbert (Epoch 1): 100%|██████████| 215/215 [00:14<00:00, 14.62it/s]


Epoch 1 Loss (distilbert): 0.4660
distilbert - Validation Accuracy: 0.8307


Training distilbert (Epoch 2): 100%|██████████| 215/215 [00:13<00:00, 15.47it/s]


Epoch 2 Loss (distilbert): 0.3603
distilbert - Validation Accuracy: 0.8399


Training distilbert (Epoch 3): 100%|██████████| 215/215 [00:13<00:00, 15.44it/s]


Epoch 3 Loss (distilbert): 0.3087
distilbert - Validation Accuracy: 0.8189


Training distilbert (Epoch 4): 100%|██████████| 215/215 [00:13<00:00, 15.37it/s]


Epoch 4 Loss (distilbert): 0.2629
distilbert - Validation Accuracy: 0.8163


Training distilbert (Epoch 5): 100%|██████████| 215/215 [00:14<00:00, 15.33it/s]


Epoch 5 Loss (distilbert): 0.2113
distilbert - Validation Accuracy: 0.7913
Early stopping for distilbert!


Training bertweet (Epoch 1): 100%|██████████| 215/215 [00:25<00:00,  8.45it/s]


Epoch 1 Loss (bertweet): 0.4488
bertweet - Validation Accuracy: 0.8307


Training bertweet (Epoch 2): 100%|██████████| 215/215 [00:25<00:00,  8.38it/s]


Epoch 2 Loss (bertweet): 0.3473
bertweet - Validation Accuracy: 0.8281


Training bertweet (Epoch 3): 100%|██████████| 215/215 [00:25<00:00,  8.35it/s]


Epoch 3 Loss (bertweet): 0.2929
bertweet - Validation Accuracy: 0.8491


Training bertweet (Epoch 4): 100%|██████████| 215/215 [00:25<00:00,  8.36it/s]


Epoch 4 Loss (bertweet): 0.2350
bertweet - Validation Accuracy: 0.8346


Training bertweet (Epoch 5): 100%|██████████| 215/215 [00:25<00:00,  8.35it/s]


Epoch 5 Loss (bertweet): 0.1967
bertweet - Validation Accuracy: 0.8228


  model_distilbert.load_state_dict(torch.load("distilbert_best.pth"))
  model_bertweet.load_state_dict(torch.load("bertweet_best.pth"))


Predictions saved to ensemble_submission.csv
