<a href="https://colab.research.google.com/github/momo4201/AI-ML-projects/blob/main/NLP_with_disaster_tweets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"momomeow","key":"e7a54a86ca55def5cbe08d92e6524723"}'}

In [None]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!pip install kaggle



In [None]:
!kaggle competitions download -c nlp-getting-started

Downloading nlp-getting-started.zip to /content
  0% 0.00/593k [00:00<?, ?B/s]
100% 593k/593k [00:00<00:00, 1.02GB/s]


In [None]:
!unzip nlp-getting-started.zip -d data/

Archive:  nlp-getting-started.zip
  inflating: data/sample_submission.csv  
  inflating: data/test.csv           
  inflating: data/train.csv          


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")

print("\\nTarget distribution:")
print(train['target'].value_counts())

\nTarget distribution:
target
0    4342
1    3271
Name: count, dtype: int64


In [None]:
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [None]:
##explore the dataset
train['length']=train['text'].apply(lambda x: len(x))
test['length']=test['text'].apply(lambda x: len(x))

print('train length statistics:')
print(train['length'].describe())
print()

print('test length statistics:')
print(test['length'].describe())


train length statistics:
count    7613.000000
mean      101.037436
std        33.781325
min         7.000000
25%        78.000000
50%       107.000000
75%       133.000000
max       157.000000
Name: length, dtype: float64

test length statistics:
count    3263.000000
mean      102.108183
std        33.972158
min         5.000000
25%        78.000000
50%       109.000000
75%       134.000000
max       151.000000
Name: length, dtype: float64


In [None]:
import re

def clean_text(text):
  text = text.lower()    #convert to lower case
  text = re.sub(r"what's", "what is ", text)
  text = re.sub(r'https?://\\S+|www\\.\\S+', '', text)
  text = re.sub(r'@\\w+', '', text)  # Remove mentions
  text = re.sub(r'#', '', text)       # Remove hashtag symbols

  text = re.sub(r'[^\\w\\s]', '', text)
  text = re.sub(r'\\s+', ' ', text).strip()

  return text

train['cleaned_text'] = train['text'].apply(clean_text)
test['cleaned_text'] = test['text'].apply(clean_text)

##converting text to tensors using transformers

In [None]:
from transformers import DistilBertTokenizer
import torch
from torch.utils.data import DataLoader, TensorDataset


In [None]:
##load pretrained tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

def tokenize_text(texts, tokenizer, max_length=128):
  encodings = tokenizer(
      list(texts),
      max_length=max_length,
      padding='max_length',
      truncation=True,
      return_tensors='pt'
  )
  return encodings['input_ids'], encodings['attention_mask']

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

##Preparing Data for Training

In [None]:
from sklearn.model_selection import train_test_split

train_texts, val_texts, train_targets, val_targets = train_test_split(
    train['cleaned_text'], train['target'], test_size=0.2, random_state=42,
    stratify = train['target']      #ensures the class distribution (0s and 1s in your target) is preserved in both training and validation sets.
)
## process data and create dataloaders
train_input_ids, train_attention_masks = tokenize_text(train_texts, tokenizer)
val_input_ids, val_attention_masks = tokenize_text(val_texts, tokenizer)

train_targets = torch.tensor(train_targets.values, dtype= torch.long)
val_targets = torch.tensor(val_targets.values, dtype=torch.long)

train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_targets)
val_dataset = TensorDataset(val_input_ids, val_attention_masks, val_targets)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

Fine tuning a pretrained transformer model

In [None]:
from transformers import DistilBertForSequenceClassification

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

model = model.to(device)

##Learning Rate Scheduling
from transformers import get_linear_schedule_with_warmup

#  define optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

#  scheduler linked to that optimizer
total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
import collections

print("Train label distribution:", collections.Counter(train_targets.tolist()))
print("Validation label distribution:", collections.Counter(val_targets.tolist()))

Train label distribution: Counter({0: 3473, 1: 2617})
Validation label distribution: Counter({0: 869, 1: 654})


In [None]:
from transformers import DistilBertForSequenceClassification, get_linear_schedule_with_warmup
from tqdm.notebook import tqdm
from sklearn.metrics import accuracy_score, f1_score
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ---- Load model ----
model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased', num_labels=2
)
model = model.to(device)

# ---- Optimizer & Scheduler ----
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5, weight_decay=0.01)

epochs = 10
total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

# ---- Training + Validation Loop ----
for epoch in range(epochs):
    # -------- TRAINING --------
    model.train()
    train_loss, correct, total = 0, 0, 0
    all_train_preds, all_train_labels = [], []

    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        input_ids, attention_mask, targets = [b.to(device) for b in batch]

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=targets)

        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()

        train_loss += loss.item()

        # Accuracy + F1
        preds = torch.argmax(outputs.logits, dim=1)
        all_train_preds.extend(preds.cpu().numpy())
        all_train_labels.extend(targets.cpu().numpy())

        correct += (preds == targets).sum().item()
        total += targets.size(0)

        acc = correct / total
        loop.set_description(f"Epoch {epoch+1}/{epochs}")
        loop.set_postfix(loss=loss.item(), acc=acc)

    train_acc = correct / total
    avg_train_loss = train_loss / len(train_loader)
    train_f1 = f1_score(all_train_labels, all_train_preds, average="weighted")

    # -------- VALIDATION --------
    model.eval()
    val_loss, val_correct, val_total = 0, 0, 0
    all_val_preds, all_val_labels = [], []

    with torch.no_grad():
        for batch in val_loader:
            input_ids, attention_mask, targets = [b.to(device) for b in batch]

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=targets)
            val_loss += outputs.loss.item()

            preds = torch.argmax(outputs.logits, dim=1)
            all_val_preds.extend(preds.cpu().numpy())
            all_val_labels.extend(targets.cpu().numpy())

            val_correct += (preds == targets).sum().item()
            val_total += targets.size(0)

    val_acc = val_correct / val_total
    avg_val_loss = val_loss / len(val_loader)
    val_f1 = f1_score(all_val_labels, all_val_preds, average="weighted")

    print(f"Epoch {epoch+1} | "
          f"Train Loss: {avg_train_loss:.4f}, Train Acc: {train_acc:.4f}, Train F1: {train_f1:.4f} | "
          f"Val Loss: {avg_val_loss:.4f}, Val Acc: {val_acc:.4f}, Val F1: {val_f1:.4f}")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/381 [00:00<?, ?it/s]

Epoch 1 | Train Loss: 0.6806, Train Acc: 0.5612, Train F1: 0.4959 | Val Loss: 0.6716, Val Acc: 0.5706, Val F1: 0.5344


  0%|          | 0/381 [00:00<?, ?it/s]

Epoch 2 | Train Loss: 0.6735, Train Acc: 0.5757, Train F1: 0.5284 | Val Loss: 0.6728, Val Acc: 0.5620, Val F1: 0.5463


  0%|          | 0/381 [00:00<?, ?it/s]

Epoch 3 | Train Loss: 0.6710, Train Acc: 0.5631, Train F1: 0.5302 | Val Loss: 0.6717, Val Acc: 0.5798, Val F1: 0.5547


  0%|          | 0/381 [00:00<?, ?it/s]

Epoch 4 | Train Loss: 0.6699, Train Acc: 0.5803, Train F1: 0.5452 | Val Loss: 0.6692, Val Acc: 0.5798, Val F1: 0.5202


  0%|          | 0/381 [00:00<?, ?it/s]

Epoch 5 | Train Loss: 0.6687, Train Acc: 0.5869, Train F1: 0.5470 | Val Loss: 0.6713, Val Acc: 0.5831, Val F1: 0.5504


  0%|          | 0/381 [00:00<?, ?it/s]

Epoch 6 | Train Loss: 0.6677, Train Acc: 0.5854, Train F1: 0.5432 | Val Loss: 0.6697, Val Acc: 0.5778, Val F1: 0.5624


  0%|          | 0/381 [00:00<?, ?it/s]

Epoch 7 | Train Loss: 0.6667, Train Acc: 0.5926, Train F1: 0.5656 | Val Loss: 0.6703, Val Acc: 0.5739, Val F1: 0.5588


  0%|          | 0/381 [00:00<?, ?it/s]

Epoch 8 | Train Loss: 0.6636, Train Acc: 0.5934, Train F1: 0.5696 | Val Loss: 0.6686, Val Acc: 0.5831, Val F1: 0.5731


  0%|          | 0/381 [00:00<?, ?it/s]

Epoch 9 | Train Loss: 0.6615, Train Acc: 0.6057, Train F1: 0.5839 | Val Loss: 0.6686, Val Acc: 0.5923, Val F1: 0.5766


  0%|          | 0/381 [00:00<?, ?it/s]

Epoch 10 | Train Loss: 0.6602, Train Acc: 0.6039, Train F1: 0.5856 | Val Loss: 0.6687, Val Acc: 0.5837, Val F1: 0.5718


In [None]:
from tqdm.notebook import tqdm
import torch

epochs = 10

for epoch in range(epochs):
    model.train()
    train_loss = 0
    correct = 0
    total = 0

    loop = tqdm(train_loader, leave=True)  # wrap train_loader in tqdm
    for batch in loop:
        input_ids, attention_mask, targets = [b.to(device) for b in batch]

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=targets)

        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

        train_loss += loss.item()

        # ----- ✅ Compute accuracy -----
        # HuggingFace returns logits at outputs.logits
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)  # predicted class
        correct += (preds == targets).sum().item()
        total += targets.size(0)

        acc = correct / total  # running accuracy

        # update tqdm bar description
        loop.set_description(f"Epoch {epoch+1}/{epochs}")
        loop.set_postfix(loss=loss.item(), acc=acc)

    print(f"Epoch {epoch+1} | Avg Loss: {train_loss/len(train_loader):.4f} | Accuracy: {acc:.4f}")

  0%|          | 0/381 [00:00<?, ?it/s]

Epoch 1 | Avg Loss: 0.6668 | Accuracy: 0.5915


  0%|          | 0/381 [00:00<?, ?it/s]

Epoch 2 | Avg Loss: 0.6666 | Accuracy: 0.5908


  0%|          | 0/381 [00:00<?, ?it/s]

Epoch 3 | Avg Loss: 0.6657 | Accuracy: 0.5878


  0%|          | 0/381 [00:00<?, ?it/s]

Epoch 4 | Avg Loss: 0.6657 | Accuracy: 0.5974


  0%|          | 0/381 [00:00<?, ?it/s]

Epoch 5 | Avg Loss: 0.6642 | Accuracy: 0.5916


  0%|          | 0/381 [00:00<?, ?it/s]

Epoch 6 | Avg Loss: 0.6626 | Accuracy: 0.6008


  0%|          | 0/381 [00:00<?, ?it/s]

Epoch 7 | Avg Loss: 0.6609 | Accuracy: 0.6002


  0%|          | 0/381 [00:00<?, ?it/s]

Epoch 8 | Avg Loss: 0.6600 | Accuracy: 0.6051


  0%|          | 0/381 [00:00<?, ?it/s]

Epoch 9 | Avg Loss: 0.6600 | Accuracy: 0.6051


  0%|          | 0/381 [00:00<?, ?it/s]

Epoch 10 | Avg Loss: 0.6581 | Accuracy: 0.6031


In [None]:
epochs = 3

for epoch in range(epochs):
    # Train
    model.train()
    train_loss = 0

    for batch in train_loader:
        input_ids, attention_mask, targets = [b.to(device) for b in batch]

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=targets)

        loss = outputs.loss

        loss.backward()
        optimizer.step()

        train_loss += loss.item()

# Evaluate
model.eval()
val_loss = 0
all_preds = []
all_targets = []

with torch.no_grad():
  for batch in val_loader:
            input_ids, attention_mask, targets = [b.to(device) for b in batch]
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=targets)

            _, preds = torch.max(outputs.logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_targets.extend(targets.cpu().numpy())
