In [1]:
import pandas as pd
import torch
import transformers
print(transformers.__file__)

from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report
from imblearn.over_sampling import SMOTE
from torch.utils.data import Dataset, DataLoader

  from .autonotebook import tqdm as notebook_tqdm


/opt/anaconda3/envs/nlp/lib/python3.11/site-packages/transformers/__init__.py


In [2]:
print(f"PyTorch Version: {torch.__version__}")
print(f"Transformers Version: {transformers.__version__}")

PyTorch Version: 2.7.0
Transformers Version: 4.31.0


## Load data

In [3]:
df = pd.read_csv('../data/processed/disaster_df.csv')
print(df.shape)
df.head()

(13999, 2)


Unnamed: 0,label,text
0,non_real_time_crisis,#earthquake Magnitude 2.1 occurred 159km NE of...
1,non_real_time_crisis,Retweeted Earthquakes Tsunamis (@NewEarthquake...
2,non_real_time_crisis,I always know I need to go shopping when I've ...
3,non_real_time_crisis,Update: M2.0 #earthquake (#sismo) strikes 1 km...
4,non_real_time_crisis,ã€


In [4]:
df['label'] = df['label'].map({'non_real_time_crisis': 0, 'real_time_crisis': 1})

In [5]:
df.head()

Unnamed: 0,label,text
0,0,#earthquake Magnitude 2.1 occurred 159km NE of...
1,0,Retweeted Earthquakes Tsunamis (@NewEarthquake...
2,0,I always know I need to go shopping when I've ...
3,0,Update: M2.0 #earthquake (#sismo) strikes 1 km...
4,0,ã€


In [6]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'].tolist(), df['label'].tolist(), test_size=0.2, random_state=42
)
print(f"Train size: {len(train_texts)}, Validation size: {len(val_texts)}")

Train size: 11199, Validation size: 2800


In [7]:
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        # Tokenize the text (padding and truncating it to max_length)
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )

        # The tokenizer returns a dictionary of tensors, so we need to extract them
        input_ids = encoding['input_ids'].flatten()
        attention_mask = encoding['attention_mask'].flatten()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'label': torch.tensor(label, dtype=torch.long)
        }


In [8]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# 4. Create Dataset Instances
train_dataset = CustomDataset(train_texts, train_labels, tokenizer)
val_dataset = CustomDataset(val_texts, val_labels, tokenizer)



In [9]:
# 5. Create DataLoader for batching
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=8)

In [10]:
# 6. Load Pre-trained BERT Model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# 7. Define Training Arguments
training_args = TrainingArguments(
    output_dir='../results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    evaluation_strategy="epoch"
)

In [12]:
# 8. Initialize Trainer with Custom Dataset
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=None,
    compute_metrics=lambda p: classification_report(p.predictions.argmax(axis=1), p.label_ids, output_dict=True)
)


In [13]:
# 9. Train the Model
trainer.train()

 12%|█▏        | 500/4200 [1:45:47<3:15:25,  3.17s/it]   

{'loss': 0.4081, 'learning_rate': 5e-05, 'epoch': 0.36}


 24%|██▍       | 1000/4200 [2:12:55<3:03:39,  3.44s/it]

{'loss': 0.3503, 'learning_rate': 4.324324324324325e-05, 'epoch': 0.71}


                                                       


{'eval_loss': 0.2962515652179718, 'eval_0': {'precision': 0.9638495359062041, 'recall': 0.9005020538566865, 'f1-score': 0.9310995752713545, 'support': 2191.0}, 'eval_1': {'precision': 0.7104913678618858, 'recall': 0.8784893267651889, 'f1-score': 0.7856093979441997, 'support': 609.0}, 'eval_accuracy': 0.8957142857142857, 'eval_macro avg': {'precision': 0.837170451884045, 'recall': 0.8894956903109377, 'f1-score': 0.858354486607777, 'support': 2800.0}, 'eval_weighted avg': {'precision': 0.908744134356565, 'recall': 0.8957142857142857, 'f1-score': 0.8994554617026984, 'support': 2800.0}, 'eval_runtime': 406.3325, 'eval_samples_per_second': 6.891, 'eval_steps_per_second': 0.861, 'epoch': 1.0}


 36%|███▌      | 1500/4200 [2:50:07<2:17:24,  3.05s/it]  

{'loss': 0.2995, 'learning_rate': 3.648648648648649e-05, 'epoch': 1.07}


 48%|████▊     | 2000/4200 [4:07:57<45:59:16, 75.25s/it]  

{'loss': 0.2304, 'learning_rate': 2.9729729729729733e-05, 'epoch': 1.43}


 60%|█████▉    | 2500/4200 [4:45:26<1:33:05,  3.29s/it] 

{'loss': 0.224, 'learning_rate': 2.2972972972972976e-05, 'epoch': 1.79}


                                                       


{'eval_loss': 0.3611055016517639, 'eval_0': {'precision': 0.9579872984855886, 'recall': 0.9241281809613572, 'f1-score': 0.9407531782201967, 'support': 2122.0}, 'eval_1': {'precision': 0.7861885790172642, 'recall': 0.8731563421828908, 'f1-score': 0.827393431167016, 'support': 678.0}, 'eval_accuracy': 0.9117857142857143, 'eval_macro avg': {'precision': 0.8720879387514264, 'recall': 0.898642261572124, 'f1-score': 0.8840733046936063, 'support': 2800.0}, 'eval_weighted avg': {'precision': 0.9163874657000444, 'recall': 0.9117857142857143, 'f1-score': 0.9133039251837479, 'support': 2800.0}, 'eval_runtime': 356.1955, 'eval_samples_per_second': 7.861, 'eval_steps_per_second': 0.983, 'epoch': 2.0}


 71%|███████▏  | 3000/4200 [5:18:11<58:47,  2.94s/it]    

{'loss': 0.152, 'learning_rate': 1.6216216216216218e-05, 'epoch': 2.14}


 83%|████████▎ | 3500/4200 [6:27:04<31:44,  2.72s/it]    

{'loss': 0.1117, 'learning_rate': 9.45945945945946e-06, 'epoch': 2.5}


 95%|█████████▌| 4000/4200 [6:54:40<11:25,  3.43s/it]  

{'loss': 0.1069, 'learning_rate': 2.702702702702703e-06, 'epoch': 2.86}


                                                     
100%|██████████| 4200/4200 [7:13:13<00:00,  6.19s/it]

{'eval_loss': 0.42944231629371643, 'eval_0': {'precision': 0.9399120664386907, 'recall': 0.9376218323586745, 'f1-score': 0.9387655525737985, 'support': 2052.0}, 'eval_1': {'precision': 0.8300132802124834, 'recall': 0.8355614973262032, 'f1-score': 0.832778147901399, 'support': 748.0}, 'eval_accuracy': 0.9103571428571429, 'eval_macro avg': {'precision': 0.8849626733255871, 'recall': 0.8865916648424388, 'f1-score': 0.8857718502375987, 'support': 2800.0}, 'eval_weighted avg': {'precision': 0.9105533906896897, 'recall': 0.9103571428571429, 'f1-score': 0.9104517744684574, 'support': 2800.0}, 'eval_runtime': 405.7734, 'eval_samples_per_second': 6.9, 'eval_steps_per_second': 0.863, 'epoch': 3.0}
{'train_runtime': 25993.2571, 'train_samples_per_second': 1.293, 'train_steps_per_second': 0.162, 'train_loss': 0.22783626261211576, 'epoch': 3.0}





TrainOutput(global_step=4200, training_loss=0.22783626261211576, metrics={'train_runtime': 25993.2571, 'train_samples_per_second': 1.293, 'train_steps_per_second': 0.162, 'train_loss': 0.22783626261211576, 'epoch': 3.0})

In [14]:
# 10. Evaluate the Model
eval_results = trainer.evaluate()
print(eval_results)

100%|██████████| 350/350 [06:48<00:00,  1.17s/it]

{'eval_loss': 0.42944231629371643, 'eval_0': {'precision': 0.9399120664386907, 'recall': 0.9376218323586745, 'f1-score': 0.9387655525737985, 'support': 2052.0}, 'eval_1': {'precision': 0.8300132802124834, 'recall': 0.8355614973262032, 'f1-score': 0.832778147901399, 'support': 748.0}, 'eval_accuracy': 0.9103571428571429, 'eval_macro avg': {'precision': 0.8849626733255871, 'recall': 0.8865916648424388, 'f1-score': 0.8857718502375987, 'support': 2800.0}, 'eval_weighted avg': {'precision': 0.9105533906896897, 'recall': 0.9103571428571429, 'f1-score': 0.9104517744684574, 'support': 2800.0}, 'eval_runtime': 409.2196, 'eval_samples_per_second': 6.842, 'eval_steps_per_second': 0.855, 'epoch': 3.0}





In [15]:
def predict_texts(texts, model, tokenizer, max_length=512):
    model.eval()  # Set model to evaluation mode
    predictions = []

    for text in texts:
        # Tokenize the text
        encoding = tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=max_length,
            return_tensors="pt"
        )

        # Move to GPU if available
        input_ids = encoding['input_ids'].to(model.device)
        attention_mask = encoding['attention_mask'].to(model.device)

        # Perform prediction without calculating gradients
        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            predicted_class = torch.argmax(logits, dim=1).item()
            predictions.append(predicted_class)
    
    return predictions

In [17]:
# Sample texts for prediction
sample_texts = [
    "A massive earthquake has hit the city, buildings are collapsing!",
    "The weather is nice today, perfect for a picnic.",
    "Emergency services are responding to the flood in the area.",
    "The local sports team won the championship game last night.",
    "A new restaurant has opened downtown, serving delicious food.",
    "There are reports of a wildfire spreading in the forest.",
]

# Use the prediction function
predictions = predict_texts(sample_texts, model, tokenizer)
label_map = {0: "non_real_time_crisis", 1: "real_time_crisis"}

for text, pred in zip(sample_texts, predictions):
    print(f"Text: {text}\nPrediction: {label_map[pred]}\n")

Text: A massive earthquake has hit the city, buildings are collapsing!
Prediction: real_time_crisis

Text: The weather is nice today, perfect for a picnic.
Prediction: real_time_crisis

Text: Emergency services are responding to the flood in the area.
Prediction: real_time_crisis

Text: The local sports team won the championship game last night.
Prediction: non_real_time_crisis

Text: A new restaurant has opened downtown, serving delicious food.
Prediction: non_real_time_crisis

Text: There are reports of a wildfire spreading in the forest.
Prediction: non_real_time_crisis



In [None]:
model.save_pretrained('../saved_model')

# Save the tokenizer
tokenizer.save_pretrained('../saved_model')

('./saved_model/tokenizer_config.json',
 './saved_model/special_tokens_map.json',
 './saved_model/vocab.txt',
 './saved_model/added_tokens.json')