In [1]:
import pandas as pd
from transformers import pipeline

In [2]:
classified_df = pd.read_csv("train.csv")
disaster_tweets = classified_df[classified_df["target"] == 1]

Select only Disaster tweets

In [3]:
print(f"Total Disaster Tweets: {len(disaster_tweets)}")
disaster_tweets.reset_index(drop=True, inplace=True)

Total Disaster Tweets: 3271


Create a new Dataset with only Disaster Tweets

In [4]:
df_disaster = disaster_tweets.to_csv("disaster_tweets_train.csv", index=False)

# Severity Classification Using a Hybrid Approach (VADER+Keyword based approach)

- VADER is fast and lightweight (works without a GPU).
- Works well for short texts like tweets
- Doesn't require training data - pre-built lexicon

In [13]:
import re
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [14]:
df_dataset = pd.read_csv('disaster_tweets_train.csv')
df_dataset.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


Define Severity Keywords

In [18]:
mild_keywords = [
    "rain", "wind", "flooded roads", "small fire", "light drizzle", "cloudy", "showers", "mist", "puddle",
    "sprinkle", "gentle breeze", "fog", "haze", "overcast", "minor landslide", "light storm", "storm warning",
    "roadblock", "tree fallen", "small accident", "traffic delay", "mild cold", "heatwave", "warm weather",
    "light snow", "ice patches", "muddy roads", "low visibility", "power fluctuation", "communication issue",
    "transportation delay", "evacuation alert", "minor injuries", "isolated incident", "waterlogging"
]

moderate_keywords = [
    "damaged buildings", "collapsed roof", "power outage", "strong winds", "heavy rain", "thunderstorm",
    "landslide", "mudslide", "flash flood", "bridge damage", "highway closure", "road erosion", "hailstorm",
    "tornado warning", "moderate injuries", "partial collapse", "severe traffic disruption", "fire outbreak",
    "electricity cut", "flooded basement", "forced evacuation", "rescue operation", "moderate casualties",
    "hospitalization", "disaster relief", "weather advisory", "tropical storm", "communication blackout",
    "food shortage", "water supply disruption", "tsunami alert", "gas leak", "volcanic ash"
]

severe_keywords = [
    "earthquake", "tsunami", "destroyed", "many casualties", "collapsed buildings", "major landslide",
    "massive flood", "hurricane", "tornado", "wildfire", "volcanic eruption", "severe drought", "famine",
    "severe injuries", "mass evacuation", "missing persons", "death toll", "catastrophic event", "total destruction",
    "chemical spill", "radioactive contamination", "oil spill", "airplane crash", "train derailment",
    "terrorist attack", "nuclear disaster", "state of emergency", "international aid", "critical condition",
    "uninhabitable", "displacement", "mass casualties", "collapsed bridge", "citywide blackout",
    "pandemic outbreak", "biological hazard", "civil unrest", "armed conflict", "explosion", "war zone",
    "severe fire", "water crisis", "riots", "looting", "disaster zone", "humanitarian crisis"
]

In [44]:
def classify_severity(text):
    """Classifies severity based on keyword matching."""
    text = text.lower()
    if any(word in text for word in severe_keywords):
        return "Severe"
    elif any(word in text for word in moderate_keywords):
        return "Moderate"
    elif any(word in text for word in mild_keywords):
        return "Mild"
    return "Unknown"

df_dataset["severity_rule_based"] = df_dataset["text"].apply(classify_severity)

Initialize Sentiment Analyzer

In [45]:
analyzer = SentimentIntensityAnalyzer()

def sentiment_severity(text):
    """Assign severity based on sentiment scores."""
    score = analyzer.polarity_scores(text)["compound"]
    if score <= -0.5:
        return "Severe"
    elif -0.5 < score <= -0.2:
        return "Moderate"
    elif -0.2 < score:
        return "Mild"
    return "Unknown"

Apply sentiment analysis severity classification

In [46]:
df_dataset["severity_sentiment_based"] = df_dataset["text"].apply(sentiment_severity)

def final_severity(row):
    """Combine both approaches. If they match, return that label. Otherwise, prioritize sentiment analysis."""
    if row["severity_rule_based"] == row["severity_sentiment_based"]:
        return row["severity_rule_based"]
    return row["severity_sentiment_based"]

Apply final severity classification

In [48]:
df_dataset["final_severity"] = df_dataset.apply(final_severity, axis=1)
print(df_dataset[["text", "severity_rule_based", "severity_sentiment_based", "final_severity"]].head(10))
df_dataset.to_csv("disaster_tweets_with_severity_train.csv", index=False)

                                                text severity_rule_based  \
0  Our Deeds are the Reason of this #earthquake M...              Severe   
1             Forest fire near La Ronge Sask. Canada             Unknown   
2  All residents asked to 'shelter in place' are ...             Unknown   
3  13,000 people receive #wildfires evacuation or...              Severe   
4  Just got sent this photo from Ruby #Alaska as ...              Severe   
5  #RockyFire Update => California Hwy. 20 closed...              Severe   
6  #flood #disaster Heavy rain causes flash flood...            Moderate   
7  I'm on top of the hill and I can see a fire in...             Unknown   
8  There's an emergency evacuation happening now ...             Unknown   
9  I'm afraid that the tornado is coming to our a...              Severe   

  severity_sentiment_based final_severity  
0                     Mild           Mild  
1                 Moderate       Moderate  
2                 Moderate     

By manually checking these 25 results, we can see that keywords based + VADER approach was good but not that accurate. We need to use a transformer based model for sentiment analysis.

In [49]:
df_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3271 entries, 0 to 3270
Data columns (total 8 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   id                        3271 non-null   int64 
 1   keyword                   3229 non-null   object
 2   location                  2196 non-null   object
 3   text                      3271 non-null   object
 4   target                    3271 non-null   int64 
 5   severity_sentiment_based  3271 non-null   object
 6   severity_rule_based       3271 non-null   object
 7   final_severity            3271 non-null   object
dtypes: int64(2), object(6)
memory usage: 204.6+ KB


# Severity Classification using DistilBERT

In [27]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F

In [50]:
import pandas as pd
df_dataset = pd.read_csv('disaster_tweets_with_severity_train.csv')

Define the transformer model and tokenizer

In [51]:
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [52]:
severity_labels = {"Mild": 0, "Moderate": 1, "Severe": 2}

Custome Dataset Classes

In [53]:
class SeverityDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        encoding = self.tokenizer(self.texts[idx], truncation=True, padding="max_length", max_length=self.max_len, return_tensors="pt")
        return {"input_ids": encoding["input_ids"].squeeze(), "attention_mask": encoding["attention_mask"].squeeze(), "label": torch.tensor(self.labels[idx], dtype=torch.long)}

In [54]:
train_texts = df_dataset["text"].tolist()
train_labels = [severity_labels[label] for label in df_dataset["final_severity"].tolist()]  # Using rule-based labels as weak supervision

train_dataset = SeverityDataset(train_texts, train_labels, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)


Training Loop

In [55]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

def train_model(model, train_loader, optimizer, epochs=3):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in train_loader:
            input_ids, attention_mask, labels = batch["input_ids"].to(device), batch["attention_mask"].to(device), batch["label"].to(device)
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader):.4f}")

train_model(model, train_loader, optimizer)

Epoch 1, Loss: 0.8008
Epoch 2, Loss: 0.4626
Epoch 3, Loss: 0.2803


Save the trained model

In [None]:
model.save_pretrained("severity_classifier_model")
tokenizer.save_pretrained("severity_classifier_model")

('severity_classifier_model\\tokenizer_config.json',
 'severity_classifier_model\\special_tokens_map.json',
 'severity_classifier_model\\vocab.txt',
 'severity_classifier_model\\added_tokens.json')

In [57]:
def predict_severity(texts, model, tokenizer):
    model.eval()
    encoded_inputs = tokenizer(texts, truncation=True, padding=True, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**{k: v.to(device) for k, v in encoded_inputs.items()})
    probs = F.softmax(outputs.logits, dim=1)
    predictions = torch.argmax(probs, dim=1).cpu().numpy()
    return [list(severity_labels.keys())[p] for p in predictions]

df_dataset["final_severity"] = predict_severity(df_dataset["text"].tolist(), model, tokenizer)

In [58]:
# Save results
df_dataset.to_csv("disaster_tweets_with_severity.csv", index=False)

In [60]:
print(df_dataset[["text", "final_severity"]].head(25))

                                                 text final_severity
0   Our Deeds are the Reason of this #earthquake M...           Mild
1              Forest fire near La Ronge Sask. Canada       Moderate
2   All residents asked to 'shelter in place' are ...       Moderate
3   13,000 people receive #wildfires evacuation or...           Mild
4   Just got sent this photo from Ruby #Alaska as ...           Mild
5   #RockyFire Update => California Hwy. 20 closed...       Moderate
6   #flood #disaster Heavy rain causes flash flood...         Severe
7   I'm on top of the hill and I can see a fire in...           Mild
8   There's an emergency evacuation happening now ...       Moderate
9   I'm afraid that the tornado is coming to our a...           Mild
10        Three people died from the heat wave so far         Severe
11  Haha South Tampa is getting flooded hah- WAIT ...           Mild
12  #raining #flooding #Florida #TampaBay #Tampa 1...       Moderate
13            #Flood in Bago Myanm

# Testing the model on test_data

In [82]:
df_test = df_test = pd.read_csv('disaster_classification_results.csv')
df_disaster_tweets = df_test[df_test["predicted_label"] == 1].reset_index(drop=True)
df_disaster_tweets.to_csv("disaster_tweets_test.csv", index=False)

In [83]:
import pandas as pd
df_test = pd.read_csv('disaster_tweets_test.csv')
test_texts = df_test["text"].tolist()

In [92]:
df_test.head(25)

Unnamed: 0,id,keyword,location,text,predicted_label,predicted_severity,disaster_label
0,0,Unknown,Unknown,Just happened a terrible car crash,1,Severe,2
1,2,Unknown,Unknown,"Heard about #earthquake is different cities, s...",1,Mild,0
2,3,Unknown,Unknown,"there is a forest fire at spot pond, geese are...",1,Severe,2
3,9,Unknown,Unknown,Apocalypse lighting. #Spokane #wildfires,1,Mild,0
4,11,Unknown,Unknown,Typhoon Soudelor kills 28 in China and Taiwan,1,Severe,2
5,12,Unknown,Unknown,We're shaking...It's an earthquake,1,Mild,0
6,46,ablaze,London,Birmingham Wholesale Market is ablaze BBC News...,1,Mild,0
7,75,ablaze,India,Rape victim dies as she sets herself ablaze: A...,1,Severe,2
8,99,accident,"Homewood, PA",Accident cleared in #PaTurnpike on PATP EB bet...,1,Moderate,1
9,111,accident,Bexhill,@Traffic_SouthE @roadpol_east Accident on A27 ...,1,Moderate,1


Load the Saved Model & Tokenizer

In [93]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_path = "severity_classifier_model"

# Load the fine-tuned model
model = AutoModelForSequenceClassification.from_pretrained(model_path)
model.eval()  # Set to evaluation mode

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)

Tokenize the Test Data

In [95]:
test_encodings = tokenizer(test_texts, padding=True, truncation=True, return_tensors="pt")

Run Inference to Predict Severity

In [97]:
with torch.no_grad():
    outputs = model(**test_encodings)
    logits = outputs.logits
    predicted_labels = torch.argmax(logits, dim=1).tolist()

Map Predictions to Severity Labels

In [98]:
severity_labels = {0: "Mild", 1: "Moderate", 2: "Severe"}
df_test["predicted_severity"] = [severity_labels[label] for label in predicted_labels]


In [99]:
df_test["disaster_label"] = predicted_labels
df_disaster_tweets = df_test[df_test["disaster_label"] == 1]  
df_disaster_tweets = df_disaster_tweets.reset_index(drop=True)  

In [101]:
print(df_test[["text", "predicted_severity"]].head(25))  # Show first 25 predictions

# Save the test dataset with predicted severity
df_test.to_csv("disaster_tweets_with_predicted_severity_results.csv", index=False)
print("Predictions saved to 'disaster_tweets_with_predicted_severity.csv'.")


                                                 text predicted_severity
0                  Just happened a terrible car crash             Severe
1   Heard about #earthquake is different cities, s...               Mild
2   there is a forest fire at spot pond, geese are...             Severe
3            Apocalypse lighting. #Spokane #wildfires               Mild
4       Typhoon Soudelor kills 28 in China and Taiwan             Severe
5                  We're shaking...It's an earthquake               Mild
6   Birmingham Wholesale Market is ablaze BBC News...               Mild
7   Rape victim dies as she sets herself ablaze: A...             Severe
8   Accident cleared in #PaTurnpike on PATP EB bet...           Moderate
9   @Traffic_SouthE @roadpol_east Accident on A27 ...           Moderate
10  For Legal and Medical Referral Service @1800_I...           Moderate
11  On the #M42 northbound between junctions J3 an...           Moderate
12  ACCIDENT - HIT AND RUN - COLD at 500 BLOCK OF .