# NLP-based classification of haitian disaster response messages

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import re
import string
import matplotlib.pyplot as plt
from wordcloud import WordCloud

import nltk
import ssl
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_recall_fscore_support,
    classification_report,
    confusion_matrix,
    roc_auc_score,
    roc_curve
)

import warnings
warnings.filterwarnings('ignore')

nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)

In [None]:
# load datasets
messages = pd.read_csv("disaster_messages.csv")
print("Messages columns:", messages.columns.tolist())

# Amee

In [None]:
cat = pd.read_csv("disaster_categories.csv")
print("Categories columns:", cat.columns.tolist())

# Amee

In [None]:
# merge datasets
df = messages.merge(cat, left_on='id', right_on='id', how='inner')
df.head()

# Amee

In [None]:
# parse categories
categories = df["categories"].str.split(';', expand=True)
category_col = categories.iloc[0].str.split('-').str[0].tolist()

categories.columns = category_col
for column in categories:
    categories[column] = categories[column].str[-1]
    
    categories[column] = pd.to_numeric(categories[column])

df.drop(['categories'], axis=1, inplace = True)

# concatenate
df = pd.concat([df, categories], axis=1)

# Amee

In [None]:
haiti_mask = (
    (df['genre'] == 'direct') |
    (df['original'].str.contains('Haiti', case=False, na=False)) |
    (df['message'].str.contains('Haiti', case=False, na=False))
)
df = df[haiti_mask].copy()

# disaster-related messages
df = df[df["related"] == 1].reset_index(drop=True)

# drop duplicates
df = df.drop_duplicates(subset=['message', 'genre']).reset_index(drop=True)

# Amee

In [None]:
conditions = [
    (df["request"] == 1),
    (df["request"] == 0)
]
choices = ["request", "info"]
df["target"] = np.select(conditions, choices, default="info")

target_percentages = df["target"].value_counts(normalize=True) * 100

# Amee

- conversion to lowercase
- remove URLs, social media mentions, hashtags, punctuation, ...
- tokenization into words
- filtering of English stopwords 
- fltering of non-alphabetic tokens
- lemmatization

In [None]:
def preprocess_text(text):
    if not isinstance(text, str):
        return ""
    # lowercase
    text = text.lower()
    # remove URLs etc..
    text = re.sub(r'http\S+|www\S+|@\w+|#\w+', '', text)
    # remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # tokenize
    tokens = word_tokenize(text)
    # remove stopwords and non-alphabetic tokens
    tokens = [t for t in tokens if t.isalpha() and t not in stopwords.words('english')]
    # lemmatize
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return ' '.join(tokens)

df['clean_message'] = df['message'].apply(preprocess_text)

In [None]:
df['word_count'] = df['clean_message'].str.split().str.len()
df['char_count'] = df['message'].str.len()

print("Word count", df['word_count'].mean())
print("Message length", df['char_count'].min(), "–", df['char_count'].max())

In [None]:
# train-test (.8/.2)
X = df['clean_message']
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [None]:
plt.figure(figsize=(6, 4))
order = ['request', 'info']
counts = df['target'].value_counts().reindex(order, fill_value=0)

ax = sns.barplot(x=counts.index, y=counts.values)

plt.tight_layout()
plt.show()

# Amee

In [None]:
def get_top_words(category, n=10):
    subset = df[df['target'] == category]['clean_message']
    vec = CountVectorizer(max_features=1000, stop_words='english')
    X = vec.fit_transform(subset)
    freq = np.array(X.sum(axis=0)).flatten()
    words = vec.get_feature_names_out()
    top_idx = freq.argsort()[-n:][::-1]
    return [(words[i], freq[i]) for i in top_idx]

def plot_wordcloud(text, title, ax):
    wordcloud = WordCloud(
        width=400,
        height=300,
        background_color='white',
        colormap='viridis',
        max_words=100,
        stopwords=set(stopwords.words('english'))
    ).generate(text)
    ax.imshow(wordcloud, interpolation='bilinear')
    ax.set_title(title, fontsize=14, fontweight='bold')
    ax.axis('off')

request_text = ' '.join(df[df['target'] == 'request']['clean_message'])
info_text = ' '.join(df[df['target'] == 'info']['clean_message'])

# plot
fig, axes = plt.subplots(1, 2, figsize=(18, 5))
plot_wordcloud(request_text, 'Requests', axes[0])
plot_wordcloud(info_text, 'Info', axes[1])

plt.tight_layout()
plt.show()

# Amee


In [None]:
# pipelines
def make_pipeline(clf):
    return Pipeline([
        ('tfidf', TfidfVectorizer(
            max_features=7000,
            ngram_range=(1, 3),
            sublinear_tf=True,
            min_df=2,
            max_df=0.95,
            stop_words='english'
        )),
        ('clf', clf)
    ])

models = {
    'LR': make_pipeline(LogisticRegression(class_weight='balanced', random_state=42, max_iter=1000)),
    'Naive Bayes': make_pipeline(MultinomialNB()),
    'Random Forest': make_pipeline(RandomForestClassifier(class_weight='balanced', random_state=42))
}

# train models
for name, pipe in models.items():
    pipe.fit(X_train, y_train)

In [None]:
# tuning
param_grid = {
    'tfidf__max_features': [5000, 7000],
    'tfidf__ngram_range': [(1, 3)],
    'clf__C': [0.1, 1.0, 10.0]
}

grid_search = GridSearchCV(
    models['LR'],
    param_grid,
    cv=3,
    scoring='f1_macro',
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)

grid_search.best_params_
grid_search.best_score_

# best estimator
best_lr = grid_search.best_estimator_

In [None]:
models = models.copy()
models['Tuned LR'] = best_lr

results = []

for name, pipe in models.items():
    y_pred = pipe.predict(X_test)
    y_proba = pipe.predict_proba(X_test)[:, 1]
    y_test_int = (y_test == 'request').astype(int)
    
    acc = accuracy_score(y_test, y_pred)
    f1_macro = f1_score(y_test, y_pred, average='macro')
    auc = roc_auc_score(y_test_int, y_proba)
    
    results.append({
        'Model': name,
        'Accuracy': round(acc, 5),
        'Macro F1': round(f1_macro, 5)
    })

results_df = pd.DataFrame(results)
print(results_df.to_string(index=False))

In [None]:
y_pred_tuned = best_lr.predict(X_test)
cm = confusion_matrix(y_test, y_pred_tuned, labels=['request', 'info'])

plt.figure(figsize=(6, 5))
sns.heatmap(
    cm, annot=True, fmt='d', cmap='Blues',
    xticklabels=['request', 'info'],
    yticklabels=['request', 'info']
)

plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.tight_layout()
plt.show()

In [None]:
y_proba = best_lr.predict_proba(X_test)

label_map = {'info': 0, 'request': 1}
y_test_int = y_test.map(label_map).values
y_proba_positive = y_proba[:, 1]

# ROC-AUC
roc_auc = roc_auc_score(y_test_int, y_proba_positive)
print(f"ROC-AUC (binary): {roc_auc:.3f}")
fpr, tpr, _ = roc_curve(y_test_int, y_proba_positive)

# plot
plt.figure(figsize=(6, 5))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.3f})')
plt.plot([0, 1], [0, 1], color='navy', lw=1, linestyle='--', label='Random classifier')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.legend(loc="lower right")
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()