# Natural Language Processing with Disaster Tweets (v2)

ML Sample of NLP.

## Dataset

Natural Language Processing with Disaster Tweets

- Predict which Tweets are about real disasters and which ones are not
  - https://www.kaggle.com/competitions/nlp-getting-started/overview


In [1]:
import pandas as pd
import numpy as np
import re

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, clone

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier


In [2]:
# Methods preparation
def clean_text(text: str) -> str:
    """Clean text with remove hashtag, user name, and URL"""
    text = re.sub(r"#\w+", "", text)
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"@\w+", "", text)
    return text


def fill_missing_keyword_and_location(df: pd.DataFrame) -> None:
    """Complete missing values in the 'keyword' and 'location' columns of a DataFrame"""
    df['keyword'].fillna('unknown_keyword', inplace=True)
    df['location'].fillna('unknown_location', inplace=True)


def train_model(model: BaseEstimator, X_train_data: list, y_train_data: list) -> BaseEstimator:
    """Train a machine learning model
    
    Args:
        model: Machine learning model to be trained.
        X_train_data, y_train_data: Training data and labels.
        
    Returns:
        Trained model
    """
    model_copy = clone(model)
    model_copy.fit(X_train_data, y_train_data)
    return model_copy


def evaluate_trained_model(model: BaseEstimator, X_test_data: list, y_test_data: list, feature_type: str) -> None:
    """Evaluate a trained Machine Learning model using various metrics
    
    Args:
        model: Trained machine learning model.
        X_test_data, y_test_data: Test data and labels.
        feature_type: Description of feature used.
    """
    y_pred = model.predict(X_test_data)

    print(f"Evaluation: {model.__class__.__name__} ({feature_type})\n")  
    print("Accuracy:", accuracy_score(y_test_data, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test_data, y_pred))
    print("Classification Report:\n", classification_report(y_test_data, y_pred))


In [3]:
# Load Train Dataset
df_train = pd.read_csv("./raw_data/train.csv")

df_train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
# Preprocessing: fill NaN
fill_missing_keyword_and_location(df_train)

In [5]:
# Preprocessing: clean text
df_train['text'] = df_train['text'].apply(clean_text)

df_train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,unknown_keyword,unknown_location,Our Deeds are the Reason of this May ALLAH Fo...,1
1,4,unknown_keyword,unknown_location,Forest fire near La Ronge Sask. Canada,1
2,5,unknown_keyword,unknown_location,All residents asked to 'shelter in place' are ...,1
3,6,unknown_keyword,unknown_location,"13,000 people receive evacuation orders in Ca...",1
4,7,unknown_keyword,unknown_location,Just got sent this photo from Ruby as smoke f...,1


In [6]:
# Feature Engineering: TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = vectorizer.fit_transform(df_train['text'])

In [7]:
# Check vectorizer vocabulary
vocabulary = vectorizer.vocabulary_

first_n_pairs = {k: vocabulary[k] for k in list(vocabulary)[:10]}
print("First 10 vocabulary items:", first_n_pairs)

First 10 vocabulary items: {'our': 3172, 'are': 416, 'the': 4398, 'reason': 3563, 'of': 3101, 'this': 4419, 'may': 2809, 'allah': 300, 'us': 4627, 'all': 299}


In [8]:
# Feature Engineering: Label Encoding for 'keyword' and 'location'
label_encoder = LabelEncoder()
df_train['keyword_encoded'] = label_encoder.fit_transform(df_train['keyword'])
df_train['location_encoded'] = label_encoder.fit_transform(df_train['location'])

display(
    "Top 10 unique encoded keyword:",
    df_train['keyword_encoded'].value_counts().head(10),
)
display(
    "Top 10 unique encoded location:",
    df_train['location_encoded'].value_counts().head(10)
)

'Top 10 unique encoded keyword:'

206    61
104    45
63     42
8      42
177    41
57     41
119    41
29     41
95     40
106    40
Name: keyword_encoded, dtype: int64

'Top 10 unique encoded location:'

3268    2533
2643     104
1826      71
2662      50
1506      45
587       29
1860      28
2632      27
1534      26
1262      24
Name: location_encoded, dtype: int64

In [9]:
# Feature Engineering: Combine all features
X_hstack = hstack([
    X_tfidf,
    df_train[['keyword_encoded', 'location_encoded']].values
])

In [10]:
y = df_train['target']

In [11]:
# Model Building: split
# for only TF-IDF Vectorization
X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split(
    X_tfidf,
    y,
    test_size=0.2,
    random_state=42
)
# for Combine all features
X_train_hstack, X_test_hstack, _, _ = train_test_split(
    X_hstack,
    y,
    test_size=0.2,
    random_state=42
)

In [12]:
# Model Building: Logistic Regression
model_lr = LogisticRegression()

# Train and evaluate model with TF-IDF features only
model_lr_tfidf = train_model(model_lr, X_train_tfidf, y_train)
evaluate_trained_model(
    model_lr_tfidf,
    X_test_tfidf,
    y_test,
    'only TF-IDF Vectorization'
)

# Train and evaluate model with all features
model_lr_all_features = train_model(model_lr, X_train_hstack, y_train)
evaluate_trained_model(
    model_lr_all_features,
    X_test_hstack,
    y_test,
    'Combine all features'
)

Evaluation: LogisticRegression (only TF-IDF Vectorization)

Accuracy: 0.8030203545633617
Confusion Matrix:
 [[773 101]
 [199 450]]
Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.88      0.84       874
           1       0.82      0.69      0.75       649

    accuracy                           0.80      1523
   macro avg       0.81      0.79      0.79      1523
weighted avg       0.80      0.80      0.80      1523

Evaluation: LogisticRegression (Combine all features)

Accuracy: 0.8003939592908733
Confusion Matrix:
 [[779  95]
 [209 440]]
Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.89      0.84       874
           1       0.82      0.68      0.74       649

    accuracy                           0.80      1523
   macro avg       0.81      0.78      0.79      1523
weighted avg       0.80      0.80      0.80      1523



In [13]:
# Model Building: Multinomial Naive Bayes
model_nb = MultinomialNB()

# Train and evaluate model with TF-IDF features only
model_nb_tfidf = train_model(model_nb, X_train_tfidf, y_train)
evaluate_trained_model(
    model_nb_tfidf,
    X_test_tfidf,
    y_test,
    'only TF-IDF Vectorization'
)

# Train and evaluate model with all features
model_nb_all_features = train_model(model_nb, X_train_hstack, y_train)
evaluate_trained_model(
    model_nb_all_features,
    X_test_hstack,
    y_test,
    'Combine all features'
)

Evaluation: MultinomialNB (only TF-IDF Vectorization)

Accuracy: 0.804333552199606
Confusion Matrix:
 [[795  79]
 [219 430]]
Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.91      0.84       874
           1       0.84      0.66      0.74       649

    accuracy                           0.80      1523
   macro avg       0.81      0.79      0.79      1523
weighted avg       0.81      0.80      0.80      1523

Evaluation: MultinomialNB (Combine all features)

Accuracy: 0.6060407091267236
Confusion Matrix:
 [[568 306]
 [294 355]]
Classification Report:
               precision    recall  f1-score   support

           0       0.66      0.65      0.65       874
           1       0.54      0.55      0.54       649

    accuracy                           0.61      1523
   macro avg       0.60      0.60      0.60      1523
weighted avg       0.61      0.61      0.61      1523



In [14]:
# Model Building: Random Forest
model_rf = RandomForestClassifier()

# Train and evaluate model with TF-IDF features only
model_rf_tfidf = train_model(model_rf, X_train_tfidf, y_train)
evaluate_trained_model(
    model_rf_tfidf,
    X_test_tfidf,
    y_test,
    'only TF-IDF Vectorization'
)

# Train and evaluate model with all features
model_rf_all_features = train_model(model_rf, X_train_hstack, y_train)
evaluate_trained_model(
    model_rf_all_features,
    X_test_hstack,
    y_test,
    'Combine all features'
)

Evaluation: RandomForestClassifier (only TF-IDF Vectorization)

Accuracy: 0.7852921864740644
Confusion Matrix:
 [[779  95]
 [232 417]]
Classification Report:
               precision    recall  f1-score   support

           0       0.77      0.89      0.83       874
           1       0.81      0.64      0.72       649

    accuracy                           0.79      1523
   macro avg       0.79      0.77      0.77      1523
weighted avg       0.79      0.79      0.78      1523

Evaluation: RandomForestClassifier (Combine all features)

Accuracy: 0.7787261982928431
Confusion Matrix:
 [[778  96]
 [241 408]]
Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.89      0.82       874
           1       0.81      0.63      0.71       649

    accuracy                           0.78      1523
   macro avg       0.79      0.76      0.76      1523
weighted avg       0.78      0.78      0.77      1523



In [15]:
# Model Building: Support Vector Machine
model_svm = SVC()

# Train and evaluate model with TF-IDF features only
model_svm_tfidf = train_model(model_svm, X_train_tfidf, y_train)
evaluate_trained_model(
    model_svm_tfidf,
    X_test_tfidf,
    y_test,
    'only TF-IDF Vectorization'
)

# Train and evaluate model with all features
model_svm_all_features = train_model(model_svm, X_train_hstack, y_train)
evaluate_trained_model(
    model_svm_all_features,
    X_test_hstack,
    y_test,
    'Combine all features'
)

Evaluation: SVC (only TF-IDF Vectorization)

Accuracy: 0.8030203545633617
Confusion Matrix:
 [[786  88]
 [212 437]]
Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.90      0.84       874
           1       0.83      0.67      0.74       649

    accuracy                           0.80      1523
   macro avg       0.81      0.79      0.79      1523
weighted avg       0.81      0.80      0.80      1523

Evaluation: SVC (Combine all features)

Accuracy: 0.5738673670387393
Confusion Matrix:
 [[874   0]
 [649   0]]
Classification Report:
               precision    recall  f1-score   support

           0       0.57      1.00      0.73       874
           1       0.00      0.00      0.00       649

    accuracy                           0.57      1523
   macro avg       0.29      0.50      0.36      1523
weighted avg       0.33      0.57      0.42      1523



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [16]:
# Model Building: K_Neighbors
model_knb = KNeighborsClassifier()

# Train and evaluate model with TF-IDF features only
model_knb_tfidf = train_model(model_knb, X_train_tfidf, y_train)
evaluate_trained_model(
    model_knb_tfidf,
    X_test_tfidf,
    y_test,
    'only TF-IDF Vectorization'
)

# Train and evaluate model with all features
model_knb_all_features = train_model(model_knb, X_train_hstack, y_train)
evaluate_trained_model(
    model_knb_all_features,
    X_test_hstack,
    y_test,
    'Combine all features'
)

Evaluation: KNeighborsClassifier (only TF-IDF Vectorization)

Accuracy: 0.6644780039395929
Confusion Matrix:
 [[867   7]
 [504 145]]
Classification Report:
               precision    recall  f1-score   support

           0       0.63      0.99      0.77       874
           1       0.95      0.22      0.36       649

    accuracy                           0.66      1523
   macro avg       0.79      0.61      0.57      1523
weighted avg       0.77      0.66      0.60      1523

Evaluation: KNeighborsClassifier (Combine all features)

Accuracy: 0.6270518713066316
Confusion Matrix:
 [[632 242]
 [326 323]]
Classification Report:
               precision    recall  f1-score   support

           0       0.66      0.72      0.69       874
           1       0.57      0.50      0.53       649

    accuracy                           0.63      1523
   macro avg       0.62      0.61      0.61      1523
weighted avg       0.62      0.63      0.62      1523

