# Natural Language Processing with Disaster Tweets (v2)

ML Sample of NLP.

## Dataset

Natural Language Processing with Disaster Tweets

- Predict which Tweets are about real disasters and which ones are not
  - https://www.kaggle.com/competitions/nlp-getting-started/overview


In [1]:
import pandas as pd
import numpy as np
import re

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from sklearn.model_selection import train_test_split


In [2]:
# Load Train Dataset
df_train = pd.read_csv("./raw_data/train.csv")

df_train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [3]:
# Methods preparation
def clean_text(text: str) -> str:
    """Clean text with remove hashtag, user name, and URL"""
    text = re.sub(r"#\w+", "", text)
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"@\w+", "", text)
    return text


def fill_missing_keyword_and_location(df: pd.DataFrame) -> None:
    """Complete missing values in the 'keyword' and 'location' columns of a DataFrame"""
    df['keyword'].fillna('unknown_keyword', inplace=True)
    df['location'].fillna('unknown_location', inplace=True)


def evaluate_model(y_true_test: np.ndarray, y_pred_test: np.ndarray) -> None:
    """Evaluate the model using various metrics
    
    Args:
        y_true_test: True labels for the test data.
        y_pred_test: Predicted labels for the test data.
    """
    print("Accuracy:", accuracy_score(y_true_test, y_pred_test))
    print("Confusion Matrix:\n", confusion_matrix(y_true_test, y_pred_test))
    print("Classification Report:\n", classification_report(y_true_test, y_pred_test))


In [4]:
# Preprocessing: fill NaN
fill_missing_keyword_and_location(df_train)

In [5]:
# Preprocessing: clean text
df_train['text'] = df_train['text'].apply(clean_text)

df_train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,unknown_keyword,unknown_location,Our Deeds are the Reason of this May ALLAH Fo...,1
1,4,unknown_keyword,unknown_location,Forest fire near La Ronge Sask. Canada,1
2,5,unknown_keyword,unknown_location,All residents asked to 'shelter in place' are ...,1
3,6,unknown_keyword,unknown_location,"13,000 people receive evacuation orders in Ca...",1
4,7,unknown_keyword,unknown_location,Just got sent this photo from Ruby as smoke f...,1


In [6]:
# Feature Engineering: Label Encoding for 'keyword' and 'location'
label_encoder = LabelEncoder()
df_train['keyword_encoded'] = label_encoder.fit_transform(df_train['keyword'])
df_train['location_encoded'] = label_encoder.fit_transform(df_train['location'])

display(
    "Top 10 unique encoded keyword:",
    df_train['keyword_encoded'].value_counts().head(10),
)
display(
    "Top 10 unique encoded location:",
    df_train['location_encoded'].value_counts().head(10)
)

'Top 10 unique encoded keyword:'

206    61
104    45
63     42
8      42
177    41
57     41
119    41
29     41
95     40
106    40
Name: keyword_encoded, dtype: int64

'Top 10 unique encoded location:'

3268    2533
2643     104
1826      71
2662      50
1506      45
587       29
1860      28
2632      27
1534      26
1262      24
Name: location_encoded, dtype: int64

In [7]:
# Feature Engineering: TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = vectorizer.fit_transform(df_train['text'])

In [8]:
# Feature Engineering: Combine all features
X = hstack([
    X_tfidf,
    df_train[['keyword_encoded', 'location_encoded']].values
])
y = df_train['target']

In [9]:
# Model Building: split
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42
)