# Natural Language Processing with Disaster Tweets (v4)

ML Sample of NLP.

## Dataset

Natural Language Processing with Disaster Tweets

- Predict which Tweets are about real disasters and which ones are not
  - https://www.kaggle.com/competitions/nlp-getting-started/overview

In [1]:
import pandas as pd
import re

from nltk.corpus import stopwords
import nltk

nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
STOPWORDS = stopwords.words('english')

print(STOPWORDS[:30])

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself']


In [3]:
# Methods preparation
def count_stopwords_in_text(text: str) -> int:
    """Count the number of stopwords in the given text."""
    return len([
        word
        for word in str(text).lower().split()
        if word in STOPWORDS
    ])


def clean_text(text: str) -> str:
    """Clean text with remove hashtag, user name, and URL"""
    text = re.sub(r"#\w+", "", text)
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"@\w+", "", text)
    return text


def fill_missing_keyword_and_location(df: pd.DataFrame) -> None:
    """Complete missing values in the 'keyword' and 'location' columns of a DataFrame"""
    df['keyword'].fillna('unknown_keyword', inplace=True)
    df['location'].fillna('unknown_location', inplace=True)

In [4]:
# Load Train Dataset
df_train = pd.read_csv("./raw_data/train.csv")

df_train.head(3)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1


In [5]:
# EDA: count stopwords
df_train['stopwords_count'] = df_train['text'].apply(count_stopwords_in_text)

df_train['stopwords_count']

0        6
1        0
2       11
3        1
4        7
        ..
7608     2
7609     9
7610     2
7611     5
7612     3
Name: stopwords_count, Length: 7613, dtype: int64

In [6]:
# Preprocessing: fill NaN and clean text
fill_missing_keyword_and_location(df_train)
df_train['text'] = df_train['text'].apply(clean_text)

df_train.head(3)

Unnamed: 0,id,keyword,location,text,target,stopwords_count
0,1,unknown_keyword,unknown_location,Our Deeds are the Reason of this May ALLAH Fo...,1,6
1,4,unknown_keyword,unknown_location,Forest fire near La Ronge Sask. Canada,1,0
2,5,unknown_keyword,unknown_location,All residents asked to 'shelter in place' are ...,1,11
