<a href="https://colab.research.google.com/github/machiwao/ml-development/blob/main/kaggle_DisasterTweets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Natural Language Processing with Disaster Tweets

Reference: https://www.kaggle.com/competitions/nlp-getting-started/data


In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
import re
import string
import warnings
warnings.filterwarnings('ignore')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from scipy.sparse import hstack

## Data Loading

In [2]:
df = pd.read_csv('train.csv')
dt = pd.read_csv('test.csv')

df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [4]:
df.value_counts('target')

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,4342
1,3271


In [5]:
df.value_counts('keyword')

Unnamed: 0_level_0,count
keyword,Unnamed: 1_level_1
fatalities,45
deluge,42
armageddon,42
sinking,41
damage,41
...,...
forest%20fire,19
epicentre,12
threat,11
inundation,10


In [6]:
df.isna().sum()

Unnamed: 0,0
id,0
keyword,61
location,2533
text,0
target,0


In [7]:
df.drop(['location'], axis=1, inplace=True)
dt.drop(['location'], axis=1, inplace=True)
df.fillna('None', inplace=True)

In [8]:
df.duplicated().sum()

0

In [13]:
df.isna().sum()

Unnamed: 0,0
id,0
keyword,0
text,0
target,0
train_data,0


## Data Preprocessing

In [16]:
df["length"] = df["text"].apply(lambda x : len(x))
dt["length"] = dt["text"].apply(lambda x : len(x))

df['length'].describe()

Unnamed: 0,length
count,7613.0
mean,101.037436
std,33.781325
min,7.0
25%,78.0
50%,107.0
75%,133.0
max,157.0


In [14]:
X = df['text']
y = df['target']

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
def text_clean(text, method, rm_stop):
    text = re.sub(r"\n","",text)   #remove line breaks
    text = text.lower() #convert to lowercase
    text = re.sub(r"\d+","",text)   #remove digits and currencies
    text = re.sub(r'[\$\d+\d+\$]', "", text)
    text = re.sub(r'\d+[\.\/-]\d+[\.\/-]\d+', '', text)   #remove dates
    text = re.sub(r'\d+[\.\/-]\d+[\.\/-]\d+', '', text)
    text = re.sub(r'\d+[\.\/-]\d+[\.\/-]\d+', '', text)
    text = re.sub(r'[^\x00-\x7f]',r' ',text)   #remove non-ascii
    text = re.sub(r'[^\w\s]','',text)   #remove punctuation
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)   #remove hyperlinks

    #remove stop words
    if rm_stop == True:
        filtered_tokens = [word for word in word_tokenize(text) if not word in set(stopwords.words('english'))]
        text = " ".join(filtered_tokens)

    #lemmatization: typically preferred over stemming
    if method == 'L':
        lemmer = WordNetLemmatizer()
        lemm_tokens = [lemmer.lemmatize(word) for word in word_tokenize(text)]
        return " ".join(lemm_tokens)

    #stemming
    if method == 'S':
        porter = PorterStemmer()
        stem_tokens = [porter.stem(word) for word in word_tokenize(text)]
        return " ".join(stem_tokens)

    return text

In [None]:
#preprocessed data: Lemm + stopword removal
preprocessed_text_1 = [text_clean(text, 'L', True) for text in X_train]

#preprocessed data: Lemm + no stopword removal
preprocessed_text_2 = [text_clean(text, 'L', False) for text in X_train]



## Data Visualization and Analysis

## Model Training

## Model Testing and Metrics

## Submission File