# Disaster Tweet Identification

In [1]:
import re
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
import re
import nltk
from nltk.corpus import stopwords

import warnings
warnings.filterwarnings("ignore")
from collections import defaultdict

In [2]:
data = pd.read_csv("./datas/disaster_tweets.csv")
data.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [3]:
data.tail()

Unnamed: 0,id,keyword,location,text,target
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1
7612,10873,,,The Latest: More Homes Razed by Northern Calif...,1


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [5]:
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,7613.0,5441.934848,3137.11609,1.0,2734.0,5408.0,8146.0,10873.0
target,7613.0,0.42966,0.49506,0.0,0.0,0.0,1.0,1.0


In [15]:
data["keyword"].unique()

array([nan, 'ablaze', 'accident', 'aftershock', 'airplane%20accident',
       'ambulance', 'annihilated', 'annihilation', 'apocalypse',
       'armageddon', 'army', 'arson', 'arsonist', 'attack', 'attacked',
       'avalanche', 'battle', 'bioterror', 'bioterrorism', 'blaze',
       'blazing', 'bleeding', 'blew%20up', 'blight', 'blizzard', 'blood',
       'bloody', 'blown%20up', 'body%20bag', 'body%20bagging',
       'body%20bags', 'bomb', 'bombed', 'bombing', 'bridge%20collapse',
       'buildings%20burning', 'buildings%20on%20fire', 'burned',
       'burning', 'burning%20buildings', 'bush%20fires', 'casualties',
       'casualty', 'catastrophe', 'catastrophic', 'chemical%20emergency',
       'cliff%20fall', 'collapse', 'collapsed', 'collide', 'collided',
       'collision', 'crash', 'crashed', 'crush', 'crushed', 'curfew',
       'cyclone', 'damage', 'danger', 'dead', 'death', 'deaths', 'debris',
       'deluge', 'deluged', 'demolish', 'demolished', 'demolition',
       'derail', 'der

## Data Cleaning and Text Processing

In [6]:
nltk.download("stopwords")

URL_PATTERN = '((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\.([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*'
stopwords = stopwords.words("english")

[nltk_data] Downloading package stopwords to C:\Users\Melis
[nltk_data]     Nur\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


`sub()` function searches for the pattern in the string and replaces the matched strings with the replacement *(repl)*.if couldn't find a match, it returns the original string.

In [7]:
def clean_text(text):
    
    # remove stopwords
    remove_stopwords = ' '.join([word for word in text.split() if word not in stopwords])
    # remove URL
    remove_url = re.sub(URL_PATTERN,'',remove_stopwords)
    # remove punctuation
    remove_punctuation = re.sub(r'[^\w\s]','',remove_url)
    
    return remove_punctuation.lower()

In [8]:
data['cleaned_text'] = data['text'].apply(lambda x : clean_text(x))

In [9]:
print(f"Before text cleaning: \n{data.text[100]}")
print("\n")
print(f"After cleaning the text: \n{data.cleaned_text[100]}")

Before text cleaning: 
.@NorwayMFA #Bahrain police had previously died in a road accident they were not killed by explosion https://t.co/gFJfgTodad


After cleaning the text: 
norwaymfa bahrain police previously died road accident killed explosion 


### Corpus and Word Frequency Dictionaries

In [10]:
def create_freq_dict(string):
    freqency_dictionary = defaultdict(int)
    
    for word in string.split():
        if word not in freqency_dictionary:
            freqency_dictionary[word] = 1
        else:
            freqency_dictionary[word] += 1
            
    
    return freqency_dictionary

In [11]:
positive_corpus = ' '.join(text for text in data[data["target"] == 1]["cleaned_text"])
negative_corpus = ' '.join(text for text in data[data["target"] == 0]["cleaned_text"])

positive_frequency_dictionary = create_freq_dict(positive_corpus)
negative_frequency_dictionary = create_freq_dict(negative_corpus)

In [12]:
def frequency(freqency_dictionary,text):
    frequency = 0
    
    for word in text.split():
        frequency += freqency_dictionary[word]
        
    return frequency

In [13]:
data["positive_freq"] = data["cleaned_text"].apply(lambda text : frequency(positive_frequency_dictionary,text))
data["negative_freq"] = data["cleaned_text"].apply(lambda text : frequency(negative_frequency_dictionary,text))

data.head()

Unnamed: 0,id,keyword,location,text,target,cleaned_text,positive_freq,negative_freq
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,our deeds reason earthquake may allah forgive us,208,154
1,4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada,307,100
2,5,,,All residents asked to 'shelter in place' are ...,1,all residents asked shelter place notified off...,177,143
3,6,,,"13,000 people receive #wildfires evacuation or...",1,13000 people receive wildfires evacuation orde...,282,109
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,just got sent photo ruby alaska smoke wildfire...,132,226


## Splitting the data to train set and validation set

In [16]:
def split_the_data(features,labels,split_size):
    
    train_size = int(len(features) * split_size)
    
    data = list(zip(features,labels))
    shuffle_data = random.sample(data,len(data))
    
    shuffle_labels = [label for feature,label in shuffle_data]
    shuffle_features = [feature for feature,label in shuffle_data]
    
    X_train = np.array(shuffle_features[:train_size])
    y_train = np.array(shuffle_labels[:train_size]).reshape((len(shuffle_labels[:train_size]),1))
    
    X_test = np.array(shuffle_features[train_size:])
    y_test = np.array(shuffle_labels[train_size:]).reshape((len(shuffle_labels[train_size:]),1))
    
    return X_train,X_test,y_train,y_test