# 1. Import Dataset

In [1]:
import numpy as np 
import pandas as pd 
import nltk
import matplotlib.pyplot as plt
import random
import re
import string
import pickle
from nltk.corpus import stopwords # module for stop words
from nltk.stem import PorterStemmer # module for stemming
from nltk.tokenize import TweetTokenizer # module for tokenizing strings

In [2]:
# download the stop words
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# 2. Data Cleaning

## 2.1. Explore the dataset
Look at the number of training dataset that is with label 1 (related to disaster) versus lable 0 (not related to disaster)

In [3]:
# import train dataset
train_df = pd.read_csv("/kaggle/input/nlpgettingstarted/train.csv")
# check the number of dataset with label 1 and 0 
print("number of disaster training sample:", len(train_df[train_df["target"] == 1]))
print("number of not disaster training sample:", len(train_df[train_df["target"] == 0]))

number of disaster training sample: 3271
number of not disaster training sample: 4342


## 2.2. Data Cleaning and Tokenizing
1. Remove the hash tag
2. Remove hyperlink
3. Remove any word that start with @
4. Tokenize the text
5. Remove stop words
6. Remove punctuation
7. Stemming

In [4]:
# Initialize tokenizer, which will make the string to be list and lowercase all the words
tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,reduce_len=True)

# Initialize stemmer, which will be used to stem the word
stemmer = PorterStemmer()

In [5]:
def clean_tokenize(text):
    """Process text function.
    Input:
        text: the text of the tweet
    Output:
        clean_token: a list of words containing the processed tweet

    """
    # remove hashtags
    # only removing the hash # sign from the word
    text = re.sub(r'#', '', text)
    
    # remove hyperlink
    text = re.sub(r'https?://[^\s\n\r]+', '', text)

    # remove @
    text = re.sub('@.*? ', '', text)

    # Tokenize the text, which will also lowercase the word
    text_token = tokenizer.tokenize(text)

    # remove stop words, punctuation and stem the word
    clean_token = []
    for word in text_token:
        if (word not in stopwords.words('english') and  # remove stopwords
            word not in string.punctuation):   # remove punctuation
            # stemming
            clean_word = stemmer.stem(word)
            clean_token.append(clean_word)
    return clean_token
train_df['Remove_Hash_Link_At'] = train_df['text'].map(clean_tokenize)
print(train_df.iloc[30:35])

    id keyword                       location  \
30  44     NaN                            NaN   
31  48  ablaze                     Birmingham   
32  49  ablaze  Est. September 2012 - Bristol   
33  50  ablaze                         AFRICA   
34  52  ablaze               Philadelphia, PA   

                                                 text  target  \
30                                           The end!       0   
31  @bbcmtd Wholesale Markets ablaze http://t.co/l...       1   
32  We always try to bring the heavy. #metal #RT h...       0   
33  #AFRICANBAZE: Breaking news:Nigeria flag set a...       1   
34                 Crying out for more! Set me ablaze       0   

                                  Remove_Hash_Link_At  
30                                              [end]  
31                          [wholesal, market, ablaz]  
32              [alway, tri, bring, heavi, metal, rt]  
33  [africanbaz, break, news, nigeria, flag, set, ...  
34                                

# 3. Build Word Dictionary
The word dictionary will use the ($word_i$, $label_i$) as key and the count of ($word_i$, $label_i$) occurrence as value

In [6]:
def build_word_dict(label_arr, token_word_arr):
    """Build frequencies.
    Input:
        token_word: a series of list of tokenized word
        label: a series of label that match the array of the list of tokenized word
    Output:
        freqs: a dictionary mapping each (word, label) pair to its frequency
    """
    word_dict = {}
    y_list = list(label_arr) # make array into list
    for label_idx in range(len(y_list)):
        for word in token_word_arr[label_idx]:
            word_dict[(word, y_list[label_idx])] = word_dict.get((word, y_list[label_idx]), 0) + 1
    return word_dict
        
    
        
word_dict = build_word_dict(train_df['target'], train_df['Remove_Hash_Link_At'])
    
    

In [7]:
print("number of words:", len(word_dict))
print("Output Example:", list(word_dict.items())[:10])

number of words: 15893
Output Example: [(('deed', 1), 1), (('reason', 1), 8), (('earthquak', 1), 47), (('may', 1), 50), (('allah', 1), 6), (('forgiv', 1), 1), (('us', 1), 49), (('forest', 1), 50), (('fire', 1), 271), (('near', 1), 49)]


In [8]:
# save the word dictionary
with open('/kaggle/working/word_dict.pickle', 'wb') as f:
    pickle.dump(word_dict, f)

Reference: [Natural Language Processing with Disaster Tweets](https://www.kaggle.com/c/nlp-getting-started/overview)