In [1]:
import pandas as pd
import numpy as np
import random

In [2]:
np.random.seed(180)
random.seed(180)

#### [Natural Language Processing with Disaster Tweets Dataset](https://www.kaggle.com/competitions/nlp-getting-started/overview)

In [3]:
df_train = pd.read_csv("./data/train.csv")
df_test = pd.read_csv("./data/test.csv")

In [4]:
df_train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [5]:
df_train["target"].value_counts(normalize=True)

target
0    0.57034
1    0.42966
Name: proportion, dtype: float64

In [6]:
text_id = df_test["id"].tolist()
test_x = df_test["text"].tolist()

In [7]:
df_0 = df_train[df_train["target"] == 0]["text"].tolist()
df_1 = df_train[df_train["target"] == 1]["text"].tolist()

train_x = df_0 + df_1
train_y = [0]*len(df_0) + [1]*len(df_1)

##### **Pre-processing text methods**

1. `process_tweet()`: cleans the text, tokenizes it into separate words, removes stopwords, and converts words to stems.

2. `count_tweets()`: takes a list of tweets as input, `process_tweet()` all of them, and returns a dictionary.
  * The key in the dictionary is a tuple containing the semmed word and its class label, e.g. ("happi",1).
  * The value the number of times this word appears in the given collection of tweets (an integer).

In [8]:
import nltk
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yveem\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
import re
import string
import numpy as np

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer

def process_tweet(tweet):
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words("english")
    # remove stock market tickers like $GE
    tweet = re.sub(r"\$\w*", "", tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r"^RT[\s]+", "", tweet)
    # remove hyperlinks
    tweet = re.sub(r"https?:\/\/.*[\r\n]*", "", tweet)
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r"#", "", tweet)
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and word not in string.punctuation):
            stem_word = stemmer.stem(word)
            tweets_clean.append(stem_word)

    return tweets_clean

def lookup(freqs, word, label):
    n = 0

    pair = (word, label)
    if (pair in freqs):
        n = freqs[pair]

    return n

def count_tweets(tweets, ys):
    result = {}
    for y, tweet in zip(ys, tweets):
        for word in process_tweet(tweet):
            pair = (word,y)
            if pair in result:
                result[pair] += 1
            else:
                result[pair] = 1

    return result

In [10]:
process_tweet("""
For those who have been waiting for this scene...

Now, a mass exodus of settlers from northern occupied Palestine have left their homes burning and are fleeing.
""")

['wait',
 'scene',
 '...',
 'mass',
 'exodu',
 'settler',
 'northern',
 'occupi',
 'palestin',
 'left',
 'home',
 'burn',
 'flee']

In [11]:
tweets = ["i am happy", "i am tricked", "i am sad", "i am tired", "i am tired"]
ys = [1, 0, 0, 0, 0]
count_tweets(tweets, ys)

{('happi', 1): 1, ('trick', 0): 1, ('sad', 0): 1, ('tire', 0): 2}

#### **Naive Bayes**
Naive Bayes is a probabilistic algorithm used to classify documents based on the likelihood of certain features (e.g., words) appearing in positive or negative contexts. Here’s an overview of its key components:

1. **How to classify documents?**

- $P(D_{pos})=\frac{D_{pos}}{D}$: Probability the document describes positive for disaster.

- $P(D_{neg})=\frac{D_{neg}}{D}$: Probability the document describes negative for disaster.

2. **Positive and Negative Probability of a Word**

- To properly compute the probability of a word, the Laplace smoothing is used to overcome the problem of zero probability: ***If query point contains a new observation, which is not yet seen in training data while calculating probabilities.*** [Reference for Additive Smoothing!](https://towardsdatascience.com/laplace-smoothing-in-na%C3%AFve-bayes-algorithm-9c237a8bdece)

$$
P(W_{pos})=\frac{freq_{pos} + 1}{N_{pos}+V}
$$

$$
P(W_{neg})=\frac{freq_{neg} + 1}{N_{neg}+V}
$$

4. **Log Likelihood**

- To compare the probabilities of a word belonging to positive or negative contexts, we compute the **log-likelihood ratio**. This helps quantify how strongly a word is associated with a particular class. The formula is:

$$
loglikelihood = log(\frac{P(W_{pos})}{P(W_{neg})})
$$

- The likelihood function represents the joint probability (or probability density) of observed data, viewed as a function of the model parameters. Here, the log transformation simplifies calculations and provides a clearer measure of relative importance.

In [12]:
def train_naive_bayes(train_x, train_y):
    freqs = count_tweets(train_x, train_y)

    loglikelihood = {}
    logprior = 0

    vocab = set([pair[0] for pair in freqs.keys()])
    V = len(vocab)
    N_pos = N_neg = 0
    for word, category in freqs.keys():
        if category == 1:
            N_pos += freqs[(word, category)]
        else:
            N_neg += freqs[(word, category)]

    D_pos = (len(list(filter(lambda x: x == 1, train_y))))
    D_neg = (len(list(filter(lambda x: x == 0, train_y))))
    logprior = np.log(D_pos) - np.log(D_neg)

    for word in vocab:
        freq_pos = lookup(freqs, word, 1)
        freq_neg = lookup(freqs, word, 0)

        p_w_pos = (freq_pos + 1) / (N_pos + V)
        p_w_neg = (freq_neg + 1) / (N_neg + V)

        loglikelihood[word] = np.log(p_w_pos/p_w_neg)

    return freqs, logprior, loglikelihood

In [13]:
D_pos = (len(list(filter(lambda x: x == 1, train_y))))
D_neg = (len(list(filter(lambda x: x == 0, train_y))))
logprior = np.log(D_pos) - np.log(D_neg)

In [14]:
freqs, logprior, loglikelihood = train_naive_bayes(train_x, train_y)
print(f"Log Prior: {logprior}")
print("\nLog Likelihood:")
for word, value in list(loglikelihood.items())[:5]:
    print(f"  {word}: {value}")

Log Prior: -0.28323932289985443

Log Likelihood:
  elven: -0.5907888505631518
  that: -0.1853237424549875
  oi: 0.7955055105567389
  tape: -0.3031067781113708
  ukrain: 1.2009706186649032


#### **Making predictions with words likelihood!**

##### Given a tweet, what is the likelihood of being a disaster description?

In [15]:
def naive_bayes_predict(tweet, logprior, loglikelihood):
    word_l = process_tweet(tweet)
    p = 0
    p += logprior

    for word in word_l:
        if word in loglikelihood:
            p += loglikelihood[word]

    return p

In [16]:
my_tweet = """
For those who have been waiting for this scene...

Now, a mass exodus of settlers from northern occupied Palestine have left their homes burning and are fleeing.
"""
p = naive_bayes_predict(my_tweet, logprior, loglikelihood)
print("The expected output is", p)

The expected output is 9.17602148915039


In [17]:
my_tweet = """
My life is good now that I found you! I'm happy with u.
"""
p = naive_bayes_predict(my_tweet, logprior, loglikelihood)
print("The expected output is", p)

The expected output is -3.5248688476543677


In [18]:
preds = []
ids = []
for _, row in df_test.iterrows():
  tweet = row["text"]
  ids.append(row["id"])
  if naive_bayes_predict(tweet, logprior, loglikelihood) > 0:
    preds.append(1)
  else:
    preds.append(0)

In [19]:
df_submit = pd.DataFrame({"id": ids, "target": preds})

In [20]:
df_submit.head()

Unnamed: 0,id,prediction
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1


In [21]:
df_submit.to_csv("./results/naive_bayes.csv", index=False)