In [62]:
import math
import pandas as pd
import tweet_utils as twu
from nltk.stem.porter import *
from collections import defaultdict

In [63]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

In [64]:
# model class
class Model:

    def __init__(self):
        self.prep = None
        self.probs = {}
        self.stemmer = None
        self.pos_prior = 0  # prior probability of having positive label

    def fit(self, x, y, prep=None, stemmer=None, frequency_threshold=3, stopwords=None):
        """
        Fist the model.
        :param x: iterable of contents;
        :param y: iterable of 0 and 1 labels;
        :param prep: text preprocessing function that takes a string and
                returns its modification (removes numbers, converts to lower
                case, filters some words etc.);
        :param frequency_threshold: number of entries in which a word must
                appear to be included into the vocabulary;
        :param stemmer: stemmer function. No stemming is used if set to None.
        :param stopwords: set of words that are to be ignored
        :return: None
        """

        self.prep = prep
        self.stemmer = stemmer

        for text, label in zip(x, y):
            if label:
                self.pos_prior += 1

            processed = text
            if prep:
                processed = prep(processed)

            words = set(processed.split())

            if stopwords:
                words -= stopwords

            if stemmer:
                words = {stemmer(word) for word in words}

            for word in words:
                freq = self.probs.get(word, (0, 0))
                if label:
                    freq = (freq[0] + 1, freq[1])
                else:
                    freq = (freq[0], freq[1] + 1)
                self.probs[word] = freq

        # calculating the prior
        self.pos_prior /= len(x)

        # filter rare words out
        self.probs = {w: freq for w, freq in self.probs.items() if freq[0] + freq[1] >= frequency_threshold}

        # calculate the total number of times the words from vocab were used
        # in positive and negative texts
        pos_total = sum(f[0] for f in self.probs.values())
        neg_total = sum(f[1] for f in self.probs.values())

        # calculate the probability of a word appearing in a positive/negative text;
        # apply smoothing;
        vocab_size = len(self.probs)
        self.probs = {w: ((f[0] + 1) / (pos_total + vocab_size),
                          (f[1] + 1) / (neg_total + vocab_size)) for w, f in self.probs.items()}

    def predict(self, x, use_prior=True):
        processed = x

        if self.prep:
            processed = self.prep(processed)

        if self.stemmer:
            processed = self.stemmer(processed)

        pos_log_likelihood = math.log(self.pos_prior) if use_prior else 0
        neg_log_likelihood = math.log(1 - self.pos_prior) if use_prior else 0

        for word in set(processed.split()):
            if word in self.probs:
                pos_log_likelihood += math.log(self.probs[word][0])
                neg_log_likelihood += math.log(self.probs[word][1])


        return 1 if pos_log_likelihood > neg_log_likelihood else 0


In [65]:
model = Model()

In [66]:
stemmer = PorterStemmer()
def stem(x):
    return stemmer.stem(x)

model.fit(train_df.text, train_df.target, prep=twu.process_tweet, stemmer=stem)

In [67]:
sorted(model.probs.items())

[('!', (0.0001769246080014154, 0.0003782012030400173)),
 ('&amp', (0.0021230952960169846, 0.003151676692000144)),
 ('&gt', (8.84623040007077e-05, 0.00010805748658286208)),
 ('&gt;&gt', (4.423115200035385e-05, 0.00012606706768000578)),
 ('&lt', (8.84623040007077e-05, 3.601916219428736e-05)),
 ('&lt;3', (4.423115200035385e-05, 0.00010805748658286208)),
 ("'a", (4.423115200035385e-05, 0.00010805748658286208)),
 ("'armageddon", (4.423115200035385e-05, 7.203832438857472e-05)),
 ("'avoid", (0.0001769246080014154, 1.800958109714368e-05)),
 ("'california", (8.84623040007077e-05, 1.800958109714368e-05)),
 ("'caus", (2.2115576000176925e-05, 7.203832438857472e-05)),
 ("'conclus", (0.0005750049760046, 1.800958109714368e-05)),
 ("'death", (4.423115200035385e-05, 5.402874329143104e-05)),
 ("'deton", (2.2115576000176925e-05, 9.00479054857184e-05)),
 ("'food", (0.00030961806400247693, 1.800958109714368e-05)),
 ("'i", (6.634672800053077e-05, 0.0003241724597485863)),
 ("'i'm", (6.634672800053077e-05, 0.

In [68]:
len(model.probs)

3910

In [69]:
model.pos_prior

0.4296597924602653

In [70]:
# check if the distribution is correct
sum(f[0] for f in model.probs.values()), sum(f[1] for f in model.probs.values())

(1.00000000000004, 1.0000000000000113)

In [71]:
min(f[0] for f in model.probs.values()), max(f[0] for f in model.probs.values())

(2.2115576000176925e-05, 0.04792445319238339)

In [72]:
min(f[1] for f in model.probs.values()), max(f[1] for f in model.probs.values())

(1.800958109714368e-05, 0.03238122681266434)

In [73]:
model.predict("Weather is severe, it's getting stormy! The mayor has ordered the evacuation.", use_prior=True)

1

In [74]:
model.predict("The sky is just burning!")

0

In [75]:
model.predict("Just happened a terrible car crash!")

1

In [76]:
# let's see accuracy on the training set
correct = 0
for tweet, label in zip(train_df.text, train_df.target):
    if label == model.predict(tweet, use_prior=True):
        correct+=1

print("Training accuracy", correct / len(train_df))

Training accuracy 0.8163667411007487


In [77]:
print(model.pos_prior)
print(sum(train_df.target) / len(train_df.target))

0.4296597924602653
0.4296597924602653


## Making predictions

In [104]:
predictions_df = pd.DataFrame(columns=['id', 'target'])
for id, tweet in zip(test_df.id, test_df.text):
    predictions_df.loc[len(predictions_df.index)] = [id, model.predict(tweet)]

In [110]:
predictions_df.head(50)

Unnamed: 0,id,target
0,0,1
1,2,0
2,3,1
3,9,1
4,11,1
5,12,1
6,21,0
7,22,0
8,27,0
9,29,0


In [125]:
predictions_df.to_csv('predictions/naive_bayes_first.csv', index=False)