In [17]:
import math
import pandas as pd
import tweet_utils as twu
from nltk.stem.porter import *
from collections import defaultdict

In [18]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

In [3]:
# model class
class Model:

    def __init__(self):
        self.prep = None
        self.probs = {}
        self.stemmer = None
        self.pos_prior = 0  # prior probability of having positive label

    def fit(self, x, y, prep=None, stemmer=None, frequency_threshold=3):
        """
        Fist the model.
        :param x: iterable of contents;
        :param y: iterable of 0 and 1 labels;
        :param prep: text preprocessing function that takes a string and
                returns its modification (removes numbers, converts to lower
                case, filters some words etc.);
        :param frequency_threshold: number of entries in which a word must
                appear to be included into the vocabulary;
        :param stemmer: stemmer function. No stemming is used if set to None.
        :return: None
        """

        self.prep = prep
        self.stemmer = stemmer

        for text, label in zip(x, y):
            if label:
                self.pos_prior += 1

            processed = text
            if prep:
                processed = prep(processed)

            words = set(processed.split())

            if stemmer:
                words = {stemmer(word) for word in words}

            for word in words:
                freq = self.probs.get(word, (0, 0))
                if label:
                    freq = (freq[0] + 1, freq[1])
                else:
                    freq = (freq[0], freq[1] + 1)
                self.probs[word] = freq

        # calculating the prior
        self.pos_prior /= len(x)

        # filter rare words out
        self.probs = {w: freq for w, freq in self.probs.items() if freq[0] + freq[1] >= frequency_threshold}

        # calculate the total number of times the words from vocab were used
        # in positive and negative texts
        pos_total = sum(f[0] for f in self.probs.values())
        neg_total = sum(f[1] for f in self.probs.values())

        # calculate the probability of a word appearing in a positive/negative text;
        # apply smoothing;
        vocab_size = len(self.probs)
        self.probs = {w: ((f[0] + 1) / (pos_total + vocab_size),
                          (f[1] + 1) / (neg_total + vocab_size)) for w, f in self.probs.items()}

    def predict(self, x, use_prior=True):
        processed = x

        if self.prep:
            processed = self.prep(processed)

        if self.stemmer:
            processed = self.stemmer(processed)

        pos_log_likelihood = math.log(self.pos_prior) if use_prior else 0
        neg_log_likelihood = math.log(1 - self.pos_prior) if use_prior else 0

        for word in set(processed.split()):
            if word in self.probs:
                pos_log_likelihood += math.log(self.probs[word][0])
                neg_log_likelihood += math.log(self.probs[word][1])


        return 1 if pos_log_likelihood > neg_log_likelihood else 0


In [4]:
model = Model()

In [20]:
stemmer = PorterStemmer()
def stem(x):
    return stemmer.stem(x)

model.fit(train_df.text, train_df.target, prep=twu.process_tweet, stemmer=stem)

In [21]:
sorted(model.probs.items())

[('!', (0.00017692581018278127, 0.0003782030630151413)),
 ('&amp', (0.0021231097221933756, 0.003151692191792844)),
 ('&gt', (8.846290509139064e-05, 0.00010805801800432607)),
 ('&gt;&gt', (4.423145254569532e-05, 0.00012606768767171376)),
 ('&lt', (8.846290509139064e-05, 3.6019339334775355e-05)),
 ('&lt;3', (4.423145254569532e-05, 0.00010805801800432607)),
 ("'a", (4.423145254569532e-05, 0.00010805768729681447)),
 ("'armageddon", (4.423145254569532e-05, 7.203867866955071e-05)),
 ("'avoid", (0.00017692185679842309, 1.8009338959876067e-05)),
 ("'california", (8.846290509139064e-05, 1.8009669667387678e-05)),
 ("'caus", (2.2115232099802886e-05, 7.203735583950427e-05)),
 ("'conclus", (0.000574996034594875, 1.8009338959876067e-05)),
 ("'death", (4.423145254569532e-05, 5.4029009002163036e-05)),
 ("'deton", (2.2115232099802886e-05, 9.004669479938034e-05)),
 ("'food", (0.00030962016781986725, 1.8009669667387678e-05)),
 ("'i", (6.634717881854299e-05, 0.00032417372330546666)),
 ("'i'm", (6.63471788

In [22]:
len(model.probs)

3910

In [23]:
model.pos_prior

0.4298475843678524

In [24]:
# check if the distribution is correct
sum(f[0] for f in model.probs.values()), sum(f[1] for f in model.probs.values())

(1.000000000000011, 0.9999999999999568)

In [25]:
min(f[0] for f in model.probs.values()), max(f[0] for f in model.probs.values())

(2.2115232099802886e-05, 0.04792477883326087)

In [26]:
min(f[1] for f in model.probs.values()), max(f[1] for f in model.probs.values())

(1.8009338959876067e-05, 0.032381386061963044)

In [27]:
model.predict("Weather is severe, it's getting stormy! The mayor has ordered the evacuation.", use_prior=True)

1

In [28]:
model.predict("The sky is just burning!")

0

In [38]:
model.predict("Just happened a terrible car crash!")

1

In [30]:
# let's see accuracy on the training set
correct = 0
for tweet, label in zip(train_df.text, train_df.target):
    if label == model.predict(tweet, use_prior=True):
        correct+=1

print("Training accuracy", correct / len(train_df))

Training accuracy 0.8163667411007487


In [16]:
print(model.pos_prior)
print(sum(train_df.target) / len(train_df.target))

0.4296597924602653
0.4296597924602653
