In [1]:
import math
import pandas as pd
import tweet_utils as twu
from collections import defaultdict

In [2]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

In [3]:
# model class
class Model:

    def __init__(self):
        self.prep = None
        self.probs = {}
        self.stemmer = None
        self.pos_prior = 0  # prior probability of having positive label

    def fit(self, x, y, prep=None, stemmer=None, frequency_threshold=3):
        """
        Fist the model.
        :param x: iterable of contents;
        :param y: iterable of 0 and 1 labels;
        :param prep: text preprocessing function that takes a string and
                returns its modification (removes numbers, converts to lower
                case, filters some words etc.);
        :param frequency_threshold: number of entries in which a word must
                appear to be included into the vocabulary;
        :param stemmer: stemmer function. No stemming is used if set to None.
        :return: None
        """

        self.prep = prep
        self.stemmer = stemmer

        for text, label in zip(x, y):
            if label:
                self.pos_prior += 1

            processed = text
            if prep:
                processed = prep(processed)

            words = set(processed.split())

            if stemmer:
                words = {stemmer(word) for word in words}

            for word in words:
                freq = self.probs.get(word, (0, 0))
                if label:
                    freq = (freq[0] + 1, freq[1])
                else:
                    freq = (freq[0], freq[1] + 1)
                self.probs[word] = freq

        # calculating the prior
        self.pos_prior /= len(x)

        # filter rare words out
        self.probs = {w: freq for w, freq in self.probs.items() if freq[0] + freq[1] >= frequency_threshold}

        # calculate the total number of times the words from vocab were used
        # in positive and negative texts
        pos_total = sum(f[0] for f in self.probs.values())
        neg_total = sum(f[1] for f in self.probs.values())

        # calculate the probability of a word appearing in a positive/negative text;
        # apply smoothing;
        vocab_size = len(self.probs)
        self.probs = {w: ((f[0] + 1) / (pos_total + vocab_size),
                          (f[1] + 1) / (neg_total + vocab_size)) for w, f in self.probs.items()}

    def predict(self, x, use_prior=True):
        processed = x

        if self.prep:
            processed = self.prep(processed)

        if self.stemmer:
            processed = self.stemmer(processed)

        pos_log_likelihood = math.log(self.pos_prior) if use_prior else 0
        neg_log_likelihood = math.log(1 - self.pos_prior) if use_prior else 0

        for word in set(processed.split()):
            if word in self.probs:
                pos_log_likelihood += math.log(self.probs[word][0])
                neg_log_likelihood += math.log(self.probs[word][1])


        return 1 if pos_log_likelihood > neg_log_likelihood else 0


In [4]:
model = Model()

In [5]:
model.fit(train_df.text, train_df.target, prep=twu.process_tweet)

In [6]:
sorted(model.probs.items())

[('!', (0.0001787629603146228, 0.0003856253557852985)),
 ('&amp', (0.0021451555237754737, 0.0032135446315441542)),
 ('&gt', (8.93814801573114e-05, 0.00011017867308151385)),
 ('&gt;&gt', (4.46907400786557e-05, 0.00012854178526176618)),
 ('&lt', (8.93814801573114e-05, 3.6726224360504615e-05)),
 ('&lt;3', (4.46907400786557e-05, 0.00011017867308151385)),
 ("'a", (4.46907400786557e-05, 9.181556090126154e-05)),
 ("'armageddon", (4.46907400786557e-05, 7.345244872100923e-05)),
 ("'avoiding", (0.0001787629603146228, 1.8363112180252308e-05)),
 ("'california", (8.93814801573114e-05, 1.8363112180252308e-05)),
 ("'cause", (2.234537003932785e-05, 7.345244872100923e-05)),
 ("'conclusively", (0.0005809796210225241, 1.8363112180252308e-05)),
 ("'death", (4.46907400786557e-05, 5.5089336540756926e-05)),
 ("'detonate", (2.234537003932785e-05, 9.181556090126154e-05)),
 ("'food", (0.0003128351805505899, 1.8363112180252308e-05)),
 ("'i", (6.703611011798355e-05, 0.00031217290706428923)),
 ("'i'm", (6.70361101

In [7]:
len(model.probs)

4439

In [8]:
model.pos_prior

0.4296597924602653

In [9]:
# check if the distribution is correct
sum(f[0] for f in model.probs.values()), sum(f[1] for f in model.probs.values())

(0.9999999999999682, 0.9999999999999859)

In [10]:
min(f[0] for f in model.probs.values()), max(f[0] for f in model.probs.values())

(2.234537003932785e-05, 0.04842241687522345)

In [11]:
min(f[1] for f in model.probs.values()), max(f[1] for f in model.probs.values())

(1.8363112180252308e-05, 0.03301687570009365)

In [12]:
model.predict("Weather is severe, it's getting stormy! The mayor has ordered the evacuation.", use_prior=True)

1

In [13]:
model.predict("The sky is just burning!")

0

In [14]:
model.predict("@morty5577, wake up! #sleepy #doll"), model.predict("@morty5577, wake up! #sleep #doll")

(0, 0)

In [15]:
# let's see accuracy on the training set
correct = 0
for tweet, label in zip(train_df.text, train_df.target):
    if label == model.predict(tweet, use_prior=True):
        correct+=1

print("Training accuracy", correct / len(train_df))

Training accuracy 0.8455273873637199


In [16]:
print(model.pos_prior)
print(sum(train_df.target) / len(train_df.target))

0.4296597924602653
0.4296597924602653
