In [1]:
import pandas as pd
import numpy as np
import tweet_utils as twu
from collections import defaultdict

In [2]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

In [3]:
# model class
class Model:

    def __init__(self):
        self.prep = None
        self.probs = {}
        self.stemmer = None

    def fit(self, x, y, prep=None, stemmer=None, frequency_threshold=3):
        """
        Fist the model.
        :param x: iterable of contents;
        :param y: iterable of 0 and 1 labels;
        :param prep: text preprocessing function that takes a string and
                returns its modification (removes numbers, converts to lower
                case, filters some words etc.);
        :param frequency_threshold: number of entries in which a word must
                appear to be included into the vocabulary;
        :param stemmer: stemmer function. No stemming is used if set to None.
        :return: None
        """

        self.prep = prep
        self.stemmer = stemmer

        for text, label in zip(x, y):
            processed = text
            if prep:
                processed = prep(processed)

            words = set(processed.split())

            if stemmer:
                words = {stemmer(word) for word in words}

            for word in words:
                freq = self.probs.get(word, (0, 0))
                if label:
                    freq = (freq[0] + 1, freq[1])
                else:
                    freq = (freq[0], freq[1] + 1)
                self.probs[word] = freq

        # filter rare words out
        self.probs = {w: freq for w, freq in self.probs.items() if freq[0] + freq[1] >= frequency_threshold}

        # calculate the total number of times the words from vocab were used
        # in positive and negative texts
        pos_total = sum(f[0] for f in self.probs.values())
        neg_total = sum(f[1] for f in self.probs.values())

        # calculate the probability of a word appearing in a positive/negative text;
        # apply smoothing;
        vocab_size = len(self.probs)
        self.probs = {w: ((f[0] + 1) / (pos_total + vocab_size),
                          (f[1] + 1) / (neg_total + vocab_size)) for w, f in self.probs.items()}




In [4]:
model = Model()

In [5]:
model.fit(train_df.text, train_df.target, prep=twu.process_tweet)

In [6]:
sorted(model.probs.items())

[('!', (0.00018784193101505084, 0.00039876193912234394)),
 ('&amp', (0.00225410317218061, 0.0033230161593528662)),
 ('&gt', (9.392096550752542e-05, 0.00011393198260638399)),
 ('&gt;&gt', (4.696048275376271e-05, 0.00013292064637411465)),
 ('&lt', (9.392096550752542e-05, 3.797732753546133e-05)),
 ('&lt;3', (4.696048275376271e-05, 0.00011393198260638399)),
 ("'a", (4.696048275376271e-05, 9.494331883865333e-05)),
 ("'armageddon", (4.696048275376271e-05, 7.595465507092266e-05)),
 ("'avoiding", (0.00018784193101505084, 1.8988663767730665e-05)),
 ("'california", (9.392096550752542e-05, 1.8988663767730665e-05)),
 ("'cause", (2.3480241376881355e-05, 7.595465507092266e-05)),
 ("'conclusively", (0.0006104862757989152, 1.8988663767730665e-05)),
 ("'death", (4.696048275376271e-05, 5.696599130319199e-05)),
 ("'detonate", (2.3480241376881355e-05, 9.494331883865333e-05)),
 ("'food", (0.00032872337927633896, 1.8988663767730665e-05)),
 ("'i", (7.044072413064406e-05, 0.0003228072840514213)),
 ("'i'm", (7

In [7]:
len(model.probs)

4438

In [8]:
# check if the distribution is correct
sum(f[0] for f in model.probs.values()), sum(f[1] for f in model.probs.values())

(1.000000000000032, 0.9999999999999784)

In [9]:
min(f[0] for f in model.probs.values()), max(f[0] for f in model.probs.values())

(2.3480241376881355e-05, 0.02409072765268027)

In [10]:
min(f[1] for f in model.probs.values()), max(f[1] for f in model.probs.values())

(1.8988663767730665e-05, 0.026318287982074702)