In [None]:
%pip install pandas

In [2]:
import os

DIR_DATA = os.path.join("..", "data")
PATH_DATA_TRAIN = os.path.join(DIR_DATA, "train.csv")

In [3]:
import pandas as pd

In [50]:
df_train = pd.read_csv(PATH_DATA_TRAIN)

NUM_SETS = 5

len_chunk = len(df_train) / NUM_SETS

train_sets = [None] * NUM_SETS
val_sets = [None] * NUM_SETS

df_shuffled = df_train.sample(frac=1)

for i in range(5):
    start = round(i * len_chunk)
    end = round((i + 1) * len_chunk)
    print(f'start: {start}, end: {end}, total_len: {len(df_train)}')
    val_sets[i] = df_train.iloc[start:end,:]
    train_sets[i] = df_train[~df_train.index.isin(val_sets[i].index)]


start: 0, end: 1523, total_len: 7613
start: 1523, end: 3045, total_len: 7613
start: 3045, end: 4568, total_len: 7613
start: 4568, end: 6090, total_len: 7613
start: 6090, end: 7613, total_len: 7613


In [85]:
MARKOV_ORDER = 5

In [84]:
from collections import defaultdict
import math
import sys

SMOOTHING_WEIGHT = 1
NUM_CHARS = 95

class MarkovModel:
    order: int
    ngram_freqs: list[dict]
    totals: list[int]

    def __init__(self, order: int, sequences: list[str] = []):
        assert(order >= 0)

        self.order = order
        self.ngram_freqs = [defaultdict(int)] * (order + 1)
        self.totals = [0] * (order + 1)

        if (len(sequences) > 0):
            i = 0
            for seq in sequences:
                i += 1
                print(f'Processing sequence {i:>{math.floor(math.log(len(sequences), 10)) + 1}}/{len(sequences)} for {order} orders', end='\r')
                for n in range(0, order + 1):
                    for pos in range(len(seq) - n):
                        ngram = seq[pos:pos+n+1]
                        self.add_ngram_occurrence(ngram)
            print()

    def __str__(self):
        return f'{self.order}-Order Markov NGram Object - Totals: {self.totals}'
    
    def __check_len(self, l):
        if l > self.order + 1 or l == 0:
            raise ValueError(f'Invalid NGram - Length of ngram must be between 1 and {self.order + 1} for an order {self.order} NGrams object (is {l})')

    def add_ngram_occurrence(self, ngram : str):
        self.__check_len(len(ngram))
        self.ngram_freqs[len(ngram) - 1][ngram] += 1
        self.totals[len(ngram) - 1] += 1

    def get_total_occurrences(self, ngram : str):
        self.__check_len(len(ngram))
        return self.ngram_freqs[len(ngram) - 1][ngram]
    
    def get_log_cond_prob(self, ngram : str):
        self.__check_len(len(ngram))

        top = self.get_total_occurrences(ngram) + SMOOTHING_WEIGHT
        bot = self.totals[0] if (len(ngram) == 1) else self.get_total_occurrences(ngram[:-1])
        bot += SMOOTHING_WEIGHT * ((len(ngram) - 1) ** NUM_CHARS)
        
        return math.log(top / bot) if top > 0 else sys.minint
    
    def get_log_sum_prob(self, seq : str):
        if (len(seq) == 0):
            raise ValueError(f'Invalid Sequence - Cannot get probability for an empty string')

        start = 0
        end = 1
        sum = 0

        while (end <= len(seq)):
            sum += self.get_log_cond_prob(seq[start:end])
            end += 1

            # ngrams can be at most order + 1 characters long:
            if (end - start > self.order + 1):
                start += 1

        return sum


In [86]:
texts_pos = (df_train.loc[df_train['target'] == 1])['text'].to_list()
texts_neg = (df_train.loc[df_train['target'] == 0])['text'].to_list()

texts_lower_pos = map(lambda string: string.lower(), texts_pos)
texts_lower_neg = map(lambda string: string.lower(), texts_neg)

model_pos = MarkovModel(MARKOV_ORDER, texts_pos)
model_neg = MarkovModel(MARKOV_ORDER, texts_neg)


Processing sequence 3271/3271 for 5 orders
Processing sequence 4342/4342 for 5 orders


In [66]:
print(model_neg.get_total_occurrences('hello'))
print(model_neg.get_total_occurrences('today'))
print(model_neg.get_total_occurrences('https'))
print(model_neg.get_total_occurrences('http'))
print(model_pos.get_log_cond_prob('hello'))
print(model_pos.get_log_cond_prob('today'))
print(model_pos.get_log_cond_prob('https'))
print(model_pos.get_log_cond_prob('http'))


4
44
277
2204
-131.69796430638962
-127.86932290990052
-126.78530942065356
-96.53655014711781


In [99]:
def predict(sequence: str):
    pos = model_pos.get_log_sum_prob(sequence)
    neg = model_neg.get_log_sum_prob(sequence)

    print(f'The sentence is likely ', end='')
    if (pos < neg):
        print('not ', end='')
    print(f'about a real disaster. (\'{sequence}\')')
    
predict("They should all die! All of them! Everything annihilated!")
predict("@sakuma_en If you pretend to feel a certain way the feeling can become genuine all by accident. -Hei (Darker than Black) #manga #anime")
predict("Shot 12 times. Found dead in cuffs after being involved in a car accident. Officers told ambulance not to treat him. https://t.co/MEUDJwaaNg")
predict("@NinaHoag - 'if you shred my Psych work our friendship would be annihilated")
predict("Why should a helicopter ambulance ride to transfer to a hospital 21 miles away cost $29800?")
predict("If you build an army of 100 lions and their leader is a dog in any fight the lions will die like a dog.")
predict("One Direction Is my pick for http://t.co/q2eBlOKeVE Fan Army #Directioners http://t.co/eNCmhz6y34 x1424")


The sentence is likely not about a real disaster. ('They should all die! All of them! Everything annihilated!')
The sentence is likely not about a real disaster. ('@sakuma_en If you pretend to feel a certain way the feeling can become genuine all by accident. -Hei (Darker than Black) #manga #anime')
The sentence is likely about a real disaster. ('Shot 12 times. Found dead in cuffs after being involved in a car accident. Officers told ambulance not to treat him. https://t.co/MEUDJwaaNg')
The sentence is likely not about a real disaster. ('@NinaHoag - 'if you shred my Psych work our friendship would be annihilated')
The sentence is likely about a real disaster. ('Why should a helicopter ambulance ride to transfer to a hospital 21 miles away cost $29800?')
The sentence is likely not about a real disaster. ('If you build an army of 100 lions and their leader is a dog in any fight the lions will die like a dog.')
The sentence is likely not about a real disaster. ('One Direction Is my pick f

[] only ascii chars?
[] remove links?
[] keep or remove hashtags