In [None]:
%pip install pandas

In [2]:
import os

DIR_DATA = os.path.join("..", "data")
PATH_DATA_TRAIN = os.path.join(DIR_DATA, "train.csv")

In [3]:
import pandas as pd

In [50]:
df_train = pd.read_csv(PATH_DATA_TRAIN)

NUM_SETS = 5

len_chunk = len(df_train) / NUM_SETS

train_sets = [None] * NUM_SETS
val_sets = [None] * NUM_SETS

df_shuffled = df_train.sample(frac=1)

for i in range(5):
    start = round(i * len_chunk)
    end = round((i + 1) * len_chunk)
    print(f'start: {start}, end: {end}, total_len: {len(df_train)}')
    val_sets[i] = df_train.iloc[start:end,:]
    train_sets[i] = df_train[~df_train.index.isin(val_sets[i].index)]


start: 0, end: 1523, total_len: 7613
start: 1523, end: 3045, total_len: 7613
start: 3045, end: 4568, total_len: 7613
start: 4568, end: 6090, total_len: 7613
start: 6090, end: 7613, total_len: 7613


In [76]:
MARKOV_ORDER = 5

In [77]:
from collections import defaultdict
import math
import sys

SMOOTHING_WEIGHT = 1
NUM_CHARS = 95

class NGrams:
    order: int
    frequencies: list[dict]
    totals: list[int]

    def __init__(self, order: int, sequences: list[str] = []):
        assert(order >= 0)

        self.order = order
        self.frequencies = [defaultdict(int)] * (order + 1)
        self.totals = [0] * (order + 1)

        if (len(sequences) > 0):
            i = 0
            for seq in sequences:
                i += 1
                print(f'Processing sequence {i:>{math.floor(math.log(len(sequences), 10)) + 1}}/{len(sequences)} for {order} orders', end='\r')
                for n in range(0, order + 1):
                    for pos in range(len(seq) - n):
                        ngram = seq[pos:pos+n+1]
                        self.add(ngram)
            print()

    def __str__(self):
        return f'{self.order}-Order Markov NGram Object - Totals: {self.totals}'
    
    def __check_len(self, l):
        if l > self.order + 1 or l == 0:
            raise ValueError(f'Invalid NGram - Length of ngram must be between 1 and {self.order + 1} for an order {self.order} NGrams object (is {l})')

    def add(self, ngram : str):
        self.__check_len(len(ngram))
        self.frequencies[len(ngram) - 1][ngram] += 1
        self.totals[len(ngram) - 1] += 1

    def get_total(self, ngram : str):
        self.__check_len(len(ngram))
        return self.frequencies[len(ngram) - 1][ngram]
    
    def get_log_cond_prob(self, ngram : str):
        self.__check_len(len(ngram))

        top = self.get_total(ngram) + SMOOTHING_WEIGHT
        bot = self.totals[0] if (len(ngram) == 1) else self.get_total(ngram[:-1])
        bot += SMOOTHING_WEIGHT * ((len(ngram) - 1) ** NUM_CHARS)
        
        return math.log(top / bot) if top > 0 else sys.minint
    
    def get_log_sum_prob(self, seq : str):
        if (len(seq) == 0):
            raise ValueError(f'Invalid Sequence - Cannot get probability for an empty string')

        start = 0
        end = 1
        sum = 0

        while (end <= len(seq)):
            sum += self.get_log_cond_prob(seq[start:end])
            end += 1

            # ngrams can be at most order + 1 characters long:
            if (end - start > self.order + 1):
                start += 1

        return sum


In [78]:
texts_pos = (df_train.loc[df_train['target'] == 1])['text'].to_list()
texts_neg = (df_train.loc[df_train['target'] == 0])['text'].to_list()

texts_lower_pos = map(lambda string: string.lower(), texts_pos)
texts_lower_neg = map(lambda string: string.lower(), texts_neg)

ngrams_pos = NGrams(MARKOV_ORDER, texts_pos)
ngrams_neg = NGrams(MARKOV_ORDER, texts_neg)


Processing sequence 3271/3271 for 5 orders
Processing sequence 4342/4342 for 5 orders


In [66]:
print(ngrams_neg.get_total('hello'))
print(ngrams_neg.get_total('today'))
print(ngrams_neg.get_total('https'))
print(ngrams_neg.get_total('http'))
print(ngrams_pos.get_log_cond_prob('hello'))
print(ngrams_pos.get_log_cond_prob('today'))
print(ngrams_pos.get_log_cond_prob('https'))
print(ngrams_pos.get_log_cond_prob('http'))


4
44
277
2204
-131.69796430638962
-127.86932290990052
-126.78530942065356
-96.53655014711781


In [82]:
print(ngrams_pos.get_log_sum_prob("this is a longer sentence #socool"))
print(ngrams_neg.get_log_sum_prob("this is a longer sentence #socool"))

-4545.739539109495
-4537.399878943147


TODO: only ascii chars?