# nGram classificator

In [1]:
import argparse, math, random, collections, sys

## Model definition

In [6]:
class NGramCharLM:
    def __init__(self, ngram_size=3, n_thresholds=20):
        self.ngram_size = max(1, ngram_size)
        self.n_thresholds = max(1, n_thresholds)
        self.counts = collections.defaultdict(collections.Counter)
        self.context_totals = collections.Counter()
        self.vocab = set()
        self.thresholds = {i:(None,None) for i in range(3,n_thresholds)} # prob. there is no need for storing all thresholds

    def _normalize(self, s):
        return s.replace("\r"," ").replace("\n"," ").lower()

    def train(self, corpus, samples=200):
        self.train_positive(corpus)
        self.train_negative(samples)

    def train_positive(self, corpus):
        s = self._normalize(corpus)
        self.vocab.update(set(s))
        padded = (" " * (self.ngram_size- 1)) + s
        for i in range(len(padded) - self.ngram_size+ 1):
            ctx = padded[i:i + self.ngram_size- 1]
            ch = padded[i + self.ngram_size- 1]
            self.counts[ctx][ch] += 1
            self.context_totals[ctx] += 1
    
    def train_negative(self, samples=200):
        def generate_random_string(length, chars):
            return "".join(random.choice(list(chars)) for _ in range(length))
        chars = self.vocab or set("abcdefghijklmnopqrstuvwxyz ")
        for length in self.thresholds.keys():
            log_probs = [self.score(generate_random_string(length, chars)) for _ in range(samples)]
            mean = sum(log_probs) / len(log_probs)
            variance = sum((x - mean) ** 2 for x in log_probs) / len(log_probs)
            stddev = math.sqrt(variance)
            self.thresholds[length] = (mean, stddev)

    def char_prob(self, context, ch):
        # Laplace (add-one) smoothing
        ctx = context[-(self.ngram_size - 1):] if self.ngram_size > 1 else ""
        V = len(self.vocab) or 1
        count = self.counts.get(ctx, {}).get(ch, 0)
        total = self.context_totals.get(ctx, 0)
        return (count + 1) / (total + V)

    def score(self, text):
        s = self._normalize(text)
        padded = (" " * (self.ngram_size- 1)) + s
        logp = 0.0
        n_chars = 0
        for i in range(len(padded) - self.ngram_size+ 1):
            ctx = padded[i:i + self.ngram_size- 1]
            ch = padded[i + self.ngram_size- 1]
            p = self.char_prob(ctx, ch)
            logp += math.log(p)
            n_chars += 1
        return logp / max(1, n_chars)  # average log-prob per char

    def predict(self, text, alpha=2.0):
        score = self.score(text)
        length = len(text)
        if length < 3:
            raise ValueError("Text too short to evaluate")
        mean, stddev = self.thresholds.get(length, (None, None))
        if mean is None or stddev is None:
            raise ValueError("No threshold available for this text length")
        threshold = mean + alpha * stddev
        return score > threshold

## Training

In [None]:
ngram_size = 0
train_dataset_path = "dataset/wikipedia_clean_corpus.txt"
    
model = NGramCharLM(ngram_size, n_thresholds=21)
model.train(train_dataset_path)
print(model.thresholds)

{3: (-2.9497694227157973, 0.04877667244338073), 4: (-2.9492391291861733, 0.038477788758329336), 5: (-2.94378612197902, 0.03580025998825171), 6: (-2.9476390791795954, 0.02331480188620268), 7: (-2.946264987911756, 0.021775978898016322), 8: (-2.9444321021031605, 0.023197934975401457), 9: (-2.9447894517136213, 0.01878430640176478), 10: (-2.946359039174334, 0.015391115503331733), 11: (-2.9447257294323155, 0.016439377146958183), 12: (-2.943899516219111, 0.016214840636483607), 13: (-2.9449284808860936, 0.012597469542271542), 14: (-2.945351983539099, 0.011813869767967746), 15: (-2.9457190191717033, 0.009325920754481149), 16: (-2.945639016671375, 0.008743050707326078), 17: (-2.9446245234561252, 0.011902161105045798), 18: (-2.945683971916986, 0.008207278157979822), 19: (-2.945618445982746, 0.007003103046113378), 20: (-2.9450780822273015, 0.007621592614305759)}


## Evaluation

In [16]:
def evaluate(alpha=0.0):
    with open("dataset/eval_dataset.txt", "r", encoding="utf-8") as f:
        lines = f.readlines()
        TP, TN, FP, FN = 0, 0, 0, 0
        for line in lines:
            l = line.strip().split("  ")
            if len(l) < 2 or len(l[0]) < 3:
                continue  # skip too short texts
            text, label = l[0], l[1]
            prediction = model.predict(text, alpha)
            if prediction and label == "1":
                TP += 1
            elif prediction and label == "0":
                FP += 1
            elif not prediction and label == "0":
                TN += 1
            elif not prediction and label == "1":
                FN += 1

        print(f"TP: {TP}, TN: {TN}, FP: {FP}, FN: {FN}")
        accuracy = (TP + TN) / (TP + TN + FP + FN)
        precision = TP / (TP + FP) if (TP + FP) > 0 else 0
        recall = TP / (TP + FN) if (TP + FN) > 0 else 0
        print(f"Accuracy: {accuracy}")
        print(f"Precision: {precision}")
        print(f"Recall: {recall}")
        print(f"F1 Score: {(2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0}")

evaluate()

TP: 3, TN: 96, FP: 3, FN: 71
Accuracy: 0.5722543352601156
Precision: 0.5
Recall: 0.04054054054054054
F1 Score: 0.075


## Inference

In [5]:
text = 'asdgnlabkegrSV'
res = model.predict(text, alpha=2)

print("Result: {}".format("LIKELY REAL" if res else "LIKELY RANDOM"))


Result: LIKELY RANDOM
