In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
import re

# Load the data
train = pd.read_csv("train.csv")
val = pd.read_csv("val.csv")
test = pd.read_csv("test.csv")


In [3]:
from hazm import Normalizer, word_tokenize
# Create a normalizer object
normalizer = Normalizer()

def remove_u200c(text):
    return text.replace('\u200c', '')

def preprocess_text(text):
    # Normalize the text
    text = normalizer.normalize(text)
    text = normalizer.remove_specials_chars(text)

    # Tokenize the text
    words = word_tokenize(text)
    words = [remove_u200c(word) for word in words]
    return words

In [4]:
train['content'] = train['content'].apply(preprocess_text)
val['content'] = val['content'].apply(preprocess_text)
test['content'] = test['content'].apply(preprocess_text)

In [25]:
print(f"Shape of the train data: {train.shape}")
print(f"Shape of the validation data: {val.shape}")
print(f"Shape of the test data: {test.shape}")

Shape of the train data: (13314, 2)
Shape of the validation data: (1480, 2)
Shape of the test data: (1644, 2)


In [59]:
class NgramLanguageModel:
    def __init__(self, n):
        self.number_of_minus_n_grams = None
        self.n = n
        self.counts = {}
        self.counts_minus_one_grams = {}
        self.vocab = set()


    def update_counts(self, tokens):
        n = self.n
        for i in range(len(tokens) - n + 1):
            ngram = tuple(tokens[i:i + n])
            self.counts[ngram] = self.counts.get(ngram, 0) + 1
            for token in ngram:
                self.vocab.add(token)

        n = self.n - 1
        for i in range(len(tokens) - n + 1):
            ngram = tuple(tokens[i:i + n])
            self.counts_minus_one_grams[ngram] = self.counts_minus_one_grams.get(ngram, 0) + 1
        self.number_of_minus_n_grams = sum(self.counts_minus_one_grams.values())

    def probability(self, token, context):
        context = tuple(context)
        ngram = context + (token,)
        if context in self.counts_minus_one_grams:
            context_count = self.counts_minus_one_grams[context]
            if ngram in self.counts:
                return self.counts[ngram] / context_count
        return 0

    def perplexity(self, test_data):
        ppl = 1
        total_ngrams = 0
        perplexity = []
        for i in range(len(test_data) - self.n + 1):
            context = tuple(test_data[i:i + self.n - 1])
            token = test_data[i + self.n - 1]
            prob = self.probability(token, context)
            ppl *= (1/prob)
            total_ngrams += 1
            if i % 10 == 0:
                perplexity.append(ppl ** (1/len(test_data)))
                ppl = 1
        perplex = 1
        for ppl in perplexity:
            perplex *= ppl
        return perplex

class BackoffSmoothing(NgramLanguageModel):
    def __init__(self, n, p_bg=0.001):
        super().__init__(n)
        self.p_bg = p_bg


    def probability(self, token, context):
        p = super().probability(token, context)
        if p == 0:
            p = self.p_bg
        return p


class AbsoluteDiscounting(NgramLanguageModel):
    def __init__(self, n, discount=0.5, p_bg=0.000001):
        super().__init__(n)
        self.discount = discount
        self.p_bg = p_bg
        self.applied_discounting = 1

    def probability(self, token, context):
        context = tuple(context)
        ngram = context + (token,)
        if context in self.counts_minus_one_grams:
            context_count = self.counts_minus_one_grams[context]
            ngram_count = self.counts.get(ngram, 0)
            if context_count > 0:
                self.applied_discounting += 1
            interpolation = self.discount * context_count / self.number_of_minus_n_grams
            return max(ngram_count - self.discount, 0) / context_count + interpolation * self.p_bg
        return  self.p_bg

In [6]:
# make all validation token in one list for tuning hyperparameters
validation_tokens = []
for i, content in enumerate(val['content']):
    for token in content:
        validation_tokens.append(token)

In [43]:
# unigram
# Train the model with BackoffSmoothing
backoff_model = BackoffSmoothing(1, 0.1)  # Change the number to the desired n-gram
for tokens in train['content']:
    backoff_model.update_counts(tokens)
background_prob = [0.001, 0.002, 0.003, 0.004, 0.005, 0.006, 0.007, 0.008, 0.009, 0.01, 0.1]
perplexity_dict = {}
for p_bg in background_prob:
    backoff_model.p_bg = p_bg
    perplexity_dict[p_bg] = backoff_model.perplexity(validation_tokens)

#choose the best hyperparameter
best_p_bg = min(perplexity_dict, key=perplexity_dict.get)
backoff_model.p_bg = best_p_bg
# Evaluate the model
for i, content in enumerate(test['content']):
    test_tokens = [token for token in content]  # Flatten the list of tokens
    print(f"Perplexity of the {i} BackoffSmoothing model on the test data: {backoff_model.perplexity(test_tokens)}")

Perplexity of the 0 BackoffSmoothing model on the test data: 1349.9228160317841
Perplexity of the 1 BackoffSmoothing model on the test data: 1406.2445790927306
Perplexity of the 2 BackoffSmoothing model on the test data: 1158.788508054969
Perplexity of the 3 BackoffSmoothing model on the test data: 1047.5144414777378
Perplexity of the 4 BackoffSmoothing model on the test data: 1780.7903256552995
Perplexity of the 5 BackoffSmoothing model on the test data: 1000.3506076531199
Perplexity of the 6 BackoffSmoothing model on the test data: 974.2244899903602
Perplexity of the 7 BackoffSmoothing model on the test data: 1110.191542958756
Perplexity of the 8 BackoffSmoothing model on the test data: 1693.452380527151
Perplexity of the 9 BackoffSmoothing model on the test data: 2212.6531823295454
Perplexity of the 10 BackoffSmoothing model on the test data: 1913.2897950894715
Perplexity of the 11 BackoffSmoothing model on the test data: 1308.0309511110197
Perplexity of the 12 BackoffSmoothing mode

In [42]:
# bigrams
# Train the model with BackoffSmoothing
backoff_model = BackoffSmoothing(2, 0.1)  # Change the number to the desired n-gram
for tokens in train['content']:
    backoff_model.update_counts(tokens)
background_prob = [0.001, 0.002, 0.003, 0.004, 0.005, 0.006, 0.007, 0.008, 0.009, 0.01, 0.1]
perplexity_dict = {}
for p_bg in background_prob:
    backoff_model.p_bg = p_bg
    perplexity_dict[p_bg] = backoff_model.perplexity(validation_tokens)

#choose the best hyperparameter
best_p_bg = min(perplexity_dict, key=perplexity_dict.get)
backoff_model.p_bg = best_p_bg
# Evaluate the model
for i, content in enumerate(test['content']):
    test_tokens = [token for token in content]  # Flatten the list of tokens
    print(f"Perplexity of the {i} BackoffSmoothing model on the test data: {backoff_model.perplexity(test_tokens)}")

Perplexity of the 0 BackoffSmoothing model on the test data: 70.5227469349564
Perplexity of the 1 BackoffSmoothing model on the test data: 49.65286204618913
Perplexity of the 2 BackoffSmoothing model on the test data: 48.858444156159024
Perplexity of the 3 BackoffSmoothing model on the test data: 52.06183351233344
Perplexity of the 4 BackoffSmoothing model on the test data: 38.01476634405145
Perplexity of the 5 BackoffSmoothing model on the test data: 34.2079527029678
Perplexity of the 6 BackoffSmoothing model on the test data: 60.21352797187265
Perplexity of the 7 BackoffSmoothing model on the test data: 72.54333430932572
Perplexity of the 8 BackoffSmoothing model on the test data: 67.63028622248548
Perplexity of the 9 BackoffSmoothing model on the test data: 60.015126881363486
Perplexity of the 10 BackoffSmoothing model on the test data: 55.36632085250772
Perplexity of the 11 BackoffSmoothing model on the test data: 49.95851239946242
Perplexity of the 12 BackoffSmoothing model on the

In [41]:
# trigrams
# Train the model with BackoffSmoothing
backoff_model = BackoffSmoothing(3, 0.1)  # Change the number to the desired n-gram
for tokens in train['content']:
    backoff_model.update_counts(tokens)
background_prob = [0.001, 0.002, 0.003, 0.004, 0.005, 0.006, 0.007, 0.008, 0.009, 0.01, 0.1]
perplexity_dict = {}
for p_bg in background_prob:
    backoff_model.p_bg = p_bg
    perplexity_dict[p_bg] = backoff_model.perplexity(validation_tokens)

#choose the best hyperparameter
best_p_bg = min(perplexity_dict, key=perplexity_dict.get)
backoff_model.p_bg = best_p_bg
# Evaluate the model
for i, content in enumerate(test['content']):
    test_tokens = [token for token in content]  # Flatten the list of tokens
    print(f"Perplexity of the {i} BackoffSmoothing model on the test data: {backoff_model.perplexity(test_tokens)}")

Perplexity of the 0 BackoffSmoothing model on the test data: 9.871543943266678
Perplexity of the 1 BackoffSmoothing model on the test data: 9.043438578631829
Perplexity of the 2 BackoffSmoothing model on the test data: 9.857035474128047
Perplexity of the 3 BackoffSmoothing model on the test data: 9.475998086120496
Perplexity of the 4 BackoffSmoothing model on the test data: 10.188019105119176
Perplexity of the 5 BackoffSmoothing model on the test data: 8.730203205165513
Perplexity of the 6 BackoffSmoothing model on the test data: 11.158955541757365
Perplexity of the 7 BackoffSmoothing model on the test data: 11.243346546633038
Perplexity of the 8 BackoffSmoothing model on the test data: 9.190444257023659
Perplexity of the 9 BackoffSmoothing model on the test data: 11.27518939406537
Perplexity of the 10 BackoffSmoothing model on the test data: 11.418602944201087
Perplexity of the 11 BackoffSmoothing model on the test data: 6.963989960356826
Perplexity of the 12 BackoffSmoothing model on

In [44]:
# unigram
# Train the model with AbsoluteDiscounting
absolute_discounting_model = AbsoluteDiscounting(1)  # Change the number to the desired n-gram
for tokens in train['content']:
    absolute_discounting_model.update_counts(tokens)
background_prob = [0.001, 0.002, 0.003, 0.004, 0.005, 0.006, 0.007, 0.008, 0.009, 0.01, 0.1]
discounts = [0.001, 0.01, 0.1, 0.3, 0.5, 0.7]
perplexity_dict = {}
for p_bg in background_prob:
    for discount in discounts:
        absolute_discounting_model.p_bg = p_bg
        absolute_discounting_model.discount = discount
        perplexity_dict[(p_bg, discount)] = absolute_discounting_model.perplexity(validation_tokens)

#choose the best hyperparameter
best_p_bg, best_discount = min(perplexity_dict, key=perplexity_dict.get)
absolute_discounting_model.p_bg = best_p_bg
absolute_discounting_model.discount = best_discount
# Evaluate the model
for i, content in enumerate(test['content']):
    test_tokens = [token for token in content]  # Flatten the list of tokens
    print(f"Perplexity of the {i} Absolute Discounting model on the test data: {absolute_discounting_model.perplexity(test_tokens)}")
    absolute_discounting_model.applied_discounting = 1

Perplexity of the 0 Absolute Discounting model on the test data: 1.8690577558729635
Perplexity of the 1 Absolute Discounting model on the test data: 1317.9769454091102
Perplexity of the 2 Absolute Discounting model on the test data: 1245.5994298288497
Perplexity of the 3 Absolute Discounting model on the test data: 1060.2788774374537
Perplexity of the 4 Absolute Discounting model on the test data: 3731.1422346932422
Perplexity of the 5 Absolute Discounting model on the test data: 1542.959106538041
Perplexity of the 6 Absolute Discounting model on the test data: 1092.1176935925334
Perplexity of the 7 Absolute Discounting model on the test data: 1097.4354179485988
Perplexity of the 8 Absolute Discounting model on the test data: 1597.1732566461742
Perplexity of the 9 Absolute Discounting model on the test data: 2305.0866338389856
Perplexity of the 10 Absolute Discounting model on the test data: 2009.29346138084
Perplexity of the 11 Absolute Discounting model on the test data: 1434.0334021

In [45]:
# bigram
# Train the model with AbsoluteDiscounting
absolute_discounting_model = AbsoluteDiscounting(2)  # Change the number to the desired n-gram
for tokens in train['content']:
    absolute_discounting_model.update_counts(tokens)
background_prob = [0.001, 0.002, 0.003, 0.004, 0.005, 0.006, 0.007, 0.008, 0.009, 0.01, 0.1]
discounts = [0.001, 0.01, 0.1, 0.3, 0.5, 0.7]
perplexity_dict = {}
for p_bg in background_prob:
    for discount in discounts:
        absolute_discounting_model.p_bg = p_bg
        absolute_discounting_model.discount = discount
        perplexity_dict[(p_bg, discount)] = absolute_discounting_model.perplexity(validation_tokens)

#choose the best hyperparameter
best_p_bg, best_discount = min(perplexity_dict, key=perplexity_dict.get)
absolute_discounting_model.p_bg = best_p_bg
absolute_discounting_model.discount = best_discount
# Evaluate the model
for i, content in enumerate(test['content']):
    test_tokens = [token for token in content]  # Flatten the list of tokens
    print(f"Perplexity of the {i} Absolute Discounting model on the test data: {absolute_discounting_model.perplexity(test_tokens)}")
    absolute_discounting_model.applied_discounting = 1

Perplexity of the 0 Absolute Discounting model on the test data: 0.0014364364311748412
Perplexity of the 1 Absolute Discounting model on the test data: 38.561190056005664
Perplexity of the 2 Absolute Discounting model on the test data: 42.66312864278921
Perplexity of the 3 Absolute Discounting model on the test data: 39.39237406430953
Perplexity of the 4 Absolute Discounting model on the test data: 27.201621161645402
Perplexity of the 5 Absolute Discounting model on the test data: 54.668309231402944
Perplexity of the 6 Absolute Discounting model on the test data: 57.265351042155494
Perplexity of the 7 Absolute Discounting model on the test data: 55.79441045823652
Perplexity of the 8 Absolute Discounting model on the test data: 39.050815518158856
Perplexity of the 9 Absolute Discounting model on the test data: 25.93849383926754
Perplexity of the 10 Absolute Discounting model on the test data: 36.40826584435013
Perplexity of the 11 Absolute Discounting model on the test data: 58.44843443

In [50]:
# trigram
# Train the model with AbsoluteDiscounting
absolute_discounting_model = AbsoluteDiscounting(3)  # Change the number to the desired n-gram
for tokens in train['content']:
    absolute_discounting_model.update_counts(tokens)
background_prob = [0.001, 0.002, 0.003, 0.004, 0.005, 0.006, 0.007, 0.008, 0.009, 0.01, 0.1]
discounts = [0.001, 0.01, 0.1]
perplexity_dict = {}
for p_bg in background_prob:
    for discount in discounts:
        absolute_discounting_model.p_bg = p_bg
        absolute_discounting_model.discount = discount
        perplexity_dict[(p_bg, discount)] = absolute_discounting_model.perplexity(validation_tokens)

#choose the best hyperparameter
best_p_bg, best_discount = min(perplexity_dict, key=perplexity_dict.get)
absolute_discounting_model.p_bg = best_p_bg
absolute_discounting_model.discount = best_discount
# Evaluate the model
for i, content in enumerate(test['content']):
    test_tokens = [token for token in content]  # Flatten the list of tokens
    print(f"Perplexity of the {i} Absolute Discounting model on the test data: {absolute_discounting_model.perplexity(test_tokens)}")
    absolute_discounting_model.applied_discounting = 1

Perplexity of the 0 Absolute Discounting model on the test data: 0.003746913207649293
Perplexity of the 1 Absolute Discounting model on the test data: 10.788610858964786
Perplexity of the 2 Absolute Discounting model on the test data: 10.841792401927307
Perplexity of the 3 Absolute Discounting model on the test data: 8.564449376118656
Perplexity of the 4 Absolute Discounting model on the test data: 8.821056346181145
Perplexity of the 5 Absolute Discounting model on the test data: 14.625216958013151
Perplexity of the 6 Absolute Discounting model on the test data: 14.682739458059043
Perplexity of the 7 Absolute Discounting model on the test data: 10.207123256255471
Perplexity of the 8 Absolute Discounting model on the test data: 6.59845570866539
Perplexity of the 9 Absolute Discounting model on the test data: 7.398993420366893
Perplexity of the 10 Absolute Discounting model on the test data: 10.727126312475507
Perplexity of the 11 Absolute Discounting model on the test data: 11.902162561

In [8]:
labels = list(range(0, 8))
test_class_tokens = {}
for i in labels:
    test_label = test[test['label'] == i]
    test_tokens = []
    for j, content in enumerate(test_label['content']):
        for token in content:
            test_tokens.append(token)
    test_class_tokens[i] = test_tokens

In [47]:
best_pg = backoff_model.p_bg
backoff_model = BackoffSmoothing(1, best_pg)  # Change the number to the desired n-gram
for tokens in train['content']:
    backoff_model.update_counts(tokens)
for i in labels:
    test_tokens = [token for token in test_class_tokens[i]]  # Flatten the list of tokens
    print(f"Perplexity of the class {i} BackoffSmoothing model on the test data with unigram: {backoff_model.perplexity(test_tokens)}")

backoff_model = BackoffSmoothing(2, best_pg)  # Change the number to the desired n-gram
for tokens in train['content']:
    backoff_model.update_counts(tokens)
for i in labels:
    test_tokens = [token for token in test_class_tokens[i]]  # Flatten the list of tokens
    print(f"Perplexity of the class {i} BackoffSmoothing model on the test data with bigram: {backoff_model.perplexity(test_tokens)}")

backoff_model = BackoffSmoothing(3, best_pg)  # Change the number to the desired n-gram
for tokens in train['content']:
    backoff_model.update_counts(tokens)
for i in labels:
    test_tokens = [token for token in test_class_tokens[i]]  # Flatten the list of tokens
    print(f"Perplexity of the class {i} BackoffSmoothing model on the test data with trigram: {backoff_model.perplexity(test_tokens)}")

Perplexity of the class 0 BackoffSmoothing model on the test data with unigram: 1542.5819205583166
Perplexity of the class 1 BackoffSmoothing model on the test data with unigram: 1532.2697543357367
Perplexity of the class 2 BackoffSmoothing model on the test data with unigram: 1469.7123696075662
Perplexity of the class 3 BackoffSmoothing model on the test data with unigram: 1407.3628970836887
Perplexity of the class 4 BackoffSmoothing model on the test data with unigram: 1793.8588562807918
Perplexity of the class 5 BackoffSmoothing model on the test data with unigram: 1661.5376001806842
Perplexity of the class 6 BackoffSmoothing model on the test data with unigram: 1580.6194972850167
Perplexity of the class 7 BackoffSmoothing model on the test data with unigram: 1497.4705622934903
Perplexity of the class 0 BackoffSmoothing model on the test data with bigram: 56.68193475239538
Perplexity of the class 1 BackoffSmoothing model on the test data with bigram: 53.874553318633176
Perplexity of

In [60]:
best_pg = absolute_discounting_model.p_bg
best_discount = 0.75
absolute_discounting_model = AbsoluteDiscounting(1,best_discount, best_discount)  # Change the number to the desired n-gram
for tokens in train['content']:
    absolute_discounting_model.update_counts(tokens)
for i in labels:
    test_tokens = [token for token in test_class_tokens[i]]  # Flatten the list of tokens
    print(f"Perplexity of the class {i} absolute discounting model on the test data with unigram: {absolute_discounting_model.perplexity(test_tokens)}")

absolute_discounting_model = AbsoluteDiscounting(2,best_discount, best_discount)  # Change the number to the desired n-gram
for tokens in train['content']:
    absolute_discounting_model.update_counts(tokens)
for i in labels:
    test_tokens = [token for token in test_class_tokens[i]]  # Flatten the list of tokens
    print(f"Perplexity of the class {i} absolute discounting model on the test data with bigram: {absolute_discounting_model.perplexity(test_tokens)}")

absolute_discounting_model = AbsoluteDiscounting(3,best_discount, best_discount)  # Change the number to the desired n-gram
for tokens in train['content']:
    absolute_discounting_model.update_counts(tokens)
for i in labels:
    test_tokens = [token for token in test_class_tokens[i]]  # Flatten the list of tokens
    print(f"Perplexity of the class {i} absolute discounting model on the test data with trigram: {absolute_discounting_model.perplexity(test_tokens)}")

Perplexity of the class 0 absolute discounting model on the test data with unigram: 1.755121906086166
Perplexity of the class 1 absolute discounting model on the test data with unigram: 1.7564037981632385
Perplexity of the class 2 absolute discounting model on the test data with unigram: 1.7553718548325226
Perplexity of the class 3 absolute discounting model on the test data with unigram: 1.7549754984189967
Perplexity of the class 4 absolute discounting model on the test data with unigram: 1.7562084619202152
Perplexity of the class 5 absolute discounting model on the test data with unigram: 1.7553690335834364
Perplexity of the class 6 absolute discounting model on the test data with unigram: 1.7564953461090733
Perplexity of the class 7 absolute discounting model on the test data with unigram: 1.7550817001051089
Perplexity of the class 0 absolute discounting model on the test data with bigram: 136.3252781740603
Perplexity of the class 1 absolute discounting model on the test data with b