In [1]:
import pandas as pd
import warnings
import numpy as np

warnings.filterwarnings('ignore')

In [2]:
# Load the data
train = pd.read_csv("train.csv")
val = pd.read_csv("val.csv")
test = pd.read_csv("test.csv")


In [3]:
from hazm import Normalizer, word_tokenize
# Create a normalizer object
normalizer = Normalizer()

def remove_u200c(text):
    return text.replace('\u200c', '')

def preprocess_text(text):
    # Normalize the text
    text = normalizer.normalize(text)
    text = normalizer.remove_specials_chars(text)

    # Tokenize the text
    words = word_tokenize(text)
    words = [remove_u200c(word) for word in words]
    return words

In [4]:
train['content'] = train['content'].apply(preprocess_text)
val['content'] = val['content'].apply(preprocess_text)
test['content'] = test['content'].apply(preprocess_text)

In [5]:
print(f"Shape of the train data: {train.shape}")
print(f"Shape of the validation data: {val.shape}")
print(f"Shape of the test data: {test.shape}")

Shape of the train data: (13314, 2)
Shape of the validation data: (1480, 2)
Shape of the test data: (1644, 2)


In [18]:
class NgramLanguageModel:
    def __init__(self, n):
        self.number_of_minus_n_grams = None
        self.n = n
        self.counts = {}
        self.counts_minus_one_grams = {}
        self.vocab = set()


    def update_counts(self, tokens):
        n = self.n
        for i in range(len(tokens) - n + 1):
            ngram = tuple(tokens[i:i + n])
            self.counts[ngram] = self.counts.get(ngram, 0) + 1
            for token in ngram:
                self.vocab.add(token)

        n = self.n - 1
        for i in range(len(tokens) - n + 1):
            ngram = tuple(tokens[i:i + n])
            self.counts_minus_one_grams[ngram] = self.counts_minus_one_grams.get(ngram, 0) + 1
        self.number_of_minus_n_grams = sum(self.counts_minus_one_grams.values())
        self.number_of_n_grams = sum(self.counts.values())

    def probability(self, token, context):
        context = tuple(context)
        ngram = context + (token,)
        if self.n == 1:
            if ngram in self.counts:
                return self.counts[ngram] / self.number_of_n_grams
        if context in self.counts_minus_one_grams:
            context_count = self.counts_minus_one_grams[context]
            if ngram in self.counts:
                return self.counts[ngram] / context_count
        return 0

    def perplexity(self, test_data):
        log_prob_sum = 0
        for i in range(len(test_data) - self.n + 1):
            context = tuple(test_data[i:i + self.n - 1])
            token = test_data[i + self.n - 1]
            prob = self.probability(token, context)
            log_prob_sum += np.log(prob)
        return np.exp(-log_prob_sum / len(test_data))

class BackoffSmoothing(NgramLanguageModel):
    def __init__(self, n, p_bg=0.001):
        super().__init__(n)
        self.p_bg = p_bg


    def probability(self, token, context):
        p = super().probability(token, context)
        if p == 0:
            p = self.p_bg
        return p


class AbsoluteDiscounting(NgramLanguageModel):
    def __init__(self, n, discount=0.5, p_bg=0.000001):
        super().__init__(n)
        self.discount = discount
        self.p_bg = p_bg
        self.applied_discounting = 1

    def probability(self, token, context):
        context = tuple(context)
        ngram = context + (token,)
        if self.n == 1:
            if ngram in self.counts:
                return self.counts[ngram] / self.number_of_n_grams
        if context in self.counts_minus_one_grams:
            context_count = self.counts_minus_one_grams[context]
            ngram_count = self.counts.get(ngram, 0)
            if context_count > 0:
                self.applied_discounting += 1
            interpolation = self.discount * context_count / self.number_of_minus_n_grams
            return max(ngram_count - self.discount, 0) / context_count + interpolation * self.p_bg
        return  self.p_bg

In [7]:
# make all validation token in one list for tuning hyperparameters
validation_tokens = []
for i, content in enumerate(val['content']):
    for token in content:
        validation_tokens.append(token)

In [9]:
# unigram
# Train the model with BackoffSmoothing
backoff_model = BackoffSmoothing(1, 0.1)  # Change the number to the desired n-gram
for tokens in train['content']:
    backoff_model.update_counts(tokens)
background_prob = [0.001, 0.002, 0.003, 0.004, 0.005, 0.006, 0.007, 0.008, 0.009, 0.01, 0.1]
perplexity_dict = {}
for p_bg in background_prob:
    backoff_model.p_bg = p_bg
    perplexity_dict[p_bg] = backoff_model.perplexity(validation_tokens)

#choose the best hyperparameter
best_p_bg = min(perplexity_dict, key=perplexity_dict.get)
backoff_model.p_bg = best_p_bg
# Evaluate the model
for i, content in enumerate(test['content']):
    test_tokens = [token for token in content]  # Flatten the list of tokens
    print(f"Perplexity of the {i} BackoffSmoothing model on the test data: {backoff_model.perplexity(test_tokens)}")

Perplexity of the 0 BackoffSmoothing model on the test data: 1361.474788040928
Perplexity of the 1 BackoffSmoothing model on the test data: 2024.8146238209442
Perplexity of the 2 BackoffSmoothing model on the test data: 1891.3562055503637
Perplexity of the 3 BackoffSmoothing model on the test data: 1563.4277211881613
Perplexity of the 4 BackoffSmoothing model on the test data: 2128.5230368981092
Perplexity of the 5 BackoffSmoothing model on the test data: 1826.7812172408296
Perplexity of the 6 BackoffSmoothing model on the test data: 1140.8369810893387
Perplexity of the 7 BackoffSmoothing model on the test data: 1183.3245657119264
Perplexity of the 8 BackoffSmoothing model on the test data: 1693.4523805271658
Perplexity of the 9 BackoffSmoothing model on the test data: 2383.3324362662056
Perplexity of the 10 BackoffSmoothing model on the test data: 2087.9888769771005
Perplexity of the 11 BackoffSmoothing model on the test data: 1808.0667075637584
Perplexity of the 12 BackoffSmoothing m

In [10]:
# bigrams
# Train the model with BackoffSmoothing
backoff_model = BackoffSmoothing(2, 0.1)
for tokens in train['content']:
    backoff_model.update_counts(tokens)
background_prob = [0.001, 0.002, 0.003, 0.004, 0.005, 0.006, 0.007, 0.008, 0.009, 0.01, 0.1]
perplexity_dict = {}
for p_bg in background_prob:
    backoff_model.p_bg = p_bg
    perplexity_dict[p_bg] = backoff_model.perplexity(validation_tokens)

#choose the best hyperparameter
best_p_bg = min(perplexity_dict, key=perplexity_dict.get)
backoff_model.p_bg = best_p_bg
# Evaluate the model
for i, content in enumerate(test['content']):
    test_tokens = [token for token in content]  # Flatten the list of tokens
    print(f"Perplexity of the {i} BackoffSmoothing model on the test data: {backoff_model.perplexity(test_tokens)}")

Perplexity of the 0 BackoffSmoothing model on the test data: 70.52274693495694
Perplexity of the 1 BackoffSmoothing model on the test data: 61.055592205520504
Perplexity of the 2 BackoffSmoothing model on the test data: 59.59813567479229
Perplexity of the 3 BackoffSmoothing model on the test data: 64.88075571991914
Perplexity of the 4 BackoffSmoothing model on the test data: 41.28288484089346
Perplexity of the 5 BackoffSmoothing model on the test data: 43.12513998746346
Perplexity of the 6 BackoffSmoothing model on the test data: 64.19933289817801
Perplexity of the 7 BackoffSmoothing model on the test data: 74.61355024708156
Perplexity of the 8 BackoffSmoothing model on the test data: 79.03624798683738
Perplexity of the 9 BackoffSmoothing model on the test data: 60.865325616596714
Perplexity of the 10 BackoffSmoothing model on the test data: 57.74036207957768
Perplexity of the 11 BackoffSmoothing model on the test data: 56.7466879229338
Perplexity of the 12 BackoffSmoothing model on th

In [11]:
# trigrams
# Train the model with BackoffSmoothing
backoff_model = BackoffSmoothing(3, 0.1)
for tokens in train['content']:
    backoff_model.update_counts(tokens)
background_prob = [0.001, 0.002, 0.003, 0.004, 0.005, 0.006, 0.007, 0.008, 0.009, 0.01, 0.1]
perplexity_dict = {}
for p_bg in background_prob:
    backoff_model.p_bg = p_bg
    perplexity_dict[p_bg] = backoff_model.perplexity(validation_tokens)

#choose the best hyperparameter
best_p_bg = min(perplexity_dict, key=perplexity_dict.get)
backoff_model.p_bg = best_p_bg
# Evaluate the model
for i, content in enumerate(test['content']):
    test_tokens = [token for token in content]  # Flatten the list of tokens
    print(f"Perplexity of the {i} BackoffSmoothing model on the test data: {backoff_model.perplexity(test_tokens)}")

Perplexity of the 0 BackoffSmoothing model on the test data: 10.880632421725235
Perplexity of the 1 BackoffSmoothing model on the test data: 10.006412596405259
Perplexity of the 2 BackoffSmoothing model on the test data: 11.21840991612558
Perplexity of the 3 BackoffSmoothing model on the test data: 10.476501005077056
Perplexity of the 4 BackoffSmoothing model on the test data: 10.601222209853903
Perplexity of the 5 BackoffSmoothing model on the test data: 10.200645017261365
Perplexity of the 6 BackoffSmoothing model on the test data: 11.502393347782105
Perplexity of the 7 BackoffSmoothing model on the test data: 11.304426592886166
Perplexity of the 8 BackoffSmoothing model on the test data: 9.942346051600575
Perplexity of the 9 BackoffSmoothing model on the test data: 11.411867085472283
Perplexity of the 10 BackoffSmoothing model on the test data: 11.632169855998821
Perplexity of the 11 BackoffSmoothing model on the test data: 7.273376412911166
Perplexity of the 12 BackoffSmoothing mod

In [19]:
# unigram
# Train the model with AbsoluteDiscounting
absolute_discounting_model = AbsoluteDiscounting(1)
for tokens in train['content']:
    absolute_discounting_model.update_counts(tokens)
background_prob = [0.001, 0.002, 0.003, 0.004, 0.005, 0.006, 0.007, 0.008, 0.009, 0.01, 0.1]
discounts = [0.001, 0.01, 0.1, 0.3, 0.5, 0.7]
perplexity_dict = {}
for p_bg in background_prob:
    for discount in discounts:
        absolute_discounting_model.p_bg = p_bg
        absolute_discounting_model.discount = discount
        perplexity_dict[(p_bg, discount)] = absolute_discounting_model.perplexity(validation_tokens)

#choose the best hyperparameter
best_p_bg, best_discount = min(perplexity_dict, key=perplexity_dict.get)
absolute_discounting_model.p_bg = best_p_bg
absolute_discounting_model.discount = best_discount
# Evaluate the model
for i, content in enumerate(test['content']):
    test_tokens = [token for token in content]  # Flatten the list of tokens
    print(f"Perplexity of the {i} Absolute Discounting model on the test data: {absolute_discounting_model.perplexity(test_tokens)}")
    absolute_discounting_model.applied_discounting = 1

Perplexity of the 0 Absolute Discounting model on the test data: 1359.2671923496907
Perplexity of the 1 Absolute Discounting model on the test data: 2019.8896803896578
Perplexity of the 2 Absolute Discounting model on the test data: 1894.681655035519
Perplexity of the 3 Absolute Discounting model on the test data: 1564.5747019000942
Perplexity of the 4 Absolute Discounting model on the test data: 2199.2778027887275
Perplexity of the 5 Absolute Discounting model on the test data: 1847.4877816438327
Perplexity of the 6 Absolute Discounting model on the test data: 1144.3833184895336
Perplexity of the 7 Absolute Discounting model on the test data: 1181.4442306690078
Perplexity of the 8 Absolute Discounting model on the test data: 1690.4982172693926
Perplexity of the 9 Absolute Discounting model on the test data: 2400.1103218888547
Perplexity of the 10 Absolute Discounting model on the test data: 2093.8585734870117
Perplexity of the 11 Absolute Discounting model on the test data: 1809.78984

In [13]:
# bigram
# Train the model with AbsoluteDiscounting
absolute_discounting_model = AbsoluteDiscounting(2)
for tokens in train['content']:
    absolute_discounting_model.update_counts(tokens)
background_prob = [0.001, 0.002, 0.003, 0.004, 0.005, 0.006, 0.007, 0.008, 0.009, 0.01, 0.1]
discounts = [0.001, 0.01, 0.1, 0.3, 0.5, 0.7]
perplexity_dict = {}
for p_bg in background_prob:
    for discount in discounts:
        absolute_discounting_model.p_bg = p_bg
        absolute_discounting_model.discount = discount
        perplexity_dict[(p_bg, discount)] = absolute_discounting_model.perplexity(validation_tokens)

#choose the best hyperparameter
best_p_bg, best_discount = min(perplexity_dict, key=perplexity_dict.get)
absolute_discounting_model.p_bg = best_p_bg
absolute_discounting_model.discount = best_discount
# Evaluate the model
for i, content in enumerate(test['content']):
    test_tokens = [token for token in content]  # Flatten the list of tokens
    print(f"Perplexity of the {i} Absolute Discounting model on the test data: {absolute_discounting_model.perplexity(test_tokens)}")
    absolute_discounting_model.applied_discounting = 1

Perplexity of the 0 Absolute Discounting model on the test data: 361.5870623553146
Perplexity of the 1 Absolute Discounting model on the test data: 178.28973847252095
Perplexity of the 2 Absolute Discounting model on the test data: 394.9783983441004
Perplexity of the 3 Absolute Discounting model on the test data: 150.69564663244472
Perplexity of the 4 Absolute Discounting model on the test data: 1213.4809363214974
Perplexity of the 5 Absolute Discounting model on the test data: 416.25436268517154
Perplexity of the 6 Absolute Discounting model on the test data: 373.5340530280333
Perplexity of the 7 Absolute Discounting model on the test data: 211.63045742508186
Perplexity of the 8 Absolute Discounting model on the test data: 147.79795302976225
Perplexity of the 9 Absolute Discounting model on the test data: 1020.7200122980078
Perplexity of the 10 Absolute Discounting model on the test data: 543.8299577784982
Perplexity of the 11 Absolute Discounting model on the test data: 252.231703876

In [22]:
# trigram
# Train the model with AbsoluteDiscounting
absolute_discounting_model = AbsoluteDiscounting(3)
for tokens in train['content']:
    absolute_discounting_model.update_counts(tokens)
background_prob = [0.001, 0.002, 0.003, 0.004, 0.005, 0.006, 0.007, 0.008, 0.009, 0.01, 0.1]
discounts = [0.001, 0.01, 0.1, 0.3, 0.5 , 0.7]
perplexity_dict = {}
for p_bg in background_prob:
    for discount in discounts:
        absolute_discounting_model.p_bg = p_bg
        absolute_discounting_model.discount = discount
        perplexity_dict[(p_bg, discount)] = absolute_discounting_model.perplexity(validation_tokens)

#choose the best hyperparameter
best_p_bg, best_discount = min(perplexity_dict, key=perplexity_dict.get)
absolute_discounting_model.p_bg = best_p_bg
absolute_discounting_model.discount = best_discount
# Evaluate the model
for i, content in enumerate(test['content']):
    test_tokens = [token for token in content]  # Flatten the list of tokens
    print(f"Perplexity of the {i} Absolute Discounting model on the test data: {absolute_discounting_model.perplexity(test_tokens)}")
    absolute_discounting_model.applied_discounting = 1

Perplexity of the 0 Absolute Discounting model on the test data: 2150.4315914078875
Perplexity of the 1 Absolute Discounting model on the test data: 2098.495944177024
Perplexity of the 2 Absolute Discounting model on the test data: 1486.912549239702
Perplexity of the 3 Absolute Discounting model on the test data: 1185.7213742271306
Perplexity of the 4 Absolute Discounting model on the test data: 764.9289356711624
Perplexity of the 5 Absolute Discounting model on the test data: 1126.1579314836176
Perplexity of the 6 Absolute Discounting model on the test data: 2442.8637723690345
Perplexity of the 7 Absolute Discounting model on the test data: 2192.6780002709115
Perplexity of the 8 Absolute Discounting model on the test data: 268.40202449398487
Perplexity of the 9 Absolute Discounting model on the test data: 2007.7476674454715
Perplexity of the 10 Absolute Discounting model on the test data: 2363.4207098698166
Perplexity of the 11 Absolute Discounting model on the test data: 687.57196340

In [15]:
labels = list(range(0, 8))
test_class_tokens = {}
for i in labels:
    test_label = test[test['label'] == i]
    test_tokens = []
    for j, content in enumerate(test_label['content']):
        for token in content:
            test_tokens.append(token)
    test_class_tokens[i] = test_tokens

In [16]:
best_pg = backoff_model.p_bg
backoff_model = BackoffSmoothing(1, best_pg)
for tokens in train['content']:
    backoff_model.update_counts(tokens)
for i in labels:
    test_tokens = [token for token in test_class_tokens[i]]  # Flatten the list of tokens
    print(f"Perplexity of the class {i} BackoffSmoothing model on the test data with unigram: {backoff_model.perplexity(test_tokens)}")

backoff_model = BackoffSmoothing(2, best_pg)
for tokens in train['content']:
    backoff_model.update_counts(tokens)
for i in labels:
    test_tokens = [token for token in test_class_tokens[i]]  # Flatten the list of tokens
    print(f"Perplexity of the class {i} BackoffSmoothing model on the test data with bigram: {backoff_model.perplexity(test_tokens)}")

backoff_model = BackoffSmoothing(3, best_pg)
for tokens in train['content']:
    backoff_model.update_counts(tokens)
for i in labels:
    test_tokens = [token for token in test_class_tokens[i]]  # Flatten the list of tokens
    print(f"Perplexity of the class {i} BackoffSmoothing model on the test data with trigram: {backoff_model.perplexity(test_tokens)}")

Perplexity of the class 0 BackoffSmoothing model on the test data with unigram: 1542.8926711581457
Perplexity of the class 1 BackoffSmoothing model on the test data with unigram: 1532.953566968718
Perplexity of the class 2 BackoffSmoothing model on the test data with unigram: 1472.0811466477771
Perplexity of the class 3 BackoffSmoothing model on the test data with unigram: 1407.9579632140478
Perplexity of the class 4 BackoffSmoothing model on the test data with unigram: 1794.698318055619
Perplexity of the class 5 BackoffSmoothing model on the test data with unigram: 1661.640768213993
Perplexity of the class 6 BackoffSmoothing model on the test data with unigram: 1585.387790135717
Perplexity of the class 7 BackoffSmoothing model on the test data with unigram: 1499.954491517723
Perplexity of the class 0 BackoffSmoothing model on the test data with bigram: 56.68213315685762
Perplexity of the class 1 BackoffSmoothing model on the test data with bigram: 53.88316560411561
Perplexity of the c

In [21]:
best_pg = absolute_discounting_model.p_bg
best_discount = 0.75
absolute_discounting_model = AbsoluteDiscounting(1,best_discount, best_discount)
for tokens in train['content']:
    absolute_discounting_model.update_counts(tokens)
for i in labels:
    test_tokens = [token for token in test_class_tokens[i]]  # Flatten the list of tokens
    print(f"Perplexity of the class {i} absolute discounting model on the test data with unigram: {absolute_discounting_model.perplexity(test_tokens)}")

absolute_discounting_model = AbsoluteDiscounting(2,best_discount, best_discount)
for tokens in train['content']:
    absolute_discounting_model.update_counts(tokens)
for i in labels:
    test_tokens = [token for token in test_class_tokens[i]]  # Flatten the list of tokens
    print(f"Perplexity of the class {i} absolute discounting model on the test data with bigram: {absolute_discounting_model.perplexity(test_tokens)}")

absolute_discounting_model = AbsoluteDiscounting(3,best_discount, best_discount)
for tokens in train['content']:
    absolute_discounting_model.update_counts(tokens)
for i in labels:
    test_tokens = [token for token in test_class_tokens[i]]  # Flatten the list of tokens
    print(f"Perplexity of the class {i} absolute discounting model on the test data with trigram: {absolute_discounting_model.perplexity(test_tokens)}")

Perplexity of the class 0 absolute discounting model on the test data with unigram: 1509.462758026619
Perplexity of the class 1 absolute discounting model on the test data with unigram: 1492.8817388604186
Perplexity of the class 2 absolute discounting model on the test data with unigram: 1432.0579462140952
Perplexity of the class 3 absolute discounting model on the test data with unigram: 1380.7551107375364
Perplexity of the class 4 absolute discounting model on the test data with unigram: 1730.5980756779888
Perplexity of the class 5 absolute discounting model on the test data with unigram: 1584.5291751243863
Perplexity of the class 6 absolute discounting model on the test data with unigram: 1543.3112467043977
Perplexity of the class 7 absolute discounting model on the test data with unigram: 1472.3280618481779
Perplexity of the class 0 absolute discounting model on the test data with bigram: 136.32575731513438
Perplexity of the class 1 absolute discounting model on the test data with 