# Spell Correction


Link to drive folder: https://drive.google.com/drive/folders/1O4GtIxJf7UhCnEct8dzKR9LfMDn2_zNH?usp=sharing

Add above folder to your drive (shortcut) and run this notebook inside `HW3` folder.

# Transformer

## Huggingface pre-trained Bert model

In [None]:
%%capture
!pip install Levenshtein
!pip install transformers
!pip install hazm

In [None]:
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained("HooshvareLab/bert-fa-base-uncased")
model = AutoModelForMaskedLM.from_pretrained("HooshvareLab/bert-fa-base-uncased")

Downloading:   0%|          | 0.00/440 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/624M [00:00<?, ?B/s]

Some weights of the model checkpoint at HooshvareLab/bert-fa-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
from torch.nn import functional as F
import torch
import Levenshtein
import re
from hazm import *


def find_possible_mistakes(inp, model, top_k=1000):
    # inp = inp.replace("\u200C", " ")
    tokens = word_tokenize(inp)
    # tokens = inp.split()
    # tokens = tokenizer.tokenize(inp)
    mistakes = []
    for i, token in enumerate(tokens):
        # print("-----------", token, "-----------")
        token = token.replace("\u200C", "")
        text = " ".join(tokens[:i]) + tokenizer.mask_token + " ".join(tokens[i+1:])
        input = tokenizer.encode_plus(text, return_tensors = "pt")
        mask_index = torch.where(input["input_ids"][0] == tokenizer.mask_token_id)
        logits = model(**input)
        logits = logits.logits
        softmax = F.softmax(logits, dim = -1)
        mask_word = softmax[0, mask_index, :]
        tops = torch.topk(mask_word, top_k, dim = 1)[1][0]
        
        least_dist = float("inf")
        corrected_word = token
        for w in tops:
            word = tokenizer.decode([w])
            dist = Levenshtein.distance(token, word)
            if dist < least_dist:
                corrected_word = word
                least_dist = dist

        if token != corrected_word:
            for reg in re.finditer(token, inp):
                s, e = reg.start(), reg.end()
            mistakes.append({"raw": token, "corrected": corrected_word, "span": [s, e]})

    return mistakes

In [None]:
input0 = "این دانشمند تیرانی باعث افتخار است."
input1 = "پس از سال‌ها تلاش رازی موفق به کسف الکل شد. این دانشمند تیرانی باعث افتخار در تاریخ کور است."
input2 = "بسیاری از مباحث علوم غیرطبیعی با استفاده از فیریک دنیای مادی ابل توجیح نیست و برای یادگیری باید به فلسفه‌های خاصی رجو کرد."
input3 = 'اما متأسفانه به قدری ساختار سارمان سینمایی و در سطح وسیع‌تر وزارت فرهنگ و ارشاد اصلامی عقب‌مانده و ناکارآمد است که عملا جلوی بهبود هر مشکلی را می‌گیرد!'
input4 = 'منطق جغرافیا و جئوپلیتیک همیشه ثابت است و قابل چشم‌پوسی نیست.‎'

### Pre-trained model results

In [None]:
import json

result = find_possible_mistakes(input1, model, top_k=1000)
print(input1)
print(json.dumps(result, indent=4, ensure_ascii=False, skipkeys=True,))

پس از سال‌ها تلاش رازی موفق به کسف الکل شد. این دانشمند تیرانی باعث افتخار در تاریخ کور است.
[
    {
        "raw": "رازی",
        "corrected": "باز",
        "span": [
            18,
            22
        ]
    },
    {
        "raw": "کسف",
        "corrected": "کشف",
        "span": [
            31,
            34
        ]
    },
    {
        "raw": "الکل",
        "corrected": "اول",
        "span": [
            35,
            39
        ]
    },
    {
        "raw": "تیرانی",
        "corrected": "ایرانی",
        "span": [
            56,
            62
        ]
    },
    {
        "raw": "کور",
        "corrected": "کشور",
        "span": [
            84,
            87
        ]
    }
]


In [None]:
import json

result = find_possible_mistakes(input2, model, top_k=1000)
print(input2)
print(json.dumps(result, indent=4, ensure_ascii=False, skipkeys=True,))

بسیاری از مباحث علوم غیرطبیعی با استفاده از فیریک دنیای مادی ابل توجیح نیست و برای یادگیری باید به فلسفه‌های خاصی رجو کرد.
[
    {
        "raw": "فیریک",
        "corrected": "فیزیک",
        "span": [
            44,
            49
        ]
    },
    {
        "raw": "ابل",
        "corrected": "قابل",
        "span": [
            61,
            64
        ]
    },
    {
        "raw": "توجیح",
        "corrected": "توجیه",
        "span": [
            65,
            70
        ]
    },
    {
        "raw": "رجو",
        "corrected": "رجوع",
        "span": [
            114,
            117
        ]
    }
]


In [None]:
result = find_possible_mistakes(input3, model, top_k=1000)
print(input3)
print(json.dumps(result, indent=4, ensure_ascii=False, skipkeys=True,))

اما متأسفانه به قدری ساختار سارمان سینمایی و در سطح وسیع‌تر وزارت فرهنگ و ارشاد اصلامی عقب‌مانده و ناکارآمد است که عملا جلوی بهبود هر مشکلی را می‌گیرد!
[
    {
        "raw": "متأسفانه",
        "corrected": "متاسفانه",
        "span": [
            4,
            12
        ]
    },
    {
        "raw": "سارمان",
        "corrected": "سازمان",
        "span": [
            28,
            34
        ]
    },
    {
        "raw": "اصلامی",
        "corrected": "اسلامی",
        "span": [
            80,
            86
        ]
    },
    {
        "raw": "عقبمانده",
        "corrected": "نمانده",
        "span": [
            80,
            86
        ]
    },
    {
        "raw": "ناکارآمد",
        "corrected": "ناکارامد",
        "span": [
            99,
            107
        ]
    }
]


In [None]:
result = find_possible_mistakes(input4, model, top_k=1000)
print(input4)
print(json.dumps(result, indent=4, ensure_ascii=False, skipkeys=True,))

منطق جغرافیا و جئوپلیتیک همیشه ثابت است و قابل چشم‌پوسی نیست.‎
[
    {
        "raw": "جغرافیا",
        "corrected": "جاوا",
        "span": [
            5,
            12
        ]
    },
    {
        "raw": "جئوپلیتیک",
        "corrected": "ژيوپلیتیک",
        "span": [
            15,
            24
        ]
    },
    {
        "raw": "چشمپوسی",
        "corrected": "چشمپوشی",
        "span": [
            15,
            24
        ]
    },
    {
        "raw": "‎",
        "corrected": ".",
        "span": [
            61,
            62
        ]
    }
]


## Fine-tune with more data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%cd /content/drive/MyDrive/NLP/HW3/NLP-HW3-Resources/

import zipfile

datasets = ["cultural.zip", "economics.zip", "politics.zip", "sports.zip"]
data = []


for dataset in datasets:
    with zipfile.ZipFile(dataset) as zipper:
        with zipper.open(dataset.split(".")[0]+'.txt') as fp:
            data += fp.read().decode('utf-8').split('\n')[:500]


%cd /content/drive/MyDrive/NLP/HW3/

/content/drive/.shortcut-targets-by-id/1aAgsAE_TSLzzFsdBLSIvjyv20Iu-wLlq/NLP-HW3-Resources
/content/drive/MyDrive/NLP/HW3


In [None]:
inputs = tokenizer(data, return_tensors='pt', max_length=512, truncation=True, padding='max_length')

# create a key for labels
inputs['labels'] = inputs.input_ids.detach().clone()

# create random array of floats in equal dimension to input_ids
rand = torch.rand(inputs.input_ids.shape)
# create mask array: where the random array is less than 0.15, we set true, also where it is not CLS or SEP
mask_arr = (rand < 0.15) * (inputs.input_ids != 101) * (inputs.input_ids != 102) * (inputs.input_ids != 0)

# create selection from mask_arr
# FYI: [MASK] == 103
selection = []

for i in range(inputs.input_ids.shape[0]):
    selection.append(torch.flatten(mask_arr[i].nonzero()).tolist())
    inputs.input_ids[i, selection[i]] = 103

In [None]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)

dataset = Dataset(inputs)

In [None]:
from transformers import TrainingArguments
from transformers import Trainer
import warnings
warnings.filterwarnings("ignore")

epochs = 5
batch_size = 8

args = TrainingArguments(
    output_dir='out',
    per_device_train_batch_size=batch_size,
    num_train_epochs=epochs
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataset
)

trainer.train()

***** Running training *****
  Num examples = 2000
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 1250


Step,Training Loss
500,0.1376
1000,0.0223


Saving model checkpoint to out/checkpoint-500
Configuration saved in out/checkpoint-500/config.json
Model weights saved in out/checkpoint-500/pytorch_model.bin
Saving model checkpoint to out/checkpoint-1000
Configuration saved in out/checkpoint-1000/config.json
Model weights saved in out/checkpoint-1000/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=1250, training_loss=0.06618174839019775, metrics={'train_runtime': 1912.7121, 'train_samples_per_second': 5.228, 'train_steps_per_second': 0.654, 'total_flos': 2634182492160000.0, 'train_loss': 0.06618174839019775, 'epoch': 5.0})

### Results after fine-tuning

In [None]:
%%capture
model.to("cpu")

In [None]:
result = find_possible_mistakes(input1, model, top_k=2000)
print(input1)
print(json.dumps(result, indent=4, ensure_ascii=False, skipkeys=True,))

پس از سال‌ها تلاش رازی موفق به کسف الکل شد. این دانشمند تیرانی باعث افتخار در تاریخ کور است.
[
    {
        "raw": "تلاش",
        "corrected": "تلال",
        "span": [
            13,
            17
        ]
    },
    {
        "raw": "رازی",
        "corrected": "روزی",
        "span": [
            18,
            22
        ]
    },
    {
        "raw": "کسف",
        "corrected": "کشف",
        "span": [
            31,
            34
        ]
    },
    {
        "raw": "الکل",
        "corrected": "الکا",
        "span": [
            35,
            39
        ]
    },
    {
        "raw": "تیرانی",
        "corrected": "ایرانی",
        "span": [
            56,
            62
        ]
    },
    {
        "raw": "کور",
        "corrected": "کشور",
        "span": [
            84,
            87
        ]
    }
]


In [None]:
result = find_possible_mistakes(input2, model, top_k=2000)
print(input2)
print(json.dumps(result, indent=4, ensure_ascii=False, skipkeys=True,))

بسیاری از مباحث علوم غیرطبیعی با استفاده از فیریک دنیای مادی ابل توجیح نیست و برای یادگیری باید به فلسفه‌های خاصی رجو کرد.
[
    {
        "raw": "فیریک",
        "corrected": "فیزیک",
        "span": [
            44,
            49
        ]
    },
    {
        "raw": "مادی",
        "corrected": "مادری",
        "span": [
            56,
            60
        ]
    },
    {
        "raw": "ابل",
        "corrected": "قابل",
        "span": [
            61,
            64
        ]
    },
    {
        "raw": "توجیح",
        "corrected": "توکیو",
        "span": [
            65,
            70
        ]
    },
    {
        "raw": "فلسفههای",
        "corrected": "جلسههای",
        "span": [
            65,
            70
        ]
    },
    {
        "raw": "رجو",
        "corrected": "رجوع",
        "span": [
            114,
            117
        ]
    }
]


<div dir=rtl>
متاسفانه نتایج بعد از یادگیری دوباره بهتر نشده اند.
</div>

# N-grams

In [None]:
from hazm import *
from google.colab import drive
import numpy as np
import os
import tqdm
from itertools import product
import math
import nltk
import json

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%cd /content/drive/MyDrive/NLP/HW3/NLP-HW3-Resources/

import zipfile

datasets = ["cultural.zip", "economics.zip", "politics.zip", "sports.zip"]
data = []


for dataset in datasets:
    with zipfile.ZipFile(dataset) as zipper:
        with zipper.open(dataset.split(".")[0]+'.txt') as fp:
            data += fp.read().decode('utf-8').split('\n')[:100000]


%cd /content/drive/MyDrive/NLP/HW3/

/content/drive/.shortcut-targets-by-id/1aAgsAE_TSLzzFsdBLSIvjyv20Iu-wLlq/NLP-HW3-Resources
/content/drive/MyDrive/NLP/HW3


In [None]:
sentences = sent_tokenize("\n".join(data))
len(sentences)

1292562

In [None]:
normalizer = Normalizer()

sentences = [normalizer.normalize(x) for x in tqdm.tqdm(sentences)]
sentences = [word_tokenize(sent) for sent in tqdm.tqdm(sentences)]
sentences = [' '.join(x) for x in tqdm.tqdm(sentences)]

100%|██████████| 1292562/1292562 [01:57<00:00, 10992.08it/s]
100%|██████████| 1292562/1292562 [00:59<00:00, 21743.79it/s]
100%|██████████| 1292562/1292562 [00:02<00:00, 504704.31it/s]


In [None]:
class LanguageModel(object):

    SOS = "<s>"
    EOS = "</s>"
    UNK = "<UNK>"
    
    def __init__(self, train_data, n, laplace=1):
        self.n = n
        self.vocab = dict()
        self.laplace = laplace
        self.tokens = self.preprocess(train_data, n)
        self.vocab  = nltk.FreqDist(self.tokens)
        self.model  = self._create_model()
        self.masks  = list(reversed(list(product((0,1), repeat=n))))



    def _smooth(self):
        vocab_size = len(self.vocab)

        n_grams = nltk.ngrams(self.tokens, self.n)
        n_vocab = nltk.FreqDist(n_grams)

        m_grams = nltk.ngrams(self.tokens, self.n-1)
        m_vocab = nltk.FreqDist(m_grams)

        def smoothed_count(n_gram, n_count):
            m_gram = n_gram[:-1]
            m_count = m_vocab[m_gram]
            return (n_count + self.laplace) / (m_count + self.laplace * vocab_size)

        return { n_gram: smoothed_count(n_gram, count) for n_gram, count in n_vocab.items() }



    def _create_model(self):
        if self.n == 1:
            num_tokens = len(self.tokens)
            return { (unigram,): count / num_tokens for unigram, count in self.vocab.items() }
        else:
            return self._smooth()



    def _convert_oov(self, ngram):
        mask = lambda ngram, bitmask: tuple((token if flag == 1 else "<UNK>" for token,flag in zip(ngram, bitmask)))

        ngram = (ngram,) if type(ngram) is str else ngram
        for possible_known in [mask(ngram, bitmask) for bitmask in self.masks]:
            if possible_known in self.model:
                return possible_known



    def perplexity(self, test_data):
        test_tokens = self.preprocess(test_data, self.n)
        test_ngrams = nltk.ngrams(test_tokens, self.n)
        N = len(test_tokens)

        known_ngrams  = [self._convert_oov(ngram) for ngram in test_ngrams]
        probabilities = [self.model[ngram] for ngram in known_ngrams]
        
        # for x,y in zip(known_ngrams, probabilities):
        #     print(x,y)
        
        return math.exp((-1/N) * sum(map(math.log, probabilities)))



    def _best_candidate(self, prev, k=10, without=[]):
        
        blacklist  = [LanguageModel.UNK] + without
        # if len(prev) < self.n:
        #     prev = [LanguageModel.SOS]*(self.n-1)

        candidates = list(((ngram[-1],prob) for ngram,prob in self.model.items() if ngram[:-1]==tuple(prev)))

        probs = [y for x,y in candidates]
        probs = probs/np.sum(probs)
        words = [x for x,y in candidates]

        candidates.sort(key = lambda x: -x[1])
        return candidates[:k]


    def preprocess(self, sentences, n):
        sentences = self.add_sentence_tokens(sentences, n)
        tokens = ' '.join(sentences).split()
        tokens = self.replace_singletons(tokens)
        return tokens


    def add_sentence_tokens(self, sentences, n):
        sos = ' '.join([LanguageModel.SOS] * (n-1)) if n > 1 else LanguageModel.SOS
        return ['{} {} {}'.format(sos, s, LanguageModel.EOS) for s in sentences]


    def replace_singletons(self, tokens):
        if len(self.vocab) == 0:
            self.vocab = nltk.FreqDist(tokens)
        return [token if self.vocab[token] > 1 else LanguageModel.UNK for token in tokens]

In [None]:
language_model = LanguageModel(sentences, 3, 1)

## Test using edit distance

In [None]:
from nltk import probability
from torch.nn import functional as F
import torch
import Levenshtein
import re
from hazm import *


def find_possible_mistakes_ngram(inp, top_k=20):
    mistakes = []
    tokens = word_tokenize(inp)
    tokens = 2 * ["<s>"] + tokens

    for i in range(len(tokens)-2):
        tops = language_model._best_candidate([tokens[i], tokens[i+1]], top_k)

        least_dist = float("inf")
        corrected_word = tokens[i+2]

        for word, prob in tops:
            dist = Levenshtein.distance(tokens[i+2], word)
            if dist < least_dist:
                corrected_word = word
                least_dist = dist

        if tokens[i+2] != corrected_word:
            for reg in re.finditer(tokens[i+2], inp):
                s, e = reg.start(), reg.end()
            mistakes.append({"raw": tokens[i+2], "corrected": corrected_word, "span": [s, e]})

    return mistakes

In [None]:
result = find_possible_mistakes_ngram(input1, top_k=10)
print(input1)
print(json.dumps(result, indent=4, ensure_ascii=False, skipkeys=True,))

پس از سال‌ها تلاش رازی موفق به کسف الکل شد. این دانشمند تیرانی باعث افتخار در تاریخ کور است.
[
    {
        "raw": "پس",
        "corrected": "در",
        "span": [
            0,
            2
        ]
    },
    {
        "raw": "سال‌ها",
        "corrected": "این",
        "span": [
            6,
            12
        ]
    },
    {
        "raw": "تلاش",
        "corrected": "پیش",
        "span": [
            13,
            17
        ]
    },
    {
        "raw": "رازی",
        "corrected": "برای",
        "span": [
            18,
            22
        ]
    },
    {
        "raw": "کسف",
        "corrected": "کسب",
        "span": [
            31,
            34
        ]
    },
    {
        "raw": "دانشمند",
        "corrected": "کارشناس",
        "span": [
            48,
            55
        ]
    },
    {
        "raw": "تیرانی",
        "corrected": "در",
        "span": [
            56,
            62
        ]
    },
    {
        "raw": "در",
        "corr

In [None]:
result = find_possible_mistakes_ngram(input2, top_k=100)
print(input2)
print(json.dumps(result, indent=4, ensure_ascii=False, skipkeys=True,))

بسیاری از مباحث علوم غیرطبیعی با استفاده از فیریک دنیای مادی ابل توجیح نیست و برای یادگیری باید به فلسفه‌های خاصی رجو کرد.
[
    {
        "raw": "مباحث",
        "corrected": "موارد",
        "span": [
            10,
            15
        ]
    },
    {
        "raw": "علوم",
        "corrected": "علمی",
        "span": [
            16,
            20
        ]
    },
    {
        "raw": "فیریک",
        "corrected": "یک",
        "span": [
            44,
            49
        ]
    },
    {
        "raw": "ابل",
        "corrected": "به",
        "span": [
            61,
            64
        ]
    },
    {
        "raw": "یادگیری",
        "corrected": "جلوگیری",
        "span": [
            83,
            90
        ]
    },
    {
        "raw": "باید",
        "corrected": "این",
        "span": [
            91,
            95
        ]
    },
    {
        "raw": "به",
        "corrected": "صبر",
        "span": [
            96,
            98
        ]
    },
    {
 

In [None]:
result = find_possible_mistakes_ngram(input3, top_k=100)
print(input3)
print(json.dumps(result, indent=4, ensure_ascii=False, skipkeys=True,))

اما متأسفانه به قدری ساختار سارمان سینمایی و در سطح وسیع‌تر وزارت فرهنگ و ارشاد اصلامی عقب‌مانده و ناکارآمد است که عملا جلوی بهبود هر مشکلی را می‌گیرد!
[
    {
        "raw": "سارمان",
        "corrected": "سازمان",
        "span": [
            28,
            34
        ]
    },
    {
        "raw": "اصلامی",
        "corrected": "اسلامی",
        "span": [
            80,
            86
        ]
    },
    {
        "raw": "عملا",
        "corrected": "ما",
        "span": [
            115,
            119
        ]
    }
]


In [None]:
result = find_possible_mistakes_ngram(input4, top_k=100)
print(input4)
print(json.dumps(result, indent=4, ensure_ascii=False, skipkeys=True,))

منطق جغرافیا و جئوپلیتیک همیشه ثابت است و قابل چشم‌پوسی نیست.‎
[
    {
        "raw": "منطق",
        "corrected": "من",
        "span": [
            0,
            4
        ]
    },
    {
        "raw": "جئوپلیتیک",
        "corrected": "ژئوپلیتیک",
        "span": [
            15,
            24
        ]
    },
    {
        "raw": "قابل",
        "corrected": "این",
        "span": [
            42,
            46
        ]
    },
    {
        "raw": "چشم‌پوسی",
        "corrected": "چشم‌پوشی",
        "span": [
            47,
            55
        ]
    },
    {
        "raw": "‎",
        "corrected": "»",
        "span": [
            61,
            62
        ]
    }
]


# CBOW

## CBOW from Scratch

### inatll requirements

In [None]:
! pip install hazm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from hazm import *
import tqdm
import torch
import torch.nn as nn

### Data Loading

In [None]:
from google.colab import drive
import os

In [None]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%cd /content/drive/MyDrive/NLP_HW3/Unzipped

/content/drive/MyDrive/NLP_HW3/Unzipped


In [None]:
# ! ls

In [None]:
cultural = ''
economics = ''
politics = ''
sports = ''
# with open('cultural.txt') as f:
#   cultural = f.read()
# with open('economics.txt') as f:
#   economics = f.read()
with open('processd_data.txt') as f:
  politics = f.read()
# with open('sports.txt') as f:
#   sports = f.read()


In [None]:
stopwords = []
with open('stopwords.txt') as f:
  stopwords = f.read().split()

### Preprocessing

In [None]:
# cultural_sents = sent_tokenize(cultural)
# economics_sents = sent_tokenize(economics)
politics_sents = sent_tokenize(politics)[:1000]
# sports_sents = sent_tokenize(sports)

In [None]:
politics_sents = list(set(politics_sents))

In [None]:
len(politics_sents)

1000

In [None]:
# cultural_sents1 = cultural_sents[:200]
# cultural_sents2 = cultural_sents[200:400]

In [None]:
# politics_sents = list(set(sent_tokenize(politics)))

In [None]:
# pol_sents = []
# for sent in politics_sents:
#   if sent not in pol_sents:
#     pol_sents.append(sent)


In [None]:
normalizer = Normalizer()
all_words = []
# all_words2 = []

for i in range(len(politics_sents)):
  all_words += word_tokenize(normalizer.normalize(politics_sents[i]))

# for i in range(len(cultural_sents2)):
#   all_words2 += word_tokenize(normalizer.normalize(cultural_sents2[i]))  

# for i in range(len(economics_sents)):
#   all_words += word_tokenize(normalizer.normalize(economics_sents[i]))
  
# for i in range(len(politics_sents)):
#   all_words += word_tokenize(normalizer.normalize(politics_sents[i]))
  
# for i in range(len(sports_sents)):
#   all_words += word_tokenize(normalizer.normalize(sports_sents[i]))
  


In [None]:
len(all_words)

29627

In [None]:
words = [t for t in tqdm.tqdm(all_words) if t not in stopwords]
# words2 = [t for t in tqdm.tqdm(all_words2) if t not in stopwords]

100%|██████████| 29627/29627 [00:00<00:00, 1345583.59it/s]


### Training Model

In [None]:
CONTEXT_SIZE = 4  # 4 words to the left, 4 to the right
EMDEDDING_DIM = 100


data = []
for i in range(CONTEXT_SIZE, len(words) - CONTEXT_SIZE):
  context = []
  for j in range(-CONTEXT_SIZE, CONTEXT_SIZE+1, 1):
    if j == 0:
      continue
    context.append(words[i+j])
  target = words[i]
  data.append((context, target))

In [None]:
len(data)

21126

In [None]:
for i in range(CONTEXT_SIZE, len(words2) - CONTEXT_SIZE):
  context = []
  for j in range(-CONTEXT_SIZE, CONTEXT_SIZE+1, 1):
    if j == 0:
      continue
    context.append(words2[i+j])
  target = words2[i]
  data.append((context, target))

In [None]:
len(data)

10126

In [None]:
vocab = set(words)
vocab_size = len(vocab)

word_to_ix = {word:ix for ix, word in enumerate(vocab)}
ix_to_word = {ix:word for ix, word in enumerate(vocab)}


def make_context_vector(context, word_to_ix):
    idxs = [word_to_ix[w] for w in context]
    return torch.tensor(idxs, dtype=torch.long)



class CBOW(torch.nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(CBOW, self).__init__()

        #out: 1 x emdedding_dim
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(embedding_dim, 128)
        self.activation_function1 = nn.ReLU()
        
        #out: 1 x vocab_size
        self.linear2 = nn.Linear(128, vocab_size)
        self.activation_function2 = nn.LogSoftmax(dim = -1)
        

    def forward(self, inputs):
        embeds = sum(self.embeddings(inputs)).view(1,-1)
        out = self.linear1(embeds)
        out = self.activation_function1(out)
        out = self.linear2(out)
        out = self.activation_function2(out)
        return out

    def get_word_emdedding(self, word):
        word = torch.tensor([word_to_ix[word]])
        return self.embeddings(word).view(1,-1)


model = CBOW(vocab_size, EMDEDDING_DIM)

loss_function = nn.NLLLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)

#TRAINING
for epoch in range(20):
  total_loss = 0
  for context, target in data:
    context_vector = make_context_vector(context, word_to_ix)  
    log_probs = model(context_vector)
    total_loss += loss_function(log_probs, torch.tensor([word_to_ix[target]]))

  #optimize at the end of each epoch
  optimizer.zero_grad()
  total_loss.backward()
  optimizer.step()


In [None]:

#TESTING
context = ['محقق', 'کشف', 'تاریخ', 'افتخار', 'کشور', 'تلاش', 'باعث', 'نیست']
context_vector = make_context_vector(context, word_to_ix)
a = model(context_vector)

#Print result
print(f'Context: {context}\n')
print(f'Prediction: {ix_to_word[torch.argmax(a[0]).item()]}')

Context: ['محقق', 'کشف', 'تاریخ', 'افتخار', 'کشور', 'تلاش', 'باعث', 'نیست']

Prediction: توفانی


## Pre-trained CBOW

In [None]:
!git clone https://github.com/facebookresearch/fastText.git 

Cloning into 'fastText'...
remote: Enumerating objects: 3930, done.[K
remote: Counting objects: 100% (76/76), done.[K
remote: Compressing objects: 100% (29/29), done.[K
remote: Total 3930 (delta 29), reused 70 (delta 29), pack-reused 3854[K
Receiving objects: 100% (3930/3930), 8.33 MiB | 32.07 MiB/s, done.
Resolving deltas: 100% (2446/2446), done.


In [None]:
!cd fastText && pip install .

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Processing /content/fastText
[33m  DEPRECATION: A future pip version will change local packages to be built in-place without first copying to a temporary directory. We recommend you use --use-feature=in-tree-build to test your packages with this new behavior before it becomes the default.
   pip 21.3 will remove support for this functionality. You can find discussion regarding this at https://github.com/pypa/pip/issues/7555.[0m
Collecting pybind11>=2.2
  Using cached pybind11-2.9.2-py2.py3-none-any.whl (213 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp37-cp37m-linux_x86_64.whl size=3137162 sha256=1156b48203ba4a46f94179495d373a44a956809cbc4676556ae9fa0ed79f103b
  Stored in directory: /tmp/pip-ephem-wheel-cache-9smi8ed3/wheels/22/04/6e/b3aba25c1a5845898b5871a0df37c

In [None]:
SKIPGRAM_MODEL_FILE_ID = '1wPnMG9_GNUVdSgbznQziQc5nMWI3QKNz'
CBOW_MODEL_FILE_ID = '1cQP10CGV6kAwmRuESJ5RTsgHq5TveXwV'

In [None]:
!gdown --id $CBOW_MODEL_FILE_ID 

Downloading...
From: https://drive.google.com/uc?id=1cQP10CGV6kAwmRuESJ5RTsgHq5TveXwV
To: /content/farsi-dedup-cbow.bin
100% 4.37G/4.37G [00:36<00:00, 119MB/s]


In [None]:
!gdown --id $SKIPGRAM_MODEL_FILE_ID 

Downloading...
From: https://drive.google.com/uc?id=1wPnMG9_GNUVdSgbznQziQc5nMWI3QKNz
To: /content/farsi-dedup-skipgram.bin
100% 4.37G/4.37G [00:35<00:00, 124MB/s]


In [None]:
import fasttext.util
# loading the Model
# model_cbow = fasttext.load_model('farsi-dedup-cbow.bin')
model_skipgram = fasttext.load_model('farsi-dedup-skipgram.bin')

In [None]:
model_skipgram.get_analogies('دانشمند', 'کشور', 'افتخار')

[(0.670722246170044, 'دانشمندم.'),
 (0.6568266749382019, 'دانشمندم'),
 (0.6276395916938782, 'مخترع.'),
 (0.6273905634880066, 'دانشمندی'),
 (0.6188674569129944, 'دانشمندم،'),
 (0.6125983595848083, 'دانشمندو'),
 (0.6124367713928223, 'افتخارلی'),
 (0.5998932123184204, 'افتخارکن'),
 (0.5969136357307434, 'ستودنش'),
 (0.5944597721099854, 'پروفسورر')]

## Fasttext CBOW

In [None]:
! pip install fasttext

In [None]:
! pip install levenshtein

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import fasttext
import Levenshtein

In [None]:
model = fasttext.train_unsupervised('data/fil9', "cbow")

In [None]:
sents = sent_tokenize(politics)

sents1 = sents[:len(sents)//2]
sents2 = sents[len(sents)//2:]   

normalizer = Normalizer()
words = []
with open('processd_data.txt', 'w') as f:
  for i in range(len(sents1)):
    words = word_tokenize(normalizer.normalize(sents1[i]))
    words = [w for w in words if w not in [',', ':', '"', '؛', '-', '»', 'ـ', '»']]
    s = ' '.join(words)
    f.write(s + '\n')


# with open('data.txt', 'w') as f:
#   for sent in result:
#     f.write(sent + '\n')  

In [None]:
with open('processd_data.txt', 'w') as f:
  for i in range(len(sents2)):
    words = word_tokenize(normalizer.normalize(sents2[i]))
    words = [w for w in words if w not in [',', ':', '"', '؛', '-', '»', 'ـ', '»']]
    s = ' '.join(words)
    f.write(s + '\n')


In [None]:
model = fasttext.train_unsupervised('processd_data.txt', model='cbow')

In [None]:
model.get_nearest_neighbors('علوم',50)

In [None]:
def get_target_word(context, model, target): # with intersection
  intersection = model.get_nearest_neighbors(context[0], 100)
  for w in context[1:]:
    neighbors = model.get_nearest_neighbors(w, 100)
    intersection = [value for value in intersection if value in neighbors]
  edit_dist = []
  for w in intersection:
    edit_dist.append(Levenshtein.distance(w, target))
  if len(edit_dist) == 0:
    return target
  minimum = min(edit_dist)
  index = edit_dist.index(minimum)
  return intersection[index]

In [None]:
def get_target_word(context, model, target): # with union
  union = []
  for w in context:
    neighbors = model.get_nearest_neighbors(w, 50)
    union += neighbors
  edit_dist = []
  for w in union:
    if w is not str:
      continue
    # try:
    edit_dist.append(Levenshtein.distance(w, target))
    # except:
    #   print(w)
    #   print(target)
  if len(edit_dist) == 0:
    return target
  minimum = min(edit_dist)
  index = edit_dist.index(minimum)
  return union[index]

In [None]:
sent = 'بسیاری از مباحث علوم طبیعی با استفاده از فیریک دنیای مادی ابل توجیح نیست و برای یادگیری باید به فلسفه های خاصی رجو کرد.'
words = word_tokenize(sent)

context_size = 3 
new_sent = words
for i in range(context_size, len(new_sent) - context_size):
  context = []
  for j in range(-context_size, context_size+1, 1):
    if j == 0:
      continue
    context.append(new_sent[i+j])
  target = new_sent[i]
  new_sent[i] = get_target_word(context, model, target)

print(new_sent)

['بسیاری', 'از', 'مباحث', 'علوم', 'طبیعی', 'با', 'استفاده', 'از', 'فیریک', 'دنیای', 'مادی', 'ابل', 'توجیح', 'نیست', 'و', 'برای', 'یادگیری', 'باید', 'به', 'فلسفه', 'های', 'خاصی', 'رجو', 'کرد', '.']


# References

* https://towardsdatascience.com/masked-language-modelling-with-bert-7d49793e5d2c
* [Huggingface Bert Model](https://huggingface.co/HooshvareLab/bert-fa-base-uncased?text=%D8%A7%DB%8C%D9%86+%D8%AF%D8%A7%D9%86%D8%B4%D9%85%D9%86%D8%AF+%5BMASK%5D+%D8%A8%D8%A7%D8%B9%D8%AB+%D8%A7%D9%81%D8%AA%D8%AE%D8%A7%D8%B1+%D8%A7%D8%B3%D8%AA.)
* https://github.com/language-ml/2-nlp-language-modeling/blob/main/1-Ngram-LanguageModeling-Persian.ipynb
* ...