In [1]:
from gensim.models import Word2Vec, FastText
import parsivar
import json
import pickle
import itertools

In [2]:
normalizer = parsivar.Normalizer()
tokenizer = parsivar.Tokenizer()

# load data

In [2]:
f = open("test.txt", "r", encoding='utf-8')
number_of_chars = len(f.read())
print(f'{number_of_chars} characters')
f.close()
f = open("test.txt", "r", encoding='utf-8')
number_of_lines = len(f.readlines())
print(f'{number_of_lines} lines')
f.close()

237915404 characters
2106953 lines


In [None]:
def get_sentences():
    sentences = []
    with open("test.txt", "r", encoding='utf-8') as t:
        while True:
            line = t.readline()
            if line == "":
                break
            sentences.append(tokenizer.tokenize_words(line))
    return sentences

In [4]:
%%time
sentences = get_sentences()
# print(len(sentences))
# 4GB of RAM

Wall time: 20.7 s


# fastText model

In [2]:
FastText_model = FastText.load("FastTextModel")
# 3.5GB RAM

In [None]:
%%time
FastText_model = FastText(vector_size=300, window=5, min_count=1, sentences=sentences, epochs=5)

In [None]:
FastText_model.wv.most_similar(FastText_model.wv['شاه'])

In [None]:
FastText_model.save("FastTextModel")

# Word2Vec model

In [3]:
word2vec_model = Word2Vec.load("Word2VecModel")
# 1GB RAM

In [None]:
%%time
word2vec_model = Word2Vec(vector_size=300, window=5, min_count=1, sentences=sentences, epochs=5)

In [41]:
word2vec_model.wv.most_similar(word2vec_model.wv['مرد'], topn=10)

[('مرد', 1.0),
 ('زن', 0.6828464865684509),
 ('شوهر', 0.6612272262573242),
 ('مردی', 0.6603354811668396),
 ('دختر', 0.6036628484725952),
 ('دختری', 0.5984135270118713),
 ('میانسال', 0.5959803462028503),
 ('شوهری', 0.5932475328445435),
 ('پیرزن', 0.5812994837760925),
 ('پیرمرد', 0.5786240100860596)]

In [34]:
word2vec_model.wv.distance("شاه", "ملکه")

0.524875283241272

In [None]:
word2vec_model.save("Word2VecModel")

# Build Ngram Model

In [4]:
def words_to_tuple(words, model):
    return tuple([model.wv.key_to_index[words[i]] for i in range(len(words))])

# seems some of the words in the text file is not present in model key_to_index
def build_ngrams(n):
    ngram_counts = {}
    with open("test.txt", "r", encoding='utf-8') as t:
        while True:
            line = t.readline()
            if line == "":
                break
            words = tokenizer.tokenize_words(line)
            for i in range(max(len(words)-n+1, 0)):
                try:
#                     pre_words_indices = tuple([word2vec_model.wv.key_to_index[words[i+j]] for j in range(n)])
                    pre_words_indices = words_to_tuple(words[i:i+n], word2vec_model)
                    if pre_words_indices in ngram_counts:
                        ngram_counts[pre_words_indices] += 1
                    else:
                        ngram_counts[pre_words_indices] = 0
                except:
                    pass
    return ngram_counts

In [33]:
%%time
word_counts = build_ngrams(1)
word_counts["all"] = sum(word_counts.values())
# 100MB of RAM
biword_counts = build_ngrams(2)
biword_counts["all"] = sum(biword_counts.values())
# 700MB of RAM
threeword_counts = build_ngrams(3)
threeword_counts["all"] = sum(threeword_counts.values())
# 1.6GB of RAM
fourword_counts = build_ngrams(4)
fourword_counts["all"] = sum(fourword_counts.values())
# 3.5GB of RAM

Wall time: 5min 44s


In [17]:
with open('word counts.json', 'wb') as f:
    pickle.dump(word_counts, f)
with open('bi-grams.json', 'wb') as f:
    pickle.dump(biword_counts, f)
with open('three-grams.json', 'wb') as f:
    pickle.dump(threeword_counts, f)
with open('four-grams.json', 'wb') as f:
    pickle.dump(fourword_counts, f)

In [18]:
print(f'words:       {len(word_counts)}')
print(f'bi-grams:    {len(biword_counts)}')
print(f'three-grams: {len(threeword_counts)}')
print(f'four-grams:  {len(fourword_counts)}')

words:       347680
bi-grams:    6764724
three-grams: 20208193
four-grams:  29157652


In [5]:
with open('word counts.json', 'rb') as f:
    word_counts = pickle.load(f)
    f.close()
with open('bi-grams.json', 'rb') as f:
    biword_counts = pickle.load(f)
    f.close()
with open('three-grams.json', 'rb') as f:
    threeword_counts = pickle.load(f)
    f.close()
# with open('four-grams.json', 'rb') as f:
#     fourword_counts = pickle.load(f)
#     f.close()

# test

In [6]:
import editdistance

In [7]:
def cal_score(model, query_words, proposed_words, word_counts, biword_counts, threeword_counts, fourword_counts=None, weights=[1,1,1,1]):
    S0_scores = []
    S1_scores = []
    S2_scores = []
    S3_scores = []
    for i in range(len(proposed_words)):
        # OOV penalty?
        counts = word_counts.get((model.wv.key_to_index.get(proposed_words[i], None), ), None)
        if counts is None:
            S0_scores.append(1/word_counts["all"])
        else:
            S0_scores.append(counts/word_counts["all"])
        if i >= 1:
            key = (model.wv.key_to_index.get(proposed_words[i-1], None), model.wv.key_to_index.get(proposed_words[i], None))
            counts = biword_counts.get(key, None)
            if counts is None:
                S1_scores.append(1/biword_counts["all"])
            else:
                key = (model.wv.key_to_index.get(proposed_words[i-1], None), )
                S1_scores.append(counts/word_counts[key])
        
        if i >= 2:
            key = (model.wv.key_to_index.get(proposed_words[i-2], None), model.wv.key_to_index.get(proposed_words[i-1], None), model.wv.key_to_index.get(proposed_words[i], None))
            counts = threeword_counts.get(key, None)
            if counts is None:
                S2_scores.append(1/threeword_counts["all"])
            else:
                key = (model.wv.key_to_index.get(proposed_words[i-2], None), model.wv.key_to_index.get(proposed_words[i-1], None))
                S2_scores.append(counts/biword_counts[key])
        
#         if i >= 3:
#             key = (model.wv.key_to_index.get(proposed_words[i-3], None), model.wv.key_to_index.get(proposed_words[i-2], None),
#                    model.wv.key_to_index.get(proposed_words[i-1], None), model.wv.key_to_index.get(proposed_words[i], None))
#             counts = fourword_counts.get(key, None)
#             if counts is None:
#                 S3_scores.append(1/fourword_counts["all"])
#             else:
#                 key = (model.wv.key_to_index.get(proposed_words[i-3], None), model.wv.key_to_index.get(proposed_words[i-2], None),
#                        model.wv.key_to_index.get(proposed_words[i-1], None))
#                 S3_scores.append(counts/threeword_counts[key])

    return weights[0]*sum(S0_scores)+weights[1]*sum(S1_scores)+weights[2]*sum(S2_scores)

In [9]:
query_words = ["جلوم", "وزیر"]
proposed_words = ["غلوم", "وزیر"]
print(cal_score(word2vec_model, query_words, proposed_words, word_counts, biword_counts, threeword_counts))
print(cal_score(word2vec_model, query_words, query_words, word_counts, biword_counts, threeword_counts))

0.0006941245437847378
0.0006941011088625792


In [10]:
def cal_word_candidates(model, query_words):
    res = [[x for x in model.wv.key_to_index if editdistance.eval(x, query_words[i]) <= 1] for i in range(len(query_words))]
    return list(itertools.product(*res))

In [11]:
def correct_answere(model, query):
    query_words = tokenizer.tokenize_words(normalizer.normalize(query))
    candidates = cal_word_candidates(model, query_words)
    
    res = ""
    score = 0
    for candidate in candidates:
        temp_score = cal_score(word2vec_model, query_words, candidate, word_counts, biword_counts, threeword_counts)
        if temp_score > score:
            score = temp_score
            res = candidate 
    return res

In [28]:
%%time
correct_answere(word2vec_model, "غله دماوند")

Wall time: 1.57 s


('قله', 'دماوند')

In [None]:
def edit_distance(a, b):
    if len(a) == 0:
        return len(b)
    if len(b) == 0:
        return len(a)
    if a[0] == b[0]:
        return edit_distance(a[1:], b[1:])
    else:
        return 1+(min(edit_distance(a, b[1:]), edit_distance(a[1:], b)))