# Ngram model with cosine similarities

In [1]:
!pip install gdown

Collecting gdown
  Downloading gdown-4.7.1-py3-none-any.whl (15 kB)
Installing collected packages: gdown
Successfully installed gdown-4.7.1


In [8]:
!rm /kaggle/working/yoruba.csv

In [9]:
import gdown


url = 'https://drive.google.com/uc?id=1WgQ6v9fuEdMn8My6ELXfeVaSJ7asAaUJ'

output = 'yoruba.txt'

gdown.download(url, output, quiet=False)

Downloading...
From: https://drive.google.com/uc?id=1WgQ6v9fuEdMn8My6ELXfeVaSJ7asAaUJ
To: /kaggle/working/yoruba.txt
100%|██████████| 70.2M/70.2M [00:00<00:00, 236MB/s]


'yoruba.txt'

In [54]:
from IPython.display import FileLink
FileLink('embedings-ngram-3.pt')


In [53]:
FileLink('tokenized.txt')

In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [4]:
!pip install sentence_transformers



In [5]:
import torch
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('bert-base-multilingual-uncased')




In [6]:
def build_ngrams(text, n):
    word_doc = []
    doc_list = text.split()
    if len(doc_list) <= n:
        word_doc.append(' '.join(doc_list))
    else :
        for i in range(n, len(doc_list) +1):
            word_doc.append(' '.join(doc_list[i-n : i]))
    return word_doc

In [7]:
def preprocess_text(text):
    text = text.lower()
    text = text.replace('\n', '').replace('\t', '').replace('\r', '').replace('\ufeff', '').replace('“','').replace('”','')
    text = text.translate(str.maketrans('','', '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'))
    doc = text.split()
    text = ' '.join(doc)
    return text

In [8]:
def read_dataset(*links):
    dataset = []
    for link in links:
        with open(link, 'r', encoding='utf-8') as file:
            #  text = file.read()
            for tt in file:
                text = tt
                text = preprocess_text(text)
                if text.strip() != '':
                    dataset.append(text)
    return dataset

In [9]:
def gather_ngrams(ds, n):
    ngrams = []
    for text in ds:
        n_list = build_ngrams(text, n)
        ngrams.extend(n_list)
    return ngrams

In [10]:
def create_embedder(phrases):
    emb = model.encode(phrases)
    emb = torch.tensor(emb) 
    emb /= emb.norm(dim=-1, p=2).unsqueeze(-1) 
    return emb

In [51]:
def generate_ngram_model(ngrams, ngram_n):
    embedings = create_embedder(ngrams)
    torch.save(embedings, f'embedings-ngram-{ngram_n}.pt')
    with open('tokenized.txt', 'w') as ngram_file:
        ngram_file.write(str(ngrams))
    print(f'created embedings-ngram-{ngram_n}.pt and tokenized.txt')
    
# generate_ngram_model(ngrams, 3)

In [12]:
def predict(text, ngram_n, ngrams):
    phrase = create_embedder([text])
    print('loading_embbedings')
    embedings =torch.load(f'/kaggle/working/embedings-ngram-{ngram_n}.pt')
    print('loading_complete')
    sims = embedings @ phrase.t()
    return sims[:,0]

In [13]:
def final_result(text, ngram_n):
    ds = read_dataset('/kaggle/input/ngram-yoruba/bibeli_Mimo.txt')
    ngrams = gather_ngrams(ds, ngram_n)
    ngrams = list(set(ngrams))
    sims = predict(text, ngram_n, ngrams)
    greatest = max(sims)
    index_max = np.argmax(sims)
    return {"text": ngrams[int(index_max)], "index" : index_max,'similarity': greatest}

In [14]:
def final_n_result(text, ngram_n, no_of_results):
    if no_of_results < 1:
        raise Exception("Sorry, no numbers below one")
    ds = read_dataset('/kaggle/input/ngram-yoruba/bibeli_Mimo.txt')
    ngrams = gather_ngrams(ds, ngram_n)
    ngrams = list(set(ngrams))
    sims = predict(text, ngram_n, ngrams)
    ind = np.argpartition(sims, -1 * no_of_results )[ -1 * no_of_results:]
    resp = []
    for i in ind:
        similarity = sims[i]
        index_max = i
        resp.append({"text": ngrams[int(index_max)], "index" : index_max,'similarity': similarity})
    return resp

In [69]:
def final_ngram_result(text,ngrams, ngram_n, no_of_results):
    if no_of_results < 1:
        raise Exception("Sorry, no numbers below one")
    if no_of_results > len(ngrams):
        no_of_results = len(ngrams)
    print(no_of_results,len(ngrams) )
    sims = predict(text, ngram_n, ngrams)
    ind = np.argpartition(sims, -1 * no_of_results )[ -1 * no_of_results:]
    resp = []
    check = []
    final_obj = {}
    obj_score = {}
    for i in ind:
        similarity = sims[i]
        index_max = i
        if similarity >= 1:
            return []
#         if ngrams[int(index_max)] not in check:
#             check.append(ngrams[int(index_max)])
#             if obj_score.get(similarity) == None:
#                 obj_score[similarity] = ngrams[int(index_max)]
#                 final_obj[ngrams[int(index_max)]] = []
#             final_obj[obj_score[similarity]].append({"text": ngrams[int(index_max)], "index" : index_max,'similarity': similarity})
        resp.append({"text": ngrams[int(index_max)], "index" : index_max,'similarity': similarity})
        
    return resp #list(reversed(sorted(resp, key=lambda d: d['similarity']) ))

In [70]:
def read_tokenizer(link):
    with open(link, "r") as f:
        return eval(f.read())

['fúngbà díẹ̀ níbi',
 'fún àjọ yìí',
 'àwọ̀lékè rẹ̀ ya',
 'ogún èèyàn péré',
 '4200 dọ́là ilẹ̀',
 'sọ nípa ikú',
 'ń ṣáájú nínú',
 'ṣètò ibi táwọn',
 'sì tọ́ ọ',
 'ará samáríà séèyàn']

In [72]:
text = 'fúngbà díẹ̀ níb'#input('enter yoruba text:')
ngrams_value = 3

# generate_ngram_model(ngrams, ngrams_value)
# final_result(text, ngrams_value)
ngrams = read_tokenizer('/kaggle/working/tokenized.txt')
final_ngram_result(text,ngrams, ngrams_value, 25)

10 10


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

loading_embbedings
loading_complete


[{'text': 'àwọ̀lékè rẹ̀ ya', 'index': tensor(2), 'similarity': tensor(0.4664)},
 {'text': 'fún àjọ yìí', 'index': tensor(1), 'similarity': tensor(0.5944)},
 {'text': 'fúngbà díẹ̀ níbi',
  'index': tensor(0),
  'similarity': tensor(0.9480)},
 {'text': 'ogún èèyàn péré', 'index': tensor(3), 'similarity': tensor(0.5864)},
 {'text': '4200 dọ́là ilẹ̀', 'index': tensor(4), 'similarity': tensor(0.5185)},
 {'text': 'sọ nípa ikú', 'index': tensor(5), 'similarity': tensor(0.6276)},
 {'text': 'ń ṣáájú nínú', 'index': tensor(6), 'similarity': tensor(0.6945)},
 {'text': 'ṣètò ibi táwọn', 'index': tensor(7), 'similarity': tensor(0.4875)},
 {'text': 'sì tọ́ ọ', 'index': tensor(8), 'similarity': tensor(0.6512)},
 {'text': 'ará samáríà séèyàn',
  'index': tensor(9),
  'similarity': tensor(0.6629)}]

In [15]:
# ds = read_dataset('/kaggle/working/yoruba.txt')
# ngrams_value = 3
# ngrams = gather_ngrams(ds, ngrams_value)
# print(len(ngrams))
# ngrams = list(set(ngrams))
# print(len(ngrams))
# print(ngrams[:5])
# generate_ngram_model(ngrams, ngrams_value)

9216779
2778269
['fúngbà díẹ̀ níbi', 'fún àjọ yìí', 'àwọ̀lékè rẹ̀ ya', 'ogún èèyàn péré', '4200 dọ́là ilẹ̀']


In [60]:
generate_ngram_model(ngrams, ngrams_value)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

created embedings-ngram-3.pt and tokenized.txt


In [None]:
# ! rm /kaggle/working/embedings-ngram-3.pt
! rm /kaggle/working/tokenized.txt