In [1]:
import warnings

warnings.filterwarnings('ignore')

from gensim.models import KeyedVectors
import numpy as np
import pandas as pd
import regex as re
import torch
from farsi_tools import stop_words
from hazm import word_tokenize, Lemmatizer
from pymongo import MongoClient
from sentence_transformers import util
from symspellpy import SymSpell, Verbosity
from tqdm import tqdm
from values.Values import *
import constance

In [3]:

word2vec_cbow_path = '../dataset/word2vec.model-cbow-size=200-window=5.bin'

lemmatizer = Lemmatizer()


# STOP WORD

In [4]:
with open(stopword_path, encoding="utf8") as f:
    stop = f.readlines()
# cleaning stopwords
stop_word = [word.replace('\n', '') for word in stop]
stop_word = [re.sub('[\\u200c]', ' ', word) for word in stop_word]
stop_word.extend(stop_words())


# mongo db

In [5]:
def context_mongo(collection_name):
    client = MongoClient(host=host, port=port)
    client_my = client['nlp']
    my_coll = client_my[collection_name]
    return my_coll

# glove

In [6]:
def load_glove_model(glove_file):
    print("loading glove model")
    model = KeyedVectors.load_word2vec_format(glove_file, binary=False)
    print(f"loaded glove model , {len(model)}")
    return model

In [7]:
model = load_glove_model(glove_path)

loading glove model
loaded glove model , 240548


# spell checker

In [8]:

sym_spell = SymSpell(max_dictionary_edit_distance=3, prefix_length=8)
sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)

True

# word 2 vector sy

In [9]:
word2vec_model = KeyedVectors.load_word2vec_format(word2vec_cbow_path, binary=True)

In [10]:
test = "تعیین مقادیر پیش فرض در ماژول جلسات"
test1 = "تعیین مقادیر پیش فرض در افزونه جلسات"
text = "ویدئو انیمیشن برای همکارات بفرست"

In [110]:
word2vec_model.most_similar_cosmul('IT')

[('ERP', 0.8689835071563721),
 ('اتوماسیون', 0.8578999042510986),
 ('راهبری', 0.8481571674346924),
 ('ICT', 0.8443616628646851),
 ('کسب\u200cوکار', 0.8366156220436096),
 ('زیرساختی', 0.8302525877952576),
 ('Management', 0.8295701146125793),
 ('SAP', 0.8287033438682556),
 ('خودکارسازی', 0.827669084072113),
 ('مدیریتی', 0.824644923210144)]

# Pre

In [15]:
def read_dataset_csv(csv_fine_path):
    df = pd.read_csv(csv_fine_path)  # read csv file
    data_list = df.to_dict(orient='records')  # reshape to dict
    return data_list

In [16]:
def encode_to_db(collection_name):
    collection_name = MongoClient(host, port)
    client = collection_name['nlp']
    my_collection = client['encoded_new_coll_collection']
    main_collection = client['new_coll']
    dict_list = [i['job'] for i in main_collection.find({}, {'job'})]
    for i in tqdm(dict_list):
        if my_collection.find_one({'text': i}):
            pass
        else:
            my_collection.insert_one({
                'text': i,
                'encoded': word_embedding_method(i)
            })


In [17]:
#encode_to_db('nlp')

In [18]:

def word_embedding_method(sentence):
    try:
        encoded_word_list = []
        for i in preprocess(sentence):
            if i in model:
                encoded_word_list.append(model[i])
            else:
                continue
        if encoded_word_list is None:
            return None
        else:
            return np.mean(encoded_word_list, axis=0).tolist()
    except KeyError as e:
        return None

# preprocess

In [19]:

def preprocess(raw_text):
    words = word_tokenize(raw_text)  # split a sentence to words and return an array
    words = [i for i in words if i not in stop_word]
    spell_checker_list = []
    for i in range(len(words)):  # iterate in word array for checking spell
        if not sym_spell.lookup(words[i], Verbosity.ALL, max_edit_distance=3):  # if word not exists ignore the word
            continue
        else:
            word_matched = sym_spell.lookup(words[i], Verbosity.ALL, max_edit_distance=3)
            for i in word_matched[:1]:  # take first similar word to our incorrect word
                spell_checker_list.append(i)
    #lemmatize = [lemmatizer.lemmatize(word) for word in words]

    split_lemm_words = []
    for i in words:

        if "#" not in i:
            split_lemm_words.append(i)
        else:
            split_lemm_words.extend(i.split("#"))
    clean_words = list(set([w for w in split_lemm_words if w not in stop_word]))  # remove some word like "و ,با, ..."

    return clean_words



# word synonyms 

In [20]:
def calculate_similarity_of_words(sentence1, sentence2):
    sentence_1 = preprocess(sentence1)

    sentence_word_similarity = {}
    for i in sentence_1:
        get_similarity = word2vec_model.most_similar_cosmul(i, topn=20)
        lemm = [lemmatizer.lemmatize(word[0]) for word in get_similarity]
        cleaning_words = [re.sub(r'[\u200c]', '', word) for word in lemm]
        sentence_word_similarity[i] = list(set(cleaning_words))
    gf_list = {}
    for sent in sentence2:
        sentence_2 = preprocess(sent)
        reformed_sentence = sentence_2.copy()
        for i, j in sentence_word_similarity.items():
            for x in sentence_2:
                if x in j:
                    reformed_sentence = [i if word == x else word for word in reformed_sentence]
        gf_list = {**gf_list, **{sent: ' '.join(reformed_sentence)}}
    return gf_list


# *Similarity*

In [23]:

def final_similarity(sentence):
    my_collection = context_mongo('new_version_of_cleaned_dataset')

    vector_1 = np.mean([word_embedding_method(sentence)], axis=0)
    result_pass = [calculate_similarity_of_words(sentence, [i['cleaned_text'] for i in my_collection.find()][:50])]

    op = {}
    for i in result_pass:
        for j in i.values():
            if len(j.split(' ')) > 1:
                op[j] = np.mean([word_embedding_method(j)], axis=0)
            else:
                if type(word_embedding_method(j)) == float:
                    continue
                else:
                    op[j] = word_embedding_method(j)
    result_dict = {}
    result = util.cos_sim(vector_1, np.array([j if i else None for i, j in op.items()], dtype=float))[0]
    top_results = torch.topk(result, k=5)
    for score, idx in zip(top_results[0], top_results[1]):
        result_dict[list(result_pass[0].keys())[int(idx)]] = np.round(score.numpy() * 100, 2)
    return result_dict

In [24]:
final_similarity("محتوای صفحه هدف و قیمت گذاری ثبت نام")

{'محتوای ایمیل مارکتینگ روز چهاردهم': 100.0,
 'برنامه تکمیلی واحد بازاریابی سال': 89.79,
 'راهنمای مدیریت پروژه': 89.45,
 'محتوای صفحات وب مدیریت پروژه': 86.31,
 'طراحی سایت همسو و صفحات داخلی آن': 85.22}



# Sent By Sent

In [25]:
def calculate_similarity_of_words(sentence1, sentence2):
    sentence_1 = preprocess(sentence1)
    sentence_word_similarity = {}
    for i in sentence_1:
        get_similarity = word2vec_model.most_similar_cosmul(i, topn=20)
        lemm = [lemmatizer.lemmatize(word[0]) for word in get_similarity]
        cleaning_words = [re.sub(r'[\u200c]', '', word) for word in lemm]
        sentence_word_similarity[i] = list(set(cleaning_words))

    sentence_2 = preprocess(sentence2)

    reformed_sentence = sentence_2.copy()
    for i, j in sentence_word_similarity.items():
        for x in sentence_2:
            if x in j:
                reformed_sentence = [i if word == x else word for word in reformed_sentence]
    gf_list = ' '.join(reformed_sentence)
    return gf_list


def single_sim(sentence1, sentence2):
    vector_1 = np.mean([word_embedding_method(sentence1)], axis=0)
    result_pass = calculate_similarity_of_words(sentence1, sentence2)

    op = {}
    if len(result_pass.split(' ')) > 1:
        op[result_pass] = np.mean([word_embedding_method(result_pass)], axis=0)
    else:
        if type(word_embedding_method(result_pass)) == float:
            pass
        else:
            op[result_pass] = word_embedding_method(result_pass)
    result_dict = {}
    result = util.cos_sim(vector_1, np.array([j if i else None for i, j in op.items()], dtype=float))[0]
    return result



In [26]:
text = "تولید پرسونا برای برنامه ریزی سخنرانی"
text1 = "تهیه پرسونا برای مدیریت جلسات"
single_sim(text, text1)

tensor([0.9442], dtype=torch.float64)

# for faster calculation

In [27]:
my_collection = context_mongo('new_version_of_cleaned_dataset')
sentences = [i['cleaned_text'] for i in my_collection.find()]

In [30]:
def find_upload(sentences):
    all_predicted_sentences = {}
    for i in tqdm(sentences):
        tokenized = preprocess(i)
        if len(tokenized) > 0:
            ref_with_sim = {}
            for word in tokenized:
                sim_words_array = []
                try:
                    get_similarity = word2vec_model.most_similar_cosmul(word, topn=10)
                    for similarity in get_similarity:
                        sim_words_array.append(similarity[0])
                except:
                    pass
                ref_with_sim = {**ref_with_sim, **{word: sim_words_array}}
            maked_sentence = []
            for tokenized_word in preprocess(i):
                for l, m in ref_with_sim.items():
                    if l == tokenized_word:
                        for ih in m:
                            maked_sentence.append(str(i).replace(l, ih))
            all_predicted_sentences = {**all_predicted_sentences, **{i: maked_sentence}}
        else:
            continue
    return all_predicted_sentences


In [45]:
all_syn = context_mongo('all_syn')

In [40]:
for i, j in find_upload(sentences[:10]).items():
    if all_syn.find_one({'name': i}):
        pass
    else:
        all_syn.insert_one({
            "name": i,
            "encoded": j
        })

100%|██████████| 10/10 [00:05<00:00,  1.70it/s]


In [43]:
syn_encoded = context_mongo('syn_encoded')

In [46]:

for i in tqdm(all_syn.find({}, {"_id": False})):
    if syn_encoded.find_one({'text': i['name']}):
        pass
    else:
        syn_encoded_list = []
        for m in i['encoded']:
            syn_encoded_list.append(np.mean([word_embedding_method(m)], axis=0).tolist())
        if syn_encoded_list:
            syn_encoded.insert_one({
                'text': i['name'],
                'generated_sent': i['encoded'],
                'mean_encoded': syn_encoded_list

            })


6513it [00:48, 134.33it/s]


KeyboardInterrupt: 

In [105]:
def result(ref):
    vector_1 = np.mean([word_embedding_method(ref)], axis=0)
    res = {}
    for i in tqdm(syn_encoded.find({}, {"_id": False})):
        result = util.cos_sim(vector_1.tolist(), [ c for c in i['mean_encoded'] if type(c) !=float ])
        res[i['text']]=np.mean(sorted(result[0].detach().numpy(),reverse=True))
        #res[i['text']] = result[0].detach().numpy()
    return list(sorted(res.items(), key=lambda item: item[1], reverse=True))[:50]

In [120]:

result('تنظیم')


6506it [00:05, 1146.86it/s]


[('اضافه کردن تنظیم onignal', 0.80495614),
 ('اضافه کردن تنظیم فعالغیرفعال برای نمایش متن های از پیش تعریف شده prefined contt',
  0.79656935),
 ('بررسی دلیل کار نکردن تنظیم veloping mo در پنل مدیریت', 0.7896117),
 ('اضافه کردن تنظیم برای زمان پیشفرض زمان سررسید مصوبه و پیگیری', 0.7810662),
 ('طراحی صفحه تنظیم تاریخ کار', 0.777129),
 ('تنظیم آتنتیکیت سیستم', 0.7734872),
 ('تنظیم فیلد های تبولیتور ها مطابق با طرح ها', 0.77012247),
 ('افزودن تنظیم برای عمومی یا خصوصی بودن الگوها', 0.7684008),
 ('ترجمه فریز اصلاح شود از ویرایش ثبت به ذخیره', 0.76694185),
 ('اضافه شدن تنظیم گروه کاربری برای ویرایش همه قرار ملاقات ها', 0.766263),
 ('افزودن تنظیم گروه کاربری برای دیدن همه پیشنهادات proposion', 0.76174885),
 ('اضافه کردن تنظیم گروه کاربری برای ویرایش تنظیمات اکانت', 0.7593921),
 ('در فیلتر چت یک تنظیم اضافه شود که همه را نمایش دهد اما آرشیوها را نمایش ندهد',
  0.75575864),
 ('اضافه کردن تنظیم به ماژول کاربران برای ایجاد فید هنگام تغییر تصویر',
  0.75448704),
 ('مدیریت پروژهپشتیبانیقسمت ویرایش 

In [107]:
from nltk import ngrams
from collections import defaultdict


class to_ngram:
    def __init__(self, data_set, n_gram: list):
        super(to_ngram, self).__init__()
        self.n_gram = n_gram
        self.data_set = data_set

    def __calculate_imbalanced_label(self):
        job_count = self.data_set.who.value_counts()
        job_count = job_count[job_count < 50]
        return job_count.index

    def __do_not_know(self):
        job_count = self.__calculate_imbalanced_label()
        output_series = pd.Series()
        for i in job_count:
            tasks = self.data_set[self.data_set['who'] == i]['job']
            list_of_task = []
            for task in tasks:
                for num in self.n_gram:
                    word = task.split()
                    word = ngrams(word, num)
                    result = [' '.join(ngram) for ngram in word]
                    if len(result) > 0:
                        list_of_task.append(result)
            output_series.loc[i] = list_of_task
        return output_series

    def __sep_dataset(self):
        output_series = self.__do_not_know()
        #full_list=defaultdict(list)
        final_dict = {}
        for who, jobs in output_series.items():
            for y in jobs:
                for z in y:
                    final_dict[z] = who
        return final_dict

    def __result(self):
        final_dict = self.__sep_dataset()
        dataframe = pd.DataFrame([final_dict]).transpose().reset_index()
        dataframe.rename(columns={'index': "job", int(0): "who"}, inplace=True)
        return dataframe

    def __call__(self):
        return self.__result()


In [95]:
dataset=context_mongo('all_syn')
li=[]
for i in dataset.find({},{'_id':False})[23:24]:
    li.append(i['name'])

In [96]:
class n_gram:
    def __init__(self,data_set,ngram:list):
        super(n_gram,self).__init__()
        self.data_set=data_set
        self.ngram=ngram
        
    def do_not_know(self):
        list_of_task = []
        for i in self.data_set:
            word=i.split()
            word=ngrams(word,2)
            
            result = [' '.join(ngram) for ngram in word]
            if len(result) > 0:
                list_of_task.append(result)
        return list_of_task

In [97]:
nnn=n_gram(li,[2,3])

In [103]:
for i in nnn.do_not_know():
    print(i)

['طراحی اینفوگرام', 'اینفوگرام معماری', 'معماری ایمیل', 'ایمیل مارکتینگ']


In [52]:
تغییر منوی سمت راست

بیزینس ناظر کار


SyntaxError: invalid syntax (1652023506.py, line 1)