In [1]:
import warnings
warnings.filterwarnings('ignore')

from gensim.models import KeyedVectors
import numpy as np
import pandas as pd
import regex as re
import torch
from farsi_tools import stop_words
from hazm import word_tokenize, Lemmatizer
from pymongo import MongoClient
from sentence_transformers import util
from symspellpy import SymSpell, Verbosity
from tqdm import tqdm
from const.constance import *


lemmatizer = Lemmatizer()


# STOP WORD

In [2]:
with open(stopword_path, encoding="utf8") as f:
    stop = f.readlines()
# cleaning stopwords
stop_word = [word.replace('\n', '') for word in stop]
stop_word = [re.sub('[\\u200c]', ' ', word) for word in stop_word]
stop_word.extend(stop_words())


# mongo db

In [3]:
def context_mongo(collection_name):
    client = MongoClient(host=host, port=port)
    client_my = client['nlp']
    my_coll = client_my[collection_name]
    return my_coll

p# ==ol

In [4]:
def load_glove_model(glove_file):
    print("loading glove model")
    model = KeyedVectors.load_word2vec_format(glove_file, binary=False)
    print(f"loaded glove model , {len(model)}")
    return model

In [5]:
model = load_glove_model(glove_path)

loading glove model
loaded glove model , 240548


# spell checker

In [6]:
sym_spell = SymSpell(max_dictionary_edit_distance=3, prefix_length=8)
sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)

True

# word 2 vector sy

In [7]:
word2vec_model = KeyedVectors.load_word2vec_format(word2vec_cbow_path, binary=True)

In [8]:
test = "تعیین مقادیر پیش فرض در ماژول جلسات"
test1 = "تعیین مقادیر پیش فرض در افزونه جلسات"
text = "ویدئو انیمیشن برای همکارات بفرست"

In [9]:
word2vec_model.most_similar_cosmul('IT',topn=300)


[('ERP', 0.8689835667610168),
 ('اتوماسیون', 0.8578999042510986),
 ('راهبری', 0.8481571078300476),
 ('ICT', 0.8443616628646851),
 ('کسب\u200cوکار', 0.8366156220436096),
 ('زیرساختی', 0.8302525877952576),
 ('Management', 0.8295701146125793),
 ('SAP', 0.8287033438682556),
 ('خودکارسازی', 0.827669084072113),
 ('مدیریتی', 0.8246448636054993),
 ('Service', 0.8239153623580933),
 ('Security', 0.8237847089767456),
 ('مشارکتی', 0.8226858973503113),
 ('سیسکو', 0.8220700621604919),
 ('توزیع\u200cشده', 0.8212320804595947),
 ('لجستیک', 0.8208287358283997),
 ('یکپارچه\u200cسازی', 0.8204160928726196),
 ('PLC', 0.8197720646858215),
 ('زیرساخت', 0.8197266459465027),
 ('توسعه\u200cای', 0.8187692761421204),
 ('زیربنایی', 0.8179969787597656),
 ('مشاوره\u200cای', 0.8171239495277405),
 ('نرم\u200cافزاری', 0.816767156124115),
 ('مدیریت', 0.8163818717002869),
 ('IETF', 0.8143932819366455),
 ('ATM', 0.8135349154472351),
 ('Systems', 0.8115798234939575),
 ('CRM', 0.8107935786247253),
 ('سخت\u200cافزارها', 0.808

# Pre

In [10]:
def read_dataset_csv(csv_fine_path):
    df = pd.read_csv(csv_fine_path)  # read csv file
    data_list = df.to_dict(orient='records')  # reshape to dict
    return data_list

In [36]:
def encode_to_db(collection_name):
    collection_name = MongoClient(host, port)
    client = collection_name['nlp']
    my_collection = client['encoded_new_coll_collection']
    main_collection = client['new_coll']
    dict_list = [i['job'] for i in main_collection.find({}, {'job'})]
    for i in tqdm(dict_list):
        if my_collection.find_one({'text': i}):
            pass
        else:
            my_collection.insert_one({
                'text': i,
                'encoded': word_embedding_method(i)
            })


In [61]:

def word_embedding_method(sentence):
    try:
        encoded_word_list = []
        for i in preprocess(sentence):
            if i in word2vec_model :
                encoded_word_list.append(word2vec_model[i])
            else:
                continue

        if encoded_word_list is None:
            return None
        else:
            return np.mean(encoded_word_list, axis=0).tolist()
    except KeyError as e:
        return None

In [62]:
len(word_embedding_method('محتوای شبکه اجتماعی لینکدین'))

200

# preprocess

In [29]:

def preprocess(raw_text):
    words = word_tokenize(raw_text)  # split a sentence to words and return an array
    words = [i for i in words if i not in stop_word]
    spell_checker_list = []
    for i in range(len(words)):  # iterate in word array for checking spell
        if not sym_spell.lookup(words[i], Verbosity.ALL, max_edit_distance=3):  # if word not exists ignore the word
            continue
        else:
            word_matched = sym_spell.lookup(words[i], Verbosity.ALL, max_edit_distance=3)
            for i in word_matched[:1]:  # take first similar word to our incorrect word
                spell_checker_list.append(i)
    #lemmatize = [lemmatizer.lemmatize(wo
    # rd) for word in words]

    split_lemm_words = []
    for i in words:

        if "#" not in i:
            split_lemm_words.append(i)
        else:
            split_lemm_words.extend(i.split("#"))
    clean_words = list(set([w for w in split_lemm_words if w not in stop_word]))  # remove some word like "و ,با, ..."

    return clean_words



# word synonyms 

In [30]:
def calculate_similarity_of_words(sentence1, sentence2):
    sentence_1 = preprocess(sentence1)

    sentence_word_similarity = {}
    for i in sentence_1:
        get_similarity = word2vec_model.most_similar_cosmul(i, topn=20)
        lemm = [lemmatizer.lemmatize(word[0]) for word in get_similarity]
        cleaning_words = [re.sub(r'[\u200c]', '', word) for word in lemm]
        sentence_word_similarity[i] = list(set(cleaning_words))
    gf_list = {}
    for sent in sentence2:
        sentence_2 = preprocess(sent)
        reformed_sentence = sentence_2.copy()
        for i, j in sentence_word_similarity.items():
            for x in sentence_2:
                if x in j:
                    reformed_sentence = [i if word == x else word for word in reformed_sentence]
        gf_list = {**gf_list, **{sent: ' '.join(reformed_sentence)}}
    return gf_list


# *Similarity*

In [31]:

def final_similarity(sentence):
    my_collection = context_mongo('new_version_of_cleaned_dataset')

    vector_1 = np.mean([word_embedding_method(sentence)], axis=0)
    result_pass = [calculate_similarity_of_words(sentence, [i['cleaned_text'] for i in my_collection.find()][:50])]

    op = {}
    for i in result_pass:
        for j in i.values():
            if len(j.split(' ')) > 1:
                op[j] = np.mean([word_embedding_method(j)], axis=0)
            else:
                if type(word_embedding_method(j)) == float:
                    continue
                else:
                    op[j] = word_embedding_method(j)
    result_dict = {}
    result = util.cos_sim(vector_1, np.array([j if i else None for i, j in op.items()], dtype=float))[0]
    top_results = torch.topk(result, k=5)
    for score, idx in zip(top_results[0], top_results[1]):
        result_dict[list(result_pass[0].keys())[int(idx)]] = np.round(score.numpy() * 100, 2)
    return result_dict

In [32]:
final_similarity("محتوای شبکه اجتماعی لینکدین")

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (8,) + inhomogeneous part.



# Sent By Sent

In [33]:
def calculate_similarity_of_words(sentence1, sentence2):
    sentence_1 = preprocess(sentence1)
    sentence_word_similarity = {}
    for i in sentence_1:
        get_similarity = model.most_similar_cosmul(i, topn=20)
        lemm = [lemmatizer.lemmatize(word[0]) for word in get_similarity]
        cleaning_words = [re.sub(r'[\u200c]', '', word) for word in lemm]
        sentence_word_similarity[i] = list(set(cleaning_words))

    sentence_2 = preprocess(sentence2)

    reformed_sentence = sentence_2.copy()
    for i, j in sentence_word_similarity.items():
        for x in sentence_2:
            if x in j:
                reformed_sentence = [i if word == x else word for word in reformed_sentence]
    gf_list = ' '.join(reformed_sentence)
    return gf_list


def single_sim(sentence1, sentence2):
    vector_1 = np.mean([word_embedding_method(sentence1)], axis=0)
    result_pass = calculate_similarity_of_words(sentence1, sentence2)

    op = {}
    if len(result_pass.split(' ')) > 1:
        op[result_pass] = np.mean([word_embedding_method(result_pass)], axis=0)
    else:
        if type(word_embedding_method(result_pass)) == float:
            pass
        else:
            op[result_pass] = word_embedding_method(result_pass)
    result_dict = {}
    result = util.cos_sim(vector_1, np.array([j if i else None for i, j in op.items()], dtype=float))[0]
    return result



In [34]:
text = "محتوای شبکه اجتماعی لینکدین"
text1 = "محتوای شبکه اجتماعی فبسبوک"
single_sim(text, text1)

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (8,) + inhomogeneous part.

# for faster calculation

In [19]:
my_collection = context_mongo('new_version_of_cleaned_dataset')
sentences = [i['cleaned_text'] for i in my_collection.find()]

In [20]:
def find_upload(sentences):
    all_predicted_sentences = {}
    for i in tqdm(sentences):
        tokenized = preprocess(i)
        if len(tokenized) > 0:
            ref_with_sim = {}
            for word in tokenized:
                sim_words_array = []
                try:
                    get_similarity = word2vec_model.most_similar_cosmul(word, topn=7)
                    glove_s=model.most_similar_cosmul(word,topn=5)
                    for similarity in get_similarity:
                        sim_words_array.append(similarity[0])
                    for glo in glove_s:
                        sim_words_array.append(glo[0])
                except:
                    pass
                ref_with_sim = {**ref_with_sim, **{word: sim_words_array}}
            maked_sentence = []
            for tokenized_word in preprocess(i):
                for l, m in ref_with_sim.items():
                    if l == tokenized_word:
                        for ih in m:
                            maked_sentence.append(str(i).replace(l, ih))
            all_predicted_sentences = {**all_predicted_sentences, **{i: maked_sentence}}
        else:
            continue
    return all_predicted_sentences


In [21]:
all_syn = context_mongo('gfwerrjdgfgfdgyoh')

In [22]:
for i, j in find_upload(sentences[:50]).items():
    if all_syn.find_one({'name': i}):
        pass
    else:
        all_syn.insert_one({
            "name": i,
            "encoded": j
        })

100%|██████████| 50/50 [00:33<00:00,  1.50it/s]


In [23]:
syn_encoded = context_mongo('uiouiouiytiha')

In [24]:

for i in tqdm(all_syn.find({}, {"_id": False})):
    if syn_encoded.find_one({'text': i['name']}):
        pass
    else:
        syn_encoded_list = []
        for m in i['encoded']:
            syn_encoded_list.append(np.mean([word_embedding_method(m)], axis=0).tolist())
        if syn_encoded_list:
            syn_encoded.insert_one({
                'text': i['name'],
                'generated_sent': i['encoded'],
                'mean_encoded': syn_encoded_list

            })


50it [07:53,  9.47s/it]


In [49]:
def result(ref):
    vector_1 = np.mean([word_embedding_method(ref)], axis=0)
    res = {}
    for i in tqdm(syn_encoded.find({}, {"_id": False})):
        result = util.cos_sim(vector_1.tolist(), [ c for c in i['mean_encoded'] if type(c) !=float ])
        res[i['text']]=np.mean(sorted(result[0].detach().numpy(),reverse=True)[:20])
        #res[i['text']] = result[0].detach().numpy()
    return list(sorted(res.items(), key=lambda item: item[1], reverse=True))[:20]

In [50]:

result('امکانات تعیین دسترسی ها و وضعیت پروژه')


50it [00:00, 464.91it/s]


[('امکانات تعیین دسترسی ها و وضعیت پروژه', 0.9690629),
 ('امکانات پردازش پروژه و تخصیص پیشرفت کار', 0.7905096),
 ('امکانات فایل های پروژه', 0.7751187),
 ('امکانات برچسب کارها و پروژه ها', 0.75007397),
 ('امکانات بودجه و هزینه', 0.6937394),
 ('امکانات لیست کارها', 0.6663345),
 ('محتوای صفحات وب مدیریت پروژه', 0.65375483),
 ('تست و رفع ایرادات مدیریت کارها', 0.6363095),
 ('تامین تجهیزات اداری بیسیم دوربین مداربسته اداره کل حفاظت محیط زیست استان تهران',
  0.607102),
 ('تکمیل لیست تجهیزات اداری', 0.5997367),
 ('امکانات گفتمان ها', 0.5925042),
 ('طراحی سایت همسو و صفحات داخلی آن', 0.5881273),
 ('امکانات تایم شیت', 0.5766343),
 ('اصلاح و تست پروفایل دانش', 0.5747388),
 ('راهنمای مدیریت پروژه', 0.56618136),
 ('محتوای صفحه هدف و قیمت گذاری ثبت نام', 0.555218),
 ('تهیه مدل رشد', 0.5368761),
 ('برنامه ریزی برای ایمیل مارکتینگ', 0.52469915),
 ('دیجیتال مارکتینگ و توسعه بازار', 0.522797),
 ('محتوای شبکه اجتماعی لینکدین', 0.5157046)]

In [130]:
\from nltk import ngrams
from collections import defaultdict


class to_ngram:
    def __init__(self, data_set, n_gram: list):
        super(to_ngram, self).__init__()
        self.n_gram = n_gram
        self.data_set = data_set

    def __calculate_imbalanced_label(self):
        job_count = self.data_set.who.value_counts()
        job_count = job_count[job_count < 50]
        return job_count.index

    def __do_not_know(self):
        job_count = self.__calculate_imbalanced_label()
        output_series = pd.Series()
        for i in job_count:
            tasks = self.data_set[self.data_set['who'] == i]['job']
            list_of_task = []
            for task in tasks:
                for num in self.n_gram:
                    word = task.split()
                    word = ngrams(word, num)
                    result = [' '.join(ngram) for ngram in word]
                    if len(result) > 0:
                        list_of_task.append(result)
            output_series.loc[i] = list_of_task
        return output_series

    def __sep_dataset(self):
        output_series = self.__do_not_know()
        #full_list=defaultdict(list)
        final_dict = {}
        for who, jobs in output_series.items():
            for y in jobs:
                for z in y:
                    final_dict[z] = who
        return final_dict

    def __result(self):
        final_dict = self.__sep_dataset()
        dataframe = pd.DataFrame([final_dict]).transpose().reset_index()
        dataframe.rename(columns={'index': "job", int(0): "who"}, inplace=True)
        return dataframe

    def __call__(self):
        return self.__result()


SyntaxError: unexpected character after line continuation character (3758142603.py, line 1)

In [95]:
dataset=context_mongo('all_syn')
li=[]
for i in dataset.find({},{'_id':False})[23:24]:
    li.append(i['name'])

In [96]:
class n_gram:
    def __init__(self,data_set,ngram:list):
        super(n_gram,self).__init__()
        self.data_set=data_set
        self.ngram=ngram
        
    def do_not_know(self):
        list_of_task = []
        for i in self.data_set:
            word=i.split()
            word=ngrams(word,2)
            
            result = [' '.join(ngram) for ngram in word]
            if len(result) > 0:
                list_of_task.append(result)
        return list_of_task

In [97]:
n_gram(li,[2,3])