In [34]:
import glob
from hazm import *
import codecs
import tqdm
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import yaml
import matplotlib.pyplot as plt
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers, processors
from gensim.models import FastText
from gensim.models import Word2Vec

normalizer = Normalizer()
stemmer = Stemmer()
lemmatizer = Lemmatizer()
import pandas as pd


In [35]:

def get_cleaned_text(text):
    text = normalizer.normalize(text)
    tokens = word_tokenize(text)
    refined_tokens = []
    for token in tokens:
        token = lemmatizer.lemmatize(token)
        refined_tokens.append(token)
    return ' '.join(refined_tokens)

def get_query_similarity(query_vector, tfidf, tfidf_array, wv_tfidf):
    dot_product = tfidf_array * query_vector
    similarity = np.sum(dot_product, axis = 1)
    return similarity

In [36]:
poets = glob.glob('normalized/*.txt')
poems = []

for poem_file in poets:
    with open(poem_file, encoding='utf-8', mode='r') as fp:
        line = fp.readline()
        cnt = 1
        box = ''
        while line:
            if line.strip() != '':
                box = box + ' ' + line.strip()
                if cnt % 8 == 0:
                    poems.append(box.strip())
                    box = ''
                cnt += 1
            line = fp.readline()


In [37]:
df = pd.DataFrame(poems)
df[0] = df[0].apply(lambda x: word_tokenize(x))

In [41]:
vectorizer_tfidf = TfidfVectorizer(norm='l2', tokenizer=word_tokenize, max_df=0.8, max_features=10000 )
X = vectorizer_tfidf.fit_transform(poems)
X = X.toarray()
X = np.float32(X)

In [42]:
query = 'عشق خیلی بد است'
query_vector = vectorizer_tfidf.transform([query]).toarray()

In [43]:
dot_product =  X * query_vector

print(sum(sum(dot_product)))


2163.9020371074616


In [44]:
similarity = np.sum(dot_product, axis = 1)

relevance_order = np.argsort(similarity)[::-1]


In [45]:
relevance_order

array([99106, 52569, 99057, ..., 59829, 59830, 51302], dtype=int64)

In [46]:
print(poems[relevance_order[0]])


عشق است که زنده دل از آنیم عشق است که جان جاودان است عاشق چو غلام و عشق سلطان عشق است که شاه عاشقان است عشق است که عقل بنده اوست عشق است که سید زمان است جانست که در بدن روانست عالم بدن است و عشق جانست


In [47]:
np.sum(similarity)




2163.9020371074603

In [48]:
import pickle

with open('vectorizer_tfidf','wb') as f: pickle.dump(vectorizer_tfidf, f)
with open('X_toarray','wb') as f: pickle.dump(X, f)
with open('poems','wb') as f: pickle.dump(poems, f)

In [None]:
import pickle
import numpy as np
with open('poems','rb') as f: poems = pickle.load(f)
with open('vectorizer_tfidf','rb') as f: vectorizer_tfidf = pickle.load(f)
with open('X_toarray','rb') as f: X_toarray = pickle.load(f)
query = 'عشق خیلی بد است'
query_vector = vectorizer_tfidf.transform([query]).toarray()
dot_product =  X_toarray * query_vector
similarity = np.sum(dot_product, axis = 1)

relevance_order = np.argsort(similarity)[::-1]
print(poems[relevance_order[0]])
