In [1]:
!pip install sentence_transformers hazm

Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers<5.0.0,>=4.6.0 (from sentence_transformers)
  Downloading transformers-4.35.0-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m61.2 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece (from sentence_transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m84.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub>=0.4.0 (from sentence_transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 k

In [1]:
from sentence_transformers import SentenceTransformer, util
import pandas as pd
import torch

In [2]:
df = pd.read_csv('/content/drive/MyDrive/Jobinja - Processed.csv')
df.head()

##Build Courps

In [39]:
import re
from hazm import *
normalizer = Normalizer()

def build_corpus(df):
		corpus = []
		for i in range(len(df)):
				news = str(df['Job Position'][i])
				news = normalizer.normalize(news)
				news = re.sub(r"\n", " ", news)
				news = re.sub(r"\t", " ", news)
				news = re.sub(r'\u200c', " ", news)
				news = re.sub(r'[^\w\s]+', " ", news)
				news = re.sub(r'<[^>]+>', " ", news)
				news = re.sub(' +', ' ', news)
				corpus.append(news[:500])
		return corpus
corpus = build_corpus(df)

## Get Embeddings

In [41]:
embedding_model = 'intfloat/multilingual-e5-large'
embedder = SentenceTransformer(embedding_model)
corpus_embeddings = embedder.encode(corpus, convert_to_tensor=True)

## Search in Corpus

In [44]:
# Initialize lists to store scores and indices for each query
threshold_percentage = 0.04
all_scores = []
all_indices = []

queries = ["به دنبال شغل هایی مثل متخصص هوش مصنوعی، پردازش تصویر، پردازش  متن، پردازش صوت، برنامه نویسی پایتون، تحلیل داده و دانشمند داده می گردم"]

for query in queries:
    # Encode the user query
    query_embedding = embedder.encode(query, convert_to_tensor=True)

    # Calculate cosine similarity and find the top-k results
    top_k = min(200, len(corpus))
    scores = []
    indices = []

    for i, emb in enumerate(corpus_embeddings):
        cos_score = util.pytorch_cos_sim(query_embedding, emb)[0]
        scores.append(cos_score.item())  # Convert tensor to a scalar
        indices.append(i)

    # Sort by scores
    sorted_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)

    query_scores = []
    query_indices = []
    highest_score = max(scores)
    threshold = (1 - threshold_percentage) * highest_score  # Set threshold as 2 percent below the highest score

    for i in sorted_indices:
        score = scores[i]
        if score >= threshold:
            idx = indices[i]
            query_scores.append(score)
            query_indices.append(idx)

    all_scores.append(query_scores)
    all_indices.append(query_indices)

# Return the highest results for each query
results = []
for query, scores, indices in zip(queries, all_scores, all_indices):
    query_results = []
    for score, idx in zip(scores, indices):
        result = {
            'Job Position': corpus[idx][:50],
            'Score': score
        }
        query_results.append(result)
        print(result)

    results.append(query_results)

{'Job Position': 'استخدام متخصص هوش مصنوعی پایتون ', 'Score': 0.8960745334625244}
{'Job Position': 'استخدام متخصص هوش مصنوعی', 'Score': 0.8868563175201416}
{'Job Position': 'استخدام متخصص هوش مصنوعی', 'Score': 0.8868563175201416}
{'Job Position': 'استخدام متخصص هوش مصنوعی', 'Score': 0.8868562579154968}
{'Job Position': 'استخدام متخصص هوش مصنوعی', 'Score': 0.8868562579154968}
{'Job Position': 'استخدام کارشناس مدیریت پروژه هوش مصنوعی و علم داده', 'Score': 0.8856395483016968}
{'Job Position': 'استخدام متخصص فناوری هوش مصنوعی', 'Score': 0.8828755617141724}
{'Job Position': 'استخدام کارشناس هوش مصنوعی AI Developer ', 'Score': 0.8828009366989136}
{'Job Position': 'استخدام کارشناس هوش مصنوعی', 'Score': 0.8802425861358643}
{'Job Position': 'استخدام کارشناس هوش مصنوعی', 'Score': 0.8802425861358643}
{'Job Position': 'استخدام کارشناس هوش مصنوعی', 'Score': 0.8802425861358643}
{'Job Position': 'استخدام کارشناس هوش مصنوعی', 'Score': 0.8802425861358643}
{'Job Position': 'استخدام کارشناس هوش مصنوعی', 