In [None]:
import random

import numpy as np
import pandas as pd

import spacy
import nmslib
import os
import json
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer
from datetime import datetime

from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer


from similarity import Finder

In [None]:
jobs_df = pd.read_csv("data/jobs.csv")
jobs_df["title_description"] = jobs_df.apply(
    lambda x: x["title"] + ". " + x["description"], axis=1
)
descriptions = jobs_df["title_description"].values.tolist()
len(descriptions)

In [None]:
# random.shuffle(descriptions)
# split_point = int(len(descriptions) * 0.10)
# set_01 = descriptions[:-split_point]
# set_02 = descriptions[-split_point:]

In [None]:
documents = descriptions

In [None]:
out_dir = "indexed_docs"

now = datetime.now().strftime("%Y-%m-%d_%H-%M-%S-%f")
print("----- Strating at " + now + " -----")
out_dir = os.path.join(out_dir, now)
if not os.path.exists(out_dir):
    os.makedirs(out_dir)

In [None]:
lower_doc = [d.lower() for d in documents]

In [None]:
nlp = SentenceTransformer("stjiris/bert-large-portuguese-cased-legal-mlm-sts-v1.0")

In [None]:
kw_model = KeyBERT(model=nlp)

In [None]:
lower_doc = [d.lower() for d in documents]

params = {}
params["stop_words"] = stopwords.words("portuguese")
params["keyphrase_ngram_range"] = (2, 2)
params["top_n"] = 10
params["min_df"] = int(len(lower_doc) * 0.005)
params["use_maxsum"] = False
params["nr_candidates"] = "None"
params["use_mmr"] = True
params["diversity"] = 0.2

kw = kw_model.extract_keywords(lower_doc, **params)

In [None]:
embs = []
for k in kw:
    local_embs = []
    for key, val in k:
        doc = nlp.encode(key)
        local_embs.append(doc)
    final_emb = np.array(local_embs).mean(axis=0)
    try:
        if final_emb.shape[0]:
            embs.append(final_emb)
    except:
        pass

In [None]:
index = nmslib.init(method="hnsw", space="angulardist")
index.addDataPointBatch(embs)
index.createIndex({"post": 2}, print_progress=True)

print(datetime.now().strftime("%H-%M-%S-%f") + "----- Saving Index and Data -----")
data = []
for doc, k in zip(lower_doc, kw):
    data.append({"doc": doc, "keys": k})

df = pd.DataFrame(data)

index.saveIndex(os.path.join(out_dir, "nms_index.index"), save_data=True)
print(
    datetime.now().strftime("%H-%M-%S-%f")
    + "----- Index Saved At: "
    + os.path.join(out_dir, "nms_index.index")
)
df.to_pickle(os.path.join(out_dir, "docs.pkl"))
print(
    datetime.now().strftime("%H-%M-%S-%f")
    + "----- Docs Saved At: "
    + os.path.join(out_dir, "docs.pkl")
)
with open(os.path.join(out_dir, "params.json"), "w") as f:
    json.dump(params, f)