In [None]:
import os
import random
import numpy as np
import pandas as pd

from similarity import Finder
from nltk.corpus import stopwords

from wordcloud import WordCloud
import matplotlib.pyplot as plt

import plotly.graph_objects as go
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
import nltk

nltk.download("stopwords")

In [None]:
column = "title_description"

In [None]:
jobs_df = pd.read_csv("data/jobs_description.csv").drop_duplicates()
jobs_df["description"] = jobs_df["description"].apply(
    lambda x: x.replace("\n", "").replace("\r", "")
)
jobs_df["title_description"] = jobs_df.apply(
    lambda x: x["title"] + ". " + x["description"], axis=1
)
jobs_df["description"] = jobs_df["description"].apply(lambda x: x.lower())
jobs_df["title_description"] = jobs_df["title_description"].apply(lambda x: x.lower())
print(jobs_df.shape)

In [None]:
interactions_df = pd.read_csv("data/jobs_interaction_3m.csv")
print(interactions_df.shape)
interactions_df.head()

In [None]:
interactions_df = interactions_df.set_index("ITEM_ID").join(
    jobs_df[["id", "title", "description"]].set_index("id")
)
interactions_df = interactions_df.reset_index()
interactions_df.head()

In [None]:
history_df = (
    interactions_df.groupby("GA_ID")["index"].apply(list).to_frame().reset_index()
)
history_df = history_df.rename(columns={"GA_ID": "user_id", "index": "history"})
history_df["len_history"] = history_df["history"].apply(lambda x: len(x))
history_df = history_df.sort_values("len_history")
history_df

In [None]:
descriptions = jobs_df[column].values.tolist()
lower_doc = [d.lower() for d in descriptions]

In [None]:
pd.set_option("display.max_colwidth", None)

In [None]:
nlp = SentenceTransformer("stjiris/bert-large-portuguese-cased-legal-mlm-sts-v1.0")
finder = Finder(
    "indexed_docs/2023-11-20_15-39-20-726961", diversity=0.5, embedding_model=nlp
)

In [None]:
description_df = jobs_df[jobs_df["id"].isin(history_df.loc[1].history)][
    ["title", "description"]
]
description_df["title_description"] = description_df.apply(
    lambda x: x["title"] + ". " + x["description"], axis=1
)
description_df["title_description"] = description_df["title_description"].apply(
    lambda x: x.lower()
)
description_df[column] = description_df[column].apply(lambda x: x.lower())
description = description_df[column].values.tolist()
description

In [None]:
out, keys = finder.get_similar(description[1])

In [None]:
jobs_df["title_description"] = jobs_df.apply(
    lambda x: x["title"] + ". " + x["description"], axis=1
)
jobs_df["title_description"] = jobs_df["title_description"].apply(lambda x: x.lower())
jobs_df["description"] = jobs_df["description"].apply(lambda x: x.lower())

description_df = (
    finder.docs[finder.docs["doc"] == description[1].lower()]
    .set_index("doc")
    .join(jobs_df[["title", "description", "title_description"]].set_index(column))
    .reset_index()
)

In [None]:
out.set_index("doc").join(
    jobs_df[["title", "description", "title_description"]].set_index(column)
).reset_index().sort_values(by="distances", ascending=False).head(5)

In [None]:
description_df

In [None]:
sw = stopwords.words("portuguese")
desc = [i if i not in sw else "" for i in description[1].split(" ")]
desc = " ".join(desc)
wc = WordCloud(background_color="white").generate(desc)
plt.imshow(wc)

In [None]:
# Transform the data for plotting
words, scores = zip(*description_df["keys"].values.tolist()[1])

# Create a bar chart using Plotly
fig = go.Figure(
    go.Bar(
        x=scores,
        y=words,
        orientation="h",  # Horizontal bar chart
    )
)

# Update layout for a similar aesthetic to the uploaded image
fig.update_layout(
    title="Visualização do modelo de tópico",
    xaxis_title="Pontuação",
    yaxis_title="Palavras",
    yaxis_autorange="reversed",  # To match the provided image's layout
    template="plotly_white",
    width=800,
    font=dict(size=20),
)

# Show the figure
fig.show()