In [28]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from scipy.spatial import distance

from gensim.models import Word2Vec
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\OscarJaramillo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

Dataset

In [73]:
data = pd.read_csv('data_rs.csv', index_col=0)

We want to deduplicate the ads by a cosine similarity function, but we're going to encode first the strings

## Bag of words

Mantaining the order of the words in the string to try to capture semantic differences

In [3]:
vocab = {}
word_encoding = 1
def one_hot_encoding(text):
  global word_encoding

  words = text.lower().split(" ")
  encoding = []

  for word in words:
    if word in vocab:
      code = vocab[word]
      encoding.append(code)
    else:
      vocab[word] = word_encoding
      encoding.append(word_encoding)
      word_encoding += 1

  return encoding

text = "this is a test to see if this test will work is is test a a"
encoding = one_hot_encoding(text)
print(encoding)
print(vocab)

[1, 2, 3, 4, 5, 6, 7, 1, 4, 8, 9, 2, 2, 4, 3, 3]
{'this': 1, 'is': 2, 'a': 3, 'test': 4, 'to': 5, 'see': 6, 'if': 7, 'will': 8, 'work': 9}


## TF-IDF

Requires text preprocesssing (e.g., lowercasing, removing punctuation, stop words, stemming)

In [4]:
# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the job descriptions to TF-IDF vectors
tfidf_matrix = tfidf_vectorizer.fit_transform(data['avisocuerpo'])

# Calculate cosine similarity between job descriptions
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Define a similarity threshold (you can experiment with different values)
similarity_threshold = 0.85

# Identify duplicates or similar job ads
duplicates = {}
for i in range(len(data)):
    duplicates[i] = [j for j, score in enumerate(cosine_sim[i]) if score > similarity_threshold and i != j]

# Print the duplicates
for key, value in duplicates.items():
    if value:
        print(f"Job Ad {key} is similar to: {value}")


ValueError: np.nan is an invalid document, expected byte or unicode string.

## Word-embeddings

With Word2Vec model in gensim for spanish

Requires text preprocesssing (e.g., lowercasing, removing punctuation, stop words, stemming)

In [94]:
data['avisocuerpo'] = data['avisocuerpo'].apply(lambda x: str(x).lower() if not pd.isna(x) else '')
data['avisocuerpo'] = data['avisocuerpo'].apply(lambda x: x.replace('.', '').replace(',', '').replace(';', '').replace('-', '').replace('>', '').replace('<', '').replace('\r', '').
                                                replace('\n', '').replace('\n2', '').replace('\n1', '').replace('\n3', ''))

Training the model

In [143]:
# Tokenize avisocuerpo
data['tokens'] = data['avisocuerpo'].apply(lambda x: word_tokenize(x.lower()))

# Train Word2Vec model on the tokenized job descriptions
model = Word2Vec(data['tokens'], vector_size=100, window=2, min_count=1, sg=0)

We can think in increasing vector size to capture more relationships between words but it'll require more computer power \
Another possibility is to increase window parameter that will give more context between read and predicted word, this should give more context to the model and it should capture better the word embeddings \



In order to compare ads, after training the Word2Vec model, we need to compute vectors for entire job descriptions. we'll do this by averaging the vectors of all words in a job description.

In [144]:
def get_ads_vector(tokens, model):

    # Is important to ensure the model has seen these words during training
    valid_tokens = [token for token in tokens if token in model.wv]
    if valid_tokens:        
        return np.mean(model.wv[valid_tokens], axis=0)
    else:
        return np.zeros(model.vector_size)

# Calculate vectors for all job descriptions
data['ads_vectors'] = data['tokens'].apply(lambda x: get_ads_vector(x, model))

Define cosine similarity because we can't use cosine_similarity from sklearn.metrics.pairwise because it compares a mean ad from the column with other ad but in our case we want to compare row by row

In [145]:
def calculate_cosine_similarity(vec1, vec2):
    return 1 - distance.cosine(vec1, vec2)

Let's compare

In [146]:
data['compare'] = None
for x in data.index:
    data['compare'][x] = calculate_cosine_similarity(data['ads_vectors'][111754], data['ads_vectors'][x])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['compare'][x] = calculate_cosine_similarity(data['ads_vectors'][111754], data['ads_vectors'][x])
  dist = 1.0 - uv / np.sqrt(uu * vv)


In [147]:
threshold = 0.99999

In [148]:
data.loc[data['compare'] >= threshold]

Unnamed: 0,pyindex,avisoid,empresaid,avisofechapublicacion,avisovacante,mostrarsueldo,avisoexperiencia,expiracion,dias,avisorepublicacion,...,avre_b4,bucket1,bucket2,bucket3,bucket4,wfh,duplicated,tokens,ads_vectors,compare
111754,112413,346860.0,7313,2008-10-23,3.0,0.0,4.0,2008-11-22 00:00:00,30,0,...,False,0,0,0,0,0,False,"[buscamos, a, los, (, as, ), mejores, ingenier...","[-0.140401, 0.20520142, 0.06415321, 0.03093196...",1.0
93342,93822,326390.0,584268,2008-09-03,1.0,,,2008-10-13 00:00:00,40,0,...,False,0,0,0,0,0,False,"[nuestra, empresa, está, en, la, búsqueda, de,...","[-0.12033648, 0.17702962, 0.0541395, 0.0258496...",0.99999
49021,49320,270171.0,576806,2008-05-13,2.0,,0.0,2008-06-27 00:00:00,45,0,...,False,0,0,0,0,0,False,[],"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0
126275,127283,363468.0,36916,2008-12-01,1.0,,,2008-12-15 00:00:00,14,0,...,False,0,0,0,0,0,True,[],"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0
5729,5945,209044.0,11758,2008-01-17,1.0,0.0,2.0,2008-04-16 00:00:00,90,0,...,False,0,0,0,0,0,False,"[importante, empresa, requiere, contratar, jef...","[-0.12791647, 0.18601729, 0.057467714, 0.02645...",0.999991
126268,127276,363461.0,5895,2008-12-01,3.0,0.0,,2008-12-16 00:00:00,15,0,...,False,0,0,0,0,0,False,"[la, gerencia, de, inteligencia, de, negocios,...","[-0.14058644, 0.20574825, 0.06316621, 0.028923...",0.99999
44800,45091,252376.0,37787,2008-05-22,1.0,1.0,5.0,2008-08-20 00:00:00,90,0,...,False,0,0,0,0,0,False,"[requerimos, contratar, jefe, de, provedores, ...","[-0.12272077, 0.17980039, 0.054967012, 0.02582...",0.999992
74749,75145,306656.0,6618,2008-08-12,1.0,0.0,5.0,2008-09-11 00:00:00,30,0,...,False,0,0,0,0,0,False,"[avisoimportante, empresa, de, servicios, oper...","[-0.1374337, 0.20129167, 0.061905827, 0.029267...",0.999992
84495,84959,317405.0,7161,2008-09-10,1.0,0.0,,2008-12-09 00:00:00,90,0,...,False,0,0,0,0,0,False,"[buscamos, profesional, responsable, del, cont...","[-0.133996, 0.19647096, 0.06099388, 0.02898169...",0.999993
49819,50120,273698.0,25662,2008-05-30,1.0,0.0,2.0,2008-08-28 00:00:00,90,0,...,False,0,1,0,0,1,False,"[a, cargo, de, :, •, aplicar, en, sistema, los...","[-0.12959811, 0.18908538, 0.058618434, 0.02798...",0.99999


In [149]:
np.max(data['compare'])

1

In [150]:
np.min(data['compare'])

0.7456258535385132

In [152]:
data['avisocuerpo'][2074]

'empresa distribuidora de materiales de construccion busca encargado de producto  la persona deberá administrar líneas de productos incluyendo atributos de precio físicos etc de manera de contar con el mix adecuado que cumpla las expectativas de los clientes y maximice la rentabilidad de la compañía    dentro de sus principales funciones se pueden mencionar:  identificar definir y crear productos en sistema  identificar categorías y subcategorías de productos  proponer cambios en el mix de productos de acuerdo a la demanda de mercado  definir y analizar la rentabilidad y rotación de cada categoría  mantener precios de compra y venta de productos  establecer relación con fábricas proveedoras a través del depto de adquisiciones'

In [92]:
data['avisocuerpo'][84495]

'buscamos profesional responsable del control de gestión de importante grupo de empresas de comunicaciones corporativas que entre otras cosas sea capaz de:\r\n\r\n1) liderar organizar y administrar proyectos de control de gestión y mejoramiento de procesos transversales entre las unidades de negocio y empresas relacionadas\r\n\r\n2) generar mecanismos de control de gestión a través de reportes coordinación y monitoreo de fuentes externas de información con el objetivo de mantener informados a los distintos estamentos del grupo sobre el cumplimiento de metas y estado de proyectos\r\n\r\n3) apoyar la organización del trabajo de personal del área administrativa y contable generando reportes financieros y de procesos para la gerencia\r\n \r\nse ofrece gran estabilidad laboral y un grato ambiente de trabajo\r\n\r\nsólo se revisarán currículums que incluyan pretensiones de renta líquida'