In [65]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from scipy.spatial import distance

from gensim.models import Word2Vec
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\OscarJaramillo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

Dataset

In [66]:
data = pd.read_csv('data_rs.csv', index_col=0)

We want to deduplicate the ads by a cosine similarity function, but we're going to encode first the strings

## Bag of words

Mantaining the order of the words in the string to try to capture semantic differences

In [5]:
vocab = {}
word_encoding = 1
def one_hot_encoding(text):
  global word_encoding

  words = text.lower().split(" ")
  encoding = []

  for word in words:
    if word in vocab:
      code = vocab[word]
      encoding.append(code)
    else:
      vocab[word] = word_encoding
      encoding.append(word_encoding)
      word_encoding += 1

  return encoding

text = "this is a test to see if this test will work is is test a a"
encoding = one_hot_encoding(text)
print(encoding)
print(vocab)

[1, 2, 3, 4, 5, 6, 7, 1, 4, 8, 9, 2, 2, 4, 3, 3]
{'this': 1, 'is': 2, 'a': 3, 'test': 4, 'to': 5, 'see': 6, 'if': 7, 'will': 8, 'work': 9}


## TF-IDF

Requires text preprocesssing (e.g., lowercasing, removing punctuation, stop words, stemming)

In [None]:
# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the job descriptions to TF-IDF vectors
tfidf_matrix = tfidf_vectorizer.fit_transform(data['avisocuerpo'])

# Calculate cosine similarity between job descriptions
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Define a similarity threshold (you can experiment with different values)
similarity_threshold = 0.85

# Identify duplicates or similar job ads
duplicates = {}
for i in range(len(data)):
    duplicates[i] = [j for j, score in enumerate(cosine_sim[i]) if score > similarity_threshold and i != j]

# Print the duplicates
for key, value in duplicates.items():
    if value:
        print(f"Job Ad {key} is similar to: {value}")


## Word-embeddings

With Word2Vec model in gensim for spanish

Requires text preprocesssing (e.g., lowercasing, removing punctuation, stop words, stemming)

In [67]:
data['avisocuerpo'] = data['avisocuerpo'].apply(lambda x: str(x).lower() if not pd.isna(x) else '')
data['avisocuerpo'] = data['avisocuerpo'].apply(lambda x: x.replace('.', '').replace(',', '').replace(';', '').replace('-', '').replace('>', '').replace('<', ''))

Training the model

In [68]:
# Tokenize avisocuerpo
data['tokens'] = data['avisocuerpo'].apply(lambda x: word_tokenize(x.lower()))

# Train Word2Vec model on the tokenized job descriptions
model = Word2Vec(data['tokens'], vector_size=100, window=5, min_count=1, sg=0)

In order to compare ads, after training the Word2Vec model, we need to compute vectors for entire job descriptions. we'll do this by averaging the vectors of all words in a job description.

In [69]:
def get_ads_vector(tokens, model):

    # Is important to ensure the model has seen these words during training
    valid_tokens = [token for token in tokens if token in model.wv]
    if valid_tokens:
        return np.mean(model.wv[valid_tokens], axis=0)
    else:
        return np.zeros(model.vector_size)

# Calculate vectors for all job descriptions
data['ads_vectors'] = data['tokens'].apply(lambda x: get_ads_vector(x, model))


Define cosine similarity because we can't use cosine_similarity from sklearn.metrics.pairwise because it compares a mean ad from the column with other ad but in our case we want to compare row by row

In [70]:
def cosine_similarity(vec1, vec2):
    return 1 - distance.cosine(vec1, vec2)

Let's compare

In [73]:
data['ads_vectors']

111754    [-0.043988142, 0.517315, 0.05676966, 0.0168512...
105701    [-0.035540834, 0.4207944, 0.04231299, 0.007412...
101579    [-0.005116907, 0.063990615, -0.0028450487, -0....
11752     [-0.031266484, 0.39305508, 0.043877464, 0.0151...
97086     [-0.040590055, 0.4503397, 0.04930606, 0.011538...
                                ...                        
64711     [-0.041747145, 0.48783726, 0.05219389, 0.01324...
55614     [-0.04394839, 0.5092569, 0.05609447, 0.0121953...
78965     [-0.041103408, 0.4732848, 0.052064642, 0.01328...
62501     [-0.040860943, 0.49127045, 0.053775627, 0.0133...
108719    [-0.038785815, 0.43028104, 0.046446897, 0.0115...
Name: ads_vectors, Length: 400, dtype: object

In [75]:
cosine_similarity(data['ads_vectors'][11752], data['ads_vectors'][62501])

0.9999738335609436