In [51]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from scipy.spatial import distance

from gensim.models import Word2Vec
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\oscar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

Dataset

In [52]:
data = pd.read_csv('data_rs.csv', index_col=0)

We want to deduplicate the ads by a cosine similarity function, but we're going to encode first the strings

## Bag of words

Mantaining the order of the words in the string to try to capture semantic differences, the problem is that it can assign the same bag for different semantic differences, because it just count frecuency in a given order but dont have context or memory

In [53]:
vocab = {}
word_encoding = 1
def one_hot_encoding(text):
  global word_encoding

  words = text.lower().split(" ")
  encoding = []

  for word in words:
    if word in vocab:
      code = vocab[word]
      encoding.append(code)
    else:
      vocab[word] = word_encoding
      encoding.append(word_encoding)
      word_encoding += 1

  return encoding

text = "this is a test to see if this test will work is is test a a"
encoding = one_hot_encoding(text)
print(encoding)
print(vocab)

[1, 2, 3, 4, 5, 6, 7, 1, 4, 8, 9, 2, 2, 4, 3, 3]
{'this': 1, 'is': 2, 'a': 3, 'test': 4, 'to': 5, 'see': 6, 'if': 7, 'will': 8, 'work': 9}


## TF-IDF

Requires text preprocesssing (e.g., lowercasing, removing punctuation, stop words, stemming)

In [54]:
data['avisocuerpo'] = data['avisocuerpo'].apply(lambda x: str(x).lower() if not pd.isna(x) else '')
data['avisocuerpo'] = data['avisocuerpo'].apply(lambda x: x.replace('.', '').replace(',', '').replace(';', '').replace('-', '').replace('>', '').replace('<', '').replace('\r', '').
                                                replace('\n', '').replace('\n2', '').replace('\n1', '').replace('\n3', ''))
data['avisocuerpo'] = data['avisocuerpo'].fillna('vacio')

In [55]:
data['pseudoindex'] = range(400)

In [56]:
# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the job descriptions to TF-IDF vectors
tfidf_matrix = tfidf_vectorizer.fit_transform(data['avisocuerpo'].fillna('Vacio'))

# Calculate cosine similarity between job descriptions
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Define a similarity threshold (you can experiment with different values)
similarity_threshold = 0.85

# Identify duplicates or similar job ads
duplicates = {}
for i in range(len(data)):
    duplicates[i] = [j for j, score in enumerate(cosine_sim[i]) if score > similarity_threshold and i != j]

# Print the duplicates
for key, value in duplicates.items():
    if value:
        print(f"Job Ad {key} is similar to: {value}")


Job Ad 16 is similar to: [76]
Job Ad 62 is similar to: [113]
Job Ad 76 is similar to: [16]
Job Ad 106 is similar to: [145]
Job Ad 113 is similar to: [62]
Job Ad 145 is similar to: [106]
Job Ad 200 is similar to: [243, 264, 350]
Job Ad 205 is similar to: [345, 360]
Job Ad 207 is similar to: [336]
Job Ad 211 is similar to: [304, 320, 384]
Job Ad 217 is similar to: [291]
Job Ad 220 is similar to: [227]
Job Ad 222 is similar to: [295, 308]
Job Ad 227 is similar to: [220]
Job Ad 233 is similar to: [282, 335]
Job Ad 236 is similar to: [285, 397]
Job Ad 243 is similar to: [200, 264, 350]
Job Ad 255 is similar to: [364]
Job Ad 260 is similar to: [302, 366]
Job Ad 263 is similar to: [396]
Job Ad 264 is similar to: [200, 243, 350]
Job Ad 275 is similar to: [323, 396]
Job Ad 282 is similar to: [233, 335]
Job Ad 285 is similar to: [236, 397]
Job Ad 289 is similar to: [346, 376]
Job Ad 291 is similar to: [217]
Job Ad 295 is similar to: [222, 308]
Job Ad 302 is similar to: [260, 366]
Job Ad 304 is s

In [57]:
#duplicates

In [58]:
data.loc[data['pseudoindex'] ==16]['avisocuerpo'][25351]

'programa consistente en un curso de inglés más empleo en hoteles de inglaterra está abierto para personas de entre 18 a 38 años con niveles de inglés desde medio hasta avanzado    no se requieren estudios ni experiencia previa de ningún tipo    el curso de inglés tiene una duración de 1 a 4 meses y el empleo de 6 meses a 1 año'

In [59]:
data.loc[data['pseudoindex'] ==76]['avisocuerpo'][36381]

'programa consistente en un curso de inglés más empleo en hoteles de inglaterra está abierto para personas de entre 21 a 38 años con niveles de inglés desde medio hasta avanzado    no se requieren estudios ni experiencia previa de ningún tipo    el curso de inglés tiene una duración de 1 a 4 meses y el empleo de 6 meses a 1 año'

## Word-embeddings

With Word2Vec model in gensim for spanish

Requires text preprocesssing (e.g., lowercasing, removing punctuation, stop words, stemming)

In [60]:
data['avisocuerpo'] = data['avisocuerpo'].apply(lambda x: str(x).lower() if not pd.isna(x) else '')
data['avisocuerpo'] = data['avisocuerpo'].apply(lambda x: x.replace('.', '').replace(',', '').replace(';', '').replace('-', '').replace('>', '').replace('<', '').replace('\r', '').
                                                replace('\n', '').replace('\n2', '').replace('\n1', '').replace('\n3', ''))
data['avisocuerpo'] = data['avisocuerpo'].fillna('vacio')

Training the model \
This is a central part of word embeddings, I used the same data to train that we'll gonna use, but it can be refined, I dont know by now if using a pre-trained model will be usefull given that it depends in the specific goal

In [61]:
# Tokenize avisocuerpo
data['tokens'] = data['avisocuerpo'].apply(lambda x: word_tokenize(x.lower()))

# Train Word2Vec model on the tokenized job descriptions
model1 = Word2Vec(data['tokens'], vector_size=100, window=2, min_count=1, sg=0)
model2 = Word2Vec(data['tokens'], vector_size=100, window=5, min_count=1, sg=0)


We can think in increasing vector size to capture more relationships between words but it'll require more computer power \
Another possibility is to increase window parameter that will give more context between read and predicted word, this should give more context to the model and it should capture better the word embeddings \

In order to compare ads, after training the Word2Vec model, we need to compute vectors for entire job descriptions. we'll do this by averaging the vectors of all words in a job description, contrary to TF-IDF this have to be done 'manually'

In [62]:
def get_similarity(text1, text2, model):
    tokens1 = word_tokenize(text1.lower())
    tokens2 = word_tokenize(text2.lower())
    
    # Filter out tokens that are not in the model's vocabulary
    valid_tokens1 = [token for token in tokens1 if token in model.wv]
    valid_tokens2 = [token for token in tokens2 if token in model.wv]
    
    if valid_tokens1 and valid_tokens2:
        similarity = model.wv.n_similarity(valid_tokens1, valid_tokens2)
        return similarity
    else:
        # Return a low similarity score if one of the texts has no valid tokens
        return 0.0

## Let's compare

Is neccesary to define a Threshold

In [63]:
threshold = 0.9999

In [64]:
# Example: Compare two job descriptions by their similarity
text1 = data['avisocuerpo'][25351]
text2 = data['avisocuerpo'][36381]

similarity_score = get_similarity(text1, text2, model1)
if similarity_score >= threshold:
    print(f"Job ads are similar with similarity Score: {similarity_score}\n{text1}\n{text2}")
elif (similarity_score < threshold) & (similarity_score >= 0.50):
    print(f"Job ads are somehow similar with similarity Score: {similarity_score}\n{text1}\n{text2}")
elif (similarity_score < 0.5) & (similarity_score >= 0):
    print(f"Job ads are different with similarity Score: {similarity_score}\n{text1}\n{text2}")
else:
    print(f"Job ads are opposites with similarity Score: {similarity_score}\n{text1}\n{text2}")

Job ads are similar with similarity Score: 0.9999997019767761
programa consistente en un curso de inglés más empleo en hoteles de inglaterra está abierto para personas de entre 18 a 38 años con niveles de inglés desde medio hasta avanzado    no se requieren estudios ni experiencia previa de ningún tipo    el curso de inglés tiene una duración de 1 a 4 meses y el empleo de 6 meses a 1 año
programa consistente en un curso de inglés más empleo en hoteles de inglaterra está abierto para personas de entre 21 a 38 años con niveles de inglés desde medio hasta avanzado    no se requieren estudios ni experiencia previa de ningún tipo    el curso de inglés tiene una duración de 1 a 4 meses y el empleo de 6 meses a 1 año


In [65]:
data['compare'] = None
for x in data.index:
    data['compare'][x] = get_similarity(data['avisocuerpo'][111754], data['avisocuerpo'][x], model1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['compare'][x] = get_similarity(data['avisocuerpo'][111754], data['avisocuerpo'][x], model1)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['compare'][x] = get_similarity(data['avisocuerpo'][111754], data['avisocuerpo'][x], model1)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['compare'][x] = get_similarity(data['avisocuerpo'][111754], data['avisocuerpo'][x], model1)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in

In [66]:
# Example2: Compare two job descriptions by their similarity but the job descriptions are long, verbose and have a lot of information, not sure how is leading with it
text1 = data['avisocuerpo'][111754]
text2 = data['avisocuerpo'][55614]

similarity_score = get_similarity(text1, text2, model1)
if similarity_score >= threshold:
    print(f"Job ads are similar with similarity Score: {similarity_score}\n{text1}\n{text2}")
elif (similarity_score < threshold) & (similarity_score >= 0.50):
    print(f"Job ads are somehow similar with similarity Score: {similarity_score}\n{text1}\n{text2}")
elif (similarity_score < 0.5) & (similarity_score >= 0):
    print(f"Job ads are different with similarity Score: {similarity_score}\n{text1}\n{text2}")
else:
    print(f"Job ads are opposites with similarity Score: {similarity_score}\n{text1}\n{text2}")

Job ads are similar with similarity Score: 0.9999666810035706
buscamos a los (as) mejores ingenieros civil industrial mención ti – computación experiencia y/o interés en el liderazgo de equipos de trabajo y gestión de proyectos usuario nivel avanzado en herramientas tecnológicas de apoyo para la gestión de proyectos ideal conocimientos y experiencia en proyectos de implantación de sistemas erp y modelamiento de procesos habilidades comunicacionales capacidad de reporte y agregación de valor capacidad de analizar procesos capacidad de resolución de problemas y manejo de situaciones de ambigüedad y estrés capacidad de negociación capacidad de gestionar procesoscapacidad de gestionar equipos de trabajo
importante empresa de telecomunicaciones requiere contratar a los mejores ejecutivos de servicio al cliente sus principales funciones son: atención y resolución de solicitudes complejas de clientes cara a cara en lo relativo a los productos y servicios prestados por la empresa procurar que 

## Bibliograby

Natural language processing with recurring neural networks (RNN's)-TechwithTim- freecodecamp.org
1. Chollet François. Deep Learning with Python. Manning Publications Co., 2018.
2. “Text Classification with an RNN &nbsp;: &nbsp; TensorFlow Core.” TensorFlow, www.tensorflow.org/tutorials/text/text_classification_rnn.
3. “Text Generation with an RNN &nbsp;: &nbsp; TensorFlow Core.” TensorFlow, www.tensorflow.org/tutorials/text/text_generation.

https://www.tensorflow.org/text/tutorials/word2vec

https://www.tensorflow.org/text/tutorials/word_embeddings

https://colab.research.google.com/drive/1ysEKrw_LE2jMndo1snrZUh5w87LQsCxk#forceEdit=true&sandboxMode=true&scrollTo=Cw-1eDE54yQo

https://github.com/tensorflow/

https://github.com/Tech-With-Tim
