In [213]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

Choose the time partition

In [251]:
tp = 4

Load data

In [252]:
data  = pd.read_csv(f'datos\counted\d{tp}_counted.csv')

In [253]:
data = data.drop('Unnamed: 0', axis=1)

In [254]:
newData = data.loc[data['wfh'] != 0]

In [255]:
newData = newData[newData.duplicated(subset=['empresaid', 'avisolugartrabajo'], keep='first')]

In [256]:
len(newData)

61072

Pseudoindex for TF-IDF mapping

In [257]:
newData['pseudoindex'] = range(len(newData))

## TF-IDF
Requires text preprocesssing (e.g., lowercasing, removing punctuation, stop words, stemming)\
The algortihm will be used just over 'avisocuerpo' because 'avisocargo' and 'avisorequisitos' are very similar and the algorithm identifies everything giving nothing interessting for us

In [258]:
newData['avisocuerpo_t'] = newData['avisocuerpo'].apply(lambda x: str(x).lower() if not pd.isna(x) else '')
newData['avisocuerpo_t'] = newData['avisocuerpo_t'].apply(lambda x: x.replace('.', ' ').replace(',', ' ').replace(';', ' ').replace('-', ' ').replace('>', ' ').replace('<', ' ').replace('\r', ' ').
                                                replace('\n', ' ').replace('\n2', ' ').replace('\n1', ' ').replace('\n3', ' ').replace('\\', ' ').replace('/', ' ').replace('html', ' '))
newData['avisocuerpo_t'] = newData['avisocuerpo_t'].fillna('vacio')

Model

Due to the size of the data is neccesary to do it by chunks

In [259]:
# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Define the chunk size and calculate the number of chunks
chunk_size = 1000  # You can adjust this as needed
num_chunks = len(newData) // chunk_size + 1

# Initialize the 'duplicates' dictionary to store results
duplicates = {}

# Loop through the data in chunks
for chunk_num in range(num_chunks):
    start_idx = chunk_num * chunk_size
    end_idx = start_idx + chunk_size
    chunk_data = newData.iloc[start_idx:end_idx].copy()  # Get a chunk of data
    
    # Fit and transform the job descriptions to TF-IDF vectors for the current chunk
    tfidf_matrix = tfidf_vectorizer.fit_transform(chunk_data['avisocuerpo_t'].fillna('Vacio'))

    # Calculate cosine similarity between job descriptions in the current chunk
    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

    # Define a similarity threshold, 0.5 to try to capture more similarities between the ads
    similarity_threshold = 0.89

    # Identify duplicates or similar job ads within the current chunk
    for i in range(len(chunk_data)):
        duplicates[start_idx + i] = [j for j, score in enumerate(cosine_sim[i]) if score > similarity_threshold and i != j]

# Now, the 'duplicates' dictionary contains the results for the entire dataset.

Without Chunks

In [241]:
# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the job descriptions to TF-IDF vectors
tfidf_matrix = tfidf_vectorizer.fit_transform(newData['avisocuerpo'].fillna('Vacio'))

# Calculate cosine similarity between job descriptions
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Define a similarity threshold (you can experiment with different values)
similarity_threshold = 0.89

# Identify duplicates or similar job ads
duplicates = {}
for i in range(len(newData)):
    duplicates[i] = [j for j, score in enumerate(cosine_sim[i]) if score > similarity_threshold and i != j]

In [260]:
newData = newData.reset_index(drop=True)

In [261]:
newData['similars'] = duplicates

In [262]:
# Mapping the WFH similars by 'avisoid' in the bigger data frame
data['similars'] = data['avisoid'].map(newData.set_index('avisoid')['similars']).fillna('None')
data['pseudoindex'] = data['avisoid'].map(newData.set_index('avisoid')['pseudoindex']).fillna('None')

In [263]:
data.loc[data['pseudoindex'] != 'None'][['pseudoindex', 'similars']]

Unnamed: 0,pseudoindex,similars
75,0.0,[]
87,1.0,[2]
88,2.0,[1]
89,3.0,[]
115,4.0,"[233, 234]"
...,...,...
730299,61067.0,[]
730306,61068.0,[]
730307,61069.0,[70]
730309,61070.0,[69]


In [266]:
data.to_csv(f'datos\\counted\d{tp}_counted.csv')

In [267]:
newData.to_csv(f'datos\\similars\d{tp}_similars.csv')