In [1]:
import pandas as pd
import numpy as np

# Classifier Analisys

This notebook analyses information found in 1871 job offers for healthcare workers in Chile. This dataset includes job offers found from April 29th and May 2nd 2024.

We will look into basic statistics of the data, tray to clean it somehow and the apply embeddings and NLP to classify better the information and extract more data from this unstructured data. 

## Data Loading

In [2]:
df = pd.read_csv('data/raw_jobs.csv')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1871 entries, 0 to 1870
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 1871 non-null   int64  
 1   title              1871 non-null   object 
 2   source_url         1871 non-null   object 
 3   posting_url        1871 non-null   object 
 4   geolocalization    1871 non-null   object 
 5   company            1871 non-null   object 
 6   salary             996 non-null    object 
 7   experience         224 non-null    object 
 8   work_schedule      1640 non-null   object 
 9   shift_type         0 non-null      float64
 10  employment_type    1201 non-null   object 
 11  slots_avaliable    0 non-null      float64
 12  urgency_required   1871 non-null   bool   
 13  seniority_level    0 non-null      float64
 14  driving_level      0 non-null      float64
 15  job_site           1641 non-null   object 
 16  description        1669 

## Data Cleaning

Will reduce dimensionality to focus on relevant information. As of now, the following collumns are relevant:

In [9]:
rel_cols = ['title',
            'posting_url',
            'geolocalization',
            'company','salary',
            'experience',
            'work_schedule',
            'shift_type',
            'employment_type',
            'slots_avaliable',
            'driving_level',
            'job_site',
            'description',
            'requisites',
            'pills',
            'inclusive_posting',
            'published_date'
           ]

In [20]:
df_clean = df[rel_cols]

Now will perform basic string cleaning.

In [82]:
def basic_cleaning(sentence):
    sentence = sentence.lower()
    sentence = ''.join(char for char in sentence if not char.isdigit())

    for punctuation in string.punctuation:
        sentence = sentence.replace(punctuation, '')

    sentence = sentence.strip()

    return sentence

In [83]:
import spacy

In [84]:
nlp = spacy.load('es_core_news_sm')

In [87]:
text =df_clean.iloc[1].description

In [88]:
doc = nlp(text)

In [89]:
print("Verbs:", [token.lemma_ for token in doc if token.pos_ == "VERB"])

Verbs: ['pertenecer', 'encontrar', 'sumar él', 'constituir', 'buscar', 'ampliar él', 'encontrar', 'sumar', 'realizar', 'corresponder', 'preparar', 'controlar', 'colaborar', 'dar', 'soporte', '•', 'registrar', '•', 'emitir', 'contar', 'insumos', 'contabilizar', 'realizar', 'realizar', 'mejorar', 'postular', 'postular', 'ingresar', 'contestar', 'ver', 'ajustar', 'buscar', 'contactarer', 'seguir']


In [90]:
for entity in doc.ents:
    print(entity.text, entity.label_)

Sercomed S.A. PER
Contact Center LOC
Nuestros LOC
Isapres LOC
Cajas de Compensación LOC
Compañías de Seguros LOC
Compañías de Asistencia LOC
Mutualidades LOC
Clínicas LOC
Hospitales LOC
Centros Médicos LOC
Descripción Funciones del Cargo MISC
Sercomed PER
Nos encontramos en la búsqueda de sumar MISC
Analista Contable ORG
• Colaborar LOC
Analizar LOC
• Registrar PER
ERP MISC
Emitir LOC
Gestionar PER
• Validar MISC
• Realizar MISC
Sueldo PER
Horario PER
Modalidad: Hibrida MISC
Nuestro Candidato Ideal LOC
Título PER
No Excluyente LOC
Manejo LOC
Excel MISC
Manejo LOC
Softland LOC
Deseable)  Sueldo Líquido Mensual LOC
El proceso MISC
Aira LOC
Postular LOC
Revisar PER
Ingresar LOC
Aira LOC
Aira LOC


## Basic Analysis

In [23]:
df_clean.job_site.fillna('Trabajando').nunique()

3

In [30]:
df_clean[df_clean.job_site == 'Trabajando'].count()/230

title                1.000000
posting_url          1.000000
geolocalization      1.000000
company              1.000000
salary               0.000000
experience           0.973913
work_schedule        0.847826
shift_type           0.000000
employment_type      0.000000
slots_avaliable      0.000000
driving_level        0.000000
job_site             1.000000
description          0.973913
requisites           0.000000
pills                0.973913
inclusive_posting    1.000000
published_date       1.000000
dtype: float64

In [33]:
df_clean[df_clean.job_site == 'Chiletrabajos'].count()/645

title                1.000000
posting_url          1.000000
geolocalization      1.000000
company              1.000000
salary               0.303876
experience           0.000000
work_schedule        1.000000
shift_type           0.000000
employment_type      0.621705
slots_avaliable      0.000000
driving_level        0.000000
job_site             1.000000
description          1.000000
requisites           0.000000
pills                1.000000
inclusive_posting    1.000000
published_date       1.000000
dtype: float64

In [35]:
df_clean[df_clean.job_site == 'Computrabajo'].count()/996

title                1.000000
posting_url          1.000000
geolocalization      1.000000
company              1.000000
salary               0.803213
experience           0.000000
work_schedule        0.803213
shift_type           0.000000
employment_type      0.803213
slots_avaliable      0.000000
driving_level        0.000000
job_site             1.000000
description          0.803213
requisites           0.803213
pills                0.803213
inclusive_posting    1.000000
published_date       1.000000
dtype: float64

In [42]:
df_clean.description.str.split().str.len().mean()

187.63990413421212

In [54]:
from collections import Counter
import spacy

In [64]:
text = '\n'.join(df_clean.iloc[:600].description.fillna('').str.lower().values)

In [78]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [79]:
stop_words = set(stopwords.words('spanish')) 
word_tokens = word_tokenize(text)


In [80]:
tokens_cleaned = [w for w in word_tokens if not w in stop_words]

In [81]:
tokens_cleaned

['requisitos',
 ':',
 '-experiencia',
 'mínima',
 '1',
 'año',
 'tratamientos',
 'depilación',
 'laser',
 ',',
 'manejo',
 'uso',
 'maquina',
 'laser',
 '.',
 '-realizar',
 'tratamientos',
 'drenaje',
 'linfático',
 ',',
 'post',
 'quirúrgico',
 ',',
 'hollywood',
 'peel',
 ',',
 'ondas',
 'rusas',
 ',',
 'maderoterapia',
 ',',
 'masajes',
 'reductivos',
 ',',
 '.',
 '-sueldo',
 'acorde',
 'mercado',
 ',',
 'jornada',
 'laboral',
 '44',
 'horas',
 'semanales',
 '.',
 'sercomed',
 's.a.',
 'contact',
 'center',
 'especializado',
 'salud',
 ',',
 'atendido',
 'administrativos',
 'expertos',
 'salud',
 ',',
 'médicos',
 ',',
 'enfermeras',
 ',',
 'personal',
 'técnico',
 'salud',
 'expertos',
 'prevención',
 'riesgos',
 '.',
 'clientes',
 'pertenecen',
 'siguientes',
 'rubros',
 ':',
 'isapres',
 ',',
 'cajas',
 'compensación',
 ',',
 'compañías',
 'seguros',
 ',',
 'compañías',
 'asistencia',
 ',',
 'mutualidades',
 ',',
 'clínicas',
 ',',
 'hospitales',
 'centros',
 'médicos',
 '.',
 'd