In [1]:
import re
import pandas as pd
import unittest
from collections import Counter
import json
import os

In [2]:
df = pd.read_csv('../data/raw_jobs.csv')

In [3]:
def simple_regex_search(pattern:str,in_string:str):
    return re.search(pattern,in_string,re.IGNORECASE)

assert simple_regex_search('t[eé]cnic|t[eé]c.?|tec','se busca técnico en enfermería') != None

In [4]:
def multiple_patterns_match(patterns:list, in_string:str):
    return simple_regex_search("|".join(patterns),in_string)

assert multiple_patterns_match(['t[eé]cnic','t[eé]c.?','tec'],'se busca técnico en enfermería') != None

In [5]:
def find_all_matches(target_classes:dict, source_data:list):
    normalized_classes = []
    for raw_data_item in source_data:
        for target_class, patterns in target_classes.items():
            if isinstance(patterns, dict):
                if patterns.get('Level1'):
                    result_match = find_all_matches({target_class:patterns['Level1']}, [raw_data_item])
                    if result_match: normalized_classes.append(*result_match)
                if patterns.get('Level2'):
                    patterns_first_half = patterns['Level2']['Patterns']
                    patterns_second_half = patterns['Level2']['Patterns_lvl2']
                    if multiple_patterns_match(patterns_first_half, raw_data_item):
                        result_match = find_all_matches({target_class:patterns_second_half},[raw_data_item])
                        if result_match: normalized_classes.append(*result_match)
            if isinstance(patterns, list):
                if multiple_patterns_match(patterns, raw_data_item): normalized_classes.append(target_class)
    return normalized_classes


In [7]:
specialties_dict = {
    'Enfermería':['enfermer.?','\beu\b'],
    'Kinesiología':['kinesi.log.?','kinesiolog.?','fisioterapia','mesoterapia','erg.nomo','ergono[ií]a'],
    'TENS':{
        'Level1': [r'tens?\b'],
        'Level2': {
            'Patterns': ['t[eé]cnic','t[eé]c.?','tec'],
            'Patterns_lvl2': ['enfermer.?']
        }
    }
}
result = find_all_matches(specialties_dict,
    ['Se busca kinesiologo o técnico en enfermería para terapia intensiva',
     ])
dict(Counter(result))

{'Enfermería': 1, 'Kinesiología': 1, 'TENS': 1}

In [16]:
def load_config_file(file_name):
    with open(os.path.join('..','normalizer','config_files', f"{file_name}.json")) as f:
        d = json.load(f)
        return d

In [48]:
def find_specialty(title:str = None, requirements:str = None, pills:str = None):
    specialties = load_config_file('specialties')
    target_data = []
    if title:
        target_data.append(title)
    if requirements:
        target_data.append(requirements)
    if pills:
        target_data.append(pills)

    specialties_detected = dict(Counter(find_all_matches(specialties, target_data)))



    return specialties_detected

In [56]:
specialties_detected = find_specialty(
    title='Enfermera / Técnico en Enfermería - Especialista Equipos Médicos/Pabellón',
    requirements='Requisitos: - Título de Enfermera  / Técnico en Enfermería - Experiencia en pabellón - Experiencia en equipos médicos - Disponibilidad para trabajar en turnos rotativos',
    )

In [57]:
final = dict(specialties_detected)
if final['TENS'] == final['Enfermería']:
    final['TENS'] = 0

In [58]:
[key for key,times in final.items() if times > 0]

['Enfermería']

In [59]:
final

{'Enfermería': 2, 'TENS': 0}