# Treating database

### Importing database

In [88]:
import json
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer

from ast import literal_eval
from unidecode import unidecode

In [182]:
db_file = open('resume_base.json', 'rb')
db = json.loads(db_file.read())

In [90]:
original_raw_data = pd.DataFrame(db)
raw_data = original_raw_data.copy()
raw_data = raw_data.set_index('id')
raw_data.head()

Unnamed: 0_level_0,age,complementary_data,education,language,location,marital_status,professional_goals_contract_type,professional_goals_salary,professional_goals_working_hours,sex,title,work_experience
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,49,"{""travel"": true, ""driving"": ""B"", ""vehicle"": ""C...","[{""dateEnd"": ""2003-11-01"", ""dateInit"": ""2003-0...","[{""level"": ""Intermediário"", ""language"": ""Inglê...","Centro, Curitiba - PR",,,"{""max"": 0.0, ""min"": 0.0}",Período Integral,Não Especificado,Candidato de Contabilidade,"[{""field"": ""Contábil, Finanças, Economia"", ""le..."
2,38,"{""travel"": true, ""driving"": ""A"", ""vehicle"": ""C...","[{""dateEnd"": ""2005-12-01"", ""dateInit"": ""2001-0...","[{""level"": ""Avançado"", ""language"": ""Inglês""}, ...",Santa Helena - PR,,,"{""max"": 0.0, ""min"": 0.0}",Período Integral,Não Especificado,Candidato de Zootecnia,"[{""field"": ""Agricultura, Pecuária, Veterinária..."
3,54,"{""travel"": true, ""driving"": ""B"", ""vehicle"": nu...","[{""dateEnd"": ""1999-10-01"", ""dateInit"": ""1999-0...","[{""level"": ""Intermediário"", ""language"": ""Inglê...","Centro, Curitiba - PR",,,"{""max"": 0.0, ""min"": 0.0}",Período Integral,Não Especificado,Candidato de Hotelaria,"[{""field"": ""Hotelaria, Turismo"", ""level"": null..."
4,41,"{""travel"": false, ""driving"": ""B"", ""vehicle"": ""...","[{""dateEnd"": ""2001-09-01"", ""dateInit"": ""1997-0...","[{""level"": ""Intermediário"", ""language"": ""Inglê...","Centro, Curitiba - PR",,,"{""max"": 0.0, ""min"": 0.0}",Período Integral,Não Especificado,Candidato de Marketing,"[{""field"": ""Marketing"", ""level"": null, ""title""..."
5,31,"{""travel"": false, ""driving"": null, ""vehicle"": ...","[{""dateEnd"": ""2004-05-01"", ""dateInit"": ""2004-0...","[{""level"": ""Intermediário"", ""language"": ""Portu...","Jardim Mediterrâneo, Colombo - PR",,,"{""max"": 0.0, ""min"": 0.0}",Período Integral,Não Especificado,Candidato de Instrumentação Industrial,"[{""field"": ""Industrial, Produção, Fábrica"", ""l..."


## Treating Data

In [91]:
treated_dataset = pd.DataFrame()
no_treatment_needed_attributes = ['age']
treated_dataset[no_treatment_needed_attributes] = raw_data[no_treatment_needed_attributes]

### Helper Functions

In [92]:
def swap_none_for_constant(data, column, fill_value, missing_values=None):
    imputer = SimpleImputer(missing_values=missing_values, strategy='constant', fill_value=fill_value)
    imputer.fit(data[column].values.reshape(-1,1))
    data[column] = imputer.transform(data[column].values.reshape(-1,1))
    return data

In [93]:
def one_edit_apart(s1, s2):
    if abs(len(s1) - len(s2)) > 1:
        return False
    if len(s1) > len(s2):
        bigger = s1
        smaller = s2
    else:
        bigger = s2
        smaller = s1

    equal_length_strings = len(s1) == len(s2)
    saw_difference = False
    i = 0
    j = 0
    while i < len(smaller): 
        if (bigger[j] != smaller[i]):
            if (saw_difference):
                return False
            saw_difference = True
            if (not equal_length_strings):
                i = i - 1
        i = i + 1
        j = j + 1
    return True

In [94]:
def string_list_dicts_to_dict(s, classes, dirty_portugues, dirty_ingles):
    d = {}
    for c in classes:
        d[c] = False;
    if s != None:
        list_of_dicts = json.loads(s)
        for language_info in list_of_dicts:
            language = language_info['language']
            if language == dirty_portugues:
                language = 'Português'
            elif language == dirty_ingles:
                language = 'Inglês'
            d[language] = True   #trocar para Label Encoder de nivel
    return d

In [95]:
def get_language_feature(target_dataset, target_column, classes, new_column_prefix, dirty_portugues, dirty_ingles):
    aux_dataset = pd.DataFrame()
    aux_dataset['dict'] = target_dataset[target_column].apply(string_list_dicts_to_dict, args=[classes, dirty_portugues, dirty_ingles])
    for key in classes:
        aux_dataset[new_column_prefix + key] = aux_dataset['dict'].map(lambda v: v[key])
    return aux_dataset.drop('dict', axis=1)

In [96]:
def string_list_to_dict(s, classes):
    input = s.replace(' ', '')
    d = {}
    for c in classes: 
        d[c] = False
    for driving_class in input.split(','):
        d[driving_class] = True
    return d

In [97]:
def split_string_list_in_features(target_dataset, target_column, classes, new_column_prefix):
    aux_dataset = pd.DataFrame()
    aux_dataset['dict'] = target_dataset[target_column].apply(string_list_to_dict, args=[classes])
    for key in classes:
        aux_dataset[new_column_prefix + key] = aux_dataset['dict'].map(lambda v: v[key])
    return aux_dataset.drop('dict', axis=1)

### Professional Goals Salary Treat

For a first approach as it is a litterally numerical attribute we will just divide into a max and min attribute

In [98]:
raw_data['professional_goals_salary'].unique()

array(['{"max": 0.0, "min": 0.0}', None, '{"max": 3000.0, "min": 1200.0}',
       ..., '{"max": 2450.0, "min": 2300.0}',
       '{"max": 14000.0, "min": 7000.0}',
       '{"max": 4800.0, "min": 2800.0}'], dtype=object)

Tem valores de None, entao a primeira abordagem trivial é só substituir por {"max": 0.0, "min":0.0}, pode ser interessante usar a média dos valores futuramente (inclusive substituindo as atuais ocorrencias de {"max": 0.0, "min":0.0})

In [99]:
display(raw_data['professional_goals_salary'].isnull().values.any())
swap_none_for_constant(raw_data, 'professional_goals_salary', {"max": 0.0, "min":0.0})
raw_data['professional_goals_salary'].isnull().values.any()

True

False

In [100]:
treated_dataset['professional_goals_salary_min'] = raw_data['professional_goals_salary'].astype(str).map(lambda v: literal_eval(v)['min'])
treated_dataset['professional_goals_salary_max'] = raw_data['professional_goals_salary'].astype(str).map(lambda v: literal_eval(v)['max'])

In [101]:
treated_dataset[['professional_goals_salary_min', 'professional_goals_salary_max']].sample(5)

Unnamed: 0_level_0,professional_goals_salary_min,professional_goals_salary_max
id,Unnamed: 1_level_1,Unnamed: 2_level_1
10661,1000.0,2000.0
45230,900.0,1500.0
57963,0.0,0.0
20529,450.0,1000.0
54867,1100.0,6000.0


### Professional Goals Contract Type Treat

In [102]:
raw_data['professional_goals_contract_type'].astype(str).apply(unidecode).unique()

array(['None', 'Estagio', 'Efetivo - CLT', 'Outros', 'Temporario',
       'Autonomo', 'Trainee', 'Cooperado', 'Prestador de Servicos (PJ)',
       'Estgio', 'Efetivo  CLT'], dtype=object)

Necessario unificar termos com distancia de edição até 1 e incluir não informado nas situações com None

In [103]:
raw_data = swap_none_for_constant(raw_data, 'professional_goals_contract_type', "Nao Informado")

In [104]:
raw_data['professional_goals_contract_type'].astype(str).apply(unidecode).unique()

array(['Nao Informado', 'Estagio', 'Efetivo - CLT', 'Outros',
       'Temporario', 'Autonomo', 'Trainee', 'Cooperado',
       'Prestador de Servicos (PJ)', 'Estgio', 'Efetivo  CLT'],
      dtype=object)

In [105]:
core_contracts_types = ['Nao Informado', 'Estagio', 'Efetivo - CLT', 'Outros',
       'Temporario', 'Autonomo', 'Trainee', 'Cooperado', 'Prestador de Servicos (PJ)']

treated_dataset['professional_goals_contract_type'] = raw_data['professional_goals_contract_type'].astype(str).apply(unidecode)
clean_contract_type = np.array(treated_dataset['professional_goals_contract_type'])

for contract_type in core_contracts_types:
    for i in range(0,len(clean_contract_type)):
        if one_edit_apart(clean_contract_type[i], contract_type):
            clean_contract_type[i] = contract_type

treated_dataset['professional_goals_contract_type'] = clean_contract_type

In [106]:
treated_dataset['professional_goals_contract_type'].unique()

array(['Nao Informado', 'Estagio', 'Efetivo - CLT', 'Outros',
       'Temporario', 'Autonomo', 'Trainee', 'Cooperado',
       'Prestador de Servicos (PJ)'], dtype=object)

### Professional Goals Working Hours

In [107]:
raw_data['professional_goals_working_hours'].astype(str).apply(unidecode).unique()

array(['Periodo Integral', 'Parcial manhas', 'None', 'Parcial tardes',
       'Parcial noites', 'Noturno', 'Perodo Integral', 'Parcial manhs'],
      dtype=object)

In [108]:
raw_data = swap_none_for_constant(raw_data, 'professional_goals_working_hours', "Nao Informado")

In [109]:
core_working_hours = ['Periodo Integral', 'Parcial manhas', 'None', 'Parcial tardes',
                       'Parcial noites', 'Noturno']

In [110]:
treated_dataset['professional_goals_working_hours'] = raw_data['professional_goals_working_hours'].astype(str).apply(unidecode)
clean_working_hours = np.array(treated_dataset['professional_goals_working_hours'])

for working_hours in core_working_hours:
    for i in range(0,len(clean_working_hours)):
        if one_edit_apart(clean_working_hours[i], working_hours):
            clean_working_hours[i] = working_hours

treated_dataset['professional_goals_working_hours'] = clean_working_hours

In [111]:
treated_dataset['professional_goals_working_hours'].unique()

array(['Periodo Integral', 'Parcial manhas', 'Nao Informado',
       'Parcial tardes', 'Parcial noites', 'Noturno'], dtype=object)

### Complementary Data Treat

In [112]:
treated_dataset.columns

Index(['age', 'professional_goals_salary_min', 'professional_goals_salary_max',
       'professional_goals_contract_type', 'professional_goals_working_hours'],
      dtype='object')

In [113]:
complementary_data_keys = list(json.loads(raw_data['complementary_data'].values[0]).keys())

for key in complementary_data_keys:
    treated_dataset['complementary_data_' + key] = raw_data['complementary_data'].map(lambda v: json.loads(v)[key])

Vehicle e Driving precisam de um tratamento extra para remover Nones e transformar strings que representam multiplos atributos em lista e futuramente em uma feature cada

In [114]:
treated_dataset['complementary_data_driving'].unique()

array(['B', 'A', None, 'A, B', 'E', 'C', 'D', 'B, A', 'A, B, C, D, E',
       'A, B, C, D', 'B, D', 'A, C', 'A, D', 'A, B, C', 'A, E', 'B, B',
       'B, C', 'A, A', 'C, A, B', 'B, A, C, D', 'B, C, D', 'B, D, A',
       'D, A', 'D, B', 'A, B, D', 'A, C, B', 'C, E', 'A, B, D, C',
       'B, A, D', 'D, B, C', 'D, E', 'A, D, B, C, E', 'D, A, B, C',
       'C, A', 'B, D, A, C', 'A, D, B, C', 'E, A', 'D, B, C, A',
       'A, B, D, E, C', 'A, B, A, B, A, B', 'A, C, D', 'E, A, B, C, D',
       'A, B, C, E'], dtype=object)

In [115]:
treated_dataset = swap_none_for_constant(treated_dataset, 'complementary_data_driving', "")

In [116]:
driving_classes = ['A', 'B', 'C', 'D', 'E']
new_features = split_string_list_in_features(treated_dataset, 'complementary_data_driving', driving_classes, 'driving_')
treated_dataset = pd.concat([treated_dataset,new_features], axis=1)
treated_dataset[['driving_' + k for k in driving_classes]].head()

Unnamed: 0_level_0,driving_A,driving_B,driving_C,driving_D,driving_E
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,False,True,False,False,False
2,True,False,False,False,False
3,False,True,False,False,False
4,False,True,False,False,False
5,False,False,False,False,False


In [117]:
treated_dataset[['driving_' + d_class for d_class in driving_classes]].head()

Unnamed: 0_level_0,driving_A,driving_B,driving_C,driving_D,driving_E
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,False,True,False,False,False
2,True,False,False,False,False
3,False,True,False,False,False
4,False,True,False,False,False
5,False,False,False,False,False


In [118]:
treated_dataset['complementary_data_vehicle'].unique()

array(['Carro particular', None, 'Moto', 'Caminhão', 'Outro',
       'Moto, Carro particular', 'Carro particular, Moto',
       'Carro particular, Outro', 'Outro, Carro particular',
       'Carro particular, Caminhão', 'Moto, Outro',
       'Carro particular, Moto, Outro',
       'Caminhão, Carro particular, Moto',
       'Moto, Carro particular, Outro', 'Caminhão, Outro',
       'Caminhão, Carro particular', 'Carro particular, Outro, Caminhão',
       'Moto, Carro particular, Caminhão',
       'Carro particular, Carro particular',
       'Moto, Carro particular, Caminhão, Outro',
       'Carro particular, Caminhão, Outro', 'Moto, Caminhão',
       'Carro particular, Moto, Caminhão',
       'Carro particular, Caminhão, Moto, Outro',
       'Moto, Caminhão, Carro particular', 'Caminhão, Moto',
       'Outro, Carro particular, Moto', 'Moto, Outro, Carro particular',
       'Carro particular, Caminhão, Moto', 'Moto, Caminhão, Outro',
       'Caminhão, Carro particular, Moto, Outro'], dtyp

In [119]:
vehicle_classes = ['Carro particular', 'Moto', 'Caminhão', 'Outro']
treated_dataset = swap_none_for_constant(treated_dataset, 'complementary_data_vehicle', "")
new_features = split_string_list_in_features(treated_dataset, 'complementary_data_vehicle', vehicle_classes, 'vehicle_')
treated_dataset = pd.concat([treated_dataset,new_features], axis=1)
treated_dataset[['vehicle_' + k for k in vehicle_classes]].head()

Unnamed: 0_level_0,vehicle_Carro particular,vehicle_Moto,vehicle_Caminhão,vehicle_Outro
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False
4,False,False,False,False
5,False,False,False,False


In [120]:
treated_dataset = treated_dataset.drop(['complementary_data_driving', 'complementary_data_vehicle'], axis=1)
treated_dataset.columns

Index(['age', 'professional_goals_salary_min', 'professional_goals_salary_max',
       'professional_goals_contract_type', 'professional_goals_working_hours',
       'complementary_data_travel', 'complementary_data_residence',
       'driving_A', 'driving_B', 'driving_C', 'driving_D', 'driving_E',
       'vehicle_Carro particular', 'vehicle_Moto', 'vehicle_Caminhão',
       'vehicle_Outro'],
      dtype='object')

In [121]:
treated_dataset['complementary_data_travel'].unique()

array([ True, False])

### Title Treat

In [287]:
def is_clean(word):
    return word != '/' and len(word) > 2 

In [288]:
from gensim.models import KeyedVectors

model = KeyedVectors.load_word2vec_format('cbow_s50.txt')
print('fitted')

fitted


In [289]:
splitted_list = [ title.split(' ')[1:] for title in raw_data['title'].unique() ]
final_list = [ [ i.lower() for i in title if is_clean(i)] if ( '/' in title or len(title) > 1 ) else [title[0].lower()] for title in splitted_list ]

In [290]:
def clean_word(word):
    bad_characters = [ '(', ')', ',', '.', '"', "'", '-', '\r', '\n', '*', ';', 'ð', '\t']
    for character in bad_characters:
        if character in word:
            word = word.replace(character, '')
    if word == 'aeropespacial':
        word = 'aeroespacial'
    elif word == 'serralheiria':
        word = 'serralheira'
    elif word == 'mandrilagem':
        word = 'mecânico'
    elif word == 'colorimetrista':
        word = 'colorista'
    elif word == 'fresaria':
        word = 'fresa'
    elif word == 'traumatorpedia':
        word = 'trauma'
    elif word == 'neorologia':
        word = 'neurologia'
    elif word == 'censoriamento':
        word = 'sensoriamento'
    return word

def clean_title(word_list):
    return [clean_word(word) for word in word_list]

title_names = list(raw_data['title'].unique())
clean_titles = [clean_title(title) for title in final_list]

In [291]:
from sklearn.cluster import KMeans

In [295]:
def get_phrase_vector(phrase):
    acumulator = np.zeros(50)
    for word in phrase:
        acumulator = acumulator + model.word_vec(word)
    phrase_vector = np.divide(acumulator, len(phrase)) if len(phrase) != 0 else acumulator
    return phrase_vector if np.linalg.norm(phrase_vector) == 0 else np.divide(phrase_vector, np.linalg.norm(phrase_vector))

title_vectors = [ get_phrase_vector(phrase) for phrase in clean_titles]

In [296]:
kmeans = KMeans(n_clusters=300 ,random_state=42).fit(title_vectors)
kmeans.labels_

array([238, 210, 134, 262, 244,  39,  31, 267, 299, 298, 292, 236, 170,
       124, 223, 186, 207, 181,  88,  28,  39,  33,   3, 119,  60, 172,
       150, 109, 212, 178,  12,  11, 205, 219, 156, 271, 247, 281,  23,
       174, 241, 141, 154,  85, 218, 203, 147, 150, 214, 193, 126, 163,
         2, 242, 187,  61, 224,  77, 162,  96,  68, 285, 295,  34, 132,
        24, 266, 148,   3,  70,  30,  17,   6,  19, 275, 125, 138, 253,
       102,  13, 171, 127,  41, 121,  75,   6,  97,  95,  76, 129, 184,
       206, 291, 251,  25,  14, 194, 278, 290,   4, 180,  20,  49,  53,
        48, 123,  73, 259, 110, 102,  35, 216, 167, 175,  36, 166,  58,
        78, 228, 264, 221, 293,   8, 248, 106,  74,  91,  69,  67, 245,
       279, 270, 176, 202,  44, 159, 276,   9, 283, 213, 256, 254,  82,
        98, 209, 249, 237, 168,   1, 289,  15, 173, 152, 179, 297, 240,
       153, 232, 211, 257, 120, 208, 101, 165, 217, 117,  94,  16, 100,
        38,  83, 255,  57,  47,  93, 137, 202,  65, 142,  22,   

In [297]:
cluster = kmeans.predict(get_phrase_vector(['manutenção']).reshape(-1,50))
index = np.where(kmeans.labels_ == cluster)
for i in index[0]:
    print(title_names[i])

Candidato de Manutenção de Maquinaria


In [298]:
#title_names
raw_data['title'].value_counts()

Candidato sem experiência incluida em CV              18408
Candidato de Administração Geral                       9884
Candidato de Atendimento                               5205
Candidato de Produção                                  3150
Candidato de Lojas / Shopping                          3140
Candidato de Venda Interna                             2810
Candidato de Recepção                                  2725
Candidato de Telemarketing / Call Center Receptivo     1892
Candidato de Venda Externa                             1563
Candidato de Telemarketing / Call Center Ativo         1516
Candidato de Suporte Técnico                           1373
Candidato de Atendente / Recepção / Garçom             1324
Candidato de Estoque, Armazenagem, Depósito            1300
Candidato de Programador / Desenvolvedor               1154
Candidato de Logística                                 1149
Candidato de Advocacia Geral                           1045
Candidato de TI                         

In [299]:
word_to_find = ['medicina']

def get_top_5(word_to_find, title_vectors):
    vector = get_phrase_vector(word_to_find)
    distances = [np.absolute(np.linalg.norm(vector - candidate_vector)) 
                          for candidate_vector in title_vectors]
    return distances

distances = get_top_5(word_to_find, title_vectors)
distances_arg_sorted = np.argsort(distances)
for i in range(0,5):
    display(clean_titles[distances_arg_sorted[i]])
    print(distances[distances_arg_sorted[i]])

['medicina']

0.0


['medicina', 'trabalho']

0.460573055675814


['psicologia']

0.5026424398526381


['odontologia']

0.529267132401292


['neurologia']

0.5322574935804296


In [132]:
distances[0]

1.4843956826179894

### Work Description Treatment

In [249]:
work_experience_lists = raw_data['work_experience'].astype(str).tolist()

In [250]:
def get_descriptions_together(work_experience_list):
    full_description = ''
    for work_experience in work_experience_list:
        full_description = full_description + ' ' + work_experience['description'] if full_description else work_experience['description']
    return full_description   

In [251]:
work_experiences = [work_experience if work_experience and work_experience != 'None' else '[]' for work_experience in work_experience_lists]
work_experiences = [json.loads(work_experience) for work_experience in work_experiences]
work_experiences_description = [get_descriptions_together(work_experience) for work_experience in work_experiences]

In [252]:
work_experiences_description[6]

'*organizar arquivos de documentação técnicas\r\n*analisar ajustamentos e desenvolvmentos de cada educando\r\n*prestar atendimetos a pais de educandos\r\n* fechamentos do caixa \r\n*Telefonista'

In [253]:
from nltk.corpus import stopwords
print(stopwords.words('portuguese'))

['de', 'a', 'o', 'que', 'e', 'do', 'da', 'em', 'um', 'para', 'com', 'não', 'uma', 'os', 'no', 'se', 'na', 'por', 'mais', 'as', 'dos', 'como', 'mas', 'ao', 'ele', 'das', 'à', 'seu', 'sua', 'ou', 'quando', 'muito', 'nos', 'já', 'eu', 'também', 'só', 'pelo', 'pela', 'até', 'isso', 'ela', 'entre', 'depois', 'sem', 'mesmo', 'aos', 'seus', 'quem', 'nas', 'me', 'esse', 'eles', 'você', 'essa', 'num', 'nem', 'suas', 'meu', 'às', 'minha', 'numa', 'pelos', 'elas', 'qual', 'nós', 'lhe', 'deles', 'essas', 'esses', 'pelas', 'este', 'dele', 'tu', 'te', 'vocês', 'vos', 'lhes', 'meus', 'minhas', 'teu', 'tua', 'teus', 'tuas', 'nosso', 'nossa', 'nossos', 'nossas', 'dela', 'delas', 'esta', 'estes', 'estas', 'aquele', 'aquela', 'aqueles', 'aquelas', 'isto', 'aquilo', 'estou', 'está', 'estamos', 'estão', 'estive', 'esteve', 'estivemos', 'estiveram', 'estava', 'estávamos', 'estavam', 'estivera', 'estivéramos', 'esteja', 'estejamos', 'estejam', 'estivesse', 'estivéssemos', 'estivessem', 'estiver', 'estivermos

In [254]:
def clean_bad_characters(word):
    bad_characters = [ '(', ')', ',', '.', '"', "'", '-', '\r', '\n', '*', ';', 'ð', '\t', '–']
    for character in bad_characters:
        if character in word:
            word = word.replace(character, ' ')
    return word

def sanitize_word(word):
    word = clean_bad_characters(word)
    return word.lower()

def treat_these_words(words):
    actual_words = []
    for word in words:
        actual_words = actual_words + word.split('/')
    words = [sanitize_word(word) for word in actual_words if not(sanitize_word(word) in stopwords.words('portuguese'))]
    actual_words = []
    for word in words:
        actual_words = actual_words + word.split(' ')
    return [sanitize_word(word) for word in actual_words if not(sanitize_word(word) in stopwords.words('portuguese'))]

work_experiences_description_words = [treat_these_words(description.split(' ')) for description in work_experiences_description]

In [255]:
work_experiences_description_words

[[''],
 ['trabalhei',
  'lider',
  'maternidade',
  '',
  'aproximadamente',
  '1500',
  'matrizes',
  '',
  'tempo',
  'maravilhoso',
  'onde',
  'aprendi',
  'muitas',
  'coisas',
  '',
  '',
  'relacionamento',
  'pessoal',
  '',
  'aplicabilidade',
  'muita',
  'coisa',
  'aprendi',
  'faculdade',
  '',
  'alem',
  'ingles',
  '',
  'cultura',
  '',
  'etc',
  ''],
 ['chefe', 'juridico'],
 ['consultora',
  'autônoma',
  'projetos',
  'consultoria',
  'branding',
  '',
  'análise',
  'mercado',
  'nacional',
  'internacional',
  '',
  'reposicionamento',
  'reconstrução',
  'marca',
  '',
  'formação',
  'planos',
  'marketing',
  'branding',
  '',
  'treinamento',
  'equipe',
  'interna',
  '',
  'treinamentos',
  'conscientização',
  'importância',
  'branding',
  'colaboradores',
  '',
  'treinamento',
  'técnicas',
  'habilidades',
  'vendas',
  'treinamentos',
  'voltados',
  'valorização',
  'marca',
  '',
  'compras',
  '',
  'reposições',
  'estoque',
  '',
  '',
  'rotina',

In [256]:
from collections import Counter

def flat_list(l):
    return [item for sublist in l for item in sublist]

all_text = flat_list(work_experiences_description_words)
counts = Counter(all_text)
counts.most_common(100)

[('', 830869),
 ('atendimento', 51765),
 ('controle', 31439),
 ('clientes', 30918),
 ('cliente', 21200),
 ('elaboração', 19662),
 ('vendas', 19351),
 ('organização', 18202),
 ('desenvolvimento', 17363),
 ('atividades', 17327),
 ('sistema', 16806),
 ('empresa', 15236),
 ('produtos', 15207),
 ('documentos', 15139),
 ('manutenção', 14653),
 ('responsável', 14613),
 ('suporte', 14459),
 ('acompanhamento', 14175),
 ('caixa', 13312),
 ('projetos', 12202),
 ('processos', 12016),
 ('auxiliar', 11435),
 ('análise', 11124),
 ('•', 11104),
 ('serviços', 11040),
 ('relatórios', 10895),
 ('notas', 10675),
 ('área', 10662),
 ('estoque', 10559),
 ('emissão', 10096),
 ('materiais', 9786),
 ('contas', 9770),
 ('produção', 9611),
 ('planilhas', 9428),
 ('gestão', 9148),
 ('fiscais', 8942),
 ('dados', 8835),
 ('trabalho', 8803),
 ('setor', 8546),
 ('venda', 8466),
 ('fechamento', 8266),
 ('sistemas', 8118),
 ('equipe', 8033),
 ('criação', 7953),
 ('geral', 7756),
 ('telefone', 7694),
 ('novos', 7517),
 (

In [264]:
def create_bigram_list(word_list):
    new_list = []
    for i in range(0, len(word_list) - 1):
        new_list = new_list + [word_list[i] + ' ' + word_list[i + 1]]
    return new_list

clean_description_word_list = [ [a for a in word_list if a != ''] if word_list != [''] else word_list for word_list in work_experiences_description_words]
bigram_list = [ create_bigram_list(word_list) for word_list in clean_description_word_list]
bigram_list

[[],
 ['trabalhei lider',
  'lider maternidade',
  'maternidade aproximadamente',
  'aproximadamente 1500',
  '1500 matrizes',
  'matrizes tempo',
  'tempo maravilhoso',
  'maravilhoso onde',
  'onde aprendi',
  'aprendi muitas',
  'muitas coisas',
  'coisas relacionamento',
  'relacionamento pessoal',
  'pessoal aplicabilidade',
  'aplicabilidade muita',
  'muita coisa',
  'coisa aprendi',
  'aprendi faculdade',
  'faculdade alem',
  'alem ingles',
  'ingles cultura',
  'cultura etc'],
 ['chefe juridico'],
 ['consultora autônoma',
  'autônoma projetos',
  'projetos consultoria',
  'consultoria branding',
  'branding análise',
  'análise mercado',
  'mercado nacional',
  'nacional internacional',
  'internacional reposicionamento',
  'reposicionamento reconstrução',
  'reconstrução marca',
  'marca formação',
  'formação planos',
  'planos marketing',
  'marketing branding',
  'branding treinamento',
  'treinamento equipe',
  'equipe interna',
  'interna treinamentos',
  'treinamentos 

In [265]:
from collections import Counter

def flat_list(l):
    return [item for sublist in l for item in sublist]

all_text = flat_list(bigram_list)
counts = Counter(all_text)
counts.most_common(100)

[('atendimento cliente', 12415),
 ('notas fiscais', 7808),
 ('atendimento público', 5573),
 ('atendimento clientes', 5345),
 ('atendimento telefônico', 5035),
 ('contas pagar', 4130),
 ('controle estoque', 3259),
 ('fechamento caixa', 2757),
 ('atendimento publico', 2529),
 ('emissão notas', 2526),
 ('banco dados', 2472),
 ('entrada saída', 2465),
 ('suporte técnico', 2453),
 ('pagar receber', 2184),
 ('elaboração relatórios', 2117),
 ('folha pagamento', 2027),
 ('rotinas administrativas', 1936),
 ('principais atividades', 1869),
 ('abertura fechamento', 1846),
 ('via telefone', 1819),
 ('novos clientes', 1789),
 ('nota fiscal', 1738),
 ('outras atividades', 1693),
 ('recrutamento seleção', 1596),
 ('elaboração planilhas', 1549),
 ('fluxo caixa', 1525),
 ('atendimento telefone', 1471),
 ('controle qualidade', 1391),
 ('organização estoque', 1389),
 ('lançamento notas', 1388),
 ('organização documentos', 1321),
 ('organização arquivos', 1292),
 ('redes sociais', 1257),
 ('demais ativida

### Language Treat

In [139]:
languages_dicts = raw_data['language'].unique()
languages_dicts = [language_dict for language_dict in languages_dicts if language_dict]

In [140]:
all_languages = np.array([])
for languages in languages_dicts:
    language_dict = json.loads(languages)
    for language in language_dict:
        all_languages = np.append(all_languages,language['language'])

In [141]:
lazy_unique = np.unique([(language) for language in all_languages])
dirty_ingles = lazy_unique[22]
dirty_portugues = lazy_unique[33]
lazy_unique = np.delete(lazy_unique,[0, 22, 33])
classes = lazy_unique

In [142]:
language_features = get_language_feature(raw_data, 'language', classes, 'language_', dirty_portugues, dirty_ingles)
treated_dataset = pd.concat([treated_dataset,language_features], axis=1)

In [143]:
language_features[['language_Inglês', 'language_Português']].head()

Unnamed: 0_level_0,language_Inglês,language_Português
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,True,True
2,True,True
3,True,True
4,True,True
5,False,True


In [144]:
raw_data['language'].head().values

array(['[{"level": "Intermediário", "language": "Inglês"}, {"level": "Nativo", "language": "Português"}, {"level": "Básico", "language": "Espanhol"}]',
       '[{"level": "Avançado", "language": "Inglês"}, {"level": "Nativo", "language": "Português"}]',
       '[{"level": "Intermediário", "language": "Inglês"}, {"level": "Nativo", "language": "Português"}]',
       '[{"level": "Intermediário", "language": "Inglês"}, {"level": "Nativo", "language": "Português"}]',
       '[{"level": "Intermediário", "language": "Português"}]'],
      dtype=object)

### Location Treat

In [145]:
experimentation = raw_data.copy()
experimentation['count'] = 1
experimentation.groupby('location').sum().sort_values('count', ascending=False)

Unnamed: 0_level_0,age,count
location,Unnamed: 1_level_1,Unnamed: 2_level_1
"Centro, Curitiba - PR",334116,13703
"Cidade Industrial, Curitiba - PR",227783,9729
"Sítio Cercado, Curitiba - PR",151700,6510
"Cajuru, Curitiba - PR",117833,4943
"Boqueirão, Curitiba - PR",99511,4121
"Uberaba, Curitiba - PR",85292,3609
"Xaxim, Curitiba - PR",76205,3174
"Pinheirinho, Curitiba - PR",70399,2956
"Tatuquara, Curitiba - PR",61337,2718
"Novo Mundo, Curitiba - PR",61855,2530


In [146]:
test_db = [ {'age':resume['age'], 'id': resume['id'], 'professional_goals_working_hours': resume['professional_goals_working_hours'], 'sex':resume['sex'], 'title': resume['title']} for resume in db]

In [147]:
test_db[0]

{'age': 49,
 'id': 1,
 'professional_goals_working_hours': 'Período Integral',
 'sex': 'Não Especificado',
 'title': 'Candidato de Contabilidade'}

### Output Treated Dataset

In [266]:
with open('treated_data.json', 'w') as outfile:
    json.dump(test_db, outfile)

In [267]:
treated_dataset['title'] = raw_data['title']
output_dataset = treated_dataset.reset_index()
output_dataset.to_csv('treated_dataset.csv')

In [268]:
output_dataset.columns

Index(['id', 'age', 'professional_goals_salary_min',
       'professional_goals_salary_max', 'professional_goals_contract_type',
       'professional_goals_working_hours', 'complementary_data_travel',
       'complementary_data_residence', 'driving_A', 'driving_B', 'driving_C',
       'driving_D', 'driving_E', 'vehicle_Carro particular', 'vehicle_Moto',
       'vehicle_Caminhão', 'vehicle_Outro', 'language_Afrikaans',
       'language_Alemão', 'language_Bengalí', 'language_Cantonês',
       'language_Catalão', 'language_Chinês', 'language_Coreano',
       'language_Croato', 'language_Dinamarquês', 'language_Eslovaco',
       'language_Espanhol', 'language_Farsi', 'language_Finlandês',
       'language_Francês', 'language_Galego', 'language_Grego',
       'language_Hebraico', 'language_Holandês', 'language_Húngaro',
       'language_Indonésio', 'language_Inglês', 'language_Islandês',
       'language_Italiano', 'language_Japonês', 'language_Latin',
       'language_Latviano', 'language_

Index(['Unnamed: 0', 'id', 'age', 'professional_goals_salary_min',
       'professional_goals_salary_max', 'professional_goals_contract_type',
       'professional_goals_working_hours', 'complementary_data_travel',
       'complementary_data_residence', 'driving_A', 'driving_B', 'driving_C',
       'driving_D', 'driving_E', 'vehicle_Carro particular', 'vehicle_Moto',
       'vehicle_Caminhão', 'vehicle_Outro', 'language_Afrikaans',
       'language_Alemão', 'language_Bengalí', 'language_Cantonês',
       'language_Catalão', 'language_Chinês', 'language_Coreano',
       'language_Croato', 'language_Dinamarquês', 'language_Eslovaco',
       'language_Espanhol', 'language_Farsi', 'language_Finlandês',
       'language_Francês', 'language_Galego', 'language_Grego',
       'language_Hebraico', 'language_Holandês', 'language_Húngaro',
       'language_Indonésio', 'language_Inglês', 'language_Islandês',
       'language_Italiano', 'language_Japonês', 'language_Latin',
       'language_Latvian