# Treating database

### Importing database

In [57]:
import json
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer

from ast import literal_eval
from unidecode import unidecode

In [58]:
db_file = open('resume_base.json', 'rb')
db = json.loads(db_file.read())
db[0:2]

[{'id': 1,
  'title': 'Candidato de Contabilidade',
  'age': 49,
  'location': 'Centro, Curitiba - PR',
  'professional_goals_salary': '{"max": 0.0, "min": 0.0}',
  'professional_goals_contract_type': None,
  'professional_goals_working_hours': 'Período Integral',
  'education': '[{"dateEnd": "2003-11-01", "dateInit": "2003-02-01", "courseName": "Curso extra-curricular / Profissionalizante  em Contabilidade para executivos", "educationInstitutionName": "UNICENP em PR"}, {"dateEnd": "1989-12-01", "dateInit": "1985-02-01", "courseName": "Ensino Superior em Ciências contábeis", "educationInstitutionName": "FESP-Fundação de estudos Sociais do Parana em PR"}, {"dateEnd": "1984-11-01", "dateInit": "1982-02-01", "courseName": "Curso Técnico em Contabilidade", "educationInstitutionName": "CEPC-Centro de Estudos Profission.de Curitiba em PR"}]',
  'work_experience': '[{"field": "Contábil, Finanças, Economia", "level": null, "title": "CONTADOR", "dateEnd": "2006-12-01", "dateInit": "2005-03-01",

In [59]:
original_raw_data = pd.DataFrame(db)
raw_data = original_raw_data.copy()
raw_data = raw_data.set_index('id')
raw_data.head()

Unnamed: 0_level_0,age,complementary_data,education,language,location,marital_status,professional_goals_contract_type,professional_goals_salary,professional_goals_working_hours,sex,title,work_experience
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,49,"{""travel"": true, ""driving"": ""B"", ""vehicle"": ""C...","[{""dateEnd"": ""2003-11-01"", ""dateInit"": ""2003-0...","[{""level"": ""Intermediário"", ""language"": ""Inglê...","Centro, Curitiba - PR",,,"{""max"": 0.0, ""min"": 0.0}",Período Integral,Não Especificado,Candidato de Contabilidade,"[{""field"": ""Contábil, Finanças, Economia"", ""le..."
2,38,"{""travel"": true, ""driving"": ""A"", ""vehicle"": ""C...","[{""dateEnd"": ""2005-12-01"", ""dateInit"": ""2001-0...","[{""level"": ""Avançado"", ""language"": ""Inglês""}, ...",Santa Helena - PR,,,"{""max"": 0.0, ""min"": 0.0}",Período Integral,Não Especificado,Candidato de Zootecnia,"[{""field"": ""Agricultura, Pecuária, Veterinária..."
3,54,"{""travel"": true, ""driving"": ""B"", ""vehicle"": nu...","[{""dateEnd"": ""1999-10-01"", ""dateInit"": ""1999-0...","[{""level"": ""Intermediário"", ""language"": ""Inglê...","Centro, Curitiba - PR",,,"{""max"": 0.0, ""min"": 0.0}",Período Integral,Não Especificado,Candidato de Hotelaria,"[{""field"": ""Hotelaria, Turismo"", ""level"": null..."
4,41,"{""travel"": false, ""driving"": ""B"", ""vehicle"": ""...","[{""dateEnd"": ""2001-09-01"", ""dateInit"": ""1997-0...","[{""level"": ""Intermediário"", ""language"": ""Inglê...","Centro, Curitiba - PR",,,"{""max"": 0.0, ""min"": 0.0}",Período Integral,Não Especificado,Candidato de Marketing,"[{""field"": ""Marketing"", ""level"": null, ""title""..."
5,31,"{""travel"": false, ""driving"": null, ""vehicle"": ...","[{""dateEnd"": ""2004-05-01"", ""dateInit"": ""2004-0...","[{""level"": ""Intermediário"", ""language"": ""Portu...","Jardim Mediterrâneo, Colombo - PR",,,"{""max"": 0.0, ""min"": 0.0}",Período Integral,Não Especificado,Candidato de Instrumentação Industrial,"[{""field"": ""Industrial, Produção, Fábrica"", ""l..."


## Treating Data

In [60]:
treated_dataset = pd.DataFrame()
no_treatment_needed_attributes = ['age']
treated_dataset[no_treatment_needed_attributes] = raw_data[no_treatment_needed_attributes]

### Helper Functions

In [61]:
def swap_none_for_constant(data, column, fill_value, missing_values=None):
    imputer = SimpleImputer(missing_values=missing_values, strategy='constant', fill_value=fill_value)
    imputer.fit(data[column].values.reshape(-1,1))
    data[column] = imputer.transform(data[column].values.reshape(-1,1))
    return data

In [62]:
def one_edit_apart(s1, s2):
    if abs(len(s1) - len(s2)) > 1:
        return False
    if len(s1) > len(s2):
        bigger = s1
        smaller = s2
    else:
        bigger = s2
        smaller = s1

    equal_length_strings = len(s1) == len(s2)
    saw_difference = False
    i = 0
    j = 0
    while i < len(smaller): 
        if (bigger[j] != smaller[i]):
            if (saw_difference):
                return False
            saw_difference = True
            if (not equal_length_strings):
                i = i - 1
        i = i + 1
        j = j + 1
    return True

In [63]:
def string_list_dicts_to_dict(s, classes, dirty_portugues, dirty_ingles):
    d = {}
    for c in classes:
        d[c] = False;
    if s != None:
        list_of_dicts = json.loads(s)
        for language_info in list_of_dicts:
            language = language_info['language']
            if language == dirty_portugues:
                language = 'Português'
            elif language == dirty_ingles:
                language = 'Inglês'
            d[language] = True   #trocar para Label Encoder de nivel
    return d

In [64]:
def get_language_feature(target_dataset, target_column, classes, new_column_prefix, dirty_portugues, dirty_ingles):
    aux_dataset = pd.DataFrame()
    aux_dataset['dict'] = target_dataset[target_column].apply(string_list_dicts_to_dict, args=[classes, dirty_portugues, dirty_ingles])
    for key in classes:
        aux_dataset[new_column_prefix + key] = aux_dataset['dict'].map(lambda v: v[key])
    return aux_dataset.drop('dict', axis=1)

In [65]:
def string_list_to_dict(s, classes):
    input = s.replace(' ', '')
    d = {}
    for c in classes: 
        d[c] = False
    for driving_class in input.split(','):
        d[driving_class] = True
    return d

In [66]:
def split_string_list_in_features(target_dataset, target_column, classes, new_column_prefix):
    aux_dataset = pd.DataFrame()
    aux_dataset['dict'] = target_dataset[target_column].apply(string_list_to_dict, args=[classes])
    for key in classes:
        aux_dataset[new_column_prefix + key] = aux_dataset['dict'].map(lambda v: v[key])
    return aux_dataset.drop('dict', axis=1)

### Professional Goals Salary Treat

For a first approach as it is a litterally numerical attribute we will just divide into a max and min attribute

In [67]:
raw_data['professional_goals_salary'].unique()

array(['{"max": 0.0, "min": 0.0}', None, '{"max": 3000.0, "min": 1200.0}',
       ..., '{"max": 2450.0, "min": 2300.0}',
       '{"max": 14000.0, "min": 7000.0}',
       '{"max": 4800.0, "min": 2800.0}'], dtype=object)

Tem valores de None, entao a primeira abordagem trivial é só substituir por {"max": 0.0, "min":0.0}, pode ser interessante usar a média dos valores futuramente (inclusive substituindo as atuais ocorrencias de {"max": 0.0, "min":0.0})

In [68]:
display(raw_data['professional_goals_salary'].isnull().values.any())
swap_none_for_constant(raw_data, 'professional_goals_salary', {"max": 0.0, "min":0.0})
raw_data['professional_goals_salary'].isnull().values.any()

True

False

In [69]:
treated_dataset['professional_goals_salary_min'] = raw_data['professional_goals_salary'].astype(str).map(lambda v: literal_eval(v)['min'])
treated_dataset['professional_goals_salary_max'] = raw_data['professional_goals_salary'].astype(str).map(lambda v: literal_eval(v)['max'])

In [70]:
treated_dataset[['professional_goals_salary_min', 'professional_goals_salary_max']].sample(5)

Unnamed: 0_level_0,professional_goals_salary_min,professional_goals_salary_max
id,Unnamed: 1_level_1,Unnamed: 2_level_1
49101,900.0,1200.0
64613,900.0,1000.0
80561,3000.0,3500.0
94966,1000.0,1300.0
44315,5000.0,7000.0


### Professional Goals Contract Type Treat

In [71]:
raw_data['professional_goals_contract_type'].astype(str).apply(unidecode).unique()

array(['None', 'Estagio', 'Efetivo - CLT', 'Outros', 'Temporario',
       'Autonomo', 'Trainee', 'Cooperado', 'Prestador de Servicos (PJ)',
       'Estgio', 'Efetivo  CLT'], dtype=object)

Necessario unificar termos com distancia de edição até 1 e incluir não informado nas situações com None

In [72]:
raw_data = swap_none_for_constant(raw_data, 'professional_goals_contract_type', "Nao Informado")

In [73]:
raw_data['professional_goals_contract_type'].astype(str).apply(unidecode).unique()

array(['Nao Informado', 'Estagio', 'Efetivo - CLT', 'Outros',
       'Temporario', 'Autonomo', 'Trainee', 'Cooperado',
       'Prestador de Servicos (PJ)', 'Estgio', 'Efetivo  CLT'],
      dtype=object)

In [74]:
core_contracts_types = ['Nao Informado', 'Estagio', 'Efetivo - CLT', 'Outros',
       'Temporario', 'Autonomo', 'Trainee', 'Cooperado', 'Prestador de Servicos (PJ)']

treated_dataset['professional_goals_contract_type'] = raw_data['professional_goals_contract_type'].astype(str).apply(unidecode)
clean_contract_type = np.array(treated_dataset['professional_goals_contract_type'])

for contract_type in core_contracts_types:
    for i in range(0,len(clean_contract_type)):
        if one_edit_apart(clean_contract_type[i], contract_type):
            clean_contract_type[i] = contract_type

treated_dataset['professional_goals_contract_type'] = clean_contract_type

In [75]:
treated_dataset['professional_goals_contract_type'].unique()

array(['Nao Informado', 'Estagio', 'Efetivo - CLT', 'Outros',
       'Temporario', 'Autonomo', 'Trainee', 'Cooperado',
       'Prestador de Servicos (PJ)'], dtype=object)

### Professional Goals Working Hours

In [76]:
raw_data['professional_goals_working_hours'].astype(str).apply(unidecode).unique()

array(['Periodo Integral', 'Parcial manhas', 'None', 'Parcial tardes',
       'Parcial noites', 'Noturno', 'Perodo Integral', 'Parcial manhs'],
      dtype=object)

In [77]:
raw_data = swap_none_for_constant(raw_data, 'professional_goals_working_hours', "Nao Informado")

In [78]:
core_working_hours = ['Periodo Integral', 'Parcial manhas', 'None', 'Parcial tardes',
                       'Parcial noites', 'Noturno']

In [79]:
treated_dataset['professional_goals_working_hours'] = raw_data['professional_goals_working_hours'].astype(str).apply(unidecode)
clean_working_hours = np.array(treated_dataset['professional_goals_working_hours'])

for working_hours in core_working_hours:
    for i in range(0,len(clean_working_hours)):
        if one_edit_apart(clean_working_hours[i], working_hours):
            clean_working_hours[i] = working_hours

treated_dataset['professional_goals_working_hours'] = clean_working_hours

In [80]:
treated_dataset['professional_goals_working_hours'].unique()

array(['Periodo Integral', 'Parcial manhas', 'Nao Informado',
       'Parcial tardes', 'Parcial noites', 'Noturno'], dtype=object)

### Complementary Data Treat

In [81]:
treated_dataset.columns

Index(['age', 'professional_goals_salary_min', 'professional_goals_salary_max',
       'professional_goals_contract_type', 'professional_goals_working_hours'],
      dtype='object')

In [82]:
complementary_data_keys = list(json.loads(raw_data['complementary_data'].values[0]).keys())

for key in complementary_data_keys:
    treated_dataset['complementary_data_' + key] = raw_data['complementary_data'].map(lambda v: json.loads(v)[key])

Vehicle e Driving precisam de um tratamento extra para remover Nones e transformar strings que representam multiplos atributos em lista e futuramente em uma feature cada

In [83]:
treated_dataset['complementary_data_driving'].unique()

array(['B', 'A', None, 'A, B', 'E', 'C', 'D', 'B, A', 'A, B, C, D, E',
       'A, B, C, D', 'B, D', 'A, C', 'A, D', 'A, B, C', 'A, E', 'B, B',
       'B, C', 'A, A', 'C, A, B', 'B, A, C, D', 'B, C, D', 'B, D, A',
       'D, A', 'D, B', 'A, B, D', 'A, C, B', 'C, E', 'A, B, D, C',
       'B, A, D', 'D, B, C', 'D, E', 'A, D, B, C, E', 'D, A, B, C',
       'C, A', 'B, D, A, C', 'A, D, B, C', 'E, A', 'D, B, C, A',
       'A, B, D, E, C', 'A, B, A, B, A, B', 'A, C, D', 'E, A, B, C, D',
       'A, B, C, E'], dtype=object)

In [84]:
treated_dataset = swap_none_for_constant(treated_dataset, 'complementary_data_driving', "")

In [85]:
driving_classes = ['A', 'B', 'C', 'D', 'E']
new_features = split_string_list_in_features(treated_dataset, 'complementary_data_driving', driving_classes, 'driving_')
treated_dataset = pd.concat([treated_dataset,new_features], axis=1)
treated_dataset[['driving_' + k for k in driving_classes]].head()

Unnamed: 0_level_0,driving_A,driving_B,driving_C,driving_D,driving_E
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,False,True,False,False,False
2,True,False,False,False,False
3,False,True,False,False,False
4,False,True,False,False,False
5,False,False,False,False,False


In [86]:
treated_dataset[['driving_' + d_class for d_class in driving_classes]].head()

Unnamed: 0_level_0,driving_A,driving_B,driving_C,driving_D,driving_E
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,False,True,False,False,False
2,True,False,False,False,False
3,False,True,False,False,False
4,False,True,False,False,False
5,False,False,False,False,False


In [87]:
treated_dataset['complementary_data_vehicle'].unique()

array(['Carro particular', None, 'Moto', 'Caminhão', 'Outro',
       'Moto, Carro particular', 'Carro particular, Moto',
       'Carro particular, Outro', 'Outro, Carro particular',
       'Carro particular, Caminhão', 'Moto, Outro',
       'Carro particular, Moto, Outro',
       'Caminhão, Carro particular, Moto',
       'Moto, Carro particular, Outro', 'Caminhão, Outro',
       'Caminhão, Carro particular', 'Carro particular, Outro, Caminhão',
       'Moto, Carro particular, Caminhão',
       'Carro particular, Carro particular',
       'Moto, Carro particular, Caminhão, Outro',
       'Carro particular, Caminhão, Outro', 'Moto, Caminhão',
       'Carro particular, Moto, Caminhão',
       'Carro particular, Caminhão, Moto, Outro',
       'Moto, Caminhão, Carro particular', 'Caminhão, Moto',
       'Outro, Carro particular, Moto', 'Moto, Outro, Carro particular',
       'Carro particular, Caminhão, Moto', 'Moto, Caminhão, Outro',
       'Caminhão, Carro particular, Moto, Outro'], dtyp

In [88]:
vehicle_classes = ['Carro particular', 'Moto', 'Caminhão', 'Outro']
treated_dataset = swap_none_for_constant(treated_dataset, 'complementary_data_vehicle', "")
new_features = split_string_list_in_features(treated_dataset, 'complementary_data_vehicle', vehicle_classes, 'vehicle_')
treated_dataset = pd.concat([treated_dataset,new_features], axis=1)
treated_dataset[['vehicle_' + k for k in vehicle_classes]].head()

Unnamed: 0_level_0,vehicle_Carro particular,vehicle_Moto,vehicle_Caminhão,vehicle_Outro
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False
4,False,False,False,False
5,False,False,False,False


In [89]:
treated_dataset = treated_dataset.drop(['complementary_data_driving', 'complementary_data_vehicle'], axis=1)
treated_dataset.columns

Index(['age', 'professional_goals_salary_min', 'professional_goals_salary_max',
       'professional_goals_contract_type', 'professional_goals_working_hours',
       'complementary_data_travel', 'complementary_data_residence',
       'driving_A', 'driving_B', 'driving_C', 'driving_D', 'driving_E',
       'vehicle_Carro particular', 'vehicle_Moto', 'vehicle_Caminhão',
       'vehicle_Outro'],
      dtype='object')

In [90]:
treated_dataset['complementary_data_travel'].unique()

array([ True, False])

### Title Treat

In [91]:
def is_clean(word):
    return word != '/' and len(word) > 2 

In [92]:
from gensim.models import KeyedVectors

model = KeyedVectors.load_word2vec_format('cbow_s50.txt')
print('fitted')

fitted


In [93]:
splitted_list = [ title.split(' ')[1:] for title in raw_data['title'].unique() ]
final_list = [ [ i.lower() for i in title if is_clean(i)] if ( '/' in title or len(title) > 1 ) else [title[0].lower()] for title in splitted_list ]

In [94]:
def clean_word(word):
    bad_characters = [ '(', ')', ',']
    for character in bad_characters:
        if character in word:
            word = word.replace(character, '')
    if word == 'aeropespacial':
        word = 'aeroespacial'
    elif word == 'serralheiria':
        word = 'serralheira'
    elif word == 'mandrilagem':
        word = 'mecânico'
    elif word == 'colorimetrista':
        word = 'colorista'
    elif word == 'fresaria':
        word = 'fresa'
    elif word == 'traumatorpedia':
        word = 'trauma'
    elif word == 'neorologia':
        word = 'neurologia'
    elif word == 'censoriamento':
        word = 'sensoriamento'
    return word

def clean_title(word_list):
    return [clean_word(word) for word in word_list]

title_names = list(raw_data['title'].unique())
clean_titles = [clean_title(title) for title in final_list]

In [95]:
from sklearn.cluster import KMeans

In [96]:
def get_phrase_vector(phrase):
    acumulator = np.zeros(50)
    for word in phrase:
        acumulator = acumulator + model.word_vec(word)
    return np.divide(acumulator, len(phrase)) if len(phrase) != 0 else acumulator

title_vectors = [ get_phrase_vector(phrase) for phrase in clean_titles]

In [97]:
kmeans = KMeans(n_clusters=300 ,random_state=42).fit(title_vectors)
kmeans.labels_

array([216,  96, 178, 244, 193,  58, 293, 191, 292, 287, 221,   1, 133,
       101,  40, 215, 247, 137, 129, 248,  58, 153, 275,  65,  69,  80,
        74,  99, 176, 277, 187, 237, 241, 225, 159, 260, 119, 200,  21,
       145, 266,  92,  76,  72, 242, 149,  68,  74, 108, 110,  82, 249,
       224, 156, 135, 245, 208,  94,  77,  61,  62, 252, 171, 144, 286,
       261, 250, 202, 219, 171, 105, 278, 106,   5, 232, 189,  43, 201,
       117,  19,  66,  10,  75,   7,  36, 106,  73, 228, 154, 214, 234,
       246, 248, 175, 291, 298, 235,  48, 238,  18, 111, 120,  64,  55,
        24, 198,  78,  89, 295, 117,  51, 258, 196,  86,  28, 254,  71,
        95, 121, 140,  34, 271,  35, 114, 116,  59,  39, 190,   4, 182,
         4, 103, 212, 296,  38,  54, 276, 267, 269, 239,   0, 126,   8,
       259, 209, 281, 125, 102,  16,   6,  15, 100, 285, 185,  84, 268,
        70, 147, 184, 233, 143,  50, 257,  27, 166, 160,  83, 290, 132,
        20,  13,  53,  24,  60, 243, 227,  11, 204, 203,  81,  2

In [98]:
cluster = kmeans.predict(get_phrase_vector(['manutenção']).reshape(-1,50))
index = np.where(kmeans.labels_ == cluster)
for i in index[0]:
    print(title_names[i])

Candidato de Manutenção de Maquinaria


In [99]:
display(clean_titles[0])
model.word_vec(clean_titles[0][0])

['contabilidade']

array([ 0.107176,  0.307806, -0.257309,  0.218156,  0.053398,  0.39165 ,
        0.102445, -0.130696,  0.124261,  0.313528, -0.01023 ,  0.038352,
        0.115566,  0.248606, -0.118287,  0.410848, -0.085927,  0.136183,
       -0.569552,  0.265026, -0.502153,  0.112177, -0.192168,  0.307524,
       -0.305401, -0.288788,  0.041757, -0.010879, -0.266819,  0.171607,
        0.335651, -0.153568, -0.025703, -0.48954 ,  0.215752,  0.110026,
       -0.356865, -0.080168,  0.332134, -0.018578,  0.099627,  0.33376 ,
        0.152203, -0.018246, -0.515894, -0.290571, -0.068779,  0.092988,
       -0.335309,  0.088523], dtype=float32)

In [100]:
embeding1 = model.word_vec('casamento')
embeding2 = model.word_vec('anel')
display(embeding1)
display(embeding2)
display(embeding1 + embeding2)

array([-0.171098, -0.211437, -0.0406  , -0.25863 ,  0.099055, -0.083413,
        0.001604, -0.157318,  0.294888,  0.275117, -0.332538,  0.102064,
       -0.261761,  0.061627,  0.023133,  0.306068, -0.198975, -0.086375,
        0.063773,  0.129837, -0.4425  ,  1.058497, -0.030719,  0.542897,
        0.303192, -0.066534,  0.284432, -0.002667,  0.061529,  0.137891,
        0.054927, -0.20466 ,  0.289955,  0.068958, -0.447869,  0.416748,
        0.46535 , -0.348347, -0.413268,  0.445893, -0.193363,  0.190303,
       -0.139398, -0.018587,  0.209661, -0.178953,  0.357544,  0.139055,
        0.121369, -0.131245], dtype=float32)

array([ 0.124733,  0.104939,  0.373799, -0.020229,  0.287824, -0.298393,
       -0.010846, -0.183719,  0.446023,  0.386821, -0.201856,  0.205284,
       -0.403549, -0.115055,  0.217655,  0.178497, -0.19751 , -0.152529,
        0.026868, -0.124246, -0.171037,  0.505091, -0.201456,  0.42194 ,
        0.222594, -0.144888, -0.01132 , -0.167962,  0.175373,  0.104832,
        0.485471,  0.199978,  0.489813,  0.235639,  0.080489,  0.467274,
        0.227337, -0.181196, -0.279548,  0.477372, -0.032982,  0.121284,
        0.130649, -0.216834,  0.749901,  0.237159,  0.09029 , -0.041777,
       -0.192291,  0.324513], dtype=float32)

array([-0.04636499, -0.106498  ,  0.333199  , -0.27885902,  0.386879  ,
       -0.38180602, -0.009242  , -0.34103698,  0.740911  ,  0.661938  ,
       -0.534394  ,  0.307348  , -0.66531   , -0.053428  ,  0.240788  ,
        0.48456502, -0.396485  , -0.238904  ,  0.090641  ,  0.00559101,
       -0.613537  ,  1.5635879 , -0.232175  ,  0.96483696,  0.525786  ,
       -0.211422  ,  0.273112  , -0.170629  ,  0.236902  ,  0.24272299,
        0.540398  , -0.004682  ,  0.779768  ,  0.30459702, -0.36738   ,
        0.884022  ,  0.69268703, -0.52954304, -0.692816  ,  0.923265  ,
       -0.226345  ,  0.311587  , -0.00874899, -0.235421  ,  0.959562  ,
        0.05820599,  0.447834  ,  0.097278  , -0.07092201,  0.19326799],
      dtype=float32)

### Language Treat

In [101]:
languages_dicts = raw_data['language'].unique()
languages_dicts = [language_dict for language_dict in languages_dicts if language_dict]

In [102]:
all_languages = np.array([])
for languages in languages_dicts:
    language_dict = json.loads(languages)
    for language in language_dict:
        all_languages = np.append(all_languages,language['language'])

In [103]:
lazy_unique = np.unique([(language) for language in all_languages])
dirty_ingles = lazy_unique[22]
dirty_portugues = lazy_unique[33]
lazy_unique = np.delete(lazy_unique,[0, 22, 33])
classes = lazy_unique

In [104]:
language_features = get_language_feature(raw_data, 'language', classes, 'language_', dirty_portugues, dirty_ingles)
treated_dataset = pd.concat([treated_dataset,language_features], axis=1)

In [105]:
language_features[['language_Inglês', 'language_Português']].head()

Unnamed: 0_level_0,language_Inglês,language_Português
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,True,True
2,True,True
3,True,True
4,True,True
5,False,True


In [106]:
raw_data['language'].head().values

array(['[{"level": "Intermediário", "language": "Inglês"}, {"level": "Nativo", "language": "Português"}, {"level": "Básico", "language": "Espanhol"}]',
       '[{"level": "Avançado", "language": "Inglês"}, {"level": "Nativo", "language": "Português"}]',
       '[{"level": "Intermediário", "language": "Inglês"}, {"level": "Nativo", "language": "Português"}]',
       '[{"level": "Intermediário", "language": "Inglês"}, {"level": "Nativo", "language": "Português"}]',
       '[{"level": "Intermediário", "language": "Português"}]'],
      dtype=object)

### Location Treat

In [107]:
experimentation = raw_data.copy()
experimentation['count'] = 1
experimentation.groupby('location').sum().sort_values('count', ascending=False)

Unnamed: 0_level_0,age,count
location,Unnamed: 1_level_1,Unnamed: 2_level_1
"Centro, Curitiba - PR",334116,13703
"Cidade Industrial, Curitiba - PR",227783,9729
"Sítio Cercado, Curitiba - PR",151700,6510
"Cajuru, Curitiba - PR",117833,4943
"Boqueirão, Curitiba - PR",99511,4121
"Uberaba, Curitiba - PR",85292,3609
"Xaxim, Curitiba - PR",76205,3174
"Pinheirinho, Curitiba - PR",70399,2956
"Tatuquara, Curitiba - PR",61337,2718
"Novo Mundo, Curitiba - PR",61855,2530


In [108]:
test_db = [ {'age':resume['age'], 'id': resume['id'], 'professional_goals_working_hours': resume['professional_goals_working_hours'], 'sex':resume['sex'], 'title': resume['title']} for resume in db]

In [109]:
test_db[0]

{'age': 49,
 'id': 1,
 'professional_goals_working_hours': 'Período Integral',
 'sex': 'Não Especificado',
 'title': 'Candidato de Contabilidade'}

### Output Treated Dataset

In [110]:
with open('treated_data.json', 'w') as outfile:
    json.dump(test_db, outfile)

In [111]:
output_dataset = treated_dataset.reset_index()
output_dataset.to_csv('treated_dataset.csv')

In [112]:
output_dataset.columns

Index(['id', 'age', 'professional_goals_salary_min',
       'professional_goals_salary_max', 'professional_goals_contract_type',
       'professional_goals_working_hours', 'complementary_data_travel',
       'complementary_data_residence', 'driving_A', 'driving_B', 'driving_C',
       'driving_D', 'driving_E', 'vehicle_Carro particular', 'vehicle_Moto',
       'vehicle_Caminhão', 'vehicle_Outro', 'language_Afrikaans',
       'language_Alemão', 'language_Bengalí', 'language_Cantonês',
       'language_Catalão', 'language_Chinês', 'language_Coreano',
       'language_Croato', 'language_Dinamarquês', 'language_Eslovaco',
       'language_Espanhol', 'language_Farsi', 'language_Finlandês',
       'language_Francês', 'language_Galego', 'language_Grego',
       'language_Hebraico', 'language_Holandês', 'language_Húngaro',
       'language_Indonésio', 'language_Inglês', 'language_Islandês',
       'language_Italiano', 'language_Japonês', 'language_Latin',
       'language_Latviano', 'language_