In [1]:
import pandas as pd
import langdetect as ld
import pycountry

ld.DetectorFactory.seed = 0
import esco_utils as eu
from difflib import SequenceMatcher
import tqdm

def detect_duplicate(vacc):
    duplicate = [False]*len(vacc)
    for i in tqdm.trange(len(vacc)):
        for j in range(i+1,i+500):
            if j>=len(vacc) or duplicate[j]==True:
                break
            ratio = SequenceMatcher(a=vacc.description.iloc[i][:50],b=vacc.description.iloc[j][:50]).ratio()
            if ratio>.9:
                duplicate[j] = True 
    return duplicate

## Create Token

In [2]:
API_TOKEN = eu.get_token()
client = eu.Client(API_TOKEN)

client.api.base_api_url = 'http://flask_sdk:6221'

{'data': {'token': '12a1bad0-4cb1-464c-8c8e-344957672bef'}, 'message': 'successful', 'meta': {}}


## Connect to DB & get vacancies samples 

In [3]:
vac_conn, skill_conn = eu.load_skillLab_DB()

## tarin models 

In [26]:
eu.train_models(['en','de','nl','es','pt','fr','ar'])

{"id": "c84f7388-79ca-11ec-a745-0242ac190008", "model_name": "tfidf_knn", "lang": "en", "ngram_min": 1, "ngram_max": 4, "n_neighbors": 50, "title_imp": 10, "alt_title_imp": 10, "case_insensitive": true}
{"id": "cbda63aa-79ca-11ec-a66c-0242ac190008", "model_name": "tfidf_knn", "lang": "de", "ngram_min": 1, "ngram_max": 4, "n_neighbors": 50, "title_imp": 10, "alt_title_imp": 10, "case_insensitive": true}
{"id": "cddf50d4-79ca-11ec-b61a-0242ac190008", "model_name": "tfidf_knn", "lang": "nl", "ngram_min": 1, "ngram_max": 4, "n_neighbors": 50, "title_imp": 10, "alt_title_imp": 10, "case_insensitive": true}
{"id": "d03de3fe-79ca-11ec-bf34-0242ac190008", "model_name": "tfidf_knn", "lang": "es", "ngram_min": 1, "ngram_max": 4, "n_neighbors": 50, "title_imp": 10, "alt_title_imp": 10, "case_insensitive": true}
{"id": "d31b931e-79ca-11ec-9ebf-0242ac190008", "model_name": "tfidf_knn", "lang": "pt", "ngram_min": 1, "ngram_max": 4, "n_neighbors": 50, "title_imp": 10, "alt_title_imp": 10, "case_insen

## Get all projects 

In [32]:
all_projs = client.get_all_projects()['data']['projects']
name2id = {prj['project_name']:prj['project_id'] for prj in all_projs}
name2id

{'GB': '61dd90dbce271774889fbc02',
 'DE': '61dd9651527776b760a845eb',
 'FR': '61dd96af527776b760a84719',
 'NL': '61dd9701527776b760a84847',
 'EL': '61dd97b7527776b760a84975',
 'PT': '61dd97da527776b760a84977',
 'ES': '61dda542527776b760a84981',
 'Mexico': '61ddbde4527776b760a84aaf',
 'United Kingdom': '61ddbe23527776b760a84bdd',
 'France': '61ddbeb7527776b760a84d0b',
 'Brazil': '61ddc0fb527776b760a84e39',
 'Netherlands': '61ddc2de527776b760a84f67',
 'Germany': '61ddc34d527776b760a85095',
 'Argentina': '61ddc51b527776b760a851c3',
 'Saudi Arabia': '61ddc5d6527776b760a852e9',
 'In Saudi Arabia': '61ddc9a2527776b760a85410',
 'test Saudi Arabia': '61ddc9ea527776b760a85536',
 'test 2 Saudi Arabia': '61ddcbed527776b760a8565c',
 'test 3 Saudi Arabia': '61ddd1f5527776b760a85782',
 'test 4 Saudi Arabia': '61ddd739527776b760a858a8',
 'GBR': '61dde40a527776b760a85f9f',
 'DEU': '61dde42a527776b760a860cd',
 'NLD': '61dde4cb527776b760a861fb',
 'FRA': '61dde4dc527776b760a86329',
 'MEX': '61dde50b52777

## create model 

In [25]:

alpha2name = {country.alpha_2:country.name for country in pycountry.countries}
alpha2alpha3 = {country.alpha_2:country.alpha_3 for country in pycountry.countries}

labels_per_task = 5
num_samples = 250
lang = 'en'

if lang=='en':
    country = 'GB'
elif lang=='es':
    country = 'MX'
elif lang=='pt':
    country = 'BR'
elif lang=='ar':
    country = 'SA'
else:
    country = lang.upper()
project_name = 'test ' + alpha2alpha3[country] + '2'


models = eu.get_models()
models = pd.DataFrame(models['data'])
model_id = models[models.lang==lang].id.iloc[-1] # chose last created model 
print(f"model_id for {lang} = {model_id}")

icon_path = f"./flags/{country}.png"
response = client.upload_file(icon_path)
icon_id = response['data']['file_id']
print(response)
print(f"icon {icon_path} uploaded with id {icon_id}")

# add icon to project
project = eu.Project(project_name, 
                  labels_per_task,
                  metadata={'lang':lang}, 
                  model_id=model_id, 
                  icon_id=icon_id)
if project_name in name2id.keys():
    project_id = name2id[project_name]
    client.edit_project(project,project_id=project_id)
else:
    response = client.create_project(project)
    project_id = response['data']['project_id']
print(f"project id = {project_id}")

client.edit_project(eu.Project(icon_id=icon_id),project_id=project_id)

classes = eu.sql_all_tags(lang,skill_conn)
eu.add_classes(client, classes, project_id)

if lang=='ar':
    vacc = pd.read_csv('bayt.csv').rename(columns={'JobDescription': 'description','meta_Title': 'title'})
else:
    # sample more to make sure enough remains after de-duplication & language detection
    vacc = eu.sample_vacancy(country,vac_conn,num=num_samples*5) 
# detect language 
vacc['langdetect'] = vacc.description.apply(lambda x: ld.detect(x) if len(x)>10 else '')
vacc = vacc.loc[vacc.langdetect==lang]
# detect duplicates 
vacc['duplicate'] = detect_duplicate(vacc)
vacc = vacc.loc[~vacc.duplicate]
eu.add_samples(client, vacc.sample(n=num_samples), project_id)

## only for testing 
print("add labelers", client.add_labelers(project_id, ["test@test.com",]))

model_id for en = 7884dad2-7312-11ec-beb7-0242ac120005
{'data': {'created_at': 'Thu, 20 Jan 2022 08:10:08 GMT', 'file_id': '61e918e0c2a562ff6ba0411f', 'file_name': 'Germany.png', 'mime_type': 'image/png', 'updated_at': 'Thu, 20 Jan 2022 08:10:08 GMT'}, 'message': 'successful', 'meta': {}}
icon ./flags/GB.png uploaded with id 61e918e0c2a562ff6ba0411f
project id = 61e918e0c2a562ff6ba04120


  return occupation_local.to_dict(orient="row")
  return occupation_local.to_dict(orient="row")
100%|██████████| 1250/1250 [00:40<00:00, 30.62it/s] 


{'task-type': 'esco-text-tagging', 'callback': None, 'items': [{'type': 'data', 'name': 'esco-occupations-data', 'data': {'title': 'Director of Product Development, Automotive (REMOTE)', 'description': 'About Assurant : One of the biggest businesses youve probably never heard of has some very big opportunities for people with big ambition. We are Assurant. A creative Fortune 500 company with 15,000 colleagues around the world, who protect what matters most to over 300 million customers. And whether its automotive support, mobile phone and gadget protection or house and business rental insurance, we are always ready to support our customers and clients. At Assurant, diversity helps us inspire creativity in the global marketplace, and we are believers in the strategic value of inclusion and how it improves performance, creates growth opportunities, better aligns us to our clients, and enhances employee engagement. The opportunity: Our continued year on year growth means opportunity for e

# Add beta labelers 

In [23]:
proj2labelers = {
    'SAU2': ['a.elhayek@skilllab.io'],
    'DEU2': ['s.schmid+data@skilllab.io', 'c.bretgeld@skilllab.io', 'r.martens@skilllab.io'],
    'GBR2': ['k.binhumam@skilllab.io', 'g.portik@skilllab.io', 'a.radic@skilllab.io'],
    'BRA2': ['v.soares@skilllab.io'],
    'MEX2': ['a.chiba@skilllab.io'],
    'NLD2': ['l.weller@skilllab.io'],
}

for pname, labelers in proj2labelers.items():
    pid = name2id[pname]
    client.add_labelers(pid,labelers)


# labelers of humansintheloop & discoverdignify

In [40]:


proj2labelers = {
    'SAU2': ['simeon@humansintheloop.org'],
    'GBR2': ['simeon@humansintheloop.org'],
    'BRA2': ['maria@discoverdignify.com','dana@discoverdignify.com','anabarbara@discoverdignify.com'],
    'MEX2': ['maria@discoverdignify.com','dana@discoverdignify.com','anabarbara@discoverdignify.com'],
}
for pname, labelers in proj2labelers.items():
    pid = name2id[pname]
    client.add_labelers(pid,labelers)

## Cancell & edit projects 

In [51]:
for project_name in ['SAU2']:
    project_id = name2id[project_name]
    states = ['in-progress', 'pending']
    for state in states: 
        tasks = client.get_all_tasks(project_id=project_id,status=state)
        task_ids = [task['_id'] for task in tasks['data']['tasks']]
        [client.cancel_task(project_id=project_id,task_id=task_id) for task_id in task_ids]

In [53]:
# country = 'SA'
# lang = 'ar'

# if lang=='ar':
#     vacc = pd.read_csv('bayt.csv').rename(columns={'JobDescription': 'description','meta_Title': 'title'})
# else:
#     # sample more to make sure enough remains after de-duplication & language detection
#     vacc = eu.sample_vacancy(country,vac_conn,num=num_samples*5) 
# # detect language 
# vacc['langdetect'] = vacc.description.apply(lambda x: ld.detect(x) if len(x)>10 else '')
# vacc = vacc.loc[vacc.langdetect==lang]
# # detect duplicates 
# vacc['duplicate'] = detect_duplicate(vacc)
# vacc = vacc.loc[~vacc.duplicate]

project_id = name2id['SAU2']
eu.add_samples(client, vacc.sample(n=238), project_id)

{'task-type': 'esco-text-tagging', 'callback': None, 'items': [{'type': 'data', 'name': 'esco-occupations-data', 'data': {'title': 'اخصائي موارد بشرية', 'description': '·يدرس ويطورالتنظيم الداخلي للشركة ويدرس\xa0تأثيره على مرونة وتنفيذ\xa0العمليات.\\n·يعد ويطور\xa0الهيكل التنظيمي\xa0للشركة\xa0موضحاً الوحدات والعلاقات فيما بينها وتوزيع الادوار وتحديد\xa0المسميات الادارية لفريق العمل.\\n·يحلل ويوصف الوظائف\xa0انطلاقاً من أهداف وغايات الشركة ويحدد المؤهلات المناسبة\xa0لشغل الوظائف كالمؤهل العلمي المناسب ومدة ونوع الخبرة المطلوبة\\n·يستخدم\xa0الاساليب\xa0الاحصائية\xa0المتبعة للتنبؤ بالموارد البشرية\xa0كالاساليب الكمية والاساليب المعتمدة على تقدير جهد العمل\xa0ومقابلة الطلب بالعرض ومعالجة\xa0الفائض بالعجز\xa0وتحديد الاحتياجات النوعية من العمالة\\n·ينشئ ويدير\xa0علاقات جيدة مع مصادر الإمداد بالمتقدمين\\n·يخطط وينفذ ويعد خطة\xa0الاعلان عن الوظائف الشاغرة\\n·يخطط وينفذ– منفرداً أو بمشاركة مشرفي الشعب -\xa0مقابلات\xa0الأولية\xa0اختيار\xa0للمتقدمين لشغل وظائف معينة\\n·يخطط وينفذ بمشاركة مشرفي ال

## cancell labeler 

In [22]:
for pname,pid in name2id.items():
    client.cancel_labeler(project_id=pid,email='test@test.com')
    if len(pname)==4 and pname[3]=='2':
        client.add_labelers(pid,['test@test.com'])

### pick the most recently created model for projects in beta #2

In [8]:

for prj_name,prj_id in name2id.items():
    prj_metadata = client.get_project(prj_id)['data']['metadata']
    if len(prj_name)==4 and prj_name[3]=='2' and 'lang' in prj_metadata:
        lang = prj_metadata['lang']
        models = eu.get_models()
        models = pd.DataFrame(models['data'])
        # chose the last created model 
        model_id = models[models.lang==lang].id.iloc[-1] 
        print(f"model_id for {lang} = {model_id}")
        client.edit_project(eu.Project(model_id=model_id),project_id=prj_id)


8


In [38]:
import random

random.random()

0.5147021251552995