In [2]:
import pandas as pd
from torch.utils.data import Dataset,DataLoader
from sentence_transformers import SentenceTransformer , LoggingHandler, losses
from scipy.spatial.distance import cosine, euclidean, canberra, braycurtis, chebyshev, cityblock
from sklearn.metrics import accuracy_score
import torch.multiprocessing
import numpy as np
from sklearn.metrics import top_k_accuracy_score
from datetime import datetime
import logging
from sentence_transformers.datasets import ParallelSentencesDataset
import pickle
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
logger = logging.getLogger(__name__)


lang = 'ar'

In [3]:
# read csv files 

In [4]:
base_data_path = './model_data/'
all_labels = pd.read_csv(base_data_path+'all_labels.csv')
all_labels = all_labels[all_labels['lang']==lang]
all_occupations = pd.read_csv(base_data_path+'esco_occupations.csv')
all_skills = pd.read_csv(base_data_path+'all_skills.csv')
graphdf = pd.read_csv(base_data_path+'graphs.csv')

lang_all_occupation = all_occupations.loc[all_occupations['lang']==lang]
lang_occupation_occupations = lang_all_occupation.loc[all_occupations['conceptType']=='Occupation']
occ_skill_df = graphdf[graphdf['nodesTypes']=='nonhierarchical-occupation-skill']
occ_skill_df = occ_skill_df[occ_skill_df['relationType']=='essential']
occ_skill_df = occ_skill_df.groupby('occupationUri')['skillUri'].apply(list).reset_index(name='new')


  all_occupations = pd.read_csv(base_data_path+'esco_occupations.csv')
  graphdf = pd.read_csv(base_data_path+'graphs.csv')


In [None]:
#  create skill_texts dictionary that is 

In [5]:
def skill_text(item):
    preferredLabel = ''
    altLabels = ''
    description = ''
    if str(item['preferredLabel']) !='nan':
        preferredLabel = str(item['preferredLabel'])+' '
    if str(item['altLabels']) != 'nan':
        altLabels = str(item['altLabels']).replace('\n',', ')
    if str(item['description']) != 'nan':
        description = str(item['description'])

    return preferredLabel+' '+description+' '+altLabels

In [7]:
all_skills = all_skills[all_skills['lang']==lang]
skill_texts = {}
for idx, row in lang_occupation_occupations.iterrows():
    skill_texts[row['conceptUri']]=''
for idx, occ_skill in occ_skill_df.iterrows():
    skill_uris = occ_skill['new']
    occupationUri = occ_skill['occupationUri']
    skills = all_skills[all_skills['conceptUri'].isin(skill_uris)]
    skill_texts[occupationUri] = ', '.join([skill_text(item) for idx,item in skills.iterrows()])


In [8]:
class EscoOccupationsDataset(Dataset):
    def __init__(self,conceptUriMapRev,  occupations_df, lang):
        self.conceptUriMapRev = conceptUriMapRev
        self.occupations = occupations_df.loc[occupations_df['lang']==lang]

    def __len__(self):
        return len(self.occupations)

    def __getitem__(self, idx):
        row = self.occupations.iloc[idx]
        title = str(row['preferredLabel']).lower()
        alternatives = lang_occupation_occupations.iloc[0]['altLabels'].replace('\n',' , ').lower()
        if len(alternatives)>0:
            title  = title+' , '+alternatives
        description = str(row['description']).replace('\n',' ').lower()
        _id = self.conceptUriMapRev[row['conceptUri']]
        return _id, f"title: {title} \n description: {description}"

In [9]:
test_df = all_labels
len(test_df)

2515

In [10]:
test_df.head(1)

Unnamed: 0.1,Unnamed: 0,project_name,description,title,occupation_title,occupation_id,task_id,iscoGroup,conceptUri,isco_preferredLabel,altLabels,isco_description,lang
8427,8427,SAU,مهندس اتصالات وشبكات\nإدارة ومراقبة العمل\nكتا...,مهندس اتصالات,مهندس اتصالات / مهندسة اتصالات,45,61dde5ad527776b760a866b4,2153.0,http://data.europa.eu/esco/occupation/02eb0ae6...,مهندس اتصالات / مهندسة اتصالات,,,ar


In [None]:
# consider just most frequent  

In [11]:
from collections import Counter
 
def most_frequent(List):
    occurence_count = Counter(List)
    return occurence_count.most_common(1)[0][0]
test_df = test_df[~test_df['conceptUri'].isna()]
stack_descripts = []
unsimilar_tests = []
for _,row in test_df.iterrows():
    if row['description'] not in stack_descripts:
        conceptUri = most_frequent(test_df.loc[test_df['description']==row['description']]['conceptUri'])
        stack_descripts.append(row['description'])
        unsimilar_tests.append({'conceptUri':conceptUri, 'description':row['description'],'preferredLabel':row['title']})
test_df = pd.DataFrame(unsimilar_tests)
len(test_df)

1242

In [None]:
# load pretrain embeder

In [13]:
model_name = 'paraphrase-multilingual-mpnet-base-v2'
embedder = SentenceTransformer(model_name)

2022-05-01 18:32:37 - Load pretrained SentenceTransformer: paraphrase-multilingual-mpnet-base-v2
2022-05-01 18:32:48 - Use pytorch device: cuda


In [15]:
# embed esco and label data

In [16]:
test_df['text'] = [row['preferredLabel']+'\n'+row['description'] for idx, row in test_df.iterrows()]
lang_occupation_occupations['text']= [str(row['preferredLabel'])+"\n"+str(row['description'])+', '+ str(row['altLabels']).replace('\n',', ')  for idx, row in lang_occupation_occupations.iterrows()]
lang_occupation_occupations['class'] = list(range(0,len(lang_occupation_occupations)))
test_embeds = embedder.encode(test_df['text'].tolist())
lang_occupation_occupations_embeds = embedder.encode(lang_occupation_occupations['text'].tolist())

Batches:   0%|          | 0/39 [00:00<?, ?it/s]

Batches:   0%|          | 0/94 [00:00<?, ?it/s]

In [18]:
# create test data

In [17]:
test_data = []
for ids, item in test_df.iterrows():
    test_data.append({'title': item['preferredLabel'],'description':item['description']})

In [18]:
labels = []
for ids, item in test_df.iterrows():
    conceptUri = item['conceptUri']
    class_id = lang_occupation_occupations[lang_occupation_occupations['conceptUri']==conceptUri].iloc[0]['class']
    labels.append(class_id)
y_true = np.array(labels)

In [19]:
# embeder cosine model without training on embeddings 

In [21]:
# ###### def deep_model(batch_text):
# device='cuda:0'
# texts = [item['title'].lower()+' '+item['description'].lower() for item in batch_data]
# embeds = embedder.encode(texts)
# torch_embeds = torch.from_numpy(embeds).to(device)
# torch_classes = torch.from_numpy(np.array(lang_occupation_occupations_embeds)).to(device).T
# y_score = torch.mm(torch_embeds,torch_classes)
# y_score = y_score.cpu().data.numpy()
# top_k_accuracy_score(y_true, y_score, k=5, normalize=True, labels=list(range(len(lang_occupation_occupations))))

In [22]:
# suppose english esco occupation and arabic esco occupation have same meaning and can be translate to each other so embedding of same occupation in two langguage have embedding 

In [23]:
esco_occupations = pd.read_csv(base_data_path+'esco_occupations.csv')
en_esco_occupations = esco_occupations.loc[esco_occupations['lang']=='en'].copy()
en_esco_occupations = en_esco_occupations.rename(columns={'description':'en_description'})
ar_esco_occupations = esco_occupations.loc[esco_occupations['lang']=='ar'].copy()
ar_esco_occupations = ar_esco_occupations.rename(columns={'description':'ar_description'})
en_ar_esco_occupations = pd.merge(en_esco_occupations, ar_esco_occupations, on='conceptUri')
en_ar_esco_occupations.head(1)

  esco_occupations = pd.read_csv(base_data_path+'esco_occupations.csv')


Unnamed: 0,Unnamed: 0_x,conceptType_x,conceptUri,iscoGroup_x,preferredLabel_x,altLabels_x,hiddenLabels_x,status_x,modifiedDate_x,regulatedProfessionNote_x,...,regulatedProfessionNote_y,scopeNote_y,definition_y,inScheme_y,ar_description,code_y,lang_y,esco_version_y,occupation_id_y,external_id_y
0,24064,Occupation,http://data.europa.eu/esco/occupation/00030d09...,2654,technical director,technical and operations director\nhead of tec...,,released,2016-07-05T13:58:41Z,http://data.europa.eu/esco/regulated-professio...,...,http://data.europa.eu/esco/regulated-professio...,,,http://data.europa.eu/esco/concept-scheme/occu...,يدرك المديرون الفنيون الرؤى الفنية للمبتكرين ض...,2654.1.7,ar,v1.1.0,2.0,http://data.europa.eu/esco/occupation/00030d09...


In [25]:
# same discution of equal embedding in esco skills

In [26]:
esco_skills = pd.read_csv(base_data_path+'all_skills.csv')
en_esco_skills = esco_skills.loc[esco_skills['lang']=='en'].copy()
en_esco_skills = en_esco_skills.rename(columns={'description':'en_description'})
ar_esco_skills = esco_skills.loc[esco_skills['lang']=='ar'].copy()
ar_esco_skills = ar_esco_skills.rename(columns={'description':'ar_description'})
en_ar_esco_skills = pd.merge(en_esco_skills, ar_esco_skills, on='conceptUri')
en_ar_esco_skills.head(1)


Unnamed: 0,Unnamed: 0_x,conceptType_x,conceptUri,skillType_x,reuseLevel_x,altLabels_x,en_description,lang_x,preferredLabel_x,Unnamed: 0_y,conceptType_y,skillType_y,reuseLevel_y,altLabels_y,ar_description,lang_y,preferredLabel_y
0,0,KnowledgeSkillCompetence,http://data.europa.eu/esco/skill/0005c151-5b5a...,skill/competence,sector-specific,manage staff of music\ncoordinate duties of mu...,Assign and manage staff tasks in areas such as...,en,,101738,KnowledgeSkillCompetence,skill/competence,sector-specific,,توزيع مهام الفرق وإدارتها في مجالات مثل تسجيل ...,ar,


In [1]:
def get_sentece(title, description):
    return f"{title} {description}"

sentences = []
for idx, row in en_ar_esco_occupations.iterrows():
    sentences.append([get_sentece(row['preferredLabel_x'], row['en_description']),get_sentece(row['preferredLabel_y'], row['ar_description'])])
for idx, row in en_ar_esco_skills.iterrows():
    sentences.append([get_sentece(row['preferredLabel_x'], row['en_description']),get_sentece(row['preferredLabel_y'], row['ar_description'])])
for idx, row in en_ar_esco_occupations.iterrows():
    sentences.append([get_sentece(row['preferredLabel_x'], row['en_description']),get_sentece(row['preferredLabel_x'], row['en_description'])])
for idx, row in en_ar_esco_skills.iterrows():
    sentences.append([get_sentece(row['preferredLabel_x'], row['en_description']),get_sentece(row['preferredLabel_x'], row['en_description'])])

NameError: name 'en_ar_esco_occupations' is not defined

In [None]:
# train model 

In [2]:
for num_epochs in [10]:
    teacher_model_name = 'paraphrase-multilingual-mpnet-base-v2'   #Our monolingual teacher model, we want to convert to multiple languages
    student_model_name = 'paraphrase-multilingual-mpnet-base-v2'       #Multilingual base model we use to imitate the teacher model


    max_seq_length = 256                #Student model max. lengths for inputs (number of word pieces)
    train_batch_size = 64               #Batch size for training
    inference_batch_size = 64           #Batch size at inference
    max_sentences_per_language = 500000 #Maximum number of  parallel sentences for training
    train_max_sentence_length = 256     #Maximum length (characters) for parallel training sentences

    num_warmup_steps = 1000             #Warumup steps

    num_evaluation_steps = 1000          #Evaluate performance after every xxxx steps
    dev_sentences = 1000                 #Number of parallel sentences to be used for development


    # Define the language codes you would like to extend the model to
    source_languages = set(['en'])                      # Our teacher model accepts English (en) sentences
    target_languages = set(['ar'])    # We want to extend the model to these new languages. For language codes, see the header of the train file


    output_path = "output/make-multilingual-"+"-".join(sorted(list(source_languages))+sorted(list(target_languages)))+"-"+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")


    logger.info("Load teacher model")
    teacher_model = SentenceTransformer(teacher_model_name)


    logger.info("Create student model from scratch")

    student_model = SentenceTransformer(student_model_name)


    ###### Read Parallel Sentences Dataset ######
    train_data = ParallelSentencesDataset(student_model=student_model, teacher_model=teacher_model, batch_size=inference_batch_size, use_embedding_cache=True)
    train_data.add_dataset(parallel_sentences=sentences, max_sentences=max_sentences_per_language, max_sentence_length=train_max_sentence_length)

    train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size)
    train_loss = losses.MSELoss(model=student_model)




    # Train the model
    student_model.fit(train_objectives=[(train_dataloader, train_loss)],
              # evaluator=evaluation.SequentialEvaluator(evaluators, main_score_function=lambda scores: np.mean(scores)),
              epochs=num_epochs,
              warmup_steps=num_warmup_steps,
              # evaluation_steps=num_evaluation_steps,
              output_path=output_path,
              save_best_model=True,
              optimizer_params= {'lr': 2e-5, 'eps': 1e-6, 'correct_bias': False}
              )


NameError: name 'datetime' is not defined

In [3]:
# calculate topk accuracy of cosine classifier

texts = [item['title'].lower()+' '+item['description'].lower() for item in batch_data]
embeds = student_model.encode(texts)
lang_occupation_occupations_embeds = student_model.encode(lang_occupation_occupations['text'].tolist())
torch_embeds = torch.from_numpy(embeds).to(device)
torch_classes = torch.from_numpy(np.array(lang_occupation_occupations_embeds)).to(device).T
y_score = torch.mm(torch_embeds,torch_classes)
y_score = y_score.cpu().data.numpy()
topk_acc = top_k_accuracy_score(y_true, y_score, k=5, normalize=True, labels=list(range(len(lang_occupation_occupations))))
logger.info(f"topk accuracy = {topk_acc}")


NameError: name 'batch_data' is not defined

In [4]:
# load model and see the results

In [5]:
model = SentenceTransformer('output/make-multilingual-en-ar-2022-04-30_16-30-49')
texts = [item['title'].lower()+' '+item['description'].lower() for item in batch_data]
embeds = model.encode(texts)
lang_occupation_occupations_embeds = model.encode(lang_occupation_occupations['text'].tolist())
torch_embeds = torch.from_numpy(embeds).to(device)
torch_classes = torch.from_numpy(np.array(lang_occupation_occupations_embeds)).to(device).T
norm_torch_classes = torch_classes/torch.linalg.norm(torch_classes,axis=0)

y_score = torch.mm(torch_embeds,torch_classes)
y_score = y_score.cpu().data.numpy()
topk_acc = top_k_accuracy_score(y_true, y_score, k=5, normalize=True, labels=list(range(len(lang_occupation_occupations))))
logger.info(f"topk accuracy = {topk_acc}")

NameError: name 'SentenceTransformer' is not defined

In [32]:
with open('output/make-multilingual-en-ar-2022-04-30_16-30-49/classes.pickle', 'wb') as f:
    pickle.dump(lang_occupation_occupations['conceptUri'].tolist(),f,protocol=pickle.HIGHEST_PROTOCOL)

In [33]:
with open('output/make-multilingual-en-ar-2022-04-30_16-30-49/embeds.pickle', 'wb') as f:
    pickle.dump(lang_occupation_occupations_embeds,f,protocol=pickle.HIGHEST_PROTOCOL)
