In [2]:
import pandas as pd
from torch.utils.data import Dataset,DataLoader
from sentence_transformers import SentenceTransformer
from scipy.spatial.distance import cosine, euclidean, canberra, braycurtis, chebyshev, cityblock
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import torch.multiprocessing
import networkx as nx
import numpy as np
from sklearn.metrics import top_k_accuracy_score
from collections import Counter
import pickle

# read csv files

In [5]:
lang = 'en'
base_data_path = './model_data/'

# read labels
test_df = pd.read_csv(base_data_path+'all_labels.csv')
test_df = test_df[test_df['lang']==lang]

# read esco occupation data
all_occupations = pd.read_csv(base_data_path+'all_occupations.csv')
lang_all_occupation = all_occupations.loc[all_occupations['lang']=='en']
lang_occupation_occupations = lang_all_occupation.loc[all_occupations['conceptType']=='Occupation']

In [6]:
test_df.head(1)

Unnamed: 0.1,Unnamed: 0,project_name,description,title,occupation_title,occupation_id,task_id,iscoGroup,conceptUri,isco_preferredLabel,altLabels,isco_description,lang
0,0,GBR,Are you ready to find a new direction where yo...,Entry Level Sales Assistant,sales assistant,1864,61dde40c527776b760a85fa0,5223.0,http://data.europa.eu/esco/occupation/9ba74e8a...,sales assistant,senior sales assistant\nsupermarket sales assi...,Sales assistants represent the direct contact ...,en


# just consider the most frequent label of a task as grandtruth label.


In [7]:
def most_frequent(List):
    occurence_count = Counter(List)
    return occurence_count.most_common(1)[0][0]


test_df = test_df[~test_df['conceptUri'].isna()]

test_df_goupby_tasks = test_df.groupby(['description','title'])['conceptUri'].apply(list).reset_index(name='new')
test_df_goupby_tasks['conceptUri']= test_df_goupby_tasks['new'].apply(lambda item: most_frequent(item))

In [8]:
test_df_goupby_tasks.head(1)

Unnamed: 0,description,title,new,conceptUri
0,4 x Warehouse Hygiene cleaner wanted Location:...,Warehouse Hygiene cleaner,[http://data.europa.eu/esco/occupation/bea705f...,http://data.europa.eu/esco/occupation/bea705fe...


In [6]:
lang_occupation_occupations.head(1)

Unnamed: 0.1,Unnamed: 0,conceptType,conceptUri,preferredLabel,altLabels,description,lang
619,619,Occupation,http://data.europa.eu/esco/occupation/00030d09...,technical director,technical and operations director\nhead of tec...,Technical directors realise the artistic visio...,en


# make train and test dataset

In [9]:

tests_texts = (test_df_goupby_tasks['title']+'\n'+test_df_goupby_tasks['description']).tolist()
tests_occs = test_df_goupby_tasks['conceptUri']
train_texts= (lang_occupation_occupations['preferredLabel']+"\n"+lang_occupation_occupations['description']+', '+ lang_occupation_occupations['altLabels'].str.replace('\n',', ')).tolist()
train_occs = lang_occupation_occupations['conceptUri']
tests_texts = [str(tt) for tt in tests_texts]
train_texts = [str(tt) for tt in train_texts]
class_map = {conceptUri:idx for idx,conceptUri in enumerate(lang_occupation_occupations['conceptUri'])}
tests_class = [class_map[occ] for occ in tests_occs]
train_class = [class_map[occ] for occ in train_occs]

In [None]:
# reverse map for finding class from index
rev_class_map = {class_map[key]:key for key in class_map}

# load pretrained model "all-magnet-base-v2"

In [11]:
  
model_name = 'all-mpnet-base-v2'
embedder = SentenceTransformer(model_name)

# calculate center of each class ( we just use esco data so for each class we just have one data so center of class equal to embed of that sample)
# calculate topk accuracy by knn for k=1

In [12]:
device='cuda:1'
tests_embeds = embedder.encode(tests_texts)
train_embeds = embedder.encode(train_texts)

test_embeds = torch.from_numpy(tests_embeds).to(device)
train_embeds = torch.from_numpy(train_embeds).to(device)

y_score = torch.mm(test_embeds,train_embeds.T)
y_score = y_score.cpu().data.numpy()

top_k_accuracy_score(tests_class, y_score, k=5, normalize=True, labels=list(range(len(lang_occupation_occupations))))

0.6644067796610169

# save embeds and class index

In [15]:
with open('embeds.pickle', 'wb') as handle:
    pickle.dump(train_embeds, handle, protocol=pickle.HIGHEST_PROTOCOL)


In [None]:
train_classes = [class_map[i] for i in range(len(class_map))]
with open('classes.pickle', 'wb') as handle:
    pickle.dump(train_classes, handle, protocol=pickle.HIGHEST_PROTOCOL)
