In [5]:
import pandas as pd
from torch.utils.data import Dataset,DataLoader
from sentence_transformers import SentenceTransformer , LoggingHandler, losses
from scipy.spatial.distance import cosine, euclidean, canberra, braycurtis, chebyshev, cityblock
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import torch.multiprocessing
import networkx as nx
import numpy as np
from sklearn.metrics import top_k_accuracy_score
from datetime import datetime
import logging
from sentence_transformers.datasets import ParallelSentencesDataset
from torch.optim import Adam 
from datasets import load_metric

logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
logger = logging.getLogger(__name__)


langs = ['en', 'de', 'nl', 'pt', 'es', 'ar']

In [6]:
# read csv files

In [7]:
base_data_path = './model_data/'
all_labels = pd.read_csv(base_data_path+'all_labels.csv')
all_labels = all_labels[all_labels['lang'].isin(langs)]
all_occupations = pd.read_csv(base_data_path+'esco_occupations.csv')
all_skills = pd.read_csv(base_data_path+'all_skills.csv')
graphdf = pd.read_csv(base_data_path+'graphs.csv')
lang_all_occupation = all_occupations.loc[all_occupations['lang'].isin(langs)]
lang_occupation_occupations = lang_all_occupation.loc[all_occupations['conceptType']=='Occupation']
occ_skill_df = graphdf[graphdf['nodesTypes']=='nonhierarchical-occupation-skill']
occ_skill_df = occ_skill_df[occ_skill_df['relationType']=='essential']
occ_skill_df = occ_skill_df.groupby('occupationUri')['skillUri'].apply(list).reset_index(name='new')
test_df = all_labels

  all_occupations = pd.read_csv(base_data_path+'esco_occupations.csv')
  graphdf = pd.read_csv(base_data_path+'graphs.csv')


In [8]:
# create skill_texts dictionary that is 

In [9]:
def skill_text(item):
    preferredLabel = ''
    altLabels = ''
    description = ''
    if str(item['preferredLabel']) !='nan':
        preferredLabel = str(item['preferredLabel'])+' '
    if str(item['altLabels']) != 'nan':
        altLabels = str(item['altLabels']).replace('\n',', ')
    if str(item['description']) != 'nan':
        description = str(item['description'])+' '
        # +' '+description+' '+altLabels
    return description+' '+altLabels

In [10]:
# # all_skills = all_skills[all_skills['lang']==lang]
# # skill_texts = {}
# for idx, row in lang_occupation_occupations.iterrows():
#     skill_texts[row['conceptUri']]=''
# for idx, occ_skill in occ_skill_df.iterrows():
#     skill_uris = occ_skill['new']
#     occupationUri = occ_skill['occupationUri']
#     skills = all_skills[all_skills['conceptUri'].isin(skill_uris)]
#     skill_texts[occupationUri] = ''.join([skill_text(item) for idx,item in skills.iterrows()])


In [11]:
class EscoOccupationsDataset(Dataset):
    def __init__(self,conceptUriMapRev,  occupations_df, lang):
        self.conceptUriMapRev = conceptUriMapRev
        self.occupations = occupations_df.loc[occupations_df['lang']==lang]

    def __len__(self):
        return len(self.occupations)

    def __getitem__(self, idx):
        row = self.occupations.iloc[idx]
        title = str(row['preferredLabel']).lower()
        alternatives = lang_occupation_occupations.iloc[0]['altLabels'].replace('\n',' , ').lower()
        if len(alternatives)>0:
            title  = title+' , '+alternatives
        description = str(row['description']).replace('\n',' ').lower()
        _id = self.conceptUriMapRev[row['conceptUri']]
        return _id, f"title: {title} \n description: {description}"

In [12]:
from collections import Counter
 
def most_frequent(List):
    occurence_count = Counter(List)
    return occurence_count.most_common(1)[0][0]
test_df = test_df[~test_df['conceptUri'].isna()]
stack_descripts = []
unsimilar_tests = []
for _,row in test_df.iterrows():
    if row['description'] not in stack_descripts:
        conceptUri = most_frequent(test_df.loc[test_df['description']==row['description']]['conceptUri'])
        stack_descripts.append(row['description'])
        unsimilar_tests.append({'conceptUri':conceptUri, 'description':row['description'],'preferredLabel':row['title'],'lang':row['lang']})
test_df = pd.DataFrame(unsimilar_tests)
len(test_df)

5083

In [13]:
test_df = test_df[test_df['conceptUri'].isin(lang_occupation_occupations['conceptUri'].unique())]

In [18]:
test_df['text'] = [row['preferredLabel']+'\n'+row['description'] for idx, row in test_df.iterrows()]
lang_occupation_occupations['text']= [str(row['preferredLabel'])+"\n"+str(row['description'])+', '+ str(row['altLabels']).replace('\n',', ')  for idx, row in lang_occupation_occupations.iterrows()]

In [19]:
lang_occupation_occupations['class'] = list(range(0,len(lang_occupation_occupations)))

In [22]:
batch_data = []
for ids, item in test_df.iterrows():
    batch_data.append({'title': item['preferredLabel'],'description':item['description'],'lang':item['lang']})

In [23]:
labels = []
for ids, item in test_df.iterrows():
    conceptUri = item['conceptUri']
    class_id = lang_occupation_occupations[lang_occupation_occupations['conceptUri']==conceptUri].iloc[0]['class']
    labels.append(class_id)
y_true = np.array(labels)

In [24]:
def get_sentece(title, description):
    return f"{title} {description}"


In [44]:
# suppose that esco occupation and skill in different languages have same meaning so the embedding of them are equals.
# so by this assumption we create dataset and train model and make pretrain embeder more powerful

In [25]:
sentences = []
esco_occupations = pd.read_csv(base_data_path+'all_occupations.csv')
en_esco_occupations = esco_occupations.loc[esco_occupations['lang']=='en'].copy()
en_esco_occupations = en_esco_occupations.rename(columns={'description':'en_description'})
esco_skills = pd.read_csv(base_data_path+'all_skills.csv')
en_esco_skills = esco_skills.loc[esco_skills['lang']=='en'].copy()
en_esco_skills = en_esco_skills.rename(columns={'description':'en_description'})

for lang in langs:
    other_lang_esco_occupations = esco_occupations.loc[esco_occupations['lang']==lang].copy()
    other_lang_esco_occupations = other_lang_esco_occupations.rename(columns={'description':'other_description'})
    other_lang_esco_occupations = pd.merge(en_esco_occupations, other_lang_esco_occupations, on='conceptUri')
    other_lang_esco_skills = esco_skills.loc[esco_skills['lang']==lang].copy()
    other_lang_esco_skills = other_lang_esco_skills.rename(columns={'description':'other_description'})
    other_lang_esco_skills = pd.merge(en_esco_skills, other_lang_esco_skills, on='conceptUri')
    for idx, row in other_lang_esco_occupations.iterrows():
        sentences.append([get_sentece(row['preferredLabel_x'], row['en_description']),get_sentece(row['preferredLabel_y'], row['other_description'])])
    for idx, row in other_lang_esco_skills.iterrows():
        sentences.append([get_sentece(row['preferredLabel_x'], row['en_description']),get_sentece(row['preferredLabel_y'], row['other_description'])])

In [26]:
len(sentences)

107951

In [27]:
# train model
device = 'cuda:1'
for num_epochs in [40]:
    teacher_model_name = 'paraphrase-multilingual-mpnet-base-v2'   #Our monolingual teacher model, we want to convert to multiple languages
    student_model_name = 'paraphrase-multilingual-mpnet-base-v2'       #Multilingual base model we use to imitate the teacher model


    max_seq_length = 256                #Student model max. lengths for inputs (number of word pieces)
    train_batch_size = 128               #Batch size for training
    inference_batch_size = 128           #Batch size at inference
    max_sentences_per_language = 500000 #Maximum number of  parallel sentences for training
    train_max_sentence_length = 256     #Maximum length (characters) for parallel training sentences

    num_warmup_steps = 1000             #Warumup steps

    num_evaluation_steps = 1000          #Evaluate performance after every xxxx steps
    dev_sentences = 1000                 #Number of parallel sentences to be used for development


    # Define the language codes you would like to extend the model to
    source_languages = set(['en'])                      # Our teacher model accepts English (en) sentences
    target_languages = set(['en','nl','de'])    # We want to extend the model to these new languages. For language codes, see the header of the train file


    output_path = "output/make-multilingual-"+"-".join(sorted(list(source_languages))+sorted(list(target_languages)))+"-"+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")


    logger.info("Load teacher model")
    teacher_model = SentenceTransformer(teacher_model_name,device=device)


    logger.info("Create student model from scratch")

    student_model = SentenceTransformer(student_model_name,device=device)


    ###### Read Parallel Sentences Dataset ######
    train_data = ParallelSentencesDataset(student_model=student_model, teacher_model=teacher_model, batch_size=inference_batch_size, use_embedding_cache=True)
    train_data.add_dataset(parallel_sentences=sentences, max_sentences=max_sentences_per_language, max_sentence_length=train_max_sentence_length)

    train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size)
    train_loss = losses.MSELoss(model=student_model)




    # Train the model
    student_model.fit(train_objectives=[(train_dataloader, train_loss)],
              epochs=num_epochs,
              warmup_steps=num_warmup_steps,
              output_path=output_path,
              save_best_model=True,
              optimizer_params= {'lr': 2e-5, 'eps': 1e-6, 'correct_bias': False}
              )



2022-04-30 22:23:09 - Load teacher model
2022-04-30 22:23:09 - Load pretrained SentenceTransformer: paraphrase-multilingual-mpnet-base-v2
2022-04-30 22:23:20 - Create student model from scratch
2022-04-30 22:23:20 - Load pretrained SentenceTransformer: paraphrase-multilingual-mpnet-base-v2




Epoch:   0%|          | 0/40 [00:00<?, ?it/s]

Iteration:   0%|          | 0/640 [00:00<?, ?it/s]

  labels = torch.tensor(labels).to(self._target_device)


Iteration:   0%|          | 0/640 [00:00<?, ?it/s]

Iteration:   0%|          | 0/640 [00:00<?, ?it/s]

Iteration:   0%|          | 0/640 [00:00<?, ?it/s]

Iteration:   0%|          | 0/640 [00:00<?, ?it/s]

Iteration:   0%|          | 0/640 [00:00<?, ?it/s]

Iteration:   0%|          | 0/640 [00:00<?, ?it/s]

Iteration:   0%|          | 0/640 [00:00<?, ?it/s]

Iteration:   0%|          | 0/640 [00:00<?, ?it/s]

Iteration:   0%|          | 0/640 [00:00<?, ?it/s]

Iteration:   0%|          | 0/640 [00:00<?, ?it/s]

Iteration:   0%|          | 0/640 [00:00<?, ?it/s]

Iteration:   0%|          | 0/640 [00:00<?, ?it/s]

Iteration:   0%|          | 0/640 [00:00<?, ?it/s]

Iteration:   0%|          | 0/640 [00:00<?, ?it/s]

Iteration:   0%|          | 0/640 [00:00<?, ?it/s]

Iteration:   0%|          | 0/640 [00:00<?, ?it/s]

Iteration:   0%|          | 0/640 [00:00<?, ?it/s]

Iteration:   0%|          | 0/640 [00:00<?, ?it/s]

Iteration:   0%|          | 0/640 [00:00<?, ?it/s]

Iteration:   0%|          | 0/640 [00:00<?, ?it/s]

Iteration:   0%|          | 0/640 [00:00<?, ?it/s]

Iteration:   0%|          | 0/640 [00:00<?, ?it/s]

Iteration:   0%|          | 0/640 [00:00<?, ?it/s]

Iteration:   0%|          | 0/640 [00:00<?, ?it/s]

Iteration:   0%|          | 0/640 [00:00<?, ?it/s]

Iteration:   0%|          | 0/640 [00:00<?, ?it/s]

Iteration:   0%|          | 0/640 [00:00<?, ?it/s]

Iteration:   0%|          | 0/640 [00:00<?, ?it/s]

Iteration:   0%|          | 0/640 [00:00<?, ?it/s]

Iteration:   0%|          | 0/640 [00:00<?, ?it/s]

Iteration:   0%|          | 0/640 [00:00<?, ?it/s]

Iteration:   0%|          | 0/640 [00:00<?, ?it/s]

Iteration:   0%|          | 0/640 [00:00<?, ?it/s]

Iteration:   0%|          | 0/640 [00:00<?, ?it/s]

Iteration:   0%|          | 0/640 [00:00<?, ?it/s]

Iteration:   0%|          | 0/640 [00:00<?, ?it/s]

Iteration:   0%|          | 0/640 [00:00<?, ?it/s]

Iteration:   0%|          | 0/640 [00:00<?, ?it/s]

Iteration:   0%|          | 0/640 [00:00<?, ?it/s]

2022-05-01 00:09:27 - Save model to output/make-multilingual-en-de-en-nl-2022-04-30_22-23-09


In [None]:
# for good embeder one classifier layer is enough

In [48]:
from torch.utils.data import TensorDataset
import torch

class TinyModel(torch.nn.Module):

    def __init__(self, embed_dim, num_class):
        super(TinyModel, self).__init__()
        self.fc1 = torch.nn.Linear(embed_dim, num_class)
        # self.fc1 = nn.Linear(embed_dim, hidden_size)
        # self.fc2 = nn.Linear(hidden_size, num_class)
        self.init_weights()
        
    def init_weights(self):
        initrange = 0.5
        self.fc1.weight.data.uniform_(-initrange, initrange)
        self.fc1.bias.data.zero_()
        # self.fc2.weight.data.uniform_(-initrange, initrange)
        # self.fc2.bias.data.zero_()

    def forward(self, embedded):
        x = self.fc1(embedded)
        # x = F.relu(x)
        # x = self.fc2(x)
        return x

class EmbededDataset(torch.utils.data.Dataset):
    def __init__(self, embeds, labels):
        self.embeds = embeds
        self.labels = labels

    def __getitem__(self, idx):
        return torch.tensor(self.embeds[idx]), torch.tensor(self.labels[idx])

    def __len__(self):
        return len(self.labels)


In [49]:
def train(model, train_loader, test_loader, num_class ,device='cuda:0',learning_rate=4e-4,epoch_num=30,best_ouput='ouput'):
    model.to(device)
    model.train()
    optim = Adam(model.parameters(), lr=learning_rate)
    criterion = torch.nn.CrossEntropyLoss()
    best_model = None
    best_topk = 0
    for epoch in range(epoch_num):
        model.train()
        train_loss = 0
        for embeds, class_idx in train_loader:
            optim.zero_grad()    
            embeds = embeds.to(device)
            class_idx = class_idx.to(device)
            outputs = model(embeds)
            loss = criterion(outputs, class_idx)
            train_loss += outputs.shape[0] *loss.item()
            loss.backward()
            optim.step()

        all_outputs = []
        model.eval()
        topk = 0
        k = 5
        metric = load_metric("accuracy")
        with torch.no_grad():
            for embeds, class_idx in test_loader:
                embeds = embeds.to(device)
                class_idx = class_idx.to(device)
                outputs = model(embeds)

                # logits = outputs.logits
                predictions = torch.argmax(outputs, dim=-1)
                y_scores = outputs.cpu().detach().numpy()
                y_true = class_idx.cpu().detach().numpy()
                topk += top_k_accuracy_score(y_true,y_scores,k=k,labels=np.arange(num_class ))
                metric.add_batch(predictions=predictions, references=class_idx)
        topk_res = topk/len(test_loader)
        print(f'top {k} accuracy={topk_res}',metric.compute(), f'epoch ={epoch} , train loss = {train_loss/len(train_loader)}')
        if topk_res>best_topk:
            best_topk = topk_res
            st = model.state_dict()
    torch.save(st, best_ouput)


In [28]:
test_df.head(1)

Unnamed: 0.1,Unnamed: 0,project_name,description,title,occupation_title,occupation_id,task_id,iscoGroup,conceptUri,isco_preferredLabel,altLabels,isco_description,lang,text
0,0,GBR,Are you ready to find a new direction where yo...,Entry Level Sales Assistant,sales assistant,1864,61dde40c527776b760a85fa0,5223.0,http://data.europa.eu/esco/occupation/9ba74e8a...,sales assistant,senior sales assistant\nsupermarket sales assi...,Sales assistants represent the direct contact ...,en,Entry Level Sales Assistant\nAre you ready to ...


In [39]:
# because different language have same embeds so we can use label data in different laguage. 
# by defining target language labeled data in other laguage will be train data.
# we also use esco data as training data

In [30]:
esco_oc = esco_occupations.copy()
esco_oc['text'] = str(esco_oc['preferredLabel'] + "\n"+ esco_oc['description'])  

In [31]:
esco_oc['type']='esco'
all_labeled=test_df.copy()
all_labeled['type'] = 'annotated'
all_labeled = all_labeled.rename(columns={'title':'preferredLabel'})

In [32]:
all_labeled.head(1)

Unnamed: 0,conceptUri,description,preferredLabel,lang,text,type
0,http://data.europa.eu/esco/occupation/9ba74e8a...,Are you ready to find a new direction where yo...,Entry Level Sales Assistant,en,Entry Level Sales Assistant\nAre you ready to ...,annotated


In [33]:
concated_data_df = pd.concat([all_labeled[['conceptUri','text','lang','preferredLabel','type']], esco_oc[['conceptUri','text','lang','preferredLabel','type']]], axis=0)

In [34]:
len(concated_data_df)

32951

In [35]:
conceptUries = all_occupations[all_occupations['lang']=='en']['conceptUri'].tolist()
concated_data_df = concated_data_df[concated_data_df['conceptUri'].isin(conceptUries)]

In [36]:
concated_data_df.head(1)

Unnamed: 0,conceptUri,text,lang,preferredLabel,type
0,http://data.europa.eu/esco/occupation/9ba74e8a...,Entry Level Sales Assistant\nAre you ready to ...,en,Entry Level Sales Assistant,annotated


In [40]:
concated_data_df['class'],class_list = pd.factorize(concated_data_df['conceptUri'])


In [41]:
concated_data_df.head(1)

Unnamed: 0,conceptUri,text,lang,preferredLabel,type,class
0,http://data.europa.eu/esco/occupation/9ba74e8a...,Entry Level Sales Assistant\nAre you ready to ...,en,Entry Level Sales Assistant,annotated,0


In [42]:
import pickle
with open('output/multi/classes.pickle', 'wb') as f:
    pickle.dump(class_list,f,protocol=pickle.HIGHEST_PROTOCOL)


In [43]:
# train one layer classifer

In [46]:
def make_train_test(embeder, data_df,lang, batch_size = 64):
    test_df = data_df[(data_df['lang']==lang) & (data_df['type']=='annotated')]
    train_df = data_df[(data_df['lang']!=lang) | (data_df['type']=='esco')]
    
    train_embeds = embeder.encode(train_df['text'].tolist())
    test_embeds = embeder.encode(test_df['text'].tolist())
    
    train_dataset = EmbededDataset(train_embeds, train_df['class'].tolist())
    test_dataset = EmbededDataset(test_embeds, test_df['class'].tolist())
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)
    return train_loader, test_loader

In [52]:
# from torch.utils.data import DataLoader
# num_class = len(concated_data_df['conceptUri'].unique())
# for lang in ['de','nl','pt']:
#     print(lang)
#     embeder= student_model
#     train_loader, test_loader = make_train_test(embeder, concated_data_df,lang, batch_size = 128)
#     one_layer_model = TinyModel(embed_dim=768,num_class=num_class)
#     train(one_layer_model, train_loader, test_loader, num_class ,device='cuda:1',learning_rate=1e-3,epoch_num=100, lang, )

In [78]:
len(concated_data_df)

34412

In [None]:
from torch.utils.data import DataLoader
num_class = len(concated_data_df['conceptUri'].unique())
embeder= SentenceTransformer('output/make-multilingual-en-de-en-nl-2022-04-30_22-23-09', device="cuda:0")
for lang in ['de','nl','pt']:

    print(lang)
    
    train_loader, test_loader = make_train_test(embeder, concated_data_df,lang, batch_size = 128)
    one_layer_model = TinyModel(embed_dim=768,num_class=num_class)
    train(one_layer_model, train_loader, test_loader, num_class ,device='cuda:1',learning_rate=1e-3,epoch_num=100,best_ouput=f'output/multi/{lang}.pt' )

2022-05-01 20:08:00 - Load pretrained SentenceTransformer: output/make-multilingual-en-de-en-nl-2022-04-30_22-23-09
de


Batches:   0%|          | 0/886 [00:00<?, ?it/s]

Batches:   0%|          | 0/9 [00:00<?, ?it/s]

top 5 accuracy=0.028645833333333332 {'accuracy': 0.0037313432835820895} epoch =0 , train loss = 1037.3313288731617
top 5 accuracy=0.06510416666666667 {'accuracy': 0.033582089552238806} epoch =1 , train loss = 991.1427148011354
top 5 accuracy=0.16493055555555555 {'accuracy': 0.06343283582089553} epoch =2 , train loss = 974.3292823611079
top 5 accuracy=0.2743055555555556 {'accuracy': 0.10074626865671642} epoch =3 , train loss = 962.8129541637661
top 5 accuracy=0.24479166666666666 {'accuracy': 0.10820895522388059} epoch =4 , train loss = 954.2368746748915
top 5 accuracy=0.30034722222222227 {'accuracy': 0.12686567164179105} epoch =5 , train loss = 947.0885830655828
top 5 accuracy=0.23524305555555555 {'accuracy': 0.12686567164179105} epoch =6 , train loss = 941.2534219810555
top 5 accuracy=0.3420138888888889 {'accuracy': 0.11940298507462686} epoch =7 , train loss = 936.2227050334484
top 5 accuracy=0.3072916666666667 {'accuracy': 0.12686567164179105} epoch =8 , train loss = 931.5806836609368

Batches:   0%|          | 0/886 [00:00<?, ?it/s]

Batches:   0%|          | 0/9 [00:00<?, ?it/s]

top 5 accuracy=0.036458333333333336 {'accuracy': 0.0035842293906810036} epoch =0 , train loss = 1037.5025115163476
top 5 accuracy=0.09669384057971014 {'accuracy': 0.025089605734767026} epoch =1 , train loss = 991.4817765059772
top 5 accuracy=0.15692934782608695 {'accuracy': 0.06093189964157706} epoch =2 , train loss = 974.797504901886
top 5 accuracy=0.17368659420289856 {'accuracy': 0.06451612903225806} epoch =3 , train loss = 963.2333760562244
top 5 accuracy=0.1867074275362319 {'accuracy': 0.07168458781362007} epoch =4 , train loss = 954.5487020123112
top 5 accuracy=0.20380434782608695 {'accuracy': 0.06451612903225806} epoch =5 , train loss = 947.3725772870554
top 5 accuracy=0.23018568840579712 {'accuracy': 0.07168458781362007} epoch =6 , train loss = 941.5819428718842
top 5 accuracy=0.21161684782608695 {'accuracy': 0.08243727598566308} epoch =7 , train loss = 936.3618846949156
top 5 accuracy=0.24581068840579712 {'accuracy': 0.0967741935483871} epoch =8 , train loss = 931.9071355510403

Batches:   0%|          | 0/849 [00:00<?, ?it/s]

Batches:   0%|          | 0/46 [00:00<?, ?it/s]

top 5 accuracy=0.04535590277777778 {'accuracy': 0.007617728531855956} epoch =0 , train loss = 1039.9057916094998
top 5 accuracy=0.07045717592592593 {'accuracy': 0.01592797783933518} epoch =1 , train loss = 999.8051183861746
top 5 accuracy=0.10894097222222222 {'accuracy': 0.03462603878116344} epoch =2 , train loss = 986.6255296787746
top 5 accuracy=0.14539930555555555 {'accuracy': 0.054016620498614956} epoch =3 , train loss = 977.7356394646873
top 5 accuracy=0.16319444444444445 {'accuracy': 0.06578947368421052} epoch =4 , train loss = 970.7923948574514
top 5 accuracy=0.1831597222222222 {'accuracy': 0.0685595567867036} epoch =5 , train loss = 965.164788196904
top 5 accuracy=0.19661458333333334 {'accuracy': 0.07271468144044321} epoch =6 , train loss = 960.2531921897136
top 5 accuracy=0.20500578703703706 {'accuracy': 0.07548476454293629} epoch =7 , train loss = 956.2823106291148
top 5 accuracy=0.22359664351851852 {'accuracy': 0.07894736842105263} epoch =8 , train loss = 952.5803956224325
t

In [None]:
# device='cuda:0'

# texts = [item['title'].lower()+' '+item['description'].lower() for item in batch_data]
# embeds = student_model.encode(texts)
# lang_occupation_occupations_embeds = student_model.encode(lang_occupation_occupations['text'].tolist())
# torch_embeds = torch.from_numpy(embeds).to(device)
# torch_classes = torch.from_numpy(np.array(lang_occupation_occupations_embeds)).to(device).T
# # print(torch_embeds.size())
# # print(torch_classes.size())
# y_score = torch.mm(torch_embeds,torch_classes)
# y_score = y_score.cpu().data.numpy()
# # print(y_score.shape)
# # print(y_true.shape)
# # print(torch.topk(dists, 5,dim=-1).indices) 
# topk_acc = top_k_accuracy_score(y_true, y_score, k=5, normalize=True, labels=list(range(len(lang_occupation_occupations))))
# logger.info(f"topk accuracy = {topk_acc}")