In [2]:
import json

EMBEDDINGS_PATH = './EMBEDDINGS/'

embeddings_meta = []

with open(f'{EMBEDDINGS_PATH}embeddings_meta.jsonl', 'r', encoding="utf-8") as f:
    for l in f:
        embeddings_meta.append(json.loads(l))

In [3]:
import json
import os

already_processed = set()

if os.path.exists('results.jsonl'):
    with open('results.jsonl', 'r', encoding="utf-8") as f:
        for l in f:
            item = json.loads(l)
            already_processed.add((item['dataset'], item['lm_model'], item['cls_model']))

In [4]:
already_processed

{('CSAbstruct', 'all-MiniLM-L6-v2', 'DecisionTreeClassifier'),
 ('CSAbstruct', 'all-MiniLM-L6-v2', 'KNeighborsClassifierSqrt'),
 ('CSAbstruct', 'all-MiniLM-L6-v2', 'MLPClassifier'),
 ('CSAbstruct', 'all-MiniLM-L6-v2', 'RandomForestClassifier'),
 ('CSAbstruct', 'all-MiniLM-L6-v2', 'SVM'),
 ('CSAbstruct', 'scibert_scivocab_uncased', 'DecisionTreeClassifier'),
 ('CSAbstruct', 'scibert_scivocab_uncased', 'KNeighborsClassifierSqrt'),
 ('CSAbstruct', 'scibert_scivocab_uncased', 'MLPClassifier'),
 ('CSAbstruct', 'scibert_scivocab_uncased', 'RandomForestClassifier'),
 ('CSAbstruct', 'scibert_scivocab_uncased', 'SVM'),
 ('CSAbstruct', 'st_minilm_abstruct', 'DecisionTreeClassifier'),
 ('CSAbstruct', 'st_minilm_abstruct', 'KNeighborsClassifierSqrt'),
 ('CSAbstruct', 'st_minilm_abstruct', 'MLPClassifier'),
 ('CSAbstruct', 'st_minilm_abstruct', 'RandomForestClassifier'),
 ('CSAbstruct', 'st_minilm_abstruct', 'SVM'),
 ('CSAbstruct', 'st_scibert_abstruct', 'DecisionTreeClassifier'),
 ('CSAbstruct', '

In [5]:
import numpy as np

seed = 31415
np.random.seed(seed)

In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import f1_score
import time

def cls_train_evaluate(clf,
                       embeddings_train,
                       embeddings_test,
                       y_train_true,
                       y_test_true):
    
    output_dict = {}
    
    start = time.time()
    clf = clf.fit(embeddings_train, y_train_true)
    end = time.time()
    y_pred = clf.predict(embeddings_test)

    output_dict['acc'] = accuracy_score(y_test_true, y_pred)
    output_dict['f1_macro'] = f1_score(y_test_true, y_pred, average='macro')
    output_dict['f1_micro'] = f1_score(y_test_true, y_pred, average='micro')
    output_dict['f1_weighted'] = f1_score(y_test_true, y_pred, average='weighted')
    output_dict['time'] = end - start
    
    return output_dict   

In [9]:
import pickle
from tqdm.auto import tqdm
import math

cls_results = []
pbar_embeddings_meta = tqdm(embeddings_meta) 

for embedding_meta in pbar_embeddings_meta:
    pbar_embeddings_meta.set_description(embedding_meta['dataset_desc'] + "-" + embedding_meta['model_name'])
    
    with open(f"{EMBEDDINGS_PATH}{embedding_meta['file_name']}", 'rb') as f:
        embeddings = pickle.load(f)
    
    embeddings_train = embeddings['embeddings_train']
    embeddings_test = embeddings['embeddings_test']
    y_train_true = embeddings['y_train_true']
    y_test_true = embeddings['y_test_true']
    
    total_n_neighbors = math.ceil(math.sqrt(len(y_train_true)))
    
    cls_models = [
        ('DecisionTreeClassifier', tree.DecisionTreeClassifier(random_state=seed)),
        ('KNeighborsClassifierSqrt', KNeighborsClassifier(n_neighbors=total_n_neighbors, weights='distance')),
        ('RandomForestClassifier', RandomForestClassifier(random_state=seed)),
        ('MLPClassifier', MLPClassifier(random_state=seed)),
        ('SVM', SVC(random_state=seed)),
    ]
    
    cls_models_bar = tqdm(cls_models, leave=False)
    
    for cls_model in cls_models_bar:
        cls_models_bar.set_description(cls_model[0])
        
        item = (embedding_meta['dataset_desc'], embedding_meta['model_name'], cls_model[0])
        if item in already_processed:
            continue        
        
        clf = cls_model[1]
        
        output_dict = {
          'dataset': embedding_meta['dataset_desc'],
          'lm_model': embedding_meta['model_name'],
          'cls_model': cls_model[0],
        }        

        cls_result = cls_train_evaluate(clf,
                                        embeddings_train,
                                        embeddings_test,
                                        y_train_true,
                                        y_test_true)
        
        with open('results.jsonl', 'a', encoding="utf-8") as f:
            f.write(json.dumps({**output_dict, **cls_result}))
            f.write("\n")
        
        cls_results.append({**output_dict, **cls_result})

  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

In [10]:
import pandas as pd

pd.DataFrame(cls_results)

In [11]:
import json

cls_results = []

with open('results.jsonl', 'r', encoding="utf-8") as f:
    for l in f:
        item = json.loads(l)
        cls_results.append(item)

In [12]:
import pandas as pd

df_result = pd.DataFrame(cls_results)[['dataset', 'lm_model', 'cls_model', 'f1_micro']]
df_result['f1_micro'] = round(df_result['f1_micro'] * 100, 2)
df_result.sort_values(by=["dataset", "lm_model", "cls_model"])

Unnamed: 0,dataset,lm_model,cls_model,f1_micro
10,CSAbstruct,all-MiniLM-L6-v2,DecisionTreeClassifier,41.59
11,CSAbstruct,all-MiniLM-L6-v2,KNeighborsClassifierSqrt,57.89
13,CSAbstruct,all-MiniLM-L6-v2,MLPClassifier,57.89
12,CSAbstruct,all-MiniLM-L6-v2,RandomForestClassifier,55.37
14,CSAbstruct,all-MiniLM-L6-v2,SVM,61.23
15,CSAbstruct,scibert_scivocab_uncased,DecisionTreeClassifier,50.19
16,CSAbstruct,scibert_scivocab_uncased,KNeighborsClassifierSqrt,71.68
18,CSAbstruct,scibert_scivocab_uncased,MLPClassifier,68.57
17,CSAbstruct,scibert_scivocab_uncased,RandomForestClassifier,69.61
19,CSAbstruct,scibert_scivocab_uncased,SVM,76.2
