# Training Baseline Classifiers


In [1]:
!pip install setfit==0.6.0
!pip install openpyxl
!mkdir models

In [8]:
# Imports
import pandas as pd
from setfit import SetFitModel, SetFitTrainer
from sentence_transformers.losses import CosineSimilarityLoss
from setfit import SetFitTrainer
from datasets import Dataset
from setfit import sample_dataset
from tqdm.auto import tqdm
from sklearn.metrics import f1_score
import joblib
import pickle
from sklearn.metrics import confusion_matrix
## Workaround for dashes in name
from importlib import import_module
nlbse_statistics = import_module('nlbse_statistics') 

tqdm.pandas()


<a id='data_collection'></a>
## Data collection
We first load the data. 
For each language we create a dataset for each of the seperate category.

In [9]:
langs = ['java', 'python', 'pharo']
lan_cats = []
datasets = {}
for lan in langs: # for each language
    df = pd.read_csv(f'./{lan}/input/{lan}.csv')
    df['combo'] = df[['comment_sentence', 'class']].agg(' | '.join, axis=1)
    df['label'] = df.instance_type
    cats = list(map(lambda x: lan + '_' + x, list(set(df.category))))
    lan_cats = lan_cats + cats
    for cat in list(set(df.category)): # for each category
        filtered =  df[df.category == cat]
        train_data = Dataset.from_pandas(filtered[filtered.partition == 0])
        test_data = Dataset.from_pandas(filtered[filtered.partition == 1])
        datasets[f'{lan}_{cat}'] = {'train_data': train_data, 'test_data' : test_data}

<a id='load_model'></a>

## Load model

In [16]:
# Note: We tweaked the hyperparams because of the dataset size of Java
# num_itertations is very sensitive, be careful when tweaking
hyperparameters = hyperparameters={'learning_rate': 1.7094555110821448e-05, 'num_epochs': 3, 
                                   'batch_size': 8, 'seed': 11, 'num_iterations': 10, 
                                   'max_iter': 241, 'solver': 'lbfgs'}
    
def model_init(params):
    params = params or {}
    max_iter = params.get("max_iter", 100)
    solver = params.get("solver", "liblinear")
    params = {
        "head_params": {
            "max_iter": max_iter,
            "solver": solver,
        }
    }
    return SetFitModel.from_pretrained("sentence-transformers/all-mpnet-base-v2", **params)

# Create a fresh trainer with hyperparams
def load_trainer(train_data, test_data):
    trainer = SetFitTrainer(
        train_dataset=train_data,
        eval_dataset=test_data,
        loss_class=CosineSimilarityLoss,
        model_init=model_init,
        column_mapping={"combo": "text", "label": "label"},
    )

    trainer.apply_hyperparameters(hyperparameters, final_model=True)
    
    return trainer

<a id='train_model'></a>


## Train Models
Train and save a model for each of the categories

This will take around 3h per category, so around 2 days in total.

In [None]:
# Train model for each cat
for lan_cat in lan_cats:
    print(f'training {lan_cat}')
    train_data = datasets[lan_cat]['train_data']
    test_data = datasets[lan_cat]['test_data']
    trainer = load_trainer(train_data, test_data)
    
    trainer.train()
    
    joblib.dump(trainer, f'./models/{lan_cat}_all-mpnet-base-v2.joblib')

<a id='eval'></a>

## Evaluation
Next we evaluate each of our trained models on the test set.

In [None]:
# Score each classifier and write scores to CSV
scores = []
for lan_cat in lan_cats:
    trainer = joblib.load(f'./models/extended_{lan_cat}_all-mpnet-base-v2.joblib')
    test_data = datasets[lan_cat]['test_data']
    y_hat = trainer.model(test_data['combo'])
    y = test_data['label']
    _, fp, fn, tp = confusion_matrix(y_hat, y).ravel()
    wf1 = f1_score(y, y_hat, average='weighted')
    precision, recall, f1 = nlbse_statistics.get_precision_recall_f1(tp, fp, fn)
    scores.append({'lan_cat': lan_cat.lower(),'precision': precision,'recall': recall,'f1': f1,'wf1': wf1})

df = pd.DataFrame(scores)
df.sort_values('lan_cat').to_excel('scores.xlsx')
df

<a id='hub'></a>

## Push to hub

Finally we push all of our models to the Hugging Face Hub to make them publically avaliable.

In [None]:
# Push to hub
token = 'hf_XXXXXXXXXXXXXXXXXXXXXXX'
repo = 'XXXXXXXXXXXXXXXXXXX'
for lan_cat in lan_cats:
    trainer = joblib.load(f'./models/{lan_cat}_all-mpnet-base-v2.joblib')
    name = lan_cat.lower().replace('_','-') + '-classifier'
    print(name)
    trainer.push_to_hub(f'{repo}/{name}', use_auth_token=token, private=False)