In [2]:
# !pip install transformers
# !pip install seqeval
# !pip install tensorboardx
# !pip install simpletransformers

In [1]:
import pandas as pd
# from simpletransformers.classification import ClassificationModel
from sklearn.metrics import f1_score, accuracy_score

def f1_multiclass(labels, preds):
    return f1_score(labels, preds, average='macro')

In [2]:
# load data and format it for simpletransformers

train_df = pd.read_json("data/KE/train.json", orient="records")
val_df = pd.read_json("data/KE/validation.json", orient="records")
test_df = pd.read_json("data/KE/test.json", orient="records")

train_df = train_df.loc[:,["problem","label"]]
train_df = train_df.rename(columns = {"problem":"text", "label":"labels"})
val_df = val_df.loc[:,["problem","label"]]
val_df = val_df.rename(columns = {"problem":"text", "label":"labels"})
test_df = test_df.loc[:,["problem","label"]]
test_df = test_df.rename(columns = {"problem":"text", "label":"labels"})

In [3]:
train_df

Unnamed: 0,text,labels
0,enrich legacy photographs by predicting color ...,9
1,classification of compressed images,5
2,"learn patterns in the data, allowing for accur...",7
3,time series forecasting,3
4,extracting useful representation for heart bio...,5
...,...,...
32129,both the pregnant women and the fetus,12
32130,determine the gender and age using left-hand r...,5
32131,each question type (expected answer characteri...,7
32132,"dense feature vectors encoding geographic, tem...",2


In [4]:
# calculate class weights
num_classes = 15
weights_dict = {i: ((len(train_df)/num_classes)/train_df["labels"].value_counts()[i]) for i in train_df["labels"].value_counts().index}
weights = [0]*num_classes
for i in weights_dict:
    weights[i] = weights_dict[i]
weights

[0.7222746684648236,
 1.9872603586889304,
 1.430084557187361,
 2.3697640117994103,
 1.0722055388722056,
 0.18073624117663603,
 4.548336871903751,
 0.4049653434152489,
 1.3289495450785775,
 1.8032547699214367,
 1.4056867891513563,
 6.531300813008131,
 6.591589743589744,
 2.793046501521078,
 6.452610441767069]

In [None]:
# define model parameters
args = {
    "train_batch_size": 32,
    "num_train_epochs": 17,
    "learning_rate": 1e-4,
#     "weight": weights,
    "save_model_every_epoch": False,
    "save_eval_checkpoints": False,
    "overwrite_output_dir": True,
    "reprocess_input_data": False,
    'evaluate_during_training': True,
    "eval_batch_size": 32
}

# Create a ClassificationModel
model = ClassificationModel('bert', 'allenai/scibert_scivocab_uncased', weight=weights, num_labels=num_classes, args=args)

In [None]:
# train model

model.train_model(train_df, eval_df=val_df, f1=f1_multiclass, acc=accuracy_score)

In [None]:
#evaluate model
result, model_outputs, wrong_predictions = model.eval_model(test_df, f1=f1_multiclass, acc=accuracy_score)
print(result)

In [None]:
#get predictions
preds, raw = model.predict(list(test_df["text"]))