In [1]:
import pandas as pd
import pickle as pkl

from data_processing import pers_labels
from sklearn.model_selection import GroupShuffleSplit
from sklearn.preprocessing import LabelEncoder

In [2]:
PWD = os.environ["WORKSPACE_PATH"]

data_type = "mbpt_0_top_lbl"

In [3]:
dataset = pd.read_json(f"{PWD}/data/model_datasets/{data_type}.jsonl", lines=True)

len(dataset)

1932

: 

In [5]:
dataset.head(5)

Unnamed: 0,text,label,char_id,movie_id
0,Please categorize bianca.\n\nbianca: C'esc ma ...,E,u0,m0
1,Please categorize kat.\n\nkat: That's not\nbia...,I,u5,m0
2,"Please categorize walter.\n\nbianca: Daddy, I ...",I,u11,m0
3,Please categorize patrick.\n\npatrick: Always ...,I,u9,m0
4,Please categorize cameron.\n\nmichael: What ma...,I,u2,m0


In [6]:
label_enc_path = f"{PWD}/data/label_encoders/{data_type}.pkl"

label_enc = LabelEncoder()

dataset["label"] = label_enc.fit_transform(dataset["label"])

with open(label_enc_path, "wb+") as fp:
    pkl.dump(label_enc, fp)

In [7]:
splitter1 = GroupShuffleSplit(test_size=.25, random_state=12)

non_test_idx, test_idx = next(splitter1.split(X=dataset[["text"]], y=dataset["label"], groups=dataset["movie_id"]))

test_df = dataset[["text", "label"]].iloc[test_idx]
non_test_df = dataset.iloc[non_test_idx]

splitter2 = GroupShuffleSplit(test_size=.2, random_state=12)

train_idx, valid_idx = next(splitter2.split(X=non_test_df, y=non_test_df["label"], groups=non_test_df["movie_id"]))

to_split_df = non_test_df[["text", "label"]]

train_df = to_split_df.iloc[train_idx]
valid_df = to_split_df.iloc[valid_idx]

In [8]:
train_df.shape

(1146, 2)

In [9]:
valid_df.shape

(279, 2)

In [10]:
test_df.shape

(507, 2)

## time to train!

In [8]:
import evaluate
import numpy as np
import torch

from datasets import Dataset
from transformers import EarlyStoppingCallback, DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer

In [9]:
device = "mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu"

config = DistilBertConfig(num_labels=2)
tkr = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", config=config).to(device)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'pre_classifier.bias', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
train = Dataset.from_pandas(train_df, split="train").with_format("torch")
valid = Dataset.from_pandas(valid_df, split="valid").with_format("torch")

print(train.shape)
print(valid.shape)

(516, 3)
(132, 3)


  if _pandas_api.is_sparse(col):


In [11]:
tokenize = lambda data: tkr(data["text"], padding="max_length", truncation=True, return_tensors="pt")
train_tk = train.map(tokenize, batched=True)
valid_tk = valid.map(tokenize, batched=True)

Map:   0%|          | 0/516 [00:00<?, ? examples/s]

Map:   0%|          | 0/132 [00:00<?, ? examples/s]

In [12]:
metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels, average="macro")

In [13]:
from datetime import datetime
from transformers import TrainingArguments, Trainer

model_path = f"{PWD}/models/{data_type}/"

is_mbpt = pers_labels.MBPT.lower() in data_type

training_args = TrainingArguments(output_dir=model_path,
                                  evaluation_strategy="steps",
                                  logging_strategy="steps",
                                  num_train_epochs=3 if is_mbpt else 5,
                                  save_total_limit = 5,
                                  eval_steps=50,
                                  metric_for_best_model="f1",
                                  load_best_model_at_end=True)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tk,
    eval_dataset=valid_tk,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3, early_stopping_threshold=.005)]
)

In [14]:
trainer.train()

trainer.save_model(model_path)

Step,Training Loss,Validation Loss,F1
50,No log,0.702151,0.352941
100,No log,0.819791,0.466127
150,No log,0.844352,0.574194
200,No log,1.126726,0.590062
250,No log,1.307918,0.625974
300,No log,1.402506,0.635023


## evaluation time!!!

In [11]:
with open(label_enc_path, "rb+") as fp:
    label_enc = pkl.load(fp)

In [15]:
import evaluate
import numpy as np
import torch

from datasets import Dataset
from transformers import EarlyStoppingCallback, DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer

device = "mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu"

config = DistilBertConfig(num_labels=2)
tkr = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", config=config).to(device)
# model = DistilBertForSequenceClassification.from_pretrained(model_path, config=config).to(device)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
test = Dataset.from_pandas(test_df, split="test").with_format("torch")

test.shape

(507, 3)

In [17]:
tokenize = lambda data: tkr(data["text"], padding="max_length", truncation=True, return_tensors="pt")
test_tk = test.map(tokenize, batched=True)

Map:   0%|          | 0/507 [00:00<?, ? examples/s]

In [19]:
from transformers import Trainer

trainer = Trainer(
    model=model
)

In [20]:
output = trainer.predict(test_tk)
preds = output.predictions
labels = output.label_ids



## results!

overall metrics

In [21]:
args = {"predictions": np.argmax(preds, axis=1),  "references": test["label"], "average": "macro"}
f1 = evaluate.load("f1")
acc = evaluate.load("recall")
prec = evaluate.load("precision")

scores = {}
scores.update(f1.compute(**args))
scores.update(acc.compute(**args))
scores.update(prec.compute(**args))

print(scores)

{'f1': 0.41151471710328563, 'recall': 0.4837241616186848, 'precision': 0.4641826427540713}


per class metrics

In [22]:
args = {"predictions": np.argmax(preds, axis=1),  "references": test["label"], "average": None}
scores = {}
scores.update(f1.compute(**args))
scores.update(acc.compute(**args))
scores.update(prec.compute(**args))


class_scores = np.concatenate([val.reshape(-1, 1) for val in scores.values()], axis=1)

class_names = np.array(label_enc.inverse_transform(range(2))).reshape(-1, 1)
per_class_df = pd.DataFrame(np.concatenate([class_names, class_scores], axis=1), columns=["label", *scores.keys()])

per_class_df.style.hide(axis="index")

label,f1,recall,precision
E,0.64598,0.854478,0.519274
I,0.177049,0.112971,0.409091
