# Import necessary libraries

In [1]:
# !pip install datasets
# !pip install evaluate
# !pip install transformers
# !pip install accelerate -U
# !pip install transformers[torch]

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder, LabelBinarizer

import transformers
import torch
import accelerate
from datasets import Dataset, DatasetDict, load_metric
import evaluate
import tensorflow as tf

from transformers import DistilBertTokenizer,TFDistilBertForSequenceClassification, TFTrainer, TFTrainingArguments
from transformers import InputExample, InputFeatures
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, pipeline

In [3]:
from shutil import copyfile
from google.colab import drive
import os, sys
drive.mount('/content/drive')

copyfile('/content/drive/My Drive/Taxonomy_Classification_AI/labeled_incidents.csv', 'labeled_incidents.csv')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


'labeled_incidents.csv'

Open the labeled dataset of Incidents:

In [4]:
df = pd.read_csv("labeled_incidents.csv")
df = df[:50]
df = df.loc[:, ['labels', 'description']]

df = df.rename(columns={"labels": "label"})

df.head()

Unnamed: 0,label,description
0,SECURITY AND SAFETY,A self-driving public shuttle by Keolis North ...
1,SECURITY AND SAFETY,An Uber autonomous vehicle (AV) in autonomous ...
2,SOCIAL HARM,YouTube’s content filtering and recommendation...
3,SOCIAL HARM,Google Image returns results that under-repres...
4,SOCIAL HARM,Researchers from Boston University and Microso...


Check NA's:

In [5]:
missing_values = df.isnull().sum()
print("Missing values in each column: \n", missing_values)

Missing values in each column: 
 label          0
description    0
dtype: int64


We need to pass labels to integers for the training and fine-tuning part, so let's change the labels and save their corresponding numbers here:

In [6]:
df['label'] = df['label'].replace({'SECURITY AND SAFETY':0,
                                  'SOCIAL HARM':1,
                                  'OPERATIONAL INCIDENT':2,
                                  'PRIVACY VIOLATION':3})
ids2labels = {0:'SECURITY AND SAFETY',
              1: 'SOCIAL HARM',
              2: 'OPERATIONAL INCIDENT',
              3: 'PRIVACY VIOLATION'}

In [7]:
list_labels =df['label'].unique()
for i in list_labels:
  print(ids2labels[i]+" --> "+str(i) )

SECURITY AND SAFETY --> 0
SOCIAL HARM --> 1
OPERATIONAL INCIDENT --> 2
PRIVACY VIOLATION --> 3


Pass the df to datasets type, which is the type of dataset we want to work with.

In [8]:
df = Dataset.from_pandas(df)

In [9]:
true_labels = df["label"]

In [10]:
print(true_labels)

[0, 0, 1, 1, 1, 1, 2, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 2, 2, 1, 2, 2, 0, 0, 1, 2, 2, 0, 2, 2, 3, 2, 1, 1, 2, 1, 0, 2, 2, 1, 2, 2, 1, 2, 1, 2, 2, 2, 0]


In the column "clean-review" we have the descriptions lemmatazied and pre-processed, the *__corpus__*.

So, we only need to re-do those steps that come after the creation of the corpus, if necessary.

#### __CLASSIFICATION without fine-tuning__

First we will check how the zero-shot classifier with pre-trained facebook/bart-large-mnli would work without fine-tuning the pre-trained model with our data.

In [11]:
label_classifier = pipeline(
    model="facebook/bart-large-mnli",
    task="zero-shot-classification",
    return_all_scores=True,
    #device = 0  #IMPORTANT TO USE GPU AND SPEED UP THE PROCESS
)

In [12]:
# def predict(examples):
#     return {"predictions": label_classifier(examples['description'], list_labels)}

# # add .select(range(10)) before map if you just want to test this quickly with 10 examples
# to_label1 = df.map(predict, batched=True, batch_size=4)

In [13]:
# print(to_label1["predictions"][:3])

To wacth how accurate it is, we take only the label the zer-shot model has ranked higher, and we compare it to the label we put to that description, which is the correct one.

In [14]:
# predicted_1 = []
# for i in to_label1["predictions"]:
#   predicted_1.append(i["labels"][0])

Check the accuracy:

In [15]:
# acc_1 = sum(1 for x,y in zip(true_labels,predicted_1) if x == y) / float(len(true_labels))

# print("The accuracy of the zero-shot classifier is ", acc_1)

#### __CLASSIFICATION with fine-tuning__

Now let's try fine tuning the model with our data, which should work better as we are training with our instances.

In [16]:
# Import tokenizer

tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-mnli")

In [17]:
#Tokenize the data
def tokenize_function(examples):
    return tokenizer(examples["description"], padding="max_length", truncation=True)

tokenized_df = df.map(tokenize_function, batched=True)

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Let's separate the data so we have some evaluation instances to see how the model works:

In [18]:
train_df, eval_df = tokenized_df.train_test_split(test_size=0.5, seed=42).values()


In [34]:
print(train_df)

Dataset({
    features: ['label', 'description', 'input_ids', 'attention_mask'],
    num_rows: 25
})


Open the model to train:

In [19]:
model = AutoModelForSequenceClassification.from_pretrained("facebook/bart-large-mnli",num_labels = 4, ignore_mismatched_sizes = True)

Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([4]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([4, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


And configure the trainer:

In [31]:
training_args = TrainingArguments(
    "./our_model",
    evaluation_strategy="epoch",
    logging_steps=30,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    eval_accumulation_steps=1,
  )

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred.predictions, eval_pred.label_ids
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

trainer = Trainer(
    args=training_args,
    model=model,
    train_dataset=train_df,
    eval_dataset=eval_df,
    compute_metrics=compute_metrics,
)

In [32]:
trainer.train()

Epoch,Training Loss,Validation Loss


  logits_np = np.array(logits)


ValueError: ignored