In [1]:
# Importing libraries

import csv
import random

In [2]:
import pandas as pd

In [3]:
# Reading the dataset

import numpy as np

df_train = pd.read_csv("/kaggle/input/disflq-train/disflqa-annotated-train.csv")
df_val = pd.read_csv("/kaggle/input/disfqa/disflqa-annotated-dev.csv")
df_test = pd.read_csv("/kaggle/input/disfqa/disflqa-annotated-test.csv")

In [4]:
# Determining labels and assigning them to integers for mapping

labels = [i.split() for i in df_train['annotation'].values.tolist()]

# Check how many labels are there in the dataset
unique_labels = set()

for lb in labels:
  [unique_labels.add(i) for i in lb if i not in unique_labels]
 
print(unique_labels)

{'BR', 'IR', 'O'}


In [5]:
labels_to_ids = {k: v for v, k in enumerate(sorted(unique_labels))}
ids_to_labels = {v: k for v, k in enumerate(sorted(unique_labels))}
print(ids_to_labels)
print(labels_to_ids)

{0: 'BR', 1: 'IR', 2: 'O'}
{'BR': 0, 'IR': 1, 'O': 2}


In [6]:
from transformers import BertTokenizerFast

In [7]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')

Downloading tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [8]:
import torch

def align_label(texts, labels):
    tokenized_inputs = tokenizer(texts, padding='max_length', max_length=100, truncation=True)

    word_ids = tokenized_inputs.word_ids()

    previous_word_idx = None
    label_ids = []

    for word_idx in word_ids:

        if word_idx is None:
            label_ids.append(-100)

        elif word_idx != previous_word_idx:
            try:
                label_ids.append(labels_to_ids[labels[word_idx]])
            except:
                label_ids.append(-100)
        else:
            try:
                label_ids.append(labels_to_ids[labels[word_idx]] if label_all_tokens else -100)
            except:
                label_ids.append(-100)
        previous_word_idx = word_idx

    return label_ids

class DataSequence(torch.utils.data.Dataset):

    def __init__(self, df):

        lb = [i.split() for i in df['annotation'].values.tolist()]
        txt = df['disfluent'].values.tolist()
        self.texts = [tokenizer(str(i),
                               padding='max_length', max_length = 100, truncation=True, return_tensors="pt") for i in txt]
        self.labels = [align_label(i,j) for i,j in zip(txt, lb)]

    def __len__(self):

        return len(self.labels)

    def get_batch_data(self, idx):

        return self.texts[idx]

    def get_batch_labels(self, idx):

        return torch.LongTensor(self.labels[idx])

    def __getitem__(self, idx):

        batch_data = self.get_batch_data(idx)
        batch_labels = self.get_batch_labels(idx)

        return batch_data, batch_labels

In [9]:
from transformers import BertForTokenClassification

class BertModel(torch.nn.Module):

    def __init__(self):

        super(BertModel, self).__init__()

        self.bert = BertForTokenClassification.from_pretrained('bert-base-cased', num_labels=len(unique_labels))

    def forward(self, input_id, mask, label):

        output = self.bert(input_ids=input_id, attention_mask=mask, labels=label, return_dict=False)

        return output

In [10]:
from torch.utils.data import DataLoader
from torch.optim import SGD
from tqdm import tqdm

# Defining train loop
def train_loop(model, df_train, df_val):
    # Load train and val datasets
    train_dataset = DataSequence(df_train)
    val_dataset = DataSequence(df_val)

    train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=False)
    val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
    # use gpu or cpu according to machine
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    optimizer = SGD(model.parameters(), lr=LEARNING_RATE)

    for epoch_num in range(EPOCHS):
        # print('test')
        total_acc_train = 0
        total_loss_train = 0

        model.train()
        model = model.to(device)
        # print('test4')
        #forward pass and backpropagation through the model
        for train_data, train_label in tqdm(train_dataloader):
            # print('test2')
            train_label = train_label.to(device)
            mask = train_data['attention_mask'].squeeze(1).to(device)
            input_id = train_data['input_ids'].squeeze(1).to(device)

            optimizer.zero_grad()
            loss, logits = model(input_id, mask, train_label)
            for i in range(logits.shape[0]):
              # print('test3')                
              logits_clean = logits[i][train_label[i] != -100]
              label_clean = train_label[i][train_label[i] != -100]

              predictions = logits_clean.argmax(dim=1)
              acc = (predictions == label_clean).float().mean()
              total_acc_train += acc
              total_loss_train += loss.item()

            loss.backward()
            optimizer.step()

        model.eval()

        total_acc_val = 0
        total_loss_val = 0
        # calculate validation scores
        for val_data, val_label in val_dataloader:

            val_label = val_label.to(device)
            mask = val_data['attention_mask'].squeeze(1).to(device)
            input_id = val_data['input_ids'].squeeze(1).to(device)

            loss, logits = model(input_id, mask, val_label)

            for i in range(logits.shape[0]):

              logits_clean = logits[i][val_label[i] != -100]
              label_clean = val_label[i][val_label[i] != -100]

              predictions = logits_clean.argmax(dim=1)
              acc = (predictions == label_clean).float().mean()
              total_acc_val += acc
              total_loss_val += loss.item()
              # print('test1')

        val_accuracy = total_acc_val / len(df_val)
        val_loss = total_loss_val / len(df_val)
        torch.save(model.state_dict(), 'checkpoint_disq_data.pth')
        print(
            f'Epochs: {epoch_num + 1} | Loss: {total_loss_train / len(df_train): .3f} | Accuracy: {total_acc_train / len(df_train): .3f} | Val_Loss: {total_loss_val / len(df_val): .3f} | Accuracy: {total_acc_val / len(df_val): .3f}')

LEARNING_RATE = 5e-3
EPOCHS = 5
BATCH_SIZE = 2

model = BertModel()

train_loop(model, df_train, df_val)

Downloading model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 3590/3590 [01:58<00:00, 30.31it/s]


Epochs: 1 | Loss:  0.433 | Accuracy:  0.843 | Val_Loss:  0.326 | Accuracy:  0.886


100%|██████████| 3590/3590 [01:55<00:00, 30.96it/s]


Epochs: 2 | Loss:  0.310 | Accuracy:  0.896 | Val_Loss:  0.288 | Accuracy:  0.907


100%|██████████| 3590/3590 [01:55<00:00, 31.04it/s]


Epochs: 3 | Loss:  0.257 | Accuracy:  0.915 | Val_Loss:  0.276 | Accuracy:  0.915


100%|██████████| 3590/3590 [01:55<00:00, 31.09it/s]


Epochs: 4 | Loss:  0.217 | Accuracy:  0.929 | Val_Loss:  0.279 | Accuracy:  0.917


100%|██████████| 3590/3590 [01:55<00:00, 31.11it/s]


Epochs: 5 | Loss:  0.180 | Accuracy:  0.941 | Val_Loss:  0.297 | Accuracy:  0.915


In [11]:
!pip install seqeval

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l- \ | done
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l- \ | done
[?25h  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=391f9e63a4b249d843d0366351becdf854ad553e1ee398d5bd2a9b9740cb857f
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [12]:
# Function to evaluate accuracy of the model

def evaluate2(model, df_test):

    test_dataset = DataSequence(df_test)

    test_dataloader = DataLoader(test_dataset, batch_size=1)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:
        model = model.cuda()

    total_acc_test = 0.0

    for test_data, test_label in test_dataloader:

            test_label = test_label.to(device)
            mask = test_data['attention_mask'].squeeze(1).to(device)

            input_id = test_data['input_ids'].squeeze(1).to(device)

            loss, logits = model(input_id, mask, test_label)

            for i in range(logits.shape[0]):

              logits_clean = logits[i][test_label[i] != -100]
              label_clean = test_label[i][test_label[i] != -100]

              predictions = logits_clean.argmax(dim=1)
              acc = (predictions == label_clean).float().mean()
              total_acc_test += acc

    val_accuracy = total_acc_test / len(df_test)
    print(f'Test Accuracy: {total_acc_test / len(df_test): .3f}')


evaluate2(model, df_test)

Test Accuracy:  0.890


In [13]:
#to calculate precision and recall
# {0: 'B-br', 1: 'B-mo', 2: 'I-br', 3: 'I-mo', 4: 'O'}
# {'B-br': 0, 'B-mo': 1, 'I-br': 2, 'I-mo': 3, 'O': 4}

from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
def evaluate(model, df_test):
    
    test_dataset = DataSequence(df_test)
    test_dataloader = DataLoader(test_dataset, batch_size=1)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:
        model = model.cuda()

    true_labels = []
    predicted_labels = []
    y_true=[]
    y_pred=[]
    for test_data, test_label in test_dataloader:

        # print(test_data)
        test_label = test_label.to(device)
        mask = test_data['attention_mask'].squeeze(1).to(device)
        input_id = test_data['input_ids'].squeeze(1).to(device)

        loss, logits = model(input_id, mask, test_label)

        for i in range(logits.shape[0]):
            logits_clean = logits[i][test_label[i] != -100]
            label_clean = test_label[i][test_label[i] != -100]
            predictions = logits_clean.argmax(dim=1)
            true_labels.extend(label_clean.cpu().numpy())
            predicted_labels.extend(predictions.cpu().numpy())
            # print(logits[i])
            # print(label_clean.cpu().numpy())
            y_true.append(label_clean.cpu().numpy().tolist())
            y_pred.append(predictions.cpu().numpy().tolist())
    
    print(y_true)
    for i in range(len(true_labels)):
        true_labels[i] = ids_to_labels[true_labels[i]]

    for i in range(len(predicted_labels)):
        predicted_labels[i] = ids_to_labels[predicted_labels[i]]
    
    y_true_id=[]
    y_pred_id=[]
    for i in range(len(y_true)):
        y_true_id.append([])
        for j in range(len(y_true[i])):
            y_true_id[i].append(ids_to_labels[y_true[i][j]])
    
    for i in range(len(y_pred)):
        y_pred_id.append([])
        for j in range(len(y_pred[i])):
            y_pred_id[i].append(ids_to_labels[y_pred[i][j]])
    
    print(y_true_id)
    
    from seqeval.metrics import accuracy_score
    from seqeval.metrics import classification_report
    from seqeval.metrics import f1_score
    print()
    print("F1 score: ", f1_score(y_true_id, y_pred_id))
    print()
    print(classification_report(y_true_id, y_pred_id))


evaluate(model, df_test)




[[2, 2, 2, 2, 0, 1, 1, 1, 2, 0, 1], [0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2], [2, 2, 0, 1, 1, 1, 2, 2, 2, 2, 2], [0, 1, 1, 2, 2, 2, 2, 2], [0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 0, 1, 1, 1, 1, 1, 1, 2], [2, 2, 0, 1, 1, 1, 1, 1, 2], [2, 2, 2, 0, 1, 1, 1, 2, 2, 2, 2, 2], [2, 2, 0, 1, 1, 1, 1, 1, 1, 1], [2, 2, 2, 2, 2, 2, 0, 1, 1, 1, 1, 1, 2, 2, 2], [2, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2], [2, 0, 1, 1, 1, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2], [2, 2, 2, 2, 2, 0, 1, 1, 1, 1, 1, 2, 2], [2, 0, 1, 1, 1, 2, 2, 2, 2], [2, 2, 2, 0, 1, 1, 1, 1, 2], [0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 0, 1], [0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 1, 1, 1, 2], [2, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2], [0, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [2, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 0], [0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2], [2, 2, 2, 0, 1, 1, 1, 1, 1, 1, 1, 1], [2, 0, 1, 1,



F1 score:  0.6380425832255028

              precision    recall  f1-score   support

           R       0.64      0.64      0.64      4255

   micro avg       0.64      0.64      0.64      4255
   macro avg       0.64      0.64      0.64      4255
weighted avg       0.64      0.64      0.64      4255



In [14]:
def align_word_ids(texts):
  
    tokenized_inputs = tokenizer(texts, padding='max_length', max_length=512, truncation=True)

    word_ids = tokenized_inputs.word_ids()

    previous_word_idx = None
    label_ids = []

    for word_idx in word_ids:

        if word_idx is None:
            label_ids.append(-100)

        elif word_idx != previous_word_idx:
            try:
                label_ids.append(1)
            except:
                label_ids.append(-100)
        else:
            try:
                label_ids.append(1 if label_all_tokens else -100)
            except:
                label_ids.append(-100)
        previous_word_idx = word_idx

    return label_ids

# Calculate output for a single sentence
def evaluate_one_text(model, sentence):


    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:
        model = model.cuda()

    text = tokenizer(sentence, padding='max_length', max_length = 512, truncation=True, return_tensors="pt")

    mask = text['attention_mask'].to(device)
    input_id = text['input_ids'].to(device)
    label_ids = torch.Tensor(align_word_ids(sentence)).unsqueeze(0).to(device)

    logits = model(input_id, mask, None)
    logits_clean = logits[0][label_ids != -100]

    predictions = logits_clean.argmax(dim=1).tolist()
    prediction_label = [ids_to_labels[i] for i in predictions]
    print(sentence)
    print(prediction_label)
            
evaluate_one_text(model, 'Bosch 347 L Frost Free Double Door 3 Star Refrigerator model KDN43VL40I offers efficient cooling and ample storage space for your food and beverages.')

Bosch 347 L Frost Free Double Door 3 Star Refrigerator model KDN43VL40I offers efficient cooling and ample storage space for your food and beverages.
['BR', 'O', 'IR', 'IR', 'IR', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'IR', 'IR', 'IR', 'O', 'O']


In [15]:
evaluate_one_text(model, 'In the Midea MDRS619FGG28IND Frost-Free Side-by-Side Refrigerator, "frost-free" means the refrigerator does not have to be defrosted manually, as it has an auto-defrost feature.')

In the Midea MDRS619FGG28IND Frost-Free Side-by-Side Refrigerator, "frost-free" means the refrigerator does not have to be defrosted manually, as it has an auto-defrost feature.
['O', 'O', 'O', 'O', 'IR', 'IR', 'O', 'IR', 'IR', 'IR', 'IR', 'IR', 'O', 'O', 'O', 'IR', 'IR', 'O', 'IR', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'IR', 'O', 'IR', 'O', 'O', 'O', 'IR', 'IR', 'IR', 'O']


In [16]:
evaluate_one_text(model, 'In the Midea MDRS619FGG28IND Frost Free Side by Side Refrigerator, the brand is listed as Midea and the model number is MDRS619FGG28IND.')

In the Midea MDRS619FGG28IND Frost Free Side by Side Refrigerator, the brand is listed as Midea and the model number is MDRS619FGG28IND.
['O', 'O', 'O', 'BR', 'IR', 'O', 'IR', 'IR', 'O', 'O', 'IR', 'IR', 'IR', 'IR', 'IR', 'IR', 'IR', 'IR', 'IR', 'O', 'O', 'O', 'O', 'O']


In [17]:
evaluate_one_text(model, 'Whirlpool 240 L Frost Free Multi-Door Refrigerator (FP 263D PROTTON ROY, German Steel)')

Whirlpool 240 L Frost Free Multi-Door Refrigerator (FP 263D PROTTON ROY, German Steel)
['BR', 'O', 'O', 'IR', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'IR', 'IR', 'IR', 'IR', 'IR', 'IR']
