In [26]:
import torch
from torchtext.legacy.data import Field, BucketIterator
from torchtext.legacy.datasets import SequenceTaggingDataset
from transformers import BertTokenizer
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertConfig, BertForTokenClassification


In [2]:
class AspectExtractionCorpus:

	def __init__(self, model, input_directory, device, model_name):
		
		self.input_directory = input_directory
		self.device = device
		self.model = model

		# List all the fields.
		if self.model in ["muril", "mbert"]:
			self.bert_tokenizer = BertTokenizer.from_pretrained(model_name)
			self.PAD_INDEX = self.bert_tokenizer.pad_token
			self.CLS_INDEX = self.bert_tokenizer.cls_token
			self.UNK_INDEX = self.bert_tokenizer.unk_token

			self.word_field = Field(batch_first = True,
									sequential = False,
									pad_token = self.PAD_INDEX,
									init_token = self.CLS_INDEX,
									unk_token = self.UNK_INDEX)
		
		else:
			self.word_field = Field(batch_first = True)

		self.tag_field = Field(unk_token = None, batch_first = True)
		self.FIELDS = (("word", self.word_field), ("tag", self.tag_field))

		# Create train and validation dataset using built-in parser from torchtext.
		self.train_ds, self.val_ds = SequenceTaggingDataset.splits(path = input_directory, 
																	train = 'train.txt', 
																	validation = 'val.txt', 
																	fields = self.FIELDS)

		# Convert fields to vocabulary list.
		self.word_field.build_vocab(self.train_ds.word)   # ADD VECTORS HERE.
		self.tag_field.build_vocab(self.train_ds.tag)

		# Prepare padding index to be ignored during model training/evaluation.
		self.word_pad_idx = self.word_field.vocab.stoi[self.word_field.pad_token]
		self.tag_pad_idx = self.tag_field.vocab.stoi[self.tag_field.pad_token]

		# Vocabulary and Tagset size.
		self.vocab_size = len(self.word_field.vocab.itos)    # Includeds <pad> and <unk> as well.
		self.tagset_size = len(self.tag_field.vocab.itos)    # Includes <pad> as well.

	def print_statistics(self):
		"""
		Prints the data statistics.
		"""
		print('\nLocation of dataset : ', self.input_directory)
		print('Length of training dataset : ', len(self.train_ds))
		print('Length of validation dataset : ', len(self.val_ds))
		print('Length of text vocab (unique words in dataset) : ', self.vocab_size)
		print('Length of label vocab (unique tags in labels) : ', self.tagset_size)
		print()

	def load_data(self, batch_size: int, shuffle: bool = False):
		'''
		Generates the data iterators for train and validation data.

		Parameters
		----------
		batch_size : int
			batch_size.
		shuffle : Bool, optional
			Whether to shuffle the data before training/testing. The default is True.

		Returns
		-------
		train_iter : training Dataloader instance.
		val_iter : validation Dataloader instance.
		'''
		train_dl, val_dl = BucketIterator.splits(datasets = (self.train_ds, self.val_ds),
															batch_sizes = (batch_size, batch_size),
															shuffle = shuffle,
															sort_key = lambda x: len(x.word),
															sort_within_batch = True,
															repeat = False,
															device = self.device)

		return train_dl, val_dl                                                      


In [5]:
root = r'D:\ML_projects\IPV-Project\data\aspect_extraction\kfold\2'

In [8]:
import pandas as pd
import numpy as np
import os

In [22]:
from pathlib import Path
import re

def flatten(mylist: list):
    return [item for sublist in mylist for item in sublist]

def read_CoNLL(file_path):
    file_path = Path(file_path)

    raw_text = file_path.read_text(encoding='utf8').strip()
    raw_docs = re.split(r'\n\t?\n', raw_text)
    token_docs = []
    tag_docs = []
    for doc in raw_docs:
        tokens = []
        tags = []
        for line in doc.split('\n'):
            token, tag = line.split('\t')
            tokens.append(token)
            tags.append(tag)
        token_docs.append(tokens)
        tag_docs.append(tags)

    return token_docs, tag_docs


In [23]:
texts, tags = read_CoNLL(os.path.join(root, 'train.txt'))
tags_list = set(flatten(tags))
tags_list

{'B-Others',
 'B-character_assasination',
 'B-ethnic_violence',
 'B-general_threat',
 'B-physical_threat',
 'B-profanity',
 'B-rape_threat',
 'B-religion_violence',
 'B-sexism',
 'I-Others',
 'I-character_assasination',
 'I-ethnic_violence',
 'I-general_threat',
 'I-physical_threat',
 'I-profanity',
 'I-rape_threat',
 'I-religion_violence',
 'I-sexism',
 'O'}

In [25]:
labels_to_ids = {k: v for v, k in enumerate(tags_list)}
ids_to_labels = {v: k for v, k in enumerate(tags_list)}

labels_to_ids

{'B-character_assasination': 0,
 'I-Others': 1,
 'B-rape_threat': 2,
 'B-profanity': 3,
 'B-Others': 4,
 'I-rape_threat': 5,
 'I-profanity': 6,
 'B-general_threat': 7,
 'B-religion_violence': 8,
 'B-physical_threat': 9,
 'I-character_assasination': 10,
 'I-religion_violence': 11,
 'I-general_threat': 12,
 'I-sexism': 13,
 'I-physical_threat': 14,
 'B-sexism': 15,
 'O': 16,
 'I-ethnic_violence': 17,
 'B-ethnic_violence': 18}

In [90]:
class AspectDataset(Dataset):
  def __init__(self, input_directory, tokenizer, max_len, labels_to_ids = None, is_train = True):
        self.input_directory = input_directory
        self.tokenizer = tokenizer
        self.max_len = max_len

        self.sentences, self.labels = read_CoNLL(self.input_directory)
        self.tags_list = set(flatten(self.labels))

        if is_train:
              self.labels_to_ids = {k: v for v, k in enumerate(self.tags_list)}
              self.ids_to_labels = {v: k for v, k in enumerate(self.tags_list)}
        else:
              self.labels_to_ids = labels_to_ids
              self.ids_to_labels = {v : k for k, v in self.labels_to_ids.items()}


  def __getitem__(self, index):

        # step 1: get the sentence and word labels 
        sentence = self.sentences[index]  
        word_labels = self.labels[index] 

        # step 2: use tokenizer to encode sentence (includes padding/truncation up to max length)
        # BertTokenizerFast provides a handy "return_offsets_mapping" functionality for individual tokens
        encoding = self.tokenizer(sentence,
                                  is_split_into_words=True, 
                                  return_offsets_mapping=True, 
                                  padding='max_length', 
                                  truncation=True, 
                                  max_length=self.max_len)
        
        # step 3: create token labels only for first word pieces of each tokenized word
        labels = [self.labels_to_ids[label] for label in word_labels] 
        
        # code based on https://huggingface.co/transformers/custom_datasets.html#tok-ner
        # create an empty array of -100 of length max_length
        encoded_labels = np.ones(len(encoding["offset_mapping"]), dtype=int) * -100
        
        # set only labels whose first offset position is 0 and the second is not 0
        i = 0
        for idx, mapping in enumerate(encoding["offset_mapping"]):
          if mapping[0] == 0 and mapping[1] != 0:
            # overwrite label
            encoded_labels[idx] = labels[i]
            i += 1

        # step 4: turn everything into PyTorch tensors
        item = {key: torch.as_tensor(val) for key, val in encoding.items()}
        item['labels'] = torch.as_tensor(encoded_labels)

        # Get lengths.
        item['seq_length'] = sum(encoding['attention_mask'])
        
        return item

  def __len__(self):
        return len(self.sentences)

In [37]:
tokenizer = BertTokenizerFast.from_pretrained('google/muril-base-cased')

In [70]:
MAX_LEN = 128
root

'D:\\ML_projects\\IPV-Project\\data\\aspect_extraction\\kfold\\2'

In [91]:
train_ds = AspectDataset(os.path.join(root, "train.txt"), tokenizer, MAX_LEN)
labels_to_ids = train_ds.labels_to_ids
val_ds = AspectDataset(os.path.join(root, "val.txt"), tokenizer, MAX_LEN, labels_to_ids, is_train=False)

In [108]:
train_dl = DataLoader(train_ds, batch_size=8, shuffle=False)

In [97]:
train_dl, val_dl = BucketIterator.splits((train_ds, val_ds), batch_sizes=(8,8), sort = False, sort_within_batch = True, sort_key = lambda x: x['seq_length'])


In [109]:
next(iter(train_dl))

{'input_ids': tensor([[   104,   1386,   8494,  ...,      0,      0,      0],
         [   104,  20643,   6370,  ...,      0,      0,      0],
         [   104,   7741,  73689,  ...,      0,      0,      0],
         ...,
         [   104, 196970, 168705,  ...,      0,      0,      0],
         [   104,  16316, 163572,  ...,      0,      0,      0],
         [   104,    446,   4277,  ...,      0,      0,      0]]),
 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'offset_mapping': tensor([[[ 0,  0],
          [ 1,  3],
          [ 3,  5],
          ...,
