In [28]:
!pip install transformers
!pip install ipywidgets
!pip install IProgress
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


In [29]:
import transformers
from transformers import BertTokenizer, BertConfig
from transformers import BertModel, BertForSequenceClassification
from transformers import BatchEncoding

import torch
import torch.nn as nn

import datasets

## tokenizer

In [30]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

## data pre-processing

In [31]:
batch_size = 2 
# for now. will be globalized. 

#### batch 1

In [32]:
paragraph_1 = """
Some people may argue that children will be more material,
neglect their study for earning money or be exploited by the
employers. However, if children get good care and
instructions from their parents, they can take advantages of the
work to learn valuable things and avoid going in a wrong way.
"""

In [33]:
paragraph_2 = """
Take Thailand for example, in the Vietnam War, many American soldiers came to Thailand for a break and involved in sexual and drug activities, 
these huge demands caused many local businesses opened and expanded, even illegally involved in under-age prostitutes to maximize their profits. 
"""

In [34]:
batch_texts = []
batch_texts.append(paragraph_1)
batch_texts.append(paragraph_2)

In [35]:
# define spans
# these spans are inclusive

span_1 = []

am_1_span = (0, 4)
ac_1_span = (5, 23)

am_2_span = (24, 25)
ac_2_span = (26, 55)

span_1.append(am_1_span)
span_1.append(ac_1_span)
span_1.append(am_2_span)
span_1.append(ac_2_span)

In [36]:
span_2 = []

am_1_span = (0, 6)
ac_1_span = (7, 23)

am_2_span = (24, 26)
ac_2_span = (27, 48)

span_2.append(am_1_span)
span_2.append(ac_1_span)
span_2.append(am_2_span)
span_2.append(ac_2_span)
# span_2.append((34, 189))

In [37]:
batch_spans = []
batch_spans.append(span_1)
batch_spans.append(span_2)

In [38]:
batch_spans

[[(0, 4), (5, 23), (24, 25), (26, 55)], [(0, 6), (7, 23), (24, 26), (27, 48)]]

In [39]:
batch_texts

['\nSome people may argue that children will be more material,\nneglect their study for earning money or be exploited by the\nemployers. However, if children get good care and\ninstructions from their parents, they can take advantages of the\nwork to learn valuable things and avoid going in a wrong way.\n',
 '\nTake Thailand for example, in the Vietnam War, many American soldiers came to Thailand for a break and involved in sexual and drug activities, \nthese huge demands caused many local businesses opened and expanded, even illegally involved in under-age prostitutes to maximize their profits. \n']

In [40]:
labels_1 = [1,2]
labels_2 = [1,0]
batch_labels = [labels_1, labels_2]

## dataset

In [41]:
dataset_d = {
    'texts' : batch_texts,
    'spans' : batch_spans,
    'labels' : batch_labels
}

In [42]:
dataset = datasets.Dataset.from_dict(dataset_d)

In [43]:
dataset

Dataset({
    features: ['texts', 'spans', 'labels'],
    num_rows: 2
})

In [44]:
dataset['texts']

['\nSome people may argue that children will be more material,\nneglect their study for earning money or be exploited by the\nemployers. However, if children get good care and\ninstructions from their parents, they can take advantages of the\nwork to learn valuable things and avoid going in a wrong way.\n',
 '\nTake Thailand for example, in the Vietnam War, many American soldiers came to Thailand for a break and involved in sexual and drug activities, \nthese huge demands caused many local businesses opened and expanded, even illegally involved in under-age prostitutes to maximize their profits. \n']

In [45]:
dataset['spans']

[[[0, 4], [5, 23], [24, 25], [26, 55]], [[0, 6], [7, 23], [24, 26], [27, 48]]]

In [46]:
dataset['labels']

[[1, 2], [1, 0]]

In [47]:
def tokenize(dataset):
    
    tokenized_text = tokenizer(dataset['texts'], padding=True, return_tensors="pt")
    max_length = max([len(l) for l in dataset['spans']])
    span_ll = []
    
    for idx, span in enumerate(dataset['spans']):
        
        tmp_l = dataset['spans'][idx] + (max_length - len(span)) * [[0,0]]
        span_ll.append(tmp_l)

    tokenized_text['spans'] = torch.tensor(span_ll)
    tokenized_text['labels'] = torch.tensor(dataset['labels'])
    return tokenized_text

In [48]:
tokenized_input = tokenize(dataset)

In [49]:
tokenized_input

{'input_ids': tensor([[  101,  2070,  2111,  2089,  7475,  2008,  2336,  2097,  2022,  2062,
          3430,  1010, 19046,  2037,  2817,  2005,  7414,  2769,  2030,  2022,
         18516,  2011,  1996, 12433,  1012,  2174,  1010,  2065,  2336,  2131,
          2204,  2729,  1998,  8128,  2013,  2037,  3008,  1010,  2027,  2064,
          2202, 12637,  1997,  1996,  2147,  2000,  4553,  7070,  2477,  1998,
          4468,  2183,  1999,  1037,  3308,  2126,  1012,   102],
        [  101,  2202,  6504,  2005,  2742,  1010,  1999,  1996,  5148,  2162,
          1010,  2116,  2137,  3548,  2234,  2000,  6504,  2005,  1037,  3338,
          1998,  2920,  1999,  4424,  1998,  4319,  3450,  1010,  2122,  4121,
          7670,  3303,  2116,  2334,  5661,  2441,  1998,  4423,  1010,  2130,
         17800,  2920,  1999,  2104,  1011,  2287, 20833,  2000, 25845,  2037,
         11372,  1012,   102,     0,     0,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [50]:
len(tokenized_input['input_ids'][0]), len(tokenized_input['input_ids'][1])

(58, 58)

In [51]:
tokenized_input['input_ids'][0]

tensor([  101,  2070,  2111,  2089,  7475,  2008,  2336,  2097,  2022,  2062,
         3430,  1010, 19046,  2037,  2817,  2005,  7414,  2769,  2030,  2022,
        18516,  2011,  1996, 12433,  1012,  2174,  1010,  2065,  2336,  2131,
         2204,  2729,  1998,  8128,  2013,  2037,  3008,  1010,  2027,  2064,
         2202, 12637,  1997,  1996,  2147,  2000,  4553,  7070,  2477,  1998,
         4468,  2183,  1999,  1037,  3308,  2126,  1012,   102])

In [52]:
text_tokenized = tokenizer.tokenize(paragraph_2)

In [53]:
text_tokenized

['take',
 'thailand',
 'for',
 'example',
 ',',
 'in',
 'the',
 'vietnam',
 'war',
 ',',
 'many',
 'american',
 'soldiers',
 'came',
 'to',
 'thailand',
 'for',
 'a',
 'break',
 'and',
 'involved',
 'in',
 'sexual',
 'and',
 'drug',
 'activities',
 ',',
 'these',
 'huge',
 'demands',
 'caused',
 'many',
 'local',
 'businesses',
 'opened',
 'and',
 'expanded',
 ',',
 'even',
 'illegally',
 'involved',
 'in',
 'under',
 '-',
 'age',
 'prostitutes',
 'to',
 'maximize',
 'their',
 'profits',
 '.']

In [72]:
text = 'To conclude, I strongly believe that the tourism has created undeniable and threatening pressure on both the sociocultural, legal and natural environments.'

In [73]:
text_tokenized = tokenizer.tokenize(text)

In [74]:
text_tokenized

['to',
 'conclude',
 ',',
 'i',
 'strongly',
 'believe',
 'that',
 'the',
 'tourism',
 'has',
 'created',
 'und',
 '##enia',
 '##ble',
 'and',
 'threatening',
 'pressure',
 'on',
 'both',
 'the',
 'socio',
 '##cultural',
 ',',
 'legal',
 'and',
 'natural',
 'environments',
 '.']

In [75]:
span = (3, 24)

In [76]:
span

(3, 24)

In [82]:
span[0], span[1]

(3, 24)

In [83]:
len(text_tokenized)

28

In [84]:
len(text)

155

In [88]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [89]:
from nltk.tokenize import word_tokenize

In [91]:
nltk_text = word_tokenize(text)

In [92]:
len(nltk_text)

25

In [1]:
def get_span(paragraph, marker_component):
    
    paragraph_tokenized = tokenizer(paragraph)
    marker_component_tokenized = tokenize(marker_component)
    
    span_l = [(i, i+len(marker_component_tokenized)-1) for i in range(len(paragraph_tokenized)) if paragraph_tokenized[i:i+len(marker_component_tokenized)] == marker_component_tokenized]
    
    return span_l[0]