In [1]:
!pip install huggingface_hub
!apt install git-lfs
! pip install datasets transformers seqeval
!pip install transformers[deepspeed]
!pip install mpi4py
!pip install jsonlines

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting huggingface_hub
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 8.9 MB/s 
Installing collected packages: huggingface-hub
Successfully installed huggingface-hub-0.10.1
Reading package lists... Done
Building dependency tree       
Reading state information... Done
git-lfs is already the newest version (2.3.4-1).
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 5 not upgraded.
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.6.1-py3-none-any.whl (441 kB)
[K     |████████████████████████████████| 441 kB 7.1 MB/s 
[?25hCollecting transformers
  Downloading transformers-4.24.0-py3-none-any.whl 

In [2]:
from huggingface_hub import notebook_login
notebook_login()

Login successful
Your token has been saved to /root/.huggingface/token


In [3]:
from nltk.tokenize import wordpunct_tokenize
import string
def remove_punctuation(text):
    """
    Remove punctuation from text
    :param text: text to remove punctuation from
    :return:  text without punctuation
    """
    text = [word.lower() for word in wordpunct_tokenize(text)
                    if word not in string.punctuation]
    return text

In [4]:
##Legend 0 = Other, 1 = I-PERIOD, 2 = I-COMMA
id2label ={
    0:'O',
    1:'I-PERIOD',
    2:'I-COMMA'
}
def text2t5labels(sentence):
    """
    Convert text to labels
    :param sentence: text to convert
    :return:  list of labels
    """
    ref_tokens = wordpunct_tokenize(sentence.lower())

    labels = []
    for i, token in enumerate(ref_tokens):
        try:
            
            if token not in string.punctuation:
               labels.append(token)
                
            elif token in ['.', '?', '!', ';']:
                 if len(labels) > 1:
                  labels.insert(len(labels)-1, '[Other]')
                 labels.append("[I-PERIOD]")
                 
            elif token == ',':
                if len(labels) > 1:
                  labels.insert(len(labels)-1, '[Other]')
                labels.append("[I-COMMA]")

        except IndexError:
            raise ValueError(f"Sentence can't start with punctuation {token}")
    return ' '.join(labels)

In [5]:
##Legend 0 = Other, 1 = I-PERIOD, 2 = I-COMMA
id2label ={
    0:'O',
    1:'I-PERIOD',
    2:'I-COMMA'
}
def text2labels(sentence):
    """
    Convert text to labels
    :param sentence: text to convert
    :return:  list of labels
    """
    tokens = wordpunct_tokenize(sentence.lower())

    labels = []
    for i, token in enumerate(tokens):
        try:
            if token not in string.punctuation:
                labels.append(0)
            elif token in ['.', '?', '!', ';']:
                labels[-1] = 1
            elif token == ',':
                labels[-1] = 2

        except IndexError:
            raise ValueError(f"Sentence can't start with punctuation {token}")
    return labels

In [6]:
def preprocess_function(examples):
   
    labels = list(map(text2labels, examples['paraphrase']))
    words = list(map(remove_punctuation, examples["paraphrase"]))
 
    examples["tokens"] = words
    examples["labels"] = labels
    return examples

In [7]:
from tqdm.notebook import tqdm

def preprocess_dataset(example):
  sentences = []
  true_labels = []
  dataset = []
  example['text_input'] = 'Recognize Entities: '+' '.join(remove_punctuation(example['text']))
  example['labels'] = text2t5labels(example['text'])
  
  return example

In [9]:
from datasets import load_dataset 

raw_dataset = load_dataset('tiagoblima/nilc-school-books', use_auth_token=True)
raw_dataset

Downloading readme:   0%|          | 0.00/1.67k [00:00<?, ?B/s]



Downloading and preparing dataset None/None to /root/.cache/huggingface/datasets/tiagoblima___parquet/tiagoblima--nilc-school-books-10ccdebb85916fe0/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/732k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/303k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.61M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/tiagoblima___parquet/tiagoblima--nilc-school-books-10ccdebb85916fe0/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    test: Dataset({
        features: ['text_id', 'text', 'level'],
        num_rows: 8321
    })
    validation: Dataset({
        features: ['text_id', 'text', 'level'],
        num_rows: 3329
    })
    train: Dataset({
        features: ['text_id', 'text', 'level'],
        num_rows: 29952
    })
})

In [10]:
set(raw_dataset['train']['level'])

{'Ensino_Fundamental_I',
 'Ensino_Fundamental_II',
 'Ensino_Medio',
 'Ensino_Superior'}

In [11]:
new_dataset = raw_dataset.map(preprocess_dataset, load_from_cache_file=False).filter(lambda example: example['level']  in ['Ensino_Fundamental_I', 'Ensino_Fundamental_II'])
new_dataset

  0%|          | 0/8321 [00:00<?, ?ex/s]

  0%|          | 0/3329 [00:00<?, ?ex/s]

  0%|          | 0/29952 [00:00<?, ?ex/s]

  0%|          | 0/9 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/30 [00:00<?, ?ba/s]

DatasetDict({
    test: Dataset({
        features: ['text_id', 'text', 'level', 'text_input', 'labels'],
        num_rows: 2604
    })
    validation: Dataset({
        features: ['text_id', 'text', 'level', 'text_input', 'labels'],
        num_rows: 1041
    })
    train: Dataset({
        features: ['text_id', 'text', 'level', 'text_input', 'labels'],
        num_rows: 9371
    })
})

In [12]:
new_dataset['train'][0]['text_input'],new_dataset['train'][0]['labels']

('Recognize Entities: por isso ela demorou para aperfeiçoar a invenção que depois foi comprada por uma empresa de chicago nos estados unidos',
 'por isso ela demorou para aperfeiçoar a [Other] invenção [I-COMMA] que depois foi comprada por uma empresa de [Other] chicago [I-COMMA] nos estados [Other] unidos [I-PERIOD]')

In [13]:
import jsonlines, os

DATASET_PATH = './dataset/'

In [14]:
new_dataset

DatasetDict({
    test: Dataset({
        features: ['text_id', 'text', 'level', 'text_input', 'labels'],
        num_rows: 2604
    })
    validation: Dataset({
        features: ['text_id', 'text', 'level', 'text_input', 'labels'],
        num_rows: 1041
    })
    train: Dataset({
        features: ['text_id', 'text', 'level', 'text_input', 'labels'],
        num_rows: 9371
    })
})

In [15]:
HF_TOKEN = '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'
new_dataset.push_to_hub('tiagoblima/punctuation-nilc-t5', private=True, token=HF_TOKEN)
new_dataset



Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]



Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]



Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading metadata:   0%|          | 0.00/654 [00:00<?, ?B/s]

DatasetDict({
    test: Dataset({
        features: ['text_id', 'text', 'level', 'text_input', 'labels'],
        num_rows: 2604
    })
    validation: Dataset({
        features: ['text_id', 'text', 'level', 'text_input', 'labels'],
        num_rows: 1041
    })
    train: Dataset({
        features: ['text_id', 'text', 'level', 'text_input', 'labels'],
        num_rows: 9371
    })
})