## Data Preprocessing and Analyses

In [None]:
!pip install huggingface_hub
!apt install git-lfs
!pip install datasets transformers seqeval
!pip install transformers[deepspeed]
!pip install mpi4py
!pip install jsonlines
!pip install datasets
!pip install spacy --upgrade
#!python -m spacy download pt_core_news_lg

In [None]:
from nltk.tokenize import wordpunct_tokenize
import string
def remove_punctuation(text):
    """
    Remove punctuation from text
    :param text: text to remove punctuation from
    :return:  text without punctuation
    """
    text = [word.lower() for word in wordpunct_tokenize(text)
                    if word not in string.punctuation]
    return text

In [None]:
##Legend 0 = Other, 1 = I-PERIOD, 2 = I-COMMA
id2label ={
    0:'O',
    1:'I-PERIOD',
    2:'I-COMMA'
}
def text2labels(sentence):
    """
    Convert text to labels
    :param sentence: text to convert
    :return:  list of labels
    """
    tokens = wordpunct_tokenize(sentence.lower())

    labels = []
    for i, token in enumerate(tokens):
        try:
            if token not in string.punctuation:
                labels.append(0)
            elif token in ['.', '?', '!', ';']:
                labels[-1] = 1
            elif token == ',':
                labels[-1] = 2

        except IndexError:
            raise ValueError(f"Sentence can't start with punctuation {token}")
    return labels

In [None]:
def preprocess_function(examples):
   
    labels = list(map(text2labels, examples['paraphrase']))
    words = list(map(remove_punctuation, examples["paraphrase"]))
 
    examples["tokens"] = words
    examples["labels"] = labels
    return examples

In [None]:

def build_dataset(annotations, tag, eos_label=1):
  sentences = []
  true_labels = []
  dataset = []

  for k, ann in enumerate(annotations, 1):
    
    tokens = remove_punctuation(ann['text'])
    sentence = []
    labels = []
    for i, ann_label in enumerate(ann['labels']): 
      sentence.append(tokens[i])
      labels.append(id2label[ann_label])
      if ann_label == eos_label:    
        dataset.append({
            'tag':tag,
            'text_id': ann['text_id'],
            'tokens': sentence,
            'labels':labels
        })
        sentences.append(sentence) 
        true_labels.append(labels)
        sentence = []
        labels = []

  return dataset

In [None]:
!git clone https://github.com/gazzola/corpus_readability_nlp_portuguese.git

In [None]:
import nltk 
nltk.download('punkt')
from nltk.tokenize import sent_tokenize


In [None]:
sent_tokenize('Olá, Mundo. Aqui você tem chance! Sr. Carlos Larceda!')

In [None]:
def join_punctuation_marks(text):
    ## join punctuation mark
    text = re.sub(r'(\w)\s([.,?!;:]+)', r'\1\2', text)
    return text

In [None]:
join_punctuation_marks('Olá , mundo !')

In [None]:
text = '; fantasminhas existem e ttêm medo de gente;'
import nltk
nltk.download('punkt')
from nltk.tokenize import wordpunct_tokenize, word_tokenize
import string 
import re


def remove_initial_punct(text_):
  ## Remove punctuation from in front of the text

  i = 0 
  tokens = wordpunct_tokenize(text_)
 
  while len(tokens) > 0 and tokens[i] in string.punctuation:
    tokens.pop(i)  

  return ' '.join(tokens)

remove_initial_punct(text)

In [None]:
def remove_extra_punctuation(text):
    ## Remove extra presence of punctuation

    text = re.sub(r'([.,?!;:])+', r'\1', text)
    return text

remove_extra_punctuation('Olá, mundo!!! Mas eles não viram...')

In [None]:
def replace_punctuation(text):
    new_text = re.sub(r'[;:!?]', '.', text)
    return new_text
replace_punctuation('Olá mundo!')

In [None]:
bad_chars = ''.join([punct for punct in string.punctuation if punct not in ';:!?.,'])
bad_chars = bad_chars.replace('+', '\+').replace('*', '\*')
bad_chars

In [None]:
def remove_bad_symbols(text):
  new_text = ''.join([char for char in list(text) if char not in bad_chars])
  return new_text

In [None]:
remove_bad_symbols('= file./// media / curso / cursoBrOffice / plan / controle.')

In [None]:
def preprocess_pipeline(text):
  text = remove_initial_punct(text)
  text = join_punctuation_marks(text)
  text = remove_extra_punctuation(text)
  text = replace_punctuation(text)
  text = remove_bad_symbols(text)
  return text

In [None]:
preprocess_pipeline('Olá, mundo!!! Como estão vocês??? Já... Brincaram hoje de IA.')

In [None]:
!rm -r dataset

In [None]:
!rm -r dataset
import os, jsonlines, re
from tqdm.notebook import tqdm
DATASET_PATH = './dataset/'



os.makedirs(DATASET_PATH, exist_ok=True)
dataset_paths = ['/content/corpus_readability_nlp_portuguese/1_Ensino_Fundamental_I',
                 '/content/corpus_readability_nlp_portuguese/2_Ensino_Fundamental_II',
                 '/content/corpus_readability_nlp_portuguese/3_Ensino_Medio',
                 '/content/corpus_readability_nlp_portuguese/4_Ensino_Superior']
dataset_list = []
lines = []
for root_dir in dataset_paths:
  tag = os.path.split(root_dir)[-1]
  
  for filename in tqdm(os.listdir(root_dir)):
    with open(os.path.join(root_dir, filename), encoding='utf-8-sig') as f:
      text = f.read()
      new_text = preprocess_pipeline(text)
      for sentences in sent_tokenize(new_text):
          for sent in sent_tokenize(preprocess_pipeline(sentences).encode().decode('utf-8-sig')):
            new_sent = ' '.join(sent.split())
            real_tokens = [token for token in wordpunct_tokenize(new_sent)
                           if token not in string.digits+string.punctuation]
                           
            if len(real_tokens) > 1 and new_sent not in lines:
              lines.append(new_sent)
              line = {
                  'text_id': int(filename.replace('_', '').replace('.txt', '')),
                  'text': new_sent,
                  'level': re.sub(r'\d_', '', tag)
              }
              dataset_list.append(line)
              with jsonlines.open(os.path.join(DATASET_PATH, f'corpus_readability.jsonl'), mode='a') as writer:
                writer.write(line)
    

In [None]:
!du -hs dataset

In [None]:
from datasets import load_dataset, Dataset

dataset = load_dataset('json', data_dir='./dataset/')

## General Statistics

In [None]:
from collections import Counter
import pandas as pd

dists = Counter(dataset['train']['level']).items()

dists = [(level.replace('_', ' '), round(num/dataset.num_rows['train'], 3)) for level, num in dists]
print(dists)
pd.DataFrame.from_dict(dict(dists), orient='index').to_csv()

### General Splits

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split

X_train, X_test = train_test_split(np.arange(0, len(dataset['train']['level'])),
                                   stratify=dataset['train']['level'], test_size=0.2)

X_train, X_dev = train_test_split(X_train,
                                   stratify=dataset['train'].select(X_train)['level'], test_size=0.1)
X_test

In [None]:
dataset['train']['text'][:3]

In [None]:
from datasets.dataset_dict import DatasetDict
new_dataset = DatasetDict({
    'train': dataset['train'].select(X_train), 
    'validation': dataset['train'].select(X_dev),
    'test': dataset['train'].select(X_test)
})

In [None]:
new_dataset

In [None]:
!rm /content/dataset/corpus_readability.jsonl

## Specific Statistics

In [None]:
import pandas as pd
from collections import Counter
from itertools import chain

In [None]:
filtered_ds = new_dataset.filter(lambda ex: ex['level'] in ['Ensino_Fundamental_II', 'Ensino_Fundamental_I'])

### Escrever infromações em um TSV linha a linha
statistics = "split\tnum_texts\tnum_sentences\tSentences Fundamental I\tSentences Fundamental II\n"

with open('statistics.csv', 'w') as f:
  f.write(statistics)
  for split in ['train', 'test', 'validation']:
    n_texts = len(list(set(filtered_ds[split]['text_id'])))
    levels = ', '.join(list(set(filtered_ds[split]['level'])))
    n_sents = filtered_ds[split].num_rows

    info_dict = Counter(filtered_ds[split]['level'])
    f1 = info_dict['Ensino_Fundamental_I']
    f2 = info_dict['Ensino_Fundamental_II']
    
    f.write(f'{split}\t{n_texts}\t{n_sents}\t{f1}\t{f2}\n')
    

In [None]:
dists = Counter(filtered_ds['train']['level']).items()

dists = [(level.replace('_', ' '), round(num/filtered_ds.num_rows['train'], 3)) for level, num in dists]
pd.DataFrame.from_dict(dict(dists), orient='index').to_csv()

In [None]:
import pandas as pd


df = pd.read_csv('statistics.csv', sep='\t')
df

In [None]:
df.to_csv()

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
from datasets import load_dataset 


dataset = load_dataset('tiagoblima/nilc-school-books')
dataset