# Preprocess raw data

You should process raw data before upload it to the doccano tool:
* Remove icons
* Separate punctuation

We provided some functions to preprocess raw data in the cell below.


In [2]:
import re
import unicodedata


def strip_emoji(text):
    RE_EMOJI = re.compile(u'([\U00002600-\U000027BF])|([\U0001f300-\U0001f64F])|([\U0001f680-\U0001f6FF])')

    return RE_EMOJI.sub(r'', text)


def preprocess_text(text: str):
    def strip_emoji(text):
        RE_EMOJI = re.compile(u'([\U00002600-\U000027BF])|([\U0001f300-\U0001f64F])|([\U0001f680-\U0001f6FF])')

        return RE_EMOJI.sub(r'', text)

    def pad_white_space(text):
        text = re.sub("""(?<! )(?=[!?()'"])|(?<=[!?()'"])(?! )""", r' ', text)

        return text

    text = unicodedata.normalize('NFC', text)
    text = strip_emoji(text)
    text = pad_white_space(text)

    return text

# Split train/val/test data

In [None]:
from utils.dataset import NERDataSet

# Custom entity tags for specific task
entities_list = ['PERSONTYPE', 'LOCATION', 'PHONENUMBER', 'EMAIL',
                 'PRODUCT', 'URL', 'ORGANIZATION', 'DATETIME',
                 'QUANTITY', 'ADDRESS', 'PERSON', 'SKILL',
                 'EVENT', 'MISCELLANEOUS', 'IP']

# give a absolute-path of doccano's output file
dataset = NERDataSet(jsonl_file='absolute-path', entity_names=entities_list)

In [None]:
data_df = dataset.dataset_df

In [None]:
data_df.head()

In [7]:
from sklearn.model_selection import train_test_split

In [None]:
train_df, rest_df = train_test_split(data_df, train_size=0.8, stratify=data_df['source'], shuffle=True, random_state=42)

In [None]:
val_df, test_df = train_test_split(rest_df, train_size=0.5, stratify=rest_df['source'], shuffle=True, random_state=42)

In [None]:
train_df.head()

# Write to file

In [None]:
from utils.repair_conll import convert

def write_to_file(df, file_path):
    data = df.values
    with open(file_path, 'w') as file:
        for i in range(len(data)):
            file.write('\n'.join(data[i][4]))
            file.write('\n\n')

Create new your dataset folder in [dataset](./dataset) folder. You should specify folder name that reflect your data version.

In [None]:
# File names of train, val, test set must be train_data.txt, val_data, test_data respectively.
write_to_file(train_df, './dataset/<your-dataset-version>/train_data.txt')
# repair some errors about punctuation and replace above file
convert(file_need_repair_path='./dataset/<your-dataset-version>/train_data.txt',
        output_file_path='./dataset/<your-dataset-version>/train_data.txt')

In [43]:
write_to_file(val_df, './dataset/<your-dataset-version>/val_data.txt')

convert(file_need_repair_path='./dataset/<your-dataset-version>/val_data.txt',
        output_file_path='./dataset/<your-dataset-version>/val_data.txt')

In [44]:
write_to_file(test_df, './dataset/<your-dataset-version>/test_data.txt')

convert(file_need_repair_path='./dataset/<your-dataset-version>/test_data.txt',
        output_file_path='./dataset/<your-dataset-version>/test_data.txt')