# Text Augmentation

This file provides basic augmentation to csv files containing text strings.

First we import the required modules, and functions from `./augmentation_utils.py`

In [None]:
import pandas as pd
import random

from augmentation_utils import remove_punctuation, insert_misspellings, insert_typo

Next, we load in our CSV file and state the column name of the strings we want to augment. 

In [None]:
DATA = pd.read_csv("./test_data.csv")
text_column = "string"

Next, we iterate through each string, and apply it an augmentation key.

An augmentation key is a 3 digit string, where each character can be either 0 or 1. An example key would be: `010`.

This decides which augmentations are going to happen:

- If the first character is a 1, punctuation will be removed.
- If the second character is a 1, words will be replaced with common misspellings found in `./missp.csv`.
- If the third character is a 1, random characters will be replaced with a different character - a typo. 

In [None]:
text_list = DATA[text_column]

augmented_text_list = []
augmentation_keys = []

for string in text_list:
    
    augmentation_key = ""

    # Ensure at least 1 augmentation occurs.
    while augmentation_key == "" or '1' not in augmentation_key:
        augmentation_key = ""
        for i in range(3):
            augmentation_key += f"{random.randint(0,1)}"

    if augmentation_key[0] == '1':
        # Remove Punctuation
        string = remove_punctuation(string)

    if augmentation_key[1] == '1':
        # Add mispellings
        string = insert_misspellings(string, prob = 0.5)

    if augmentation_key[2] == "1":
        # Add random typos
        string = insert_typo(string)

    augmentation_keys.append(augmentation_key)
    augmented_text_list.append(string)

Let's look at an augmented string.

In [None]:
print(augmented_text_list[0])
print(augmentation_keys[0])

Finally, we can save the augmented data as a new csv.

In [None]:
augmentation_dataframe = pd.DataFrame{
    text_column: augmented_text_list,
    "augmentation_key": augmentation_keys
}

augmentation_dataframe.to_csv("augmented_test_data.scsv")