In [1]:
import csv
import pandas as pd
import random
import numpy as np

In [2]:
df = pd.read_csv(r'train.csv')
df = df.dropna()

In [7]:
def get_insertion_string(text, duplicate_index, num_duplications):
    ind = 0
    global_ind = 0
    dup_char = ''
    for c in text:
        if c.isalpha() or c.isdigit():
            if ind == duplicate_index:
                dup_char = c
                break
            else:
                ind += 1
        global_ind += 1
    
    return text[:global_ind] + num_duplications * dup_char + text[global_ind:]

In [20]:
def create_adversarial_examples(data, adv_size=0.5):
    skip = 1 / adv_size
    adversarial_examples = []
    labels = []
    for index, row in df.iterrows():
        if index % skip == 0:
            adversarial_examples.append(row['text'])
            labels.append(row['sentiment'])
            continue
        text = row['text']
        text_tokens = text.split(' ')
        text_len = len(text_tokens)
        # there are three types of letter-level typos: insertion, deletion, swaps
        # each adversarial example has a uniform chance of being each type
        typo_type = random.randint(1, 3)
        # edge case: don't allow typo creation if we barely have any characters
        if text_len < 4:
            continue
        # insertion
        if typo_type == 1:
            num_duplications = random.randint(1, 3)
            # count total number of alphanumeric characters
            char_total = sum(c.isalpha() or c.isdigit() for c in text)
            duplicate_index = random.randint(0, char_total-1)
            adversarial_examples.append(get_insertion_string(text, duplicate_index, num_duplications))
        # deletion
        if typo_type == 2:
            num_deletions = random.randint(1, 3)
            deletion_index = random.randint(num_deletions, text_len-1)
            adversarial_examples.append(text[:deletion_index-num_deletions] + text[deletion_index:])
        # swap
        if typo_type == 3:
            swap_index = random.randint(2, text_len-1)
            text = ''.join((text[:swap_index-2], text[swap_index], text[swap_index-1], text[swap_index:]))
            adversarial_examples.append(text)
        
        labels.append(row['sentiment'])
        
    d = {"text": adversarial_examples, "sentiment": labels}
    new_df = pd.DataFrame(d)
    return new_df

In [21]:
df = create_adversarial_examples(df)

In [22]:
df.head(10)

Unnamed: 0,text,sentiment
0,"I`d have responded, if I were going",neutral
1,Sooo SA I will miss you here in San Diego!!!,negative
2,my boss is bullying me...,negative
3,what interview! leave me alonnne,negative
4,"Sons of ****, why couldn`t they put them on t...",negative
5,htttp://www.dothebouncy.com/smf - some shamele...,neutral
6,2am feedings for the baby are fun when he is a...,positive
7,Both of you,neutral
8,Jrney!? Wow... u just became cooler. hehe......,positive
9,"as much as i love to be hopeful, i reckon the...",neutral


In [24]:
df.to_csv('adversarial_typos_train.csv')