In [1]:
# import SNLI dataset from h
from datasets import load_dataset

dataset = load_dataset("snli")
# Access the splits
train_data = dataset['train']
validation_data = dataset['validation']
test_data = dataset['test']

In [2]:
train_data[0]

{'premise': 'A person on a horse jumps over a broken down airplane.',
 'hypothesis': 'A person is training his horse for a competition.',
 'label': 1}

In [2]:
import re
import string
# Define the punctuation set we care about
PUNCT = {'.', '!', '?'}
common_contractions = {
    "do not": "don't",
    "is not": "isn't",
    "are not": "aren't",
    "it is": "it's",
    "that is": "that's",
    "we are": "we're",
    "you are": "you're",
    "I am": "I'm",
    "I will": "I'll",
    "I would": "I'd",
    "they are": "they're",
    "will not": "won't",
    "can not": "can't",
    "there is": "there's"
}

def encased_with_apostrophes(text):
    # Check if the text is encased with standard quotes (artificat in SNLI)
    return text.startswith('"') and text.endswith('"')

def starts_with_uppercase_word(text):
    # Strip leading whitespace and check if the first character is uppercase
    text = text.lstrip()
    if not text:
        return False
    return text[0].isupper()

def ends_with_punctuation(text):
    # Check if the last non-whitespace character is punctuation
    text = text.rstrip()
    return len(text) > 0 and text[-1] in PUNCT

def contains_punctuation(text):
    # Check if there's any punctuation in the text
    # return any(ch in string.punctuation for ch in text)
    return any(ch in PUNCT for ch in text)

def whitespace_encoding(text):
    # Identify all distinct whitespace code points used in the text.
    # This will differentiate between e.g. U+0020 (normal space) and U+00A0 (no-break space).
    whitespaces = set()
    for ch in text:
        if ch.isspace():
            whitespaces.add(ord(ch))  # store the code point
    return whitespaces

def apostrophe_encoding(text):
    # Extract all apostrophe-like characters: common are `'` and `’`
    # Return a set of apostrophe chars used
    # If you want to be more comprehensive, include other variants.
    # Here we include backtick and right single quotation mark as well.
    possible_apostrophes = {"'", "’", "`"}
    apostrophes = {ch for ch in text if ch in possible_apostrophes}
    return apostrophes

def extract_number_patterns(text):
    # Find all numbers and their surrounding formatting.
    # We'll capture substrings around each digit sequence that may include punctuation and spacing.
    number_patterns = []
    for match in re.finditer(r"\d+", text):
        start, end = match.span()
        # Extend outwards to include punctuation/whitespace directly adjacent to the digits
        left = start
        while left > 0 and (text[left-1] in string.punctuation or text[left-1].isspace()):
            left -= 1
        right = end
        while right < len(text) and (text[right] in string.punctuation or text[right].isspace()):
            right += 1
        substring = text[left:right].strip()
        number_patterns.append(substring)
    return number_patterns

def compare_number_formats(patterns1, patterns2):
    # Check if both lists have the same number of numeric patterns
    if len(patterns1) != len(patterns2):
        return False
    # Compare each pair of patterns
    for p1, p2 in zip(patterns1, patterns2):
        # Compare digits sequence
        digits1 = re.sub(r"\D", "", p1)
        digits2 = re.sub(r"\D", "", p2)
        if digits1 != digits2:
            return False
        # Compare non-digit formatting
        non_digits1 = re.sub(r"\d", "", p1)
        non_digits2 = re.sub(r"\d", "", p2)
        if non_digits1 != non_digits2:
            return False
    return True

def contains_newline(text):
    return "\n" in text

def contains_contractions(text):
    # Check if text contains any of the known contracted forms
    pattern = r'\b(?:' + '|'.join(map(re.escape, common_contractions.values())) + r')\b'
    return bool(re.search(pattern, text, flags=re.IGNORECASE))

def can_form_contractions(text):
    # Check if text contains any expansions that could be turned into known contractions
    # If we find at least one expansion pattern in the text, return True
    for expansion in common_contractions.keys():
        # Create a regex pattern for the expansion
        exp_words = expansion.split()
        pattern = r'\b' + r'\s+'.join(exp_words) + r'\b'
        if re.search(pattern, text, flags=re.IGNORECASE):
            return True
    return False


In [9]:
def compare_texts(text1, text2):
    conditions = []
    conditions.append(encased_with_apostrophes(text1) == encased_with_apostrophes(text2))
    conditions.append(starts_with_uppercase_word(text1) == starts_with_uppercase_word(text2))
    conditions.append(ends_with_punctuation(text1) == ends_with_punctuation(text2))
    conditions.append(contains_punctuation(text1) == contains_punctuation(text2))
    conditions.append(whitespace_encoding(text1) == whitespace_encoding(text2))
    conditions.append(apostrophe_encoding(text1) == apostrophe_encoding(text2))
    patterns1 = extract_number_patterns(text1)
    patterns2 = extract_number_patterns(text2)
    conditions.append(compare_number_formats(patterns1, patterns2))
    conditions.append(contains_contractions(text1) == contains_contractions(text2))
    similarity = sum(conditions) / len(conditions)
    return similarity

def make_texts_similar(text1, text2):
    # Adjust Quotes
    if encased_with_apostrophes(text1) != encased_with_apostrophes(text2):
        if encased_with_apostrophes(text1) and not encased_with_apostrophes(text2):
            text2 = '"' + text2 + '"'
        elif not encased_with_apostrophes(text1) and encased_with_apostrophes(text2):
            text2 = text2[1:-1]
    
    # Adjust capitalization at the start
    if starts_with_uppercase_word(text1) != starts_with_uppercase_word(text2):
        if starts_with_uppercase_word(text1) and not starts_with_uppercase_word(text2):
            stripped = text2.lstrip()
            if stripped:
                start_idx = len(text2) - len(stripped)
                text2 = text2[:start_idx] + stripped[0].upper() + stripped[1:]
        elif not starts_with_uppercase_word(text1) and starts_with_uppercase_word(text2):
            stripped = text2.lstrip()
            if stripped:
                start_idx = len(text2) - len(stripped)
                text2 = text2[:start_idx] + stripped[0].lower() + stripped[1:]

    # Adjust punctuation at the end
    if ends_with_punctuation(text1) != ends_with_punctuation(text2):
        if ends_with_punctuation(text1) and not ends_with_punctuation(text2):
            t1_end_punct = text1.rstrip()[-1]
            text2 = text2.rstrip() + t1_end_punct
        elif not ends_with_punctuation(text1) and ends_with_punctuation(text2):
            text2 = text2.rstrip()
            while text2 and text2[-1] in PUNCT:
                text2 = text2[:-1]

    # Now text1 and text2 should be similar in capitalization and end punctuation.
    # Apostrophe and whitespace encoding is the same initially.
    # Randomly decide if we want to change them for BOTH texts simultaneously.
    
    # Random chance to change whitespace encoding for both
    # For example, replace all regular spaces with non-breaking spaces in both texts
    if random.random() < 0.5:
        # Check if we have spaces
        if " " in text1 or " " in text2:
            # Replace all spaces with non-breaking spaces
            text1 = text1.replace(" ", "\u00A0")
            text2 = text2.replace(" ", "\u00A0")

    # Random chance to change the dialect for both texts
    if random.random() < 0.5:
        # Randomly select a dialect from DIALECTS
        attempts = 5  # limit attempts to avoid infinite loops
        changed = False
        while attempts > 0 and not changed:
            try:
                dialect = random.choice(DIALECTS)
                text1 = dialect.transform(text1)
                text2 = dialect.transform(text2)
                changed = True
            except:  # if the dialect transformation fails
                print(f"Failed to transform {text1} or {text2}. Retrying {attempts} more times ...")
            attempts -= 1

    # Random chance to toggle apostrophe encoding for both
    # If we have apostrophes, switch them from `'` to `’` or vice versa
    apos1 = apostrophe_encoding(text1)
    apos2 = apostrophe_encoding(text2)
    # Since they are initially the same, we can just pick a toggle.
    if random.random() < 0.5 and (apos1 and apos2):
        # If we have at least one type of apostrophe in the texts
        # If we find `'` in texts, replace it with `’`, else if `’` then replace with `'`
        if "'" in text1 or "'" in text2:
            # Replace `'` with `’`
            text1 = text1.replace("'", "’")
            text2 = text2.replace("'", "’")
        elif "’" in text1 or "’" in text2:
            # Replace `’` with `'`
            text1 = text1.replace("’", "'")
            text2 = text2.replace("’", "'")

    return text1, text2
    

In [4]:
from multivalue import Dialects
DIALECTS = [Dialects.ColloquialSingaporeDialect(), Dialects.AfricanAmericanVernacular(), Dialects.ChicanoDialect(), Dialects.IndianDialect(), Dialects.AppalachianDialect(), 
            Dialects.NorthEnglandDialect(), Dialects.MalaysianDialect(), Dialects.AustralianDialect(), Dialects.HongKongDialect(), Dialects.NewZealandDialect(),
            Dialects.NigerianDialect(), Dialects.PakistaniDialect(), Dialects.PhilippineDialect(), Dialects.SoutheastAmericanEnclaveDialect()]

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m28.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


[nltk_data] Downloading package cmudict to /Users/anna/nltk_data...
[nltk_data]   Package cmudict is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/anna/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [12]:
import random

def flip_quotes(t):
    if encased_with_apostrophes(t):
        return t[1:-1], True
    else:
        return '"' + t + '"', True
    
def flip_capitalization(t):
    stripped = t.lstrip()
    if not stripped:
        return t, False
    start_idx = len(t) - len(stripped)
    first_char = stripped[0]
    if first_char.isalpha():
        flipped = first_char.lower() if first_char.isupper() else first_char.upper()
        new_t = t[:start_idx] + flipped + stripped[1:]
        changed = (new_t != t)
        return new_t, changed
    else:
        return t, False

def toggle_end_punctuation(t):
    if ends_with_punctuation(t):
        original = t
        t = t.rstrip()
        while t and t[-1] in PUNCT:
            t = t[:-1]
        changed = (t != original)
        return t, changed
    else:
        return t + ".", True

# def toggle_punctuation_presence(t):
#     if contains_punctuation(t):
#         original = t
#         t = "".join(ch for ch in t if ch not in PUNCT).rstrip()
#         changed = (t != original)
#         return t, changed
#     else:
#         return t, False

def toggle_whitespace_encoding(t):
    # Assume it only includes " " whitespaces. Change those to non-breaking spaces (\u00A0)
    original = t
    if " " in t:
        # Replace all spaces with non-breaking spaces
        t = t.replace(" ", "\u00A0")
        changed = (t != original)
        return t, changed
    else:
        # No spaces to change
        return t, False

def toggle_apostrophe_encoding(t):
    original = t
    apos = apostrophe_encoding(t)
    if apos:
        if "'" in apos and "’" in apos:
            t = t.replace("'", "\uFFFF")
            t = t.replace("’", "'")
            t = t.replace("\uFFFF", "’")
        elif "'" in apos:
            t = t.replace("'", "’")
        elif "’" in apos:
            t = t.replace("’", "'")
        changed = (t != original)
        return t, changed
    else:
        return t, False

def toggle_number_format(t):
    patterns = extract_number_patterns(t)
    changed = False
    if patterns:
        for p in patterns:
            if ',' in p:
                new_p = re.sub(r",", "", p)
                if new_p != p:
                    idx = t.find(p)
                    if idx != -1:
                        t = t[:idx] + new_p + t[idx+len(p):]
                        changed = True
                        break
    return t, changed

def dialect_transform(text2):
    # randomly select a dialect from DIALECTS
    changed = False
    attempts = 5  # limit attempts to avoid infinite loops
    # transform
    while not changed and attempts > 0:
        dialect = random.choice(DIALECTS)
        transformed_text = text2
        try:
            transformed_text = dialect.transform(text2)
        except:  # if the dialect transformation fails
            print(f"Failed to transform with {text2}. Retrying {attempts} more times...")
        if transformed_text != text2:
            return transformed_text, True
        attempts -= 1
    return text2, False
        

def maybe_add_contraction(text1, text2):
    # Only add a contraction if:
    # - text1 can form contractions
    # - text1 has no contractions
    # - text2 has no contractions
    original = text2
    if not can_form_contractions(text1):
        return text2, False
    if contains_contractions(text1) or contains_contractions(text2):
        return text2, False

    expansions = list(common_contractions.keys())
    random.shuffle(expansions)

    for expansion in expansions:
        exp_words = expansion.split()
        pattern = r'\b' + r'\s+'.join(exp_words) + r'\b'
        match = re.search(pattern, text2, flags=re.IGNORECASE)
        if match:
            contraction = common_contractions[expansion]
            matched_text = match.group(0)
            if matched_text[0].isupper():
                contraction = contraction[0].upper() + contraction[1:]
            text2 = text2[:match.start()] + contraction + text2[match.end():]
            return text2, (text2 != original)

    return text2, False



def make_texts_distinct(text1, text2):
    """
        Assumes to be called on SNLI text pairs
    :param text1: 
    :param text2: 
    :return: 
    """
    transformations = [
        flip_quotes,
        flip_capitalization,
        toggle_end_punctuation,
        toggle_whitespace_encoding,
        toggle_apostrophe_encoding,
        toggle_number_format,
        lambda t: maybe_add_contraction(text1, t),
    ]
    
    # flip coin to to dialect_transform as this is a transformation that needs to run before all other transformations
    text_modified = text2
    if random.random() < 0.5:
        text_modified, changed = dialect_transform(text2)

    attempts = 20  # limit attempts to avoid infinite loops
    while attempts > 0:
        # Attempt two further random transformation
        three_trans = random.sample(transformations, 3)
        for transform in three_trans:
            new_text, changed = transform(text_modified)
            if changed:
                text_modified = new_text
        attempts -= 1
        if text_modified != text2:
            return text_modified

    # If we exit the loop, we failed to reduce similarity
    return text_modified


    

In [27]:
text_a = "I talked with them yesterday."
text_b = "Hello,\u00a0world!"
print(dialect_transform(text_a))


('I have talked with them yesterday.', True)


In [22]:
text_a = "Hello, world!"
text_b = "Hello,\u00a0world!"
score = compare_texts(text_a, text_b)
print("Similarity score:", score)

Similarity score: 0.875


In [7]:
text_a = "Hello, world!\nThe price is 1,000 dollars. It’s great."
text_b = "hello world. The price is 1000 dollars It's great"
score = compare_texts(text_a, text_b)
print("Similarity score:", score)

Similarity score: 0.25


In [8]:
text_a = "The two farmers are working on a piece of John Deere equipment."
text_b = "Men are working on John Deere equipment"
score = compare_texts(text_a, text_b)
print("Similarity score:", score)
text_b_synth = make_texts_similar(text_a, text_b)[1]
print("Synthesized text:", text_b_synth)
score = compare_texts(text_a, text_b_synth)
print("Similarity score:", score)

Similarity score: 0.75
Synthesized text: Men are working on John Deere equipment.
Similarity score: 0.875


In [9]:
text_a = "There is a party"
text_b = "There is a party"
score = compare_texts(text_a, text_b)
print("Similarity score:", score)
text_b_synth = make_texts_distinct(text_a, text_b)
print("Synthesized text:", text_b_synth)
score = compare_texts(text_a, text_b_synth)
print("Similarity score:", score)

Similarity score: 1.0
Synthesized text: there is a party
Similarity score: 0.875


## Data Augmentation for SNLI

In [None]:
import pandas as pd
import os
from tqdm import tqdm

dataset = load_dataset("snli")
os.makedirs("snli_modified", exist_ok=True)

for split in tqdm(dataset.keys(), desc="Processing splits"):
    print(f"Processing {split}")
    data = dataset[split]

    rows = []
    for example in tqdm(data, desc=f"Processing examples in {split}"):
        premise = example["premise"]
        hypothesis = example["hypothesis"]
        label = example["label"]
        
        # make sure that text is not empty
        if not premise or not hypothesis:
            continue

        # Skip if label is not in {0, 1, 2}
        if label not in {0, 1, 2}:
            continue

        # Flip a coin for similar/distinct
        want_similar = random.choice([True, False])

        # Check current similarity
        initial_sim = compare_texts(premise, hypothesis)
        # currently_similar = (initial_sim == 1.0)

        if want_similar:
            # Make them similar
            premise, hypothesis = make_texts_similar(premise, hypothesis)
        else:
            # Make them distinct
            hypothesis = make_texts_distinct(premise, hypothesis)

        # Re-check similarity after transformations
        final_sim = compare_texts(premise, hypothesis)
        style = 1 if final_sim == 1.0 else 0 # 1 for similar, 0 for distinct

        rows.append({
            "premise": premise,
            "hypothesis": hypothesis,
            "premise_original": example["premise"],
            "hypothesis_original": example["hypothesis"],
            "nli": label, # 0 entailment, 1 neutral, 2 contradiction
            "style": style # 0 distinct, 1 similar
        })

    df = pd.DataFrame(rows, columns=["premise", "hypothesis", "premise_original", "hypothesis_original", "nli", "style"])
    output_file = f"snli_modified/{split}_modified.tsv"
    df.to_csv(output_file, index=False, encoding='utf-8', sep="\t")

Processing splits:   0%|          | 0/3 [00:00<?, ?it/s]

Processing test



Processing examples in test:   0%|          | 0/10000 [00:00<?, ?it/s][A
Processing examples in test:   0%|          | 4/10000 [00:02<1:23:36,  1.99it/s][A
Processing examples in test:   0%|          | 9/10000 [00:02<50:51,  3.27it/s]  [A
Processing examples in test:   0%|          | 11/10000 [00:04<1:00:48,  2.74it/s][A
Processing examples in test:   0%|          | 12/10000 [00:05<1:41:52,  1.63it/s][A
Processing examples in test:   0%|          | 15/10000 [00:07<1:24:38,  1.97it/s][A
Processing examples in test:   0%|          | 16/10000 [00:08<1:58:24,  1.41it/s][A
Processing examples in test:   0%|          | 17/10000 [00:10<2:32:22,  1.09it/s][A
Processing examples in test:   0%|          | 18/10000 [00:12<3:01:24,  1.09s/it][A
Processing examples in test:   0%|          | 19/10000 [00:14<3:28:27,  1.25s/it][A
Processing examples in test:   0%|          | 20/10000 [00:14<3:07:43,  1.13s/it][A
Processing examples in test:   0%|          | 22/10000 [00:15<2:18:43,  1.20i

Failed to transform with A snow field with a snowboarder on it. Retrying 5 more times...
Failed to transform with A snow field with a snowboarder on it. Retrying 4 more times...
Failed to transform with A snow field with a snowboarder on it. Retrying 3 more times...
Failed to transform with A snow field with a snowboarder on it. Retrying 2 more times...



Processing examples in test:   1%|          | 67/10000 [00:55<1:52:24,  1.47it/s][A

Failed to transform with A snow field with a snowboarder on it. Retrying 1 more times...



Processing examples in test:   1%|          | 68/10000 [00:57<2:14:28,  1.23it/s][A
Processing examples in test:   1%|          | 69/10000 [00:59<2:34:24,  1.07it/s][A
Processing examples in test:   1%|          | 76/10000 [01:01<1:31:54,  1.80it/s][A
Processing examples in test:   1%|          | 78/10000 [01:02<1:35:30,  1.73it/s][A
Processing examples in test:   1%|          | 79/10000 [01:04<1:51:09,  1.49it/s][A
Processing examples in test:   1%|          | 80/10000 [01:06<2:17:13,  1.20it/s][A
Processing examples in test:   1%|          | 81/10000 [01:07<2:31:11,  1.09it/s][A
Processing examples in test:   1%|          | 83/10000 [01:09<2:26:35,  1.13it/s][A
Processing examples in test:   1%|          | 90/10000 [01:10<1:20:37,  2.05it/s][A
Processing examples in test:   1%|          | 91/10000 [01:11<1:26:58,  1.90it/s][A
Processing examples in test:   1%|          | 94/10000 [01:12<1:23:31,  1.98it/s][A
Processing examples in test:   1%|          | 95/10000 [01:13<1:

Failed to transform Two teenage girls conversing next to lockers. or Girls talking about their problems next to lockers.. Retrying 5 more times ...
Failed to transform Two teenage girls conversing next to lockers. or Girls talking about their problems next to lockers.. Retrying 4 more times ...
Failed to transform Two teenage girls a-conversing next to lockers. or Girls talking about their problems next to lockers.. Retrying 3 more times ...
Failed to transform Two teenage girl a-a conversing next to lockers. or Girls talking about their problems next to lockers.. Retrying 2 more times ...



Processing examples in test:   1%|          | 123/10000 [01:40<4:24:20,  1.61s/it][A

Failed to transform Two teenage girl a-a conversing next to lockers. or Girls talking about their problems next to lockers.. Retrying 1 more times ...



Processing examples in test:   1%|          | 124/10000 [01:42<4:34:03,  1.66s/it][A
Processing examples in test:   1%|▏         | 128/10000 [01:44<3:00:22,  1.10s/it][A
Processing examples in test:   1%|▏         | 134/10000 [01:45<1:39:49,  1.65it/s][A
Processing examples in test:   1%|▏         | 140/10000 [01:47<1:20:32,  2.04it/s][A
Processing examples in test:   1%|▏         | 141/10000 [01:48<1:40:08,  1.64it/s][A
Processing examples in test:   1%|▏         | 144/10000 [01:50<1:29:22,  1.84it/s][A
Processing examples in test:   1%|▏         | 148/10000 [01:51<1:19:52,  2.06it/s][A
Processing examples in test:   2%|▏         | 154/10000 [01:53<1:08:56,  2.38it/s][A
Processing examples in test:   2%|▏         | 155/10000 [01:54<1:23:45,  1.96it/s][A
Processing examples in test:   2%|▏         | 159/10000 [01:55<1:06:32,  2.47it/s][A
Processing examples in test:   2%|▏         | 160/10000 [01:57<1:21:10,  2.02it/s][A
Processing examples in test:   2%|▏         | 161/100

In [1]:
from textflint.transformation.universal.char_typos import CharTypos

original_text = "This is a sample sentence."
transformer = CharTypos()

# The transform() method expects a list of strings and returns a list of variants.
perturbed_variants = transformer.transform([original_text])

print("Original Text:")
print(original_text)
print("\nPerturbed Variants:")
for i, variant in enumerate(perturbed_variants, start=1):
    print(f"{i}. {variant}")

[34;1mTextFlint[0m: Downloading http://textflint.oss-cn-beijing.aliyuncs.com/download/NLTK_DATA/wordnet/wordnet.zip.
100%|██████████| 10.8M/10.8M [00:01<00:00, 5.97MB/s]
[34;1mTextFlint[0m: Unzipping file /Users/anna/.cache/textflint/tmp3a_4wsdp to /Users/anna/.cache/textflint/NLTK_DATA/wordnet.
[34;1mTextFlint[0m: Successfully saved NLTK_DATA/wordnet/wordnet.zip to cache.


ModuleNotFoundError: No module named 'textflint.transformation'