# Classification Data

In [None]:
import flair
from flair.data import Sentence
from flair.models import SequenceTagger

# load tagger
tagger = SequenceTagger.load("flair/ner-english-large")

In [None]:
from tqdm import tqdm
import csv

In [None]:
def replace_entities_placeholder_flair(text):
    # make example sentence
    sentence = Sentence(text)
    # predict NER tags
    tagger.predict(sentence)
    # iterate over entities and print
    replacements = []
    if not sentence.get_spans('ner'):
        return text

    for entity in sentence.get_spans('ner'):
        if entity.get_label().value == "ORG":
            repl = "ORG"
            replacements.append((entity.start_position, entity.end_position, repl, entity.text))
        elif entity.get_label().value == "PER":
            repl = "PERSON"
            replacements.append((entity.start_position, entity.end_position, repl, entity.text))
        elif entity.get_label().value == "LOC":
            repl = "LOCATION"
            replacements.append((entity.start_position, entity.end_position, repl, entity.text))

    if replacements:
        res = []
        i = 0
        for (start, end, txt, orig) in replacements:
            assert orig != txt
            res.append(text[i:start] + txt)
            i = end
        res.append(text[end:])
        return ''.join(res)
    return text

In [None]:
cls_data = load_dataset("imdb")

In [None]:
cls_data

In [None]:
train_data = cls_data['train']
unsup_data = cls_data['unsupervised']
test_data = cls_data['test']

In [None]:
train_data[5]['text']

In [None]:
replace_entities_placeholder_flair(train_data[10]['text'].replace("<br /><br />", " ").replace("<br />", ""))

In [None]:
train_pairs_placeholder = []
with open("anonymized_flair/classification_placeholder/imdb_train.csv", "w") as f:
    writer = csv.writer(f)
    writer.writerow(["text","label"])
    for p in tqdm(train_data):
        src = replace_entities_placeholder_flair(p['text'].replace("<br /><br />", " ").replace("<br />", ""))
        train_pairs_placeholder.append((src, p['label']))
        writer.writerow((src, p['label']))

In [None]:
test_pairs_placeholder = []
with open("anonymized_flair/classification_placeholder/imdb_test.csv", "w") as f:
    writer = csv.writer(f)
    writer.writerow(["text","label"])
    for p in tqdm(test_data):
        src = replace_entities_placeholder_flair(p['text'].replace("<br /><br />", " ").replace("<br />", ""))
        test_pairs_placeholder.append((src, p['label']))
        writer.writerow((src, p['label']))

In [None]:
unsup_pairs_placeholder = []
with open("anonymized_flair/classification_placeholder/imdb_unsup.csv", "w") as f:
    writer = csv.writer(f)
    writer.writerow(["text","label"])
    for p in tqdm(unsup_data):
        src = replace_entities_placeholder_flair(p['text'].replace("<br /><br />", " ").replace("<br />", ""))
        unsup_pairs_placeholder.append((src, p['label']))
        writer.writerow((src, p['label']))

# Spacy

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [None]:
def replace_entities_placeholder_spacy(text):
    parsed = nlp(text)
    # iterate over entities and print
    replacements = []
    if all([w.ent_type == 0 for w in parsed]):
        return text

    for word in parsed:
        if word.ent_type_ == "ORG":
            repl = "ORG"
            replacements.append((word.idx, word.idx + len(word.text), repl, word.text))
        elif word.ent_type_ == "PERSON":
            repl = "PERSON"
            replacements.append((word.idx, word.idx + len(word.text), repl, word.text))
        elif word.ent_type_ == "GPE":
            repl = "LOCATION"
            replacements.append((word.idx, word.idx + len(word.text), repl, word.text))

    if replacements:
        res = []
        i = 0
        for (start, end, txt, orig) in replacements:
            assert orig != txt
            res.append(text[i:start] + txt)
            i = end
        res.append(text[end:])
        return ''.join(res)
    return text

# IMDB

In [None]:
train_pairs_placeholder2 = []
with open("anonymized_spacy/classification_placeholder/imdb_train.csv", "w") as f:
    writer = csv.writer(f)
    writer.writerow(["text","label"])
    for p in tqdm(train_data):
        src = replace_entities_placeholder_spacy(p['text'].replace("<br /><br />", " ").replace("<br />", ""))
        train_pairs_placeholder2.append((src, p['label']))
        writer.writerow((src, p['label']))

In [None]:
test_pairs_placeholder2 = []
with open("anonymized_spacy/classification_placeholder/imdb_test.csv", "w") as f:
    writer = csv.writer(f)
    writer.writerow(["text","label"])
    for p in tqdm(test_data):
        src = replace_entities_placeholder_spacy(p['text'].replace("<br /><br />", " ").replace("<br />", ""))
        test_pairs_placeholder2.append((src, p['label']))
        writer.writerow((src, p['label']))

In [None]:
unsup_pairs_placeholder2 = []
with open("anonymized_spacy/classification_placeholder/imdb_unsup.csv", "w") as f:
    writer = csv.writer(f)
    writer.writerow(["text","label"])
    for p in tqdm(unsup_data):
        src = replace_entities_placeholder_spacy(p['text'].replace("<br /><br />", " ").replace("<br />", ""))
        unsup_pairs_placeholder2.append((src, p['label']))
        writer.writerow((src, p['label']))