# Classification Data

In [9]:
import flair #

from flair.data import Sentence
from flair.models import SequenceTagger
# load tagger
tagger = SequenceTagger.load("flair/ner-english-large") #加载了预训练NER模型
#model_name_or_path=r"D:\My Computer\py project\last\model\flair\pytorch_model.bin"
#tagger= SequenceTagger.load(model_name_or_path) ##本地模型

2024-03-04 22:45:47,274 SequenceTagger predicts: Dictionary with 20 tags: <unk>, O, S-ORG, S-MISC, B-PER, E-PER, S-LOC, B-ORG, E-ORG, I-PER, S-PER, B-MISC, I-MISC, E-MISC, I-ORG, B-LOC, E-LOC, I-LOC, <START>, <STOP>


In [10]:
import torch
print(torch.cuda.is_available())
print(torch.__version__)

True
2.2.1+cu121


In [11]:
from tqdm import tqdm
import csv

In [12]:
def replace_entities_placeholder_flair(text):
    # 用占位符来替换NER
    # make example sentence
    sentence = Sentence(text)
    # predict NER tags
    tagger.predict(sentence)
    # iterate over entities and print
    replacements = [] #存储待替换的实体信息
    if not sentence.get_spans('ner'):
        return text

    for entity in sentence.get_spans('ner'):
        #对于识别到的命名实体
        if entity.get_label().value == "ORG":
            repl = "ORG"
            replacements.append((entity.start_position, entity.end_position, repl, entity.text))
            #将替换信息（起始位置、结束位置、替换后的值、原始文本）存储到replacements列表中
        elif entity.get_label().value == "PER":
            repl = "PERSON"
            replacements.append((entity.start_position, entity.end_position, repl, entity.text))
        elif entity.get_label().value == "LOC":
            repl = "LOCATION"
            replacements.append((entity.start_position, entity.end_position, repl, entity.text))

    #完成替换，将识别替换后的文本重新组合成一个完整的字符串
    if replacements:
        res = []
        i = 0
        for (start, end, txt, orig) in replacements:
            assert orig != txt
            res.append(text[i:start] + txt)
            i = end
        res.append(text[end:])
        return ''.join(res)
    return text

In [13]:
from datasets import load_dataset
cls_data = load_dataset("imdb")

In [14]:
cls_data

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [15]:
train_data = cls_data['train']
unsup_data = cls_data['unsupervised']
test_data = cls_data['test']


In [16]:
train_data[5]['text']

"I would put this at the top of my list of films in the category of unwatchable trash! There are films that are bad, but the worst kind are the ones that are unwatchable but you are suppose to like them because they are supposed to be good for you! The sex sequences, so shocking in its day, couldn't even arouse a rabbit. The so called controversial politics is strictly high school sophomore amateur night Marxism. The film is self-consciously arty in the worst sense of the term. The photography is in a harsh grainy black and white. Some scenes are out of focus or taken from the wrong angle. Even the sound is bad! And some people call this art?<br /><br />"

In [17]:
replace_entities_placeholder_flair(train_data[10]['text'].replace("<br /><br />", " ").replace("<br />", ""))
#预处理后传给替换函数

'It was great to see some of my favorite stars of 30 years ago including PERSON, PERSON and PERSON. They looked quite wonderful. But that was it. They were not given any characters or good lines to work with. I neither understood or cared what the characters were doing. Some of the smaller female roles were fine, PERSON and PERSON were quite competent and confident in their small sidekick parts. They showed some talent and it is sad they didn\'t go on to star in more and better films. Sadly, I didn\'t think PERSON got a chance to act in this her only important film role. The film appears to have some fans, and I was very open-minded when I started watching it. I am a big PERSON fan and I enjoyed his last movie, "Cat\'s Meow" and all his early ones from "Targets" to "Nickleodeon". So, it really surprised me that I was barely able to keep awake watching this one. It is ironic that this movie is about a detective agency where the detectives and clients get romantically involved with each 

In [19]:
train_pairs_placeholder = []
#匿名化ibdm的训练集
with open("data/flair/imdb_train.csv", "w",encoding='utf-8') as f:
    #打开文件
    writer = csv.writer(f)
    #打开csv
    writer.writerow(["text","label"])
    for p in tqdm(train_data):
        src = replace_entities_placeholder_flair(p['text'].replace("<br /><br />", " ").replace("<br />", ""))
        #对信息进行预处理
        train_pairs_placeholder.append((src, p['label']))
        writer.writerow((src, p['label']))

100%|██████████| 25000/25000 [9:34:09<00:00,  1.38s/it]   


In [None]:
test_pairs_placeholder = []
#匿名化测试集
with open("data/flair/imdb_test.csv", "w",encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(["text","label"])
    for p in tqdm(test_data):
        src = replace_entities_placeholder_flair(p['text'].replace("<br /><br />", " ").replace("<br />", ""))
        test_pairs_placeholder.append((src, p['label']))
        writer.writerow((src, p['label']))

In [None]:
unsup_pairs_placeholder = []
#匿名化未标记的数据集
with open("data/flair/imdb_unsup.csv", "w",encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(["text","label"])
    for p in tqdm(unsup_data):
        src = replace_entities_placeholder_flair(p['text'].replace("<br /><br />", " ").replace("<br />", ""))
        unsup_pairs_placeholder.append((src, p['label']))
        writer.writerow((src, p['label']))

# Spacy

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')#安装spacy

In [None]:
def replace_entities_placeholder_spacy(text):
    parsed = nlp(text)
    # iterate over entities and print
    replacements = []
    if all([w.ent_type == 0 for w in parsed]):
        return text

    for word in parsed:
        if word.ent_type_ == "ORG":
            repl = "ORG"
            replacements.append((word.idx, word.idx + len(word.text), repl, word.text))
        elif word.ent_type_ == "PERSON":
            repl = "PERSON"
            replacements.append((word.idx, word.idx + len(word.text), repl, word.text))
        elif word.ent_type_ == "GPE":
            repl = "LOCATION"
            replacements.append((word.idx, word.idx + len(word.text), repl, word.text))

    if replacements:
        res = []
        i = 0
        for (start, end, txt, orig) in replacements:
            assert orig != txt
            res.append(text[i:start] + txt)
            i = end
        res.append(text[end:])
        return ''.join(res)
    return text

# IMDB

In [None]:
train_pairs_placeholder2 = []
with open("data/spacy/imdb_train.csv", "w",encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(["text","label"])
    for p in tqdm(train_data):
        src = replace_entities_placeholder_spacy(p['text'].replace("<br /><br />", " ").replace("<br />", ""))
        train_pairs_placeholder2.append((src, p['label']))
        writer.writerow((src, p['label']))

In [None]:
test_pairs_placeholder2 = []
with open("data/spacy/imdb_test.csv", "w",encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(["text","label"])
    for p in tqdm(test_data):
        src = replace_entities_placeholder_spacy(p['text'].replace("<br /><br />", " ").replace("<br />", ""))
        test_pairs_placeholder2.append((src, p['label']))
        writer.writerow((src, p['label']))

In [None]:
unsup_pairs_placeholder2 = []
with open("data/spacy/imdb_unsup.csv", "w",encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(["text","label"])
    for p in tqdm(unsup_data):
        src = replace_entities_placeholder_spacy(p['text'].replace("<br /><br />", " ").replace("<br />", ""))
        unsup_pairs_placeholder2.append((src, p['label']))
        writer.writerow((src, p['label']))