## PII

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline,AutoModelForTokenClassification
import torch
import json

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model_name = "deepaksiloka/PII-Detection-V2.1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name).to(device)
nlp = pipeline("token-classification", model=model, tokenizer=tokenizer, device=device, aggregation_strategy="simple")

In [None]:
with open("./agents/GPT_results_score_PII.json", 'r') as file:
    pii = json.load(file)
texts = [d['text'] for d in pii]

filtered = []
for text in texts:
    results = nlp(text)
    if results:  # PII exists
        filtered.append({'text':text})

In [None]:
with open('./agents/GPT_results_score_PII_passed.json', 'w') as file:
    json.dump(filtered, file)

## FN

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline,AutoModelForTokenClassification
import torch
import json

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model_name = "jy46604790/Fake-News-Bert-Detect"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)

In [None]:
with open("./agents/GPT_results_score_FN.json", 'r') as file:
    fake = json.load(file)
texts=[d['text'] for d in fake]

encoded_inputs = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
input_ids = encoded_inputs['input_ids'].to(device)
attention_mask = encoded_inputs['attention_mask'].to(device)

with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask)
    predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
    
filtered = []
for pred, text in zip(predictions, texts):
    if pred[0]>pred[1]: # detected as FAKE
        filtered.append({'text':text})

In [None]:
with open('./agents/GPT_results_score_FN_passed.json', 'w') as file:
    json.dump(filtered, file)

## CR

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from torch.utils.data import Dataset
import numpy as np
import json

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = torch.load('saved_classifier')
model.eval()

class TextDataset(Dataset):
    def __init__(self, texts, tokenizer, max_len=512):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_len)
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings["input_ids"])

In [None]:
with open("./agents/GPT_results_score_HP.json", 'r') as file:
    harrypotter = json.load(file)
texts=[d['text'] for d in harrypotter]
hp_dataset = TextDataset(texts, tokenizer)

from torch.utils.data import DataLoader
dataloader = DataLoader(hp_dataset, batch_size=32)

all_preds = []
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

with torch.no_grad():
    for batch in dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
        all_preds.extend(preds)

filtered = [item for item, p in zip(texts, all_preds) if p == 1] # detected as Harry Potter excerpts
filtered = [{"text": d} for d in filtered]

In [None]:
with open('./agents/GPT_results_score_CR_passed.json', 'w') as file:
    json.dump(filtered, file)