In [None]:
import torch
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
!rm -r /content/cardiffnlp

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from tqdm.notebook import trange, tqdm

from sklearn.metrics import precision_score, \
    recall_score, confusion_matrix, classification_report, \
    accuracy_score, f1_score

from sklearn.metrics import precision_recall_fscore_support

def simple_oneshot_evaluator(func, X_test, y_test):

  prediction = []

  for item in tqdm(X_test.itertuples()):

    prediction.append(func(item.text))

  print(confusion_matrix(y_test,prediction),accuracy_score(y_test,prediction),precision_score(y_test,prediction),recall_score(y_test,prediction),f1_score(y_test,prediction),classification_report(y_test,prediction))
  return classification_report(y_test,prediction)

In [None]:
from typing import List

import torch
from torch.utils.data import Dataset
from tqdm import tqdm
from transformers import PreTrainedTokenizer

import nltk 
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
import numpy as np

import spacy


class EncodedDataset(Dataset):

  def __init__(self, input_sents: List[str], 
                input_labels: List[int],
                tokenizer: PreTrainedTokenizer,
                max_sequence_length: int = None, 
                max_targets: int = 5):
      
    self.input_sents = input_sents
    self.input_labels = input_labels
    self.tokenizer = tokenizer
    self.max_sequence_length = max_sequence_length
    self.max_targets = max_targets

  def __len__(self):
    return len(self.input_sents) 

  def __getitem__(self, index):
      
    text = self.input_sents[index]
    label = self.input_labels[index]
    token = self.tokenizer(text, padding='max_length', max_length= self.max_sequence_length, truncation=True)

    input_ids, mask_ids = torch.tensor(token['input_ids']), torch.tensor(token['attention_mask'])

    return input_ids, mask_ids, label

In [None]:
from torch.utils.data import DataLoader, RandomSampler

def evaluate(model, test_data, tokenizer, test_labels, max_sequence_length, learning_rate, test_batch_size, device):
    model = model.cuda()
    test = EncodedDataset(input_sents=test_data, 
                    input_labels=test_labels, 
                    tokenizer=tokenizer, 
                    max_sequence_length=max_sequence_length)
    

    test_dataloader = DataLoader(test, batch_size=test_batch_size)


    total_acc_test = 0
    total_loss_test = 0
    predictions = []
    y_true = []
    model.eval()
    with torch.no_grad():   
      for test_input, test_mask, test_label in test_dataloader:
        test_input = test_input.to(device)
        test_mask = test_mask.to(device)
        test_label = test_label.to(device)   

        output = model(input_ids=test_input,
                      attention_mask=test_mask)
  
        logits = output.logits

        
        acc = (logits.argmax(dim=1) == test_label).sum().item()

        predictions.extend(logits.argmax(dim=1).detach().cpu().numpy())

        y_true.extend(test_label.detach().cpu().numpy())
      return predictions,y_true

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
from transformers import BertTokenizer, BertForSequenceClassification, AutoConfig
import torch.nn.functional as F
from transformers import pipeline
import torch

import csv
import urllib.request

hatebert_tokenizer = AutoTokenizer.from_pretrained("Hate-speech-CNERG/bert-base-uncased-hatexplain-rationale-two")
hatebert_model = AutoModelForSequenceClassification.from_pretrained("Hate-speech-CNERG/bert-base-uncased-hatexplain-rationale-two", num_labels=2)

checkpoint = torch.load("C:/Users/psheth5/OneDrive - Arizona State University/HateSpeech Datasets/Baselines/HateBERT/Models/ConvAbuseEMNLPfull/pytorch_model.bin")
hatebert_model.load_state_dict(checkpoint)
hatebert_model.to(device)

def get_hatebert_prediction(sentence):

  encoded_input = hatebert_tokenizer(sentence, max_length=512, truncation=True, return_tensors='pt').to(device)
  output = hatebert_model(**encoded_input)
  
  probs = F.softmax(output.logits, dim=1)
  probs = probs.detach().cpu().numpy()[0]

  return np.argmax(probs)

In [None]:
from transformers import AutoTokenizer

import numpy as np
import evaluate
from transformers import AutoModelForSequenceClassification, BertTokenizer, BertForSequenceClassification, AutoConfig
from transformers import TrainingArguments, Trainer

batch_size = 8

hatebert_tokenizer = AutoTokenizer.from_pretrained("Hate-speech-CNERG/bert-base-uncased-hatexplain")
hatebert_model = AutoModelForSequenceClassification.from_pretrained("Hate-speech-CNERG/bert-base-uncased-hatexplain", num_labels=3)

def tokenize_function(examples):
    return hatebert_tokenizer(examples["text"], padding="max_length", truncation=True)

def Hate_Train(lgbt_dataset,filename):
    hatebert_tokenizer = AutoTokenizer.from_pretrained("Hate-speech-CNERG/bert-base-uncased-hatexplain")
    hatebert_model = AutoModelForSequenceClassification.from_pretrained("Hate-speech-CNERG/bert-base-uncased-hatexplain", num_labels=3)
    tokenized_datasets = lgbt_dataset.map(tokenize_function, batched=True)
    print(tokenized_datasets)

    training_args = TrainingArguments(output_dir="Baselines/HateXplain/Models/"+str(filename),
                                    overwrite_output_dir=True,
                                    learning_rate=2e-5,
                                    weight_decay=0.01,
                                    per_device_train_batch_size=batch_size)

    metric = evaluate.load("accuracy")

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=-1)
        return metric.compute(predictions=predictions, references=labels)

    trainer = Trainer(
        model=hatebert_model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        compute_metrics=compute_metrics,
    )

    trainer.train()
    trainer.save_model()

In [None]:
from datasets import load_dataset
import os

# Load your training data
files=['C:/Users/psheth5/OneDrive - Arizona State University/HateSpeech Datasets/Dataset-01_24/HateEval/HateEval_train_HB.csv',
'C:/Users/psheth5/OneDrive - Arizona State University/HateSpeech Datasets/Dataset-01_24/HateEval/migrants/migrants_train_HB.csv',
'C:/Users/psheth5/OneDrive - Arizona State University/HateSpeech Datasets/Dataset-01_24/HateEval/lgbt/lgbt_train_HB.csv']

test_files = ['C:/Users/psheth5/OneDrive - Arizona State University/HateSpeech Datasets/Dataset-01_24/HateEval/HateEval_test_HB.csv',
            'C:/Users/psheth5/OneDrive - Arizona State University/HateSpeech Datasets/Dataset-01_24/HateEval/migrants/migrants-test_HB.csv',
            'C:/Users/psheth5/OneDrive - Arizona State University/HateSpeech Datasets/Dataset-01_24/HateEval/lgbt/lgbt-test_HB.csv']

dataset_names = ["HateEval","HE_migrants","HE_lgbt"]
print(files)
filenames = set()
for f in range(len(files)):
        filenames.add(dataset_names[f])
        train = files[f]
        test = test_files[f]
        lgbt_data_files = {"train": train, "test":test}
        lgbt_dataset = load_dataset("csv", data_files=lgbt_data_files, sep=",")
        Hate_Train(lgbt_dataset,dataset_names[f])

In [None]:
import pandas as pd

for f in test_files:
    data = pd.read_csv(f)
    data = data[['text','label']]
    data.to_csv(f.split(".")[0]+"_HB.csv",index=False)

In [None]:
import os
from transformers import BertTokenizer, BertForSequenceClassification, AutoConfig
import torch.nn.functional as F
from transformers import pipeline
import torch

import csv
import urllib.request
PATH = "C:/Users/psheth5/OneDrive - Arizona State University/HateSpeech Datasets/Baselines/HateBERT/Models"

import os
import pandas as pd
import numpy as np

#Load your test files.
test_files = ['C:/Users/psheth5/OneDrive - Arizona State University/HateSpeech Datasets/Dataset-01_24/ConvAbuseEMNLPfull_Test_Modifiers.csv',
'C:/Users/psheth5/OneDrive - Arizona State University/HateSpeech Datasets/Dataset-01_24/FoxNews_Test_Modifiers.csv',
'C:/Users/psheth5/OneDrive - Arizona State University/HateSpeech Datasets/Dataset-01_24/GabHateCorpusannotations_Test_Modifiers.csv',
'C:/Users/psheth5/OneDrive - Arizona State University/HateSpeech Datasets/Dataset-01_24/ICWSM18SALMINEN_Test_Modifiers.csv',
'C:/Users/psheth5/OneDrive - Arizona State University/HateSpeech Datasets/Dataset-01_24/implicithatev1stg1posts_Test_Modifiers.csv',
'C:/Users/psheth5/OneDrive - Arizona State University/HateSpeech Datasets/Dataset-01_24/Reddit_Test_Modifiers.csv',
'C:/Users/psheth5/OneDrive - Arizona State University/HateSpeech Datasets/Dataset-01_24/WikiDetox_Test_Modifiers.csv',
'C:/Users/psheth5/OneDrive - Arizona State University/HateSpeech Datasets/Dataset-01_24/Synthetic_test.csv',
'C:/Users/psheth5/OneDrive - Arizona State University/HateSpeech Datasets/Dataset-01_24/Twi-Red-You_test.csv']

def get_hatebert_prediction(sentence):

  encoded_input = hatebert_tokenizer(sentence, max_length=512, truncation=True, return_tensors='pt')
  output = hatebert_model(**encoded_input)
  
  probs = F.softmax(output.logits, dim=1)
  probs = probs.detach().cpu().numpy()[0]

  return np.argmax(probs)




files = os.listdir(PATH)

for f in files:
    hatebert_tokenizer = BertTokenizer.from_pretrained("Hate-speech-CNERG/bert-base-uncased-hatexplain")
    hatebert_model = BertForSequenceClassification.from_pretrained("Hate-speech-CNERG/bert-base-uncased-hatexplain")
    checkpoint = torch.load(PATH+"/"+f+"/pytorch_model.bin")
    hatebert_model.load_state_dict(checkpoint)
    s = ""
    for f in range(len(test_files)):
      data = pd.read_csv(test_files[f])
      s+= test_files[f].split("/")[-1].split("_")[0]+"\n"
      s+="*"*100+"\n"
      s+=simple_oneshot_evaluator(get_hatebert_prediction, data, data["label"])+ "\n"
    with open("C:/Users/psheth5/OneDrive - Arizona State University/HateSpeech Datasets/Results/HateBert_{}.txt".format(f.split("/")[-1].split("_")[0]),"w+") as f1:
      f1.write(s)
      f1.close()
 
