In [None]:
from google.colab import drive
drive.mount('/content/drive')
from IPython.display import clear_output
clear_output()

In [None]:
import os
import json
import csv
import pandas
import time
import random
from collections import Counter
from statistics import mean, median, stdev
import numpy as np
import matplotlib.pyplot as plt


#df = pandas.read_csv('drive/MyDrive/master/new_multi_conan.csv')
val_df = pandas.read_csv('drive/MyDrive/master/testset_multi_conan.csv', sep=',')

# #156 is the longest HS-CN text, and it will be used as max_len for both encoder and decoder
max_len=156
batch_size=20

###Setup

In [None]:
decoder_max_len = max_len
encoder_max_len = max_len

# !pip install transformers==4.0.0
!pip install torch==1.6.0

!pip install -U git+https://github.com/huggingface/transformers.git
!pip install datasets==1.0.2
import torch
from torch.utils.data import Dataset, DataLoader

import gc
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import AutoTokenizer, AutoModelForCausalLM

import logging
logging.getLogger().setLevel(logging.CRITICAL)

import warnings
warnings.filterwarnings('ignore')

device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'

is_cloud = True
model_name = 'gpt2'
if is_cloud:
    model_name = 'gpt2-large'

clear_output()

In [None]:
class HSCNDataset2(Dataset):
    def __init__(self, df, add_special_tokens=False):
      #the super() builtin returns a proxy object, a substitute object that can call methods of the base class (Dataset) via delegation
        super().__init__()

        self.hscn_list = []

        if add_special_tokens:
            self.start_of_hs_token = "<hatespeech>"
            self.end_of_hs_token = "<counternarrative>"
        else:
            self.start_of_hs_token = "<|hatespeech|>"
            self.end_of_hs_token = "<|counternarrative|>"


        #This is the end of text token originally used in GPT2
        # self.end_of_text_token = "<|endoftext|>"

        x = 0
        for _, row in df.iterrows():
          #
          hscn = f"{self.start_of_hs_token} {row['HATE_SPEECH']} {self.end_of_hs_token}"
          self.hscn_list.append(hscn)

        # random.shuffle(self.hscn_list)

    def __len__(self):
        return len(self.hscn_list)

    def __getitem__(self, item):
        return self.hscn_list[item]

In [None]:
gc.collect() #this is useful in order not to load cuda
torch.cuda.empty_cache()

#given that we can't map the dataset, we generate by batches through the Pytorch's DataLoader class
def generate_cn(model, hscndata, output_data, decoding, p=0.92, k=40, nb=1, rp=2.0, specialtokens=True):
    gc.collect() #this is useful in order not to load cuda
    torch.cuda.empty_cache()
    output_df = output_data.copy()
    print(output_df.head())
    print(len(output_df))
    print(len(hscndata))
    new_cns=[]
    outs = []
    for i in range(len(hscndata)):

        encoded_hs_ids = tokenizer(hscndata[i],
                                truncation=True,
                                padding=True,
                                return_tensors="pt").input_ids.to(device)

        if decoding == 'top-p':

            name_col='tp'
            encoded_new_cn = model.generate(input_ids = encoded_hs_ids,
                                      max_length=decoder_max_len,
                                      do_sample=True,
                                      top_p=p,
                                      num_return_sequences = nb)
            new_cns.extend(tokenizer.batch_decode(encoded_new_cn, skip_special_tokens=False))
            clear_output()
            #print(decoding)
            #print(len(new_cns))

        if decoding == 'top-k':
            name_col='tk'
            encoded_new_cn = model.generate(input_ids = encoded_hs_ids,
                                      max_length = decoder_max_len,
                                      do_sample = True,
                                      top_k = k,
                                      num_return_sequences = nb)
            new_cns.extend(tokenizer.batch_decode(encoded_new_cn, skip_special_tokens=False))
            clear_output()
            # print(decoding)
            # print(len(new_cns))

        if decoding == 'beam-search':
            name_col='bs'
            encoded_new_cn = model.generate(input_ids = encoded_hs_ids,
                                      max_length=decoder_max_len,
                                      num_beams=nb,
                                      early_stopping=True,
                                      num_return_sequences=nb,
                                      repetition_penalty=rp,
                                      do_sample=True)
            new_cns.extend(tokenizer.batch_decode(encoded_new_cn, skip_special_tokens=False))
            #clear_output()
            print(decoding)
            print(len(new_cns))
            #print(new_cns)

        if decoding == 'k-p':
            name_col='kp'
            encoded_new_cn = model.generate(input_ids = encoded_hs_ids,
                                      max_length = decoder_max_len,
                                      do_sample = True,
                                      top_k = k,
                                      top_p = p,
                                      num_return_sequences = nb)
            new_cns.extend(tokenizer.batch_decode(encoded_new_cn, skip_special_tokens=False))
            clear_output()
           # print(decoding)
           # print(len(new_cns))

        #for j in range(len(new_cns)):
         #   c = new_cns[j].split('<counternarrative>')
          #  s = c[1].lstrip().strip(' <|endoftext|>')
        #if pandas.isnull(s):
         #   outs.append('')
        #else:
         #   outs.append(s)
    #print(outs)
    #chunks=[outs[i:i+5] for i in range(0,len(outs),5)]
    #print(chunks)
    #for i in range(5):
     #   toappend= [chunk[i] for chunk in chunks]
      #  print(toappend)
       # output_df[name_col+str(i)] = toappend

    clear_output()
    print(new_cns)
    return new_cns

###Load model and prepare data
(`add_special_tokens = True`)

In [None]:
# load model
model_path = 'drive/MyDrive/master/checkpoints/'
tokenizer = GPT2Tokenizer.from_pretrained(model_path, return_tensors = 'pt')
model1 = GPT2LMHeadModel.from_pretrained(model_path)
model1 = model1.to(device)

In [None]:
# this is necessary in order to pad
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    # config.pad_token_id = config.eos_token_id
print(tokenizer.special_tokens_map)

In [None]:
# prepare data
v_df = val_df[['HATE_SPEECH']]#, 'COUNTER_NARRATIVE', 'TARGET','VERSION']]
val_dataset1 = HSCNDataset2(v_df,add_special_tokens=True)

### Generate

In [None]:
from functools import reduce
from google.colab import files

In [None]:
#%%time
you_want_to_save = True
v_df1 = v_df.copy()

seed=42
torch.manual_seed(seed)
decodings = ['beam-search']
list_dfs = []
hscnpairs = pandas.DataFrame(columns=["HS", "CN", "Decoding"])
for k in decodings:
    fresh_cn =generate_cn(model=model1, hscndata=val_dataset1, output_data=v_df1, decoding= k)
    for i in range(len(fresh_cn)):
      c = fresh_cn[i].split('<counternarrative>')
      s = c[1].lstrip().strip(' <|endoftext|>')
      hscnpairs.loc[len(hscnpairs)] = {"HS": v_df1.loc[i, ["HATE_SPEECH"]].item(), "CN": s, "Decoding": k}
    #pandas_columns = [pandas.Series(fresh_cn[col], name = str(col)).reset_index(drop = True) for col in fresh_cn.columns]
    #generated_data = pandas.concat([pcol for pcol in pandas_columns], axis = 1)
    #list_dfs.append(generated_data)

#merged = reduce(lambda  left,right: pandas.merge(left,right,on=['HATE_SPEECH'], how='outer'), list_dfs)
#temp=sorted(list(merged.columns))
#temp.remove('index')
#temp.insert(0,'index')
#merged=merged.reindex(columns=temp)
hscnpairs.head()

In [None]:
hscnpairs.to_csv("drive/MyDrive/master/output_testset_multi_conan.csv")

**END OF NOTEBOOK**