In [None]:
!nvidia-smi

In [None]:
!pip install -q thai2transformers==0.1.2 datasets==2.10.0 huggingface_hub==0.14.0 openprompt==1.0.1
!pip uninstall -q  transformers -y
!pip install transformers==4.28.0

In [None]:
import time
start_time = time.time()

In [None]:
import torch.nn as nn
import os
import re
import transformers
import json
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from pathlib import Path
import pandas as pd
from itertools import groupby
from operator import itemgetter
from torch.optim import AdamW
from sklearn import metrics
from sklearn.metrics import classification_report
import numpy as np
import torch
import random
import pandas as pd
from sklearn.metrics import accuracy_score


# Setting up the device for GPU usage
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
os.environ["TOKENIZERS_PARALLELISM"] = "false"

##### parameters
RANDOM_SEED = 21
SAMPLE_SIZE = 4
CLASS_NAME = ['absent','dengue','health','mosquito','sick']
LEARNING_RATE = 1e-5
TOKEN_NUMBER = 32
MODEL_SIZE = 30000
MAX_EPOCH = 51
MAX_LENGTH = 512
FILENAME = 'PLAML_philippines'

predict_df = pd.DataFrame()
f1_df = pd.DataFrame()
pred_df = pd.DataFrame()
threshold_df = pd.DataFrame()
val_pred_df = pd.DataFrame({'indx': range(len(CLASS_NAME)*32)})
val_act_df = pd.DataFrame({'indx': range(len(CLASS_NAME)*32)})
act_df = pd.DataFrame()

def set_seed(seed: int = 42):
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.enabled=False
    # Set a fixed value for the hash seed
    os.environ["PYTHONHASHSEED"] = str(seed)
set_seed()


In [None]:
TEMPLATE = 'Ito ay tungkol sa '

def text_template(text,label_name):
    return text[:500] + TEMPLATE + label_name + ' at <mask>.'

In [None]:
!wget https://huggingface.co/datasets/jcblaise/dengue_filipino/resolve/main/dengue_raw.zip
!unzip '/kaggle/working/dengue_raw.zip'

In [None]:
# Load data
from datasets import load_dataset
raw_dataset = ''

def load_dataset_func(raw_dataset, SAMPLE_SIZE, RANDOM_SEED):

    train_df = pd.read_csv('/kaggle/working/dengue/train.csv')
    test_df = pd.read_csv('/kaggle/working/dengue/test.csv')
    valid_df = pd.read_csv('/kaggle/working/dengue/valid.csv')

    train_df['label_list'] = train_df.iloc[:,1:].apply(list, axis=1)
    test_df['label_list'] = test_df.iloc[:,1:].apply(list, axis=1)
    valid_df['label_list'] = valid_df.iloc[:,1:].apply(list, axis=1)

    def get_filter_type(lst):
      lst = np.array(lst)
      result = np.where(lst == 1)[0].tolist()
      if len(result) > 0:
        return result
      else:
        return [-1]

    train_df['filter_type'] = [get_filter_type(x) for x in train_df['label_list']]
    test_df['filter_type'] = [get_filter_type(x) for x in test_df['label_list']]
    valid_df['filter_type'] = [get_filter_type(x) for x in valid_df['label_list']]

    train_df = pd.concat([train_df[[True if i in x else False for x in train_df['filter_type']]].sample(SAMPLE_SIZE,random_state=RANDOM_SEED,replace=True) for i in range(len(CLASS_NAME))])
    valid_df = pd.concat([valid_df[[True if i in x else False for x in valid_df['filter_type']]].sample(SAMPLE_SIZE,random_state=RANDOM_SEED,replace=True) for i in range(len(CLASS_NAME))])

    train_df.reset_index(drop=True, inplace=True)
    test_df.reset_index(drop=True, inplace=True)
    valid_df.reset_index(drop=True, inplace=True)

    train_df['texts'] = train_df['text']
    test_df['texts'] = test_df['text']
    valid_df['texts'] = valid_df['text']
    
    new_train_df = train_df.copy()
    new_train_df_label_list = []
    new_train_df_text = []
    for label_id, each_class_name in enumerate(CLASS_NAME):
      for i in new_train_df.index:
        text = new_train_df['texts'][i]
        label_name_list = new_train_df['filter_type'][i]
        if label_id in label_name_list:
            label_name_list = [x for x in label_name_list if x != label_id]
            if len(label_name_list) > 0:
                label_name_list_sim = [CLASS_NAME_emb_sim[label_id][x] for x in label_name_list]
                label_name_list_sim = dict(zip(label_name_list,label_name_list_sim))
                label_name_list = sorted(label_name_list_sim, key=label_name_list_sim.get)

                label_name_list = [CLASS_NAME[x] for x in label_name_list]
                label_name_list = ' '.join(label_name_list)
            else:
                label_name_list = ''

            label_name_list = label_name_list + " "+  CLASS_NAME[label_id]
            text = " ".join(str(text).split())
            text = text_template(text,label_name_list)
            new_train_df_label_list.append(new_train_df['label_list'][i])
            new_train_df_text.append(text)

    new_train_df = pd.DataFrame({
       'texts':new_train_df_text,
       'label_list':new_train_df_label_list
    })

    train_df_train = train_df.copy()
    train_df_train['texts'] = [x[:500] + TEMPLATE + '<mask>.' for x in train_df_train['texts']]
    
    new_train_df.reset_index(drop=True, inplace=True)
    train_df_train.reset_index(drop=True, inplace=True)

    test_df['texts'] = [x[:500] + TEMPLATE + '<mask>.' for x in test_df['texts']]
    valid_df['texts'] = [x[:500] + TEMPLATE + '<mask>.' for x in valid_df['texts']]

    return train_df, test_df, valid_df, new_train_df, train_df_train

In [None]:
# find probability of mask word from pretrain model
from transformers import CamembertConfig, AutoModelForMaskedLM, AutoTokenizer, RobertaConfig, CamembertModel , AutoModel, AutoConfig

model_path = "jcblaise/roberta-tagalog-base"
model_config = AutoConfig.from_pretrained(model_path)
plm_topk = AutoModelForMaskedLM.from_pretrained(model_path, config = model_config)
plm = AutoModelForMaskedLM.from_pretrained(model_path, config = model_config)
tokenizer = AutoTokenizer.from_pretrained(model_path, revision='main', model_max_length=MAX_LENGTH)

token_id_remove  = [tokenizer.decode(x) for x in range(MODEL_SIZE)]
token_id_remove = [i if ('<' in x) or ('.' in x) or ('[' in x) or (len(x.replace(' ','')) == 1) else -1 for i,x in enumerate(token_id_remove)]
token_id_remove = [x for x in token_id_remove if x != -1]
plm_topk.to(device)

In [None]:
model = AutoModel.from_pretrained(model_path, output_hidden_states=True).eval()
CLASS_NAME_emb = [' '+ x for x in CLASS_NAME]
CLASS_NAME_emb = [tokenizer.encode(x, return_tensors='pt') for x in CLASS_NAME_emb]
CLASS_NAME_emb = [torch.mean(model(x).last_hidden_state[0][1:-1],0) for x in CLASS_NAME_emb]
CLASS_NAME_emb_sim = [torch.cosine_similarity(x,torch.stack(CLASS_NAME_emb)).tolist() for x in CLASS_NAME_emb]

In [None]:
import itertools
def _get_predicted_token(plm_topk, SAMPLE_SIZE, train_df, label_name, label_id):
  new_size = len(train_df[[True if label_id in x else False for x in train_df['filter_type']]]['texts'])
  print(str(label_id) + " " + label_name + " " + str(new_size))
  predicted_token = torch.zeros(new_size, MODEL_SIZE)
  for i,text in enumerate(train_df[[True if label_id in x else False for x in train_df['filter_type']]]['texts'][:new_size]):
    text = " ".join(str(text).split())
    text = text_template(text,label_name)
    inputs = tokenizer(text, return_tensors="pt").to(device)

    with torch.no_grad():
      logits = plm_topk(**inputs).logits
    mask_token_index = (inputs.input_ids == tokenizer.mask_token_id)[0].nonzero(as_tuple=True)[0]
    predicted_token_id = logits[0, mask_token_index]
    predicted_token[i] = predicted_token_id

  predicted_token = torch.mean(predicted_token, 0)
  return predicted_token

def get_predicted_token_list(plm_topk, SAMPLE_SIZE, train_df, CLASS_NAME=CLASS_NAME, TOKEN_NUMBER=TOKEN_NUMBER):
  predicted_token_list_correct = []
  for i, name in enumerate(CLASS_NAME):
    each_predicted_token = _get_predicted_token(plm_topk, SAMPLE_SIZE, train_df, name,i)
    predicted_token_list_correct.append(each_predicted_token)  

  predicted_token_list_combine = torch.sub(torch.stack(predicted_token_list_correct),0.)
  print(predicted_token_list_combine)
    
  predicted_token_list_ids = []
  for each_predicted_token_list_combine in predicted_token_list_combine.tolist():
    predicted_token_sort = dict(zip(range(MODEL_SIZE),each_predicted_token_list_combine))
    predicted_token_sort = sorted(predicted_token_sort.items(), key=lambda x: x[1], reverse=True)
    predicted_token_list_ids.append([x for x in predicted_token_sort if  (x[0] not in token_id_remove)])

  TOKEN_SPACE_FACTOR = 1
  token_space_id = [x[:int(TOKEN_NUMBER*TOKEN_SPACE_FACTOR)] for x in predicted_token_list_ids]
  token_space_id = np.array([[each_x[0] for each_x in x] for x in token_space_id])    
  token_space_score = [x[:int(TOKEN_NUMBER*TOKEN_SPACE_FACTOR)] for x in predicted_token_list_ids]
  token_space_score = np.array([[each_x[1] for each_x in x] for x in token_space_score])  

  # https://stackoverflow.com/questions/49961043/count-of-each-element-in-2d-numpy-array
  token_rank = dict(zip(*np.unique(token_space_id.ravel(), return_counts=True))) 
  token_rank = np.vectorize(token_rank.get)(token_space_id)
  token_position = np.tile(np.arange(int(TOKEN_NUMBER*TOKEN_SPACE_FACTOR)), (len(CLASS_NAME), 1))/(int(TOKEN_NUMBER*TOKEN_SPACE_FACTOR)+1)
  token_rank = token_rank
  print('---Token position---')
  print(token_rank)
    
  # https://stackoverflow.com/questions/6618515/sorting-list-according-to-corresponding-values-from-a-parallel-list
  token_space_id_sort_all = []
  for i,each_token_space_id in enumerate(token_space_id.tolist()):
    each_token_space_id_sort = [x for _,x in sorted(zip(token_rank.tolist()[i],each_token_space_id))]
    token_space_id_sort_all.append(each_token_space_id_sort)
  token_space_score_sort_all = []
  for i,each_token_space_score in enumerate(token_space_score.tolist()):
    each_token_space_score_sort = [x for _,x in sorted(zip(token_rank.tolist()[i],each_token_space_score))]
    token_space_score_sort_all.append(each_token_space_score_sort)
  predicted_token_filter_list = [list(zip(token_space_id_sort_all[i], token_space_score_sort_all[i]))[:TOKEN_NUMBER] for i in range(len(token_space_score_sort_all))]

  predicted_token_list_ids_filter = []
  for predicted_token_list_id in predicted_token_filter_list:
    predicted_token_list_ids_filter.append([x for x in predicted_token_list_id][:TOKEN_NUMBER])

  print('final token')
  for i, predicted_token_list_id in enumerate(predicted_token_list_ids_filter):
    print('class: ' + str(i))
    print(len(predicted_token_list_ids_filter[i]))
    print(predicted_token_list_ids_filter[i][:TOKEN_NUMBER])
    print([tokenizer.decode(x[0]) for x in predicted_token_list_ids_filter[i][:TOKEN_NUMBER]])
    
  predicted_token_list_ids_pad = [[y[0] for y in x] for x in predicted_token_list_ids_filter]
  predicted_token_list_ids_pad = list(zip(*itertools.zip_longest(*predicted_token_list_ids_pad, fillvalue=0)))
  predicted_token_list_ids_pad = [list(x) for x in predicted_token_list_ids_pad]

  predicted_token_list_score = [[y[1] for y in x] for x in predicted_token_list_ids_filter]
  predicted_token_list_score = [x[:len(predicted_token_list_ids_pad[0])] for x in predicted_token_list_score]  

  # with weight
  predicted_token_list_score = [[x / sum(predicted_token_list_score[r]) for x in token_rank[r]] for r in range(0, len(predicted_token_list_score))]
  predicted_token_list_score = np.array(predicted_token_list_score)*np.log(len(CLASS_NAME)/np.array(token_rank))
  return predicted_token_list_ids_pad, predicted_token_list_score

In [None]:
class PrepData(Dataset):
    def __init__(self,  dataframe, tokenizer):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.texts
        self.targets = dataframe.label_list
        self.max_len = MAX_LENGTH

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=True,
            truncation=True,
            padding='max_length'
        )

        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'targets': torch.tensor(np.array(self.targets[index])*1.0, dtype=torch.float),
            'mask_token_index':torch.where(torch.tensor(ids, dtype=torch.long) == self.tokenizer.mask_token_id)[0][0]
        }

In [None]:
class PromptModel(torch.nn.Module):
    def __init__(self, plm ,predicted_token_filter_list_ids,predicted_token_filter_list_score):
        super(PromptModel, self).__init__()
        self.plm = plm
        self.device = device
        self.plm.to(self.device)
        self.predicted_token_filter_list_ids = predicted_token_filter_list_ids
        self.label_words_weights = nn.Parameter(torch.tensor(predicted_token_filter_list_score, dtype=torch.float), requires_grad=False)
        
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        mask_token_index=None
    ):

        # for training
        outputs  = self.plm(input_ids=input_ids,attention_mask=attention_mask)
        label_prediction = []
        for i, each_label in enumerate(self.predicted_token_filter_list_ids):
          each_prediction = outputs['logits'][:,mask_token_index.tolist(),:] 
          each_prediction = torch.diagonal(each_prediction, 0, 0, 1)
          each_prediction_pos = torch.matmul(self.label_words_weights[i].unsqueeze(0), each_prediction[each_label,:])
          each_prediction_combine = torch.sum(each_prediction_pos, dim=0)
          label_prediction.append(each_prediction_combine)

        prediction = torch.stack(label_prediction, axis=-1)
        return prediction

    def generate(self,**kwangs):
        return self.plm.generate(**kwangs)

def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

In [None]:
def get_result(SAMPLE_SIZE, TRAIN_BATCH_SIZE):
    print('Sample Size ' + str(SAMPLE_SIZE))
    ##### Make the experiemnt run multiple seed here
    # for RANDOM_SEED in [21,42,99]:
    for RANDOM_SEED in [21]:
        set_seed()
        plm = AutoModelForMaskedLM.from_pretrained(model_path, config= model_config)
        print('Random Seed ' + str(RANDOM_SEED))
          
        train_df,test_df,valid_df,new_train_df,train_df_train = load_dataset_func(raw_dataset, SAMPLE_SIZE, RANDOM_SEED)
        predicted_token_filter_list_ids, predicted_token_filter_list_score = get_predicted_token_list(plm_topk, SAMPLE_SIZE, train_df, CLASS_NAME, TOKEN_NUMBER)

        training_set = PrepData(new_train_df, tokenizer)
        training_train_set = PrepData(train_df_train, tokenizer)
        validating_set = PrepData(valid_df, tokenizer)
        testing_set = PrepData(test_df, tokenizer)

        g = torch.Generator()
        g.manual_seed(RANDOM_SEED)

        train_params = {'batch_size': TRAIN_BATCH_SIZE, 'shuffle': True, 'num_workers': 2, 'worker_init_fn':seed_worker}
        val_params = {'batch_size': 4, 'shuffle': False, 'num_workers': 2, 'worker_init_fn':seed_worker}
        test_params = {'batch_size': 4, 'shuffle': False, 'num_workers': 2, 'worker_init_fn':seed_worker}

        training_loader = DataLoader(training_set, **train_params)
        training_train_loader = DataLoader(training_train_set, **train_params)
        validating_loader = DataLoader(validating_set, **val_params)
        testing_loader = DataLoader(testing_set, **test_params)
        
        model = PromptModel(plm, predicted_token_filter_list_ids, predicted_token_filter_list_score)
        model.to(device)

        loss_function = torch.nn.BCEWithLogitsLoss()
        optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)

        best_val_acc,val_counter = 0,0
        for i in tqdm(range(1,MAX_EPOCH)):
            tr_loss = 0
            for data in training_loader:
                model.train()
                ids = data['ids'].to(device, dtype = torch.long)
                mask = data['mask'].to(device, dtype = torch.long)
                targets = data['targets'].to(device, dtype = torch.long)
                mask_token_index = data['mask_token_index'].to(device, dtype = torch.long)

                outputs = model(ids, mask, mask_token_index)
                loss = loss_function(outputs, targets.float())
                tr_loss += loss.item()

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                
            for data in training_train_loader:
                model.train()
                ids = data['ids'].to(device, dtype = torch.long)
                mask = data['mask'].to(device, dtype = torch.long)
                targets = data['targets'].to(device, dtype = torch.long)
                mask_token_index = data['mask_token_index'].to(device, dtype = torch.long)

                outputs = model(ids, mask, mask_token_index)
                loss = loss_function(outputs, targets.float())
                tr_loss += loss.item()

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

        model.eval()
        with torch.no_grad():
            val_pred_list,val_act_list = [],[]
            for data in validating_loader:
                ids = data['ids'].to(device, dtype = torch.long)
                mask = data['mask'].to(device, dtype = torch.long)
                targets = data['targets'].to(device, dtype = torch.long)
                mask_token_index = data['mask_token_index'].to(device, dtype = torch.long)

                outputs = model(ids, mask, mask_token_index)
                preds = nn.Sigmoid()(outputs)
                val_pred_list.append(preds.tolist())         
                val_act_list.append(data['targets'].tolist())    

        val_pred_list = [item for sublist in val_pred_list for item in sublist]
        val_act_list = [item for sublist in val_act_list for item in sublist]   

        val_pred_df[str(SAMPLE_SIZE)+'_'+str(RANDOM_SEED)] = pd.Series(val_pred_list)
        val_act_df[str(SAMPLE_SIZE)+'_'+str(RANDOM_SEED)] = pd.Series(val_act_list)

        model.eval()
        with torch.no_grad():
            pred_list,act_list = [],[]
            for data in testing_loader:
                ids = data['ids'].to(device, dtype = torch.long)
                mask = data['mask'].to(device, dtype = torch.long)
                targets = data['targets'].to(device, dtype = torch.long)
                mask_token_index = data['mask_token_index'].to(device, dtype = torch.long)

                outputs = model(ids, mask, mask_token_index)
                preds = nn.Sigmoid()(outputs)
                pred_list.append(preds.tolist())         
                act_list.append(data['targets'].tolist())    

        pred_list = [item for sublist in pred_list for item in sublist]
        act_list = [item for sublist in act_list for item in sublist]   
        pred_df[str(SAMPLE_SIZE)+'_'+str(RANDOM_SEED)] = pred_list
        act_df[str(SAMPLE_SIZE)+'_'+str(RANDOM_SEED)] = act_list
        
        ###############
        best_t_s_mean = []
        for indx in range(len(CLASS_NAME)):
          each_pred_list, each_act_list = [],[]
          for i,each_act in enumerate(val_act_list):
            if each_act[indx] == 1:
              each_pred_list.append(val_pred_list[i][indx])
              each_act_list.append(1)
            else:
              each_pred_list.append(val_pred_list[i][indx])
              each_act_list.append(0)
          max_acc = 0
          each_t_list = []
          for t in np.arange(0,1,.01):
              temp = np.where(np.array(each_pred_list)>t, 1, 0)
              if accuracy_score(each_act_list, temp) >= max_acc:
                max_acc = accuracy_score(each_act_list, temp)
                each_t_list.append(t)
          best_t_s_mean.append(np.mean(each_t_list))


        print('s_mean')
        print('threshold: ' + str(best_t_s_mean))
        pred_threshold = np.array(pred_list)>best_t_s_mean
        pred_threshold = pred_threshold*1.0
        pred_threshold = pred_threshold.tolist()
        print(classification_report(act_list, pred_threshold, digits=4))
        predict_df['s_mean'+str(SAMPLE_SIZE)+'_'+str(RANDOM_SEED)] = pred_threshold
        f1_df['s_mean'+str(SAMPLE_SIZE)+'_'+str(RANDOM_SEED)] =  [metrics.f1_score(act_list, pred_threshold, average="macro")]
        threshold_df['s_mean'+str(SAMPLE_SIZE)+'_'+str(RANDOM_SEED)] = best_t_s_mean

        del plm, model

In [None]:
SAMPLE_SIZE = 2 ######## each class
TRAIN_BATCH_SIZE = 2
get_result(SAMPLE_SIZE, TRAIN_BATCH_SIZE)

In [None]:
SAMPLE_SIZE = 4 ######## each class
TRAIN_BATCH_SIZE = 4
get_result(SAMPLE_SIZE, TRAIN_BATCH_SIZE)

In [None]:
SAMPLE_SIZE = 8 ######## each class
TRAIN_BATCH_SIZE = 4
get_result(SAMPLE_SIZE, TRAIN_BATCH_SIZE)

In [None]:
SAMPLE_SIZE = 16 ######## each class
TRAIN_BATCH_SIZE = 4
get_result(SAMPLE_SIZE, TRAIN_BATCH_SIZE)

In [None]:
predict_df.to_excel(FILENAME+'_predict.xlsx',index=False)
f1_df.to_excel(FILENAME+'_f1.xlsx',index=False)
pred_df.to_excel(FILENAME+'_pred.xlsx',index=False)
threshold_df.to_excel(FILENAME+'_threshold.xlsx',index=False)
val_pred_df.to_excel(FILENAME+'_val_pred.xlsx',index=False)
val_act_df.to_excel(FILENAME+'_val_act.xlsx',index=False)
act_df.to_excel(FILENAME+'_act.xlsx',index=False)

In [None]:
!zip -r {FILENAME}.zip '/kaggle/working/' -x "*/.*"

In [None]:
print("--- %s seconds ---" % (time.time() - start_time))