In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 5.0 MB/s 
Collecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 44.5 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 52.8 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 4.1 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 60.7 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing

In [None]:
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel, BertTokenizer, BertModel
from google.colab import drive
import tensorflow as tf
import os
import sys
import xml.etree.ElementTree as ET
import glob
from scipy import io

In [None]:
#define device for deep learning
CUDA_LAUNCH_BLOCKING=1

device_name = tf.test.gpu_device_name()
if device_name == '/device:GPU:0':
    device = torch.device("cuda")
    print('GPU:', torch.cuda.get_device_name(0))
else:
    raise SystemError('GPU device not found')

GPU: Tesla P100-PCIE-16GB


In [None]:
# set environment as googledrive to folder "resource"
data_path =  "/Colab Notebooks/"

try:
    drive.mount('/content/drive')
    data_path = "/content/drive/MyDrive/Colab Notebooks/UCCA-CNN/"

except:
    print("You are not working in Colab at the moment :(")

Mounted at /content/drive


In [None]:
# parameters

seed = 42

np.random.seed(seed)
node_embedding = np.random.uniform(low=0, high=1, size=(768,))

In [None]:
# Use last four layers by default
layers = [-4, -3, -2, -1]
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")
model = AutoModel.from_pretrained("bert-base-multilingual-cased", output_hidden_states=True)

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/972k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.87M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/681M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
def get_word_idx(sent: str, word: str):
     return sent.split(" ").index(word)
 
 
def get_hidden_states(encoded, token_ids_word, model, layers):
     """Push input IDs through model. Stack and sum `layers` (last four by default).
        Select only those subword token outputs that belong to our word of interest
        and average them."""
     with torch.no_grad():
         output = model(**encoded)
 
     # Get all hidden states
     states = output.hidden_states
     # Stack and sum all requested layers
     output = torch.stack([states[i] for i in layers]).sum(0).squeeze()
     # Only select the tokens that constitute the requested word
     word_tokens_output = output[token_ids_word]
 
     return word_tokens_output.mean(dim=0)
 
 
def get_word_vector(sent, idx, tokenizer, model, layers):
     """Get a word vector by first tokenizing the input sentence, getting all token idxs
        that make up the word of interest, and then `get_hidden_states`."""
     encoded = tokenizer.encode_plus(sent, return_tensors="pt")
     # get all token idxs that belong to the word of interest
     token_ids_word = np.where(np.array(encoded.word_ids()) == idx)
 
     return get_hidden_states(encoded, token_ids_word, model, layers)

In [None]:
def XML_processing(file):
  node2tag = {} # dict[node] = 'word/tag'
  parent2children = {} # dict[parent] = [children]
  parent2childrenterminal = {}
  maxnoode = 0
  sent = '' # sentence in a string
  tree = ET.parse(file)
  root = tree.getroot()
  for layer in root.iter('layer'):
    if layer.attrib['layerID'] == '0':
      for node in layer.iter('node'):
        for attribute in node.iter('attributes'):
          node2tag[node.attrib['ID']] = attribute.attrib['text']
          sent += attribute.attrib['text'] + ' '
    else:
      for node in layer.iter('node'):
        e = []
        for edge in node.iter('edge'):
          if edge.attrib['toID'].startswith("1"):
              if int(edge.attrib['toID'][2::]) > maxnoode:
                  maxnoode = int(edge.attrib['toID'][2::])
              e.append(edge.attrib['toID']) 
              parent2children[node.attrib['ID']] = e
          else:
              e.append(edge.attrib['toID']) 
              parent2childrenterminal[node.attrib['ID']] = e              
          if edge.attrib['type'] != 'Terminal':
            node2tag[edge.attrib['toID']] = edge.attrib['type']
  return node2tag, parent2children, parent2childrenterminal, maxnoode, sent[0:-1]

In [None]:
def create_feature_matrix(sent, parent2childrenterminal, maxnoode, node_embedding):

  #feature_matrix = []
  feature_matrix = np.zeros((maxnoode,768))
  words = sent.split()
  #word_embedding = node_embedding
  for index in range(maxnoode):
    key = "1."+str(index+1)
    if key in parent2childrenterminal.keys():
      value = int(parent2childrenterminal[key][0][2::])-1
      idx = get_word_idx(sent, words[value])

      word_embedding = get_word_vector(sent, idx, tokenizer, model, layers)
    else:
      word_embedding = node_embedding
    #feature_matrix.append(word_embedding)
    feature_matrix[index,:] = word_embedding
  return feature_matrix

def adj_list_to_matrix(adj_list, n):
    adj_matrix = np.zeros((n,n))
    np.fill_diagonal(adj_matrix,0)
    for i in adj_list:
        for j in adj_list[i]:
            if int(j[0])!= 0:
              adj_matrix[int(i[2::])-1,int(j[2::])-1] = 1
    return adj_matrix

In [None]:
file = data_path + "dataset/xml/turkish/1.xml"
node2tag, parent2children, parent2childrenterminal, maxnode, sent = XML_processing(file)

In [None]:
adj_matrix = adj_list_to_matrix(parent2children, maxnode)

feature_matrix = create_feature_matrix(sent, parent2childrenterminal, maxnode, node_embedding)
feature_matrix.shape, adj_matrix.shape

((18, 768), (18, 18))

In [None]:
tr_irony = pd.read_csv(data_path+"dataset/raw/turkishirony.csv")
tr_dict = {}
for item, label in enumerate(tr_irony["label"]):
  tr_dict[int(item)] = int(label)

In [None]:
adj_all = []
feature_all = []
count = []
for file in glob.glob(data_path + "dataset/xml/turkish600/*.xml"):
  node2tag, parent2children, parent2childrenterminal, maxnode, sent = XML_processing(file)
  file_name = file.split("/",-1)
  sent_id = int(file_name[-1].split(".")[0])
  count.append(tr_dict[int(sent_id)-1])
  adj_matrix = adj_list_to_matrix(parent2children, maxnode)
  adj_all.append(adj_matrix)
  feature_matrix = create_feature_matrix(sent, parent2childrenterminal, maxnode, node_embedding)
  feature_all.append(feature_matrix)

In [None]:
len(feature_all)

220

In [None]:
def spanishEncode(label):
  if label == 1:
    return "1"
  return "0"

def frenchEncode(label):
  if label == "figurative":
    return 1
  return 0

In [None]:
def read_spanish_dataset(filename):
    
    spanish_data = pd.read_csv(filename)
    spanish_data["label"] = spanish_data["IS_IRONIC"]
    spanish_data["text"] = spanish_data["MESSAGE"]
    
    return list(spanish_data["text"]), list(spanish_data["label"])

def read_french_dataset(filename):
    french_data_file = open(filename, "r", encoding="utf8")

    french_label = []
    french_text = []

    for item in french_data_file.readlines():
        element = item.strip("\n").split("\t")
        text = "".join(element[1:-1])
        #french_text.append(" ".join(WordPunctTokenizer().tokenize(text)))
        french_label.append(frenchEncode(element[-1]))

    #return french_text, french_label
    return french_label
      
def read_english_dataset(filename):
    #French
    english_data_file = open(filename, "r", encoding="utf8")

    english_label = []
    english_text = []

    for item in english_data_file.readlines()[1::]:
        element = item.strip("\n").split("\t")
        text = "".join(element[2])
        #english_text.append(" ".join(WordPunctTokenizer().tokenize(text)))
        english_label.append(int(element[1]))

    #return english_text, english_label
    return english_label

def read_italian_dataset(filename):
    italian_data_file = open(filename, "r", encoding="utf8")

    italian_label = []
    italian_text = []

    for item in italian_data_file.readlines()[1::]:
        element = item.strip("\n").split("\t")
        #italian_text.append(" ".join(WordPunctTokenizer().tokenize(element[1])))
        italian_label.append(int(element[2]))

    #return italian_text, italian_label
    return italian_label

In [None]:
english_labels = read_english_dataset(data_path + "dataset/raw/english.txt")
italian_label = read_italian_dataset(data_path + "dataset/raw/italian.csv")
french_label = read_french_dataset(data_path + "dataset/raw/french.csv")
spanish_sentences, spanish_labels = read_spanish_dataset(data_path + "dataset/raw/spanish.csv")

In [None]:
set(english_labels), set(italian_label), set(french_label),set(spanish_labels)

({0, 1}, {0, 1}, {0, 1}, {0, 1})

In [None]:
english_all = []
english_adj = []
for file in glob.glob(data_path + "dataset/xml/english/*.xml"):
  node2tag, parent2children, parent2childrenterminal, maxnode, sent = XML_processing(file)
  file_name = file.split("/",-1)
  sent_id = int(file_name[-1].split(".")[0])
  #count.append(tr_dict[int(sent_id)-1])
  #print(parent2children, maxnode)
  adj_matrix = adj_list_to_matrix(parent2children, maxnode)
  english_adj.append(adj_matrix)
  feature_matrix = create_feature_matrix(sent, parent2childrenterminal, maxnode, node_embedding)
  english_all.append(feature_matrix)

In [None]:
spanish_all = []
spanish_adj = []
for file in glob.glob(data_path + "dataset/xml/spanish/*.xml"):
  node2tag, parent2children, parent2childrenterminal, maxnode, sent = XML_processing(file)
  file_name = file.split("/",-1)
  sent_id = int(file_name[-1].split(".")[0])
  #count.append(tr_dict[int(sent_id)-1])
  #print(parent2children, maxnode)
  adj_matrix = adj_list_to_matrix(parent2children, maxnode)
  spanish_adj.append(adj_matrix)
  feature_matrix = create_feature_matrix(sent, parent2childrenterminal, maxnode, node_embedding)
  spanish_all.append(feature_matrix)

KeyboardInterrupt: ignored

In [None]:
french_all = []
french_adj = []
for file in glob.glob(data_path + "dataset/xml/french/*.xml"):
  node2tag, parent2children, parent2childrenterminal, maxnode, sent = XML_processing(file)
  file_name = file.split("/",-1)
  sent_id = int(file_name[-1].split(".")[0])
  #count.append(tr_dict[int(sent_id)-1])
  #print(parent2children, maxnode)
  adj_matrix = adj_list_to_matrix(parent2children, maxnode)
  french_adj.append(adj_matrix)
  feature_matrix = create_feature_matrix(sent, parent2childrenterminal, maxnode, node_embedding)
  french_all.append(feature_matrix)

KeyboardInterrupt: ignored

In [None]:
italian_all = []
italian_adj = []
for file in glob.glob(data_path + "dataset/xml/italian/*.xml"):
  node2tag, parent2children, parent2childrenterminal, maxnode, sent = XML_processing(file)
  file_name = file.split("/",-1)
  sent_id = int(file_name[-1].split(".")[0])
  #count.append(tr_dict[int(sent_id)-1])
  #print(parent2children, maxnode)
  adj_matrix = adj_list_to_matrix(parent2children, maxnode)
  italian_adj.append(adj_matrix)
  feature_matrix = create_feature_matrix(sent, parent2childrenterminal, maxnode, node_embedding)
  italian_all.append(feature_matrix)

In [None]:
len(train_all)

3834

In [None]:
io.savemat(data_path+'turkish_irony.mat', {'feature':feature_all,'adjencency':adj_all, "label":count})

In [None]:
tr_irony = io.loadmat(data_path + 'turkish.mat')
tr_irony600 = io.loadmat(data_path + 'turkish_irony.mat')

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.autograd as autograd
from torch.utils.data import Dataset, DataLoader
import torch.nn.utils as utils
import torch.optim.lr_scheduler as lr_scheduler
from transformers import BertTokenizer, BertModel
from transformers import AutoModel, AutoTokenizer
from sklearn.utils import shuffle
import tensorflow as tf
import os
import sys
import math
import random
import xml.etree.ElementTree as ET
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, matthews_corrcoef, confusion_matrix, classification_report, f1_score, recall_score, precision_score, accuracy_score
from scipy.stats import pearsonr
from sklearn.model_selection import KFold
from nltk.tokenize import WordPunctTokenizer
import warnings
warnings.filterwarnings("ignore")

In [None]:
# Model hyperparameters

RANDOM_SEED = 42
batch_size = 64
n_out = 2
epoch_size = 20
learning_rate = 0.0001
init_weight_decay = 0.2
init_clip_max_norm = 0.1
filter_sizes=[3,4,5,6]
num_filters=200
dropout = 0.1

In [None]:
# Model dataset

class IronyDataset(Dataset):
  def __init__(self, split, feature, label):
    self.feature_array = np.array(feature)
    self.label_array = label

    #print("len ", len(self.feature_array), " ", len(self.label_array))

  def __len__(self):
    return len(self.feature_array)

  def __getitem__(self, idx):
    selected_label = int(self.label_array[idx])
    selected_feature = self.feature_array[idx]

    return selected_feature, selected_label


In [None]:
def collate_fn(data):
  data.sort(key=lambda x: (x[0].shape[0]), reverse=True)
  arrays, labels = zip(*data)
  lengths = [(array.shape[0]) for array in arrays]
  longest = max(lengths)
  targets = np.zeros([len(arrays), max(lengths), 768])
  for i, cap in enumerate(arrays):
    end = lengths[i]
    array = arrays[i]
    new_array = np.pad(array, [((longest - end),0),(0,0)], mode='constant')
    targets[i,:,:] = new_array
  return targets, labels


In [None]:
tr_irony = io.loadmat(data_path + 'turkish.mat')
tr_irony600 = io.loadmat(data_path + 'turkish_irony.mat')
dataset = pd.DataFrame({'feature' : feature_all, 'label' : count})
#dataset600 = pd.DataFrame({'feature' : tr_irony600["feature"], 'label' : tr_irony600["label"]})


In [None]:
dataset["feature"].shape, np.array(dataset["label"]).shape

((220,), (220,))

In [None]:

train_feature, valid_feature, train_label, valid_label = train_test_split(feature_all, count, test_size=0.1, random_state=RANDOM_SEED)
#len(train_feature), len(valid_feature)

dl_train = DataLoader(IronyDataset("train", train_feature, train_label), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
dl_val= DataLoader(IronyDataset("val", valid_feature, valid_label), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

In [None]:
0# cross_lingual

dl_train = DataLoader(IronyDataset("train", english_all, english_labels), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
dl_val= DataLoader(IronyDataset("val", feature_all, count), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

In [None]:
class UCCA_CNN(nn.Module):
  def __init__(self, filter_sizes=[3,4,5],num_filters=50, embeding_dim=768, n_out=2, dropout=0.5):
    super(UCCA_CNN, self).__init__()
    self.embedding_dim = embeding_dim
    self.convs =nn.ModuleList([nn.Conv1d(in_channels=self.embedding_dim,
                                             out_channels=num_filters,
                                             kernel_size=filter_size, stride=1) for filter_size in filter_sizes])
    self.fc1 = nn.Linear(len(filter_sizes)*num_filters, 200)
    self.fc2 = nn.Linear(200, n_out)
    self.dropout = nn.Dropout(dropout)

  def forward(self, input_ids):
    x = input_ids.transpose(1,2)
    x_conv_list = [conv2d(x) for conv2d in self.convs]

    x_pool_list = [(F.max_pool1d(x_conv, kernel_size=x_conv.shape[2]))
            for x_conv in x_conv_list]

    x_fc = torch.cat([x_pool.squeeze(dim=2) for x_pool in x_pool_list],
                         dim=1)
    
    logits = self.fc1(self.dropout(x_fc))
    logits = self.fc2(self.dropout(logits))
    return logits


In [None]:
cnn_model = UCCA_CNN(filter_sizes=filter_sizes,
                      num_filters=num_filters,
                      embeding_dim=768,
                      n_out=2,
                      dropout=dropout)
cnn_model.to(device)

UCCA_CNN(
  (convs): ModuleList(
    (0): Conv1d(768, 200, kernel_size=(3,), stride=(1,))
    (1): Conv1d(768, 200, kernel_size=(4,), stride=(1,))
    (2): Conv1d(768, 200, kernel_size=(5,), stride=(1,))
    (3): Conv1d(768, 200, kernel_size=(6,), stride=(1,))
  )
  (fc1): Linear(in_features=800, out_features=200, bias=True)
  (fc2): Linear(in_features=200, out_features=2, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
)

In [None]:
optimizer = torch.optim.Adam(cnn_model.parameters(), lr=learning_rate) #, weight_decay=init_weight_decay)
criterion = nn.CrossEntropyLoss()

In [None]:
def save_checkpoint(state, location):
	filepath = os.path.join(location, 'best.pth.tar')
	torch.save(state, filepath)

In [None]:
def train(train_dl, model, optimizer):
  model.train()
  total_loss = 0.
  for batch in train_dl:
    tokens, label = batch
    tokens, label = torch.FloatTensor(tokens), torch.LongTensor(label)
    tokens, label = tokens.to(device).requires_grad_(), label.to(device)
    optimizer.zero_grad()
    output = model(tokens)
    loss = criterion(output.view(-1, n_out), label)
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
    optimizer.step()
    total_loss += loss.item()

  return total_loss/float(len(train_dl))

In [None]:
def evaluate(model, dl):
  total_loss = 0
  prediction_list = []
  label_list = []
  model.eval()
  with torch.no_grad():
    for batch in dl:		
      tokens, label = batch
      tokens, label = torch.FloatTensor(tokens), torch.LongTensor(label)
      tokens, label = tokens.to(device), label.to(device)
      output = model(tokens)
      loss = criterion(output.view(-1, n_out), label)
      total_loss += loss.item()
      predicted = torch.argmax(output, dim=1)
      prediction_list.extend(predicted.data.cpu().numpy())
      label_list.extend(label.data.cpu().numpy())
  return f1_score(label_list, prediction_list, average='macro'), total_loss,label_list,prediction_list

In [None]:
def train_and_evaluate(model, optimizer, train_dl, val_dl, test_dl=None, fold=0):
  best_val_acc = -999.9
  r_test_acc = -999.0
  best_label = []
  best_prediction = []
  scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)
  for epoch in range(1, epoch_size+1):
    total_loss = train(train_dl, model, optimizer)
    val_acc, val_loss, label_list,prediction_list = evaluate(model, val_dl)
    #test_acc, test_loss = evaluate(model, test_dl)
    print("Epoch = ", epoch, " train loss = ", total_loss, " val_acc = ", val_acc) #, " test_acc = ", test_acc)
    if val_acc > best_val_acc:
      save_checkpoint({'epoch': epoch , 'state_dict': model.state_dict(), 'optim_dict': optimizer.state_dict()}, location=data_path + 'result/')
      best_val_acc = val_acc
      #r_test_acc = test_acc
      best_label = label_list
      best_prediction = prediction_list
    scheduler.step()
  print("Best Val acc = ", best_val_acc) #, " Test Acc = ", r_test_acc)
  return best_val_acc, best_label, best_prediction


In [None]:
best_val_acc, best_label, best_prediction = train_and_evaluate(cnn_model, optimizer, dl_train, dl_val)

Epoch =  1  train loss =  0.8328107208013534  val_acc =  0.4768253968253968
Epoch =  2  train loss =  0.6120959823330243  val_acc =  0.40670153436110884
Epoch =  3  train loss =  0.4654809206724167  val_acc =  0.4139431201236832
Epoch =  4  train loss =  0.31227724875013035  val_acc =  0.43829432599486073
Epoch =  5  train loss =  0.2081796944141388  val_acc =  0.4499938888209869
Epoch =  6  train loss =  0.13287362692256768  val_acc =  0.4360400444938821
Epoch =  7  train loss =  0.1001694181933999  val_acc =  0.42790693726498774
Epoch =  8  train loss =  0.06680742061386506  val_acc =  0.4419381787802841
Epoch =  9  train loss =  0.05313681435460846  val_acc =  0.4108993039326706
Epoch =  10  train loss =  0.040753544013326364  val_acc =  0.43557296963054354
Epoch =  11  train loss =  0.037973654363304375  val_acc =  0.46424774998228335
Epoch =  12  train loss =  0.029272424957404532  val_acc =  0.44948446513975066
Epoch =  13  train loss =  0.02417868219781667  val_acc =  0.44499845

In [None]:
f1_score(best_label, best_prediction), recall_score(best_label, best_prediction), precision_score(best_label, best_prediction),accuracy_score(best_label, best_prediction)

(0.4114285714285714, 0.36, 0.48, 0.485)

In [None]:
def get_eval_report(labels, preds):
  mcc = matthews_corrcoef(labels, preds)
  tn, fp, fn, tp = confusion_matrix(labels, preds).ravel()
  return {
              "mcc": mcc,
              "tp": tp,
              "tn": tn,
              "fp": fp,
              "fn": fn
          }

In [None]:
def train_and_evaluate_fold():
  accuracy = []
  recall = []
  f1 = []
  precision = []
  best_accuracy = []
  label_all = []
  prediction_all = []

  #scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)
  k_folds = 10

  results = {}

  # Set fixed random number seed
  torch.manual_seed(42)

  # Define the K-fold Cross Validator
  kfold = KFold(n_splits=k_folds, random_state=RANDOM_SEED, shuffle=True)
  # Start print
  print('--------------------------------')

  # K-fold Cross Validation model evaluation
  dataset = pd.DataFrame({'feature' : feature_all, 'label' : count})
  #dataset = shuffle(dataset)
  #print(dataset)
  for fold, (train_idx, val_idx) in enumerate(kfold.split(dataset)):
    cnn_model = UCCA_CNN(filter_sizes=filter_sizes,
                      num_filters=num_filters,
                      embeding_dim=768,
                      n_out=n_out,
                      dropout=dropout).to(device)
    optimizer = torch.optim.Adam(cnn_model.parameters(), lr=learning_rate) #, weight_decay=init_weight_decay)

    
    
    train_df = dataset.iloc[train_idx]

    valid_df = dataset.iloc[val_idx]
    print(fold)
    dl_train = DataLoader(IronyDataset("train", train_df["feature"].to_numpy(), list(train_df["label"])), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
    dl_val= DataLoader(IronyDataset("val", valid_df["feature"].to_numpy(), list(valid_df["label"])), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
    best_a ,label_list,prediction_list = train_and_evaluate(cnn_model, optimizer, dl_train, dl_val, fold)
    best_accuracy.append(best_a)
    label_all.extend(label_list)
    prediction_all.extend(prediction_list)
  print(np.mean(best_accuracy))
  return label_all, prediction_all



In [None]:
label_all, prediction_all = train_and_evaluate_fold()

--------------------------------
0
Best Val acc =  0.8166666666666667
1
Best Val acc =  1.0
2
Best Val acc =  0.9083333333333333
3
Best Val acc =  0.9536842105263159
4
Best Val acc =  0.8562091503267975
5
Best Val acc =  0.8482758620689654
6
Best Val acc =  0.725
7
Best Val acc =  0.9536842105263159
8
Best Val acc =  0.9090909090909091
9
Best Val acc =  0.905982905982906
0.887692724852221


In [None]:
f1_score(label_all, prediction_all), recall_score(label_all, prediction_all), precision_score(label_all, prediction_all),accuracy_score(label_all, prediction_all)

(0.8956521739130435,
 0.9363636363636364,
 0.8583333333333333,
 0.8909090909090909)