In [1]:
!pip install transformers
!pip install torch-scatter
!pip install torch-sparse
!pip install torch-geometric

Collecting transformers
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 14.1 MB/s 
Collecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 68.8 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 75.2 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 6.9 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 74.3 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    F

In [2]:
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel, BertTokenizer, BertModel
from google.colab import drive
import tensorflow as tf
import os
import sys
import xml.etree.ElementTree as ET
import glob
from scipy import io

In [3]:
#define device for deep learning
CUDA_LAUNCH_BLOCKING=1

device_name = tf.test.gpu_device_name()
if device_name == '/device:GPU:0':
    device = torch.device("cuda")
    print('GPU:', torch.cuda.get_device_name(0))
else:
    raise SystemError('GPU device not found')

GPU: Tesla T4


In [4]:
# set environment as googledrive to folder "resource"
data_path =  "/Colab Notebooks/"

try:
    drive.mount('/content/drive')
    data_path = "/content/drive/MyDrive/Colab Notebooks/UCCA-CNN/"

except:
    print("You are not working in Colab at the moment :(")

Mounted at /content/drive


In [5]:
# parameters

seed = 42

np.random.seed(seed)
node_embedding = np.random.uniform(low=0, high=1, size=(768,))

In [6]:
# Use last four layers by default
layers = [-4, -3, -2, -1]
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")
model = AutoModel.from_pretrained("bert-base-multilingual-cased", output_hidden_states=True)

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/972k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.87M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/681M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
def get_word_idx(sent: str, word: str):
     return sent.split(" ").index(word)
 
 
def get_hidden_states(encoded, token_ids_word, model, layers):
     """Push input IDs through model. Stack and sum `layers` (last four by default).
        Select only those subword token outputs that belong to our word of interest
        and average them."""
     with torch.no_grad():
         output = model(**encoded)
 
     # Get all hidden states
     states = output.hidden_states
     # Stack and sum all requested layers
     output = torch.stack([states[i] for i in layers]).sum(0).squeeze()
     # Only select the tokens that constitute the requested word
     word_tokens_output = output[token_ids_word]
 
     return word_tokens_output.mean(dim=0)
 
 
def get_word_vector(sent, idx, tokenizer, model, layers):
     """Get a word vector by first tokenizing the input sentence, getting all token idxs
        that make up the word of interest, and then `get_hidden_states`."""
     encoded = tokenizer.encode_plus(sent, return_tensors="pt")
     # get all token idxs that belong to the word of interest
     token_ids_word = np.where(np.array(encoded.word_ids()) == idx)
 
     return get_hidden_states(encoded, token_ids_word, model, layers)

In [8]:
def XML_processing(file):
  node2tag = {} # dict[node] = 'word/tag'
  parent2children = {} # dict[parent] = [children]
  parent2childrenterminal = {}
  maxnoode = 0
  sent = '' # sentence in a string
  tree = ET.parse(file)
  root = tree.getroot()
  for layer in root.iter('layer'):
    if layer.attrib['layerID'] == '0':
      for node in layer.iter('node'):
        for attribute in node.iter('attributes'):
          node2tag[node.attrib['ID']] = attribute.attrib['text']
          sent += attribute.attrib['text'] + ' '
    else:
      for node in layer.iter('node'):
        e = []
        for edge in node.iter('edge'):
          if edge.attrib['toID'].startswith("1"):
              if int(edge.attrib['toID'][2::]) > maxnoode:
                  maxnoode = int(edge.attrib['toID'][2::])
              e.append(edge.attrib['toID']) 
              parent2children[node.attrib['ID']] = e
          else:
              e.append(edge.attrib['toID']) 
              parent2childrenterminal[node.attrib['ID']] = e              
          if edge.attrib['type'] != 'Terminal':
            node2tag[edge.attrib['toID']] = edge.attrib['type']
  return node2tag, parent2children, parent2childrenterminal, maxnoode, sent[0:-1]

In [9]:
def create_feature_matrix(sent, parent2childrenterminal, maxnoode, node_embedding):

  #feature_matrix = []
  feature_matrix = np.zeros((maxnoode,768))
  words = sent.split()
  #word_embedding = node_embedding
  for index in range(maxnoode):
    key = "1."+str(index+1)
    if key in parent2childrenterminal.keys():
      value = int(parent2childrenterminal[key][0][2::])-1
      idx = get_word_idx(sent, words[value])

      word_embedding = get_word_vector(sent, idx, tokenizer, model, layers)
    else:
      word_embedding = node_embedding
    #feature_matrix.append(word_embedding)
    feature_matrix[index,:] = word_embedding
  return feature_matrix

def adj_list_to_matrix(adj_list, n):
    adj_matrix = np.zeros((n,n))
    np.fill_diagonal(adj_matrix,0)
    for i in adj_list:
        for j in adj_list[i]:
            adj_matrix[int(i[2::])-1,int(j[2::])-1] = 1
    return adj_matrix

In [None]:
file = data_path + "dataset/xml/turkish/1.xml"
node2tag, parent2children, parent2childrenterminal, maxnode, sent = XML_processing(file)

In [None]:
adj_matrix = adj_list_to_matrix(parent2children, maxnode)

feature_matrix = create_feature_matrix(sent, parent2childrenterminal, maxnode, node_embedding)
feature_matrix.shape, adj_matrix.shape

((18, 768), (18, 18))

In [411]:
tr_irony = pd.read_csv(data_path+"dataset/raw/turkish.csv")
tr_dict = {}
for item, label in enumerate(tr_irony["label"]):
  tr_dict[int(item)] = int(label)

In [412]:
adj_all = []
feature_all = []
count = []
for file in glob.glob(data_path + "dataset/xml/turkish/*.xml"):
  node2tag, parent2children, parent2childrenterminal, maxnode, sent = XML_processing(file)
  file_name = file.split("/",-1)
  sent_id = int(file_name[-1].split(".")[0])
  count.append(tr_dict[int(sent_id)-1])
  adj_matrix = adj_list_to_matrix(parent2children, maxnode)
  adj_all.append(adj_matrix)
  feature_matrix = create_feature_matrix(sent, parent2childrenterminal, maxnode, node_embedding)
  feature_all.append(feature_matrix)

In [None]:
len(count)

600

In [None]:
io.savemat(data_path+'turkish_irony.mat', {'feature':feature_all,'adjencency':adj_all, "label":count})

In [None]:
tr_irony = io.loadmat(data_path + 'turkish.mat')
tr_irony600 = io.loadmat(data_path + 'turkish_irony.mat')

In [141]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.autograd as autograd
from torch.utils.data import Dataset, DataLoader
import torch.nn.utils as utils
import torch.optim.lr_scheduler as lr_scheduler
from torch.nn import Parameter
from transformers import BertTokenizer, BertModel
from transformers import AutoModel, AutoTokenizer
from sklearn.utils import shuffle
import tensorflow as tf
import os
import sys
import math
import random
import xml.etree.ElementTree as ET
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, matthews_corrcoef, confusion_matrix, classification_report, f1_score, recall_score, precision_score, accuracy_score
from scipy.stats import pearsonr
from sklearn.model_selection import KFold
from nltk.tokenize import WordPunctTokenizer
from torch_geometric.nn import GCNConv, GATConv, GATv2Conv
import warnings
warnings.filterwarnings("ignore")

In [413]:
# Model hyperparameters

RANDOM_SEED = 42
batch_size = 2
n_out = 2
epoch_size = 50
learning_rate = 0.005
init_weight_decay = 0.2
init_clip_max_norm = 0.1
nhid=200
dropout = 0.1

In [13]:
# Model dataset

class IronyDataset(Dataset):
  def __init__(self, split, feature, adj, label):
    self.feature_array = np.array(feature)
    self.adj_array = np.array(adj)
    self.label_array = label

    #print("len ", len(self.feature_array), " ", len(self.label_array))

  def __len__(self):
    return len(self.feature_array)

  def __getitem__(self, idx):
    selected_label = int(self.label_array[idx])
    selected_feature = self.feature_array[idx]
    selected_adj = self.adj_array[idx]

    return selected_feature, selected_adj, selected_label


In [326]:
def collate_fn(data):
  data.sort(key=lambda x: (x[0].shape[0]), reverse=True)
  arrays, adjs, labels = zip(*data)
  lengths = [(array.shape[0]) for array in arrays]
  longest = max(lengths)
  targets = np.zeros([len(arrays), max(lengths), 768])
  targets_adj = np.zeros([len(arrays), max(lengths), max(lengths)])
  for i, cap in enumerate(arrays):
    end = lengths[i]
    array = arrays[i]
    adj = adjs[i]
    new_adj = np.pad(adj, [((longest - end),0),(0,(longest - end))], mode='constant')
    new_array = np.pad(array, [((longest - end),0),(0,0)], mode='constant')
    targets[i,:,:] = new_array
    targets_adj[i,:,:] = new_adj
  return targets, targets_adj, labels


In [None]:
tr_irony = io.loadmat(data_path + 'turkish.mat')
tr_irony600 = io.loadmat(data_path + 'turkish_irony.mat')
dataset = pd.DataFrame({'feature' : feature_all, 'label' : count})
#dataset600 = pd.DataFrame({'feature' : tr_irony600["feature"], 'label' : tr_irony600["label"]})


In [None]:
dataset["feature"].shape, np.array(dataset["label"]).shape

((220,), (220,))

In [None]:

train_feature, valid_feature, train_label, valid_label = train_test_split(feature_all, count, test_size=0.1, random_state=RANDOM_SEED)
#len(train_feature), len(valid_feature)

dl_train = DataLoader(IronyDataset("train", train_feature, train_label), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
dl_val= DataLoader(IronyDataset("val", valid_feature, valid_label), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

len  198   198
len  22   22


In [345]:
class GraphConvolution(nn.Module):
    """
    Class to implement Graph Convlution NN
    """

    def __init__(self, in_features, out_features, bias=True):
        r""" Define the constructor. Here we need to specify the number 
        of input and output features 
        
        Parameters
        ----------
        in_features: int, number of nodes for a graph 
        out_features: int, number of targets 
        bias: bool, whether we want to have the bias term or not 
        """
        # define the super constructor
        super(GraphConvolution, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        # parameterise weights
        self.weight = Parameter(torch.FloatTensor(in_features, out_features))
        if bias:
            self.bias = Parameter(torch.FloatTensor(out_features))
        else:
            self.register_parameter('bias', None)
        self.reset_parameters()

    def reset_parameters(self):
        r""" Reset of parameters. Parameterized weights with auniform distribution"""
        
        stdv = 1. / math.sqrt(self.weight.size(1))
        self.weight.data.uniform_(-stdv, stdv)
        if self.bias is not None:
            self.bias.data.uniform_(-stdv, stdv)

    def forward(self, input, adj):
        r""" Define the forward step. Here we perform a matrix multiplication 
        between input graph and weigths. 
        The output is computed as a sparse matrix multiplication between the adjacency 
        matrix and the support result.
        
        Parameters
        ----------
        input: matrix/tensor input graph matrix 
        adj: self connected adjacency matrix from graph
        """
        support = torch.matmul(input, self.weight)
        output = torch.matmul(adj, support)
        if self.bias is not None:
            return output + self.bias
        else:
            return output


class UCCA_GCN(nn.Module):
    r""" Define the main GCN model"""
    def __init__(self, nfeat, nhid, nclass, dropout):
        r""" The model take the input feature dimension, the hidden dimension size
        the number of classes and possible dropout probability
        
        Parameter
        ---------
        nfeat: int, number of input features
        nhid: int, number of features for the hidden layer 
        nclass: int, number of target classes
        dropout: float, dropout percentage
        """
        super(UCCA_GCN, self).__init__()

        self.gc1 = GraphConvolution(nfeat, nhid)
        self.gc2 = GraphConvolution(nhid, nclass)
        self.dropout = dropout

    def forward(self, x, adj):
        r""" Define the forward step
        
        Parameters
        -----------
        x: array/tensor input features 
        adj: array/tensor self term adjacency matrix 
        """
        x = F.relu(self.gc1(x, adj))
        x = F.dropout(x, self.dropout, training=self.training)
        x = self.gc2(x, adj)
        x= torch.mean(x, 1)
        return x
        #return F.log_softmax(x, dim=1)

In [289]:
class UCCA_GAT(torch.nn.Module):
  def __init__(self, in_channels, hidden1, hidden2, heads,out_channels):
    super(UCCA_GAT, self).__init__()
    self.conv1 = GATv2Conv(in_channels, hidden1, heads= heads, concat=True, add_self_loops= True)
    self.conv2 = GATv2Conv(hidden1*heads, hidden2, heads= heads, concat=True, add_self_loops= True)
    self.conv3 = GATv2Conv(hidden2*heads, out_channels, heads= 1, concat=False, add_self_loops= False)

  def forward(self, feature, adj, mask):
    #feature = feature.transpose(1,2)
    feature,adj= torch.squeeze(feature), torch.squeeze(adj)
    #feature = torch.matmul(feature, mask)
    print(feature.shape, adj.shape)
    output = self.conv1(feature, adj)
    output = self.conv2(output, adj)
    output = self.conv3(output, adj)
    return output

In [110]:
gat_model = UCCA_GCN(nfeat=768, 
                     nhid=nhid, 
                     nclass=n_out, 
                    dropout=dropout)
gat_model.to(device)

UCCA_GCN(
  (gc1): GraphConvolution (768 -> 100)
  (gc2): GraphConvolution (100 -> 2)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc1): Linear(in_features=768, out_features=100, bias=True)
  (fc2): Linear(in_features=100, out_features=2, bias=True)
)

In [21]:
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) #, weight_decay=init_weight_decay)
criterion = nn.CrossEntropyLoss()

In [22]:
def save_checkpoint(state, location):
	filepath = os.path.join(location, 'best.pth.tar')
	torch.save(state, filepath)

In [331]:
def train(train_dl, model, optimizer):
  model.train()
  total_loss = 0.
  for batch in train_dl:
    feature, adj, label = batch
    feature, adj, label = torch.FloatTensor(feature), torch.FloatTensor(adj), torch.LongTensor(label)
    feature, adj, label = feature.to(device), adj.to(device), label.to(device)
    optimizer.zero_grad()
    output = model(feature, adj)
    loss = criterion(output.view(-1, n_out), label)
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
    optimizer.step()
    total_loss += loss.item()

  return total_loss/float(len(train_dl))

In [397]:
def evaluate(model, dl):
  total_loss = 0
  prediction_list = []
  label_list = []
  model.eval()
  with torch.no_grad():
    for batch in dl:		
      feature, adj, label = batch
      feature, adj, label = torch.FloatTensor(feature), torch.FloatTensor(adj), torch.LongTensor(label)
      feature, adj, label = feature.to(device), adj.to(device), label.to(device)
      output = model(feature, adj)
      loss = criterion(output.view(-1, n_out), label)
      total_loss += loss.item()
      predicted = torch.argmax(output, dim=1)
      prediction_list.extend(predicted.data.cpu().numpy())
      label_list.extend(label.data.cpu().numpy())
  return f1_score(label_list, prediction_list, average='macro'), total_loss, label_list, prediction_list

In [404]:
def train_and_evaluate(model, optimizer, train_dl, val_dl, test_dl=None, fold=0):
  best_val_acc = -999.9
  r_test_acc = -999.0
  scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)
  label_best = []
  prediction_best = []
  for epoch in range(1, epoch_size+1):
    total_loss = train(train_dl, model, optimizer)
    val_acc, val_loss, label_list, prediction_list = evaluate(model, val_dl)
    #test_acc, test_loss = evaluate(model, test_dl)
    #print("Epoch = ", epoch, " train loss = ", total_loss, " val_acc = ", val_acc) #, " test_acc = ", test_acc)
    if val_acc > best_val_acc:
      save_checkpoint({'epoch': epoch , 'state_dict': model.state_dict(), 'optim_dict': optimizer.state_dict()}, location=data_path + 'result/')
      best_val_acc = val_acc
      #r_test_acc = test_acc
      label_best = label_list
      prediction_best = prediction_list
    scheduler.step()
  print("Best Val acc = ", best_val_acc) #, " Test Acc = ", r_test_acc)
  return best_val_acc,label_best, prediction_best


In [354]:
train_and_evaluate(gcn_model, optimizer, dl_train, dl_val)

NameError: ignored

In [394]:
def get_eval_report(labels, preds):
  mcc = matthews_corrcoef(labels, preds)
  tn, fp, fn, tp = confusion_matrix(labels, preds).ravel()
  return {
              "mcc": mcc,
              "tp": tp,
              "tn": tn,
              "fp": fp,
              "fn": fn
          }

In [401]:
def train_and_evaluate_fold():
  label_all = []
  prediction_all = []
  best_accuracy = []


  #scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)
  k_folds = 10

  results = {}

  # Set fixed random number seed
  torch.manual_seed(42)

  # Define the K-fold Cross Validator
  kfold = KFold(n_splits=k_folds, random_state=RANDOM_SEED, shuffle=True)
  # Start print
  print('--------------------------------')

  # K-fold Cross Validation model evaluation
  dataset = pd.DataFrame({'feature' : feature_all, 'adj' : adj_all,'label' : count})
  #dataset = shuffle(dataset)
  #print(dataset)
  for fold, (train_idx, val_idx) in enumerate(kfold.split(dataset)):
    gcn_model = UCCA_GCN(nfeat=768, 
                         nhid=nhid,
                         nclass=n_out,
                         dropout=dropout).to(device)
    optimizer = torch.optim.Adam(gcn_model.parameters(), lr=learning_rate) #, weight_decay=init_weight_decay)

    train_df = dataset.iloc[train_idx]

    valid_df = dataset.iloc[val_idx]
    print(fold)
    dl_train = DataLoader(IronyDataset("train", train_df["feature"].to_numpy(), train_df["adj"].to_numpy(), list(train_df["label"])), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
    dl_val= DataLoader(IronyDataset("val", valid_df["feature"].to_numpy(), valid_df["adj"].to_numpy(), list(valid_df["label"])), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
    best_a,label_list, prediction_list = train_and_evaluate(gcn_model, optimizer, dl_train, dl_val, fold)
    best_accuracy.append(best_a)
    label_all.extend(label_list)
    prediction_all.extend(prediction_list)
  
  print(np.mean(best_accuracy))
  return label_all, prediction_all

In [414]:
label_all, prediction_all = train_and_evaluate_fold()

--------------------------------
0
Best Val acc =  0.9083333333333333
1
Best Val acc =  1.0
2
Best Val acc =  0.9536842105263159
3
Best Val acc =  0.8633540372670807
4
Best Val acc =  0.8610526315789473
5
Best Val acc =  0.9017857142857144
6
Best Val acc =  0.7272727272727272
7
Best Val acc =  0.9536842105263159
8
Best Val acc =  0.9090909090909091
9
Best Val acc =  0.905982905982906
0.898424067986425


In [415]:
f1_score(label_all, prediction_all), recall_score(label_all, prediction_all), precision_score(label_all, prediction_all),accuracy_score(label_all, prediction_all)

(0.9017857142857142, 0.9181818181818182, 0.8859649122807017, 0.9)