In [1]:
import os
from os.path import dirname

root_path = dirname(dirname(os.getcwd()))
print(root_path)

import pandas as pd
import numpy as np
import time, datetime
import pickle as pkl
import copy

import torch
import torch.nn as nn
import torch.optim as optim
from torch_geometric.nn import GatedGraphConv, global_mean_pool
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.utils.data import Subset
from sklearn.model_selection import train_test_split


import warnings

warnings.filterwarnings("ignore")

data_dir = root_path + "/DuongNA/1_Data/"
project_dir = root_path + "/DuongNA/"

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

/home/sebdis/ProcessMining/Next_Activity_GNN




cuda:0


In [2]:
import random

torch.manual_seed(0)
torch.cuda.manual_seed(0)
random.seed(0)
np.random.seed(0)

In [4]:
data_name = "Helpdesk"
# data_name = "env_permit"
# data_name = "BPI_Challenge_2012_A"
# data_name = "BPI_Challenge_2012_O"
# data_name = "BPI_Challenge_2012_W_Complete"
# data_name = "BPI_Challenge_2013_closed_problems"
# data_name = "BPI_Challenge_2013_incidents"

In [5]:
tab_all = pd.read_csv(data_dir + data_name + "_processed_all.csv")
tab_train = pd.read_csv(data_dir + data_name + "_processed_train.csv")
tab_valid = pd.read_csv(data_dir + data_name + "_processed_valid.csv")
tab_test = pd.read_csv(data_dir + data_name + "_processed_test.csv")
tab_all.head()

Unnamed: 0,Case_ID,Activity,timestamp
0,Case 1,Assign seriousness,2012/10/09 14:50:17
1,Case 1,Take in charge ticket,2012/10/09 14:51:01
2,Case 1,Take in charge ticket,2012/10/12 15:02:56
3,Case 1,Resolve ticket,2012/10/25 11:54:26
4,Case 1,Closed,2012/11/09 12:54:39


In [6]:
list_activities = list(tab_all["Activity"].unique())

In [7]:
with open(data_dir + "GGNN_" + data_name + "_NextActivity_train.pkl", "rb") as f:
    X_train, Y_train = pkl.load(f)
with open(data_dir + "GGNN_" + data_name + "_NextActivity_valid.pkl", "rb") as f:
    X_valid, Y_valid = pkl.load(f)
with open(data_dir + "GGNN_" + data_name + "_NextActivity_test.pkl", "rb") as f:
    X_test, Y_test = pkl.load(f)

In [8]:
class EventLogData(Dataset):
    def __init__(self, input_x, output):
        self.X = input_x[0]
        self.A = input_x[1]
        self.V = input_x[2]
        self.y = output
        self.y = self.y.to(torch.float32)
        # self.y = self.y.reshape((len(self.y),1))

    # get the number of rows in the dataset
    def __len__(self):
        return len(self.X)

    # get a row at a particular index in the dataset
    def __getitem__(self, idx):
        return [[self.X[idx], self.A[idx], self.V[idx]], self.y[idx]]

    # get the indices for the train and test rows
    def get_splits(self, n_valid=0.2):
        train_idx, valid_idx = train_test_split(
            list(range(len(self.X))), test_size=n_valid, shuffle=True
        )
        train = Subset(self, train_idx)
        valid = Subset(self, valid_idx)
        return train, valid


def my_collate(batch):
    data = [item[0] for item in batch]
    Y = [item[1] for item in batch]
    return [data, Y]

In [9]:
valid_loader = DataLoader(
    EventLogData(X_valid, Y_valid),
    batch_size=len(X_valid[0]),
    shuffle=False,
    collate_fn=my_collate,
)
test_loader = DataLoader(
    EventLogData(X_test, Y_test), batch_size=1, shuffle=False, collate_fn=my_collate
)

In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [11]:
# A Class to keep track of the metrics of the classification process
class ClassificationMetrics:

    # Constructor takes the number of classes, in our case 20
    def __init__(self, num_classes=20):
        self.num_classes = num_classes
        # Initialize a confusion matrix
        self.C = torch.zeros(num_classes, num_classes)

    # Update the confusion matrix with the new scores
    def add(self, yp, yt):
        # yp: 1D tensor with predictions
        # yt: 1D tensor with ground-truth targets
        yp = yp.to("cpu")
        yt = yt.to("cpu")
        with torch.no_grad():  # We require no computation graph
            self.C += (
                (yt * self.C.shape[1] + yp)
                .bincount(minlength=self.C.numel())
                .view(self.C.shape)
                .float()
            )

    def clear(self):
        # We set the confusion matrix to zero
        self.C.zero_()

    # Computes the global accuracy
    def acc(self):
        return self.C.diag().sum().item() / self.C.sum()

    # Computes the class-averaged accuracy
    def mAcc(self):
        return (self.C.diag() / self.C.sum(-1)).mean().item()

    # Computers the class-averaged Intersection over Union
    def mIoU(self):
        return (
            (self.C.diag() / (self.C.sum(0) + self.C.sum(1) - self.C.diag()))
            .mean()
            .item()
        )

    # Returns the confusion matrix
    def confusion_matrix(self):
        return self.C

In [12]:
from torch_geometric.nn.models import GAT

In [13]:
dtype = torch.double


# Creating the model class
class GAT_model(nn.Module):
    def __init__(self, ggnn_dim, num_layers, droppout_prob, list_activities):
        super(GAT_model, self).__init__()
        self.ggnn_dim = ggnn_dim
        self.num_layers = num_layers
        self.droppout_prob = droppout_prob

        # self.ggnn = GatedGraphConv(self.ggnn_dim, num_layers=self.num_layers)
        self.ggnn = GAT(
            in_channels=-1,
            out_channels=self.ggnn_dim,
            num_layers=self.num_layers,
            hidden_channels=10,
        )
        self.fc = nn.Sequential(
            nn.Dropout(p=self.droppout_prob),
            nn.Linear(self.ggnn_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(p=self.droppout_prob),
            nn.Linear(64, len(list_activities)),
        )

    # Progresses data across layers
    def forward(self, x):
        x = [self.ggnn(X, A.to(torch.long), V) for i, (X, A, V) in enumerate(x)]
        x = torch.stack([global_mean_pool(single_x, batch=None) for single_x in x])
        x = x.squeeze(1)
        out = self.fc(x)
        return out

In [14]:
batch_size = 256
ggnn_dim = 256
num_layers = 4
lr_value = 0.01
droppout_prob = 0.3
hidden_channels = 4

train_loader = DataLoader(
    EventLogData(X_train, Y_train),
    batch_size=batch_size,
    shuffle=True,
    collate_fn=my_collate,
)

In [15]:
model_name = "_model_GAT"

In [16]:
from tqdm import tqdm_notebook as tqdm

save_folder = project_dir + "/5_Output_files/Next_Activity/" + data_name + model_name

if not os.path.exists(save_folder):
    os.mkdir(save_folder)

In [17]:
loaders = {"train": train_loader, "validation": valid_loader, "test": test_loader}

In [18]:
from copy import deepcopy

num_epochs = 2
best_accuracy = 0
early_stop_patience = 10

best_model = None

num_runs = 1
running_time = []

metric_tracker = ClassificationMetrics(num_classes=len(list_activities))

for run in range(num_runs):

    start = datetime.datetime.now()
    print("Run: {}".format(run + 1))

    model = GAT_model(ggnn_dim, num_layers, droppout_prob, list_activities)

    criterion = nn.CrossEntropyLoss()

    optimizer = torch.optim.Adam(model.parameters(), lr=lr_value)

    model = model.to(device)

    not_improved_count = 0

    for epoch in range(num_epochs):
        print(
            "\n-- EPOCH {}/{} -------------------------\n".format(epoch + 1, num_epochs)
        )
        torch.cuda.empty_cache()

        for state in ["train", "validation"]:
            if state == "train":
                model.train()
                metric_tracker.clear()

            else:

                print("\tTRAIN | acc: {:.4f}".format(metric_tracker.acc()))

                metric_tracker.clear()
                model.eval()

            running_loss = 0.0
            running_corrects = 0

            for i, (x, y) in enumerate(loaders[state]):
                x = [[sub_item.to(device=device) for sub_item in item] for item in x]

                y = torch.tensor([torch.max(yi, 0)[1] for yi in y])

                y = y.to(device)

                outputs = model(x)

                outputs = outputs.to(device)

                loss = criterion(outputs, y)

                if state == "train":
                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()

                _, preds = torch.max(outputs, 1)
                preds = preds.to(device)

                metric_tracker.add(preds, y)

        print("\tEVAL  | acc: {:.4f} \n".format(metric_tracker.acc()))

        if epoch == 0:  # HERE WE KEEP BEST AUC VALUE
            best_accuracy = metric_tracker.acc()
            best_model = deepcopy(model)
            torch.save(
                model.state_dict(),
                "{}/best_model_run_{}.pt".format(save_folder, run + 1),
            )
        else:
            if metric_tracker.acc() > best_accuracy:
                print("SAVING MODEL..............\n")
                torch.save(
                    model.state_dict(),
                    "{}/best_model_run_{}.pt".format(save_folder, run + 1),
                )
                best_model = deepcopy(model)
                best_accuracy = metric_tracker.acc()
                not_improved_count = 0
            else:
                not_improved_count += 1

        if not_improved_count == early_stop_patience:
            print(
                "Validation performance didn't improve for {} epochs. "
                "Training stops.".format(early_stop_patience)
            )
            break

    running_time.append((datetime.datetime.now() - start).total_seconds())

Run: 1

-- EPOCH 1/2 -------------------------

	TRAIN | acc: 0.3457 | mAcc: 0.0000 | mIoU: 0.0000
	EVAL  | acc: 0.4388 


-- EPOCH 2/2 -------------------------

	TRAIN | acc: 0.3673 | mAcc: 0.0000 | mIoU: 0.0000
	EVAL  | acc: 0.4395 

SAVING MODEL..............



## 5. Evaluation

In [19]:
def evaluate_model(model):
    err_dict = {}
    with torch.no_grad():
        model.eval()

        metric_tracker = ClassificationMetrics(num_classes=len(list_activities))
        for i, (inputs, targets) in enumerate(test_loader):
            metric_tracker.clear()
            prefix_len = inputs[0][0].size(0)
            inputs = [
                [sub_item.to(dtype=torch.float32, device=device) for sub_item in item]
                for item in inputs
            ]

            targets = torch.tensor([torch.max(yi, 0)[1] for yi in targets])
            outputs = model(inputs)

            metric_tracker.add(torch.max(outputs, 1)[1], targets)

            if prefix_len not in err_dict.keys():
                err_dict[prefix_len] = [metric_tracker.acc()]
            else:
                err_dict[prefix_len].append(metric_tracker.acc())
    return err_dict

In [20]:
err_total_dict = {}
trained_model = best_model
print(save_folder)
for run in range(num_runs):
    print("Run: {}".format(run + 1))
    # trained_model = GAT_model(ggnn_dim, num_layers, droppout_prob, list_activities)
    trained_model = trained_model.to(device)
    # trained_model.load_state_dict(torch.load('{}/best_model_run_{}.pt'.format(save_folder,run+1),
    #                                     map_location=torch.device(device)))
    err_dict = evaluate_model(trained_model)
    print(err_dict)
    for key in err_dict.keys():
        err = torch.mean(torch.tensor(err_dict[key]), axis=0)
        if key in err_total_dict.keys():
            err_total_dict[key].append(torch.tensor(err))
        else:
            err_total_dict[key] = [torch.tensor(err)]

/home/sebdis/ProcessMining/Next_Activity_GNN/DuongNA//5_Output_files/Next_Activity/Helpdesk_model_GAT
Run: 1
{2: [tensor(0.), tensor(0.), tensor(0.), tensor(0.), tensor(0.), tensor(0.), tensor(0.), tensor(0.), tensor(0.), tensor(0.), tensor(0.), tensor(0.), tensor(0.), tensor(0.), tensor(0.), tensor(0.), tensor(0.), tensor(0.), tensor(0.), tensor(0.), tensor(0.), tensor(0.), tensor(0.), tensor(0.), tensor(1.), tensor(0.), tensor(0.), tensor(0.), tensor(0.), tensor(0.), tensor(0.), tensor(0.), tensor(0.), tensor(0.), tensor(0.), tensor(0.), tensor(0.), tensor(0.), tensor(0.), tensor(0.), tensor(1.), tensor(0.), tensor(0.), tensor(0.), tensor(0.), tensor(0.), tensor(0.), tensor(0.), tensor(0.), tensor(0.), tensor(0.), tensor(0.), tensor(0.), tensor(0.), tensor(0.), tensor(0.), tensor(0.), tensor(0.), tensor(0.), tensor(1.), tensor(0.), tensor(0.), tensor(0.), tensor(0.), tensor(0.), tensor(0.), tensor(0.), tensor(0.), tensor(0.), tensor(0.), tensor(0.), tensor(1.), tensor(0.), tensor(1.)

In [21]:
print(err_total_dict)

{2: [tensor(0.0468)], 3: [tensor(0.5283)], 4: [tensor(0.7485)], 5: [tensor(0.6069)], 6: [tensor(0.7059)], 7: [tensor(0.8000)], 8: [tensor(0.7500)], 9: [tensor(0.)], 10: [tensor(0.)], 11: [tensor(0.)], 12: [tensor(0.)], 13: [tensor(1.)]}


In [22]:
num_samples_dict = {}
for i, (inputs, targets) in enumerate(test_loader):
    key = inputs[0][0].size(0)
    if key in num_samples_dict.keys():
        num_samples_dict[key] += 1
    else:
        num_samples_dict[key] = 1

In [23]:
print(num_samples_dict)

{2: 1518, 3: 1450, 4: 684, 5: 173, 6: 68, 7: 20, 8: 4, 9: 1, 10: 1, 11: 1, 12: 1, 13: 1}


In [24]:
list_prefix_len = []
list_num_samples = []
list_accuracy = []

for key, value in err_total_dict.items():
    list_prefix_len.append(key)
    list_num_samples.append(num_samples_dict[key])
    list_accuracy.append(value[0].item())

tab_result = pd.DataFrame(
    {
        "Prefix length": list_prefix_len,
        "Num samples": list_num_samples,
        "Accuracy(%)": list_accuracy,
    }
)
tab_result

Unnamed: 0,Prefix length,Num samples,Accuracy(%)
0,2,1518,0.046772
1,3,1450,0.528276
2,4,684,0.748538
3,5,173,0.606936
4,6,68,0.705882
5,7,20,0.8
6,8,4,0.75
7,9,1,0.0
8,10,1,0.0
9,11,1,0.0


In [25]:
tab = tab_result[tab_result["Num samples"] >= 20]
general_acc = round(tab["Accuracy(%)"] * tab["Num samples"])
# print(general_acc)
print(sum(general_acc) / sum(tab["Num samples"]))

0.3879376437515972


In [26]:
tab.to_csv(
    project_dir + "4_Outputs/Evaluation/" + data_name + "_GAT_eval.csv", index=False
)
torch.cuda.empty_cache()