# Wireshark cross-project classify comments

In [2]:
# Get paths set either from environment variables or if not set use some default values
import os

if 'ACORA_HOME_PATH' in os.environ:
    acora_home_path = os.environ['ACORA_HOME_PATH']
else:
    acora_home_path = "../../acora"

if 'ACORA_DATA_PATH' in os.environ:
    data_path = os.environ['ACORA_DATA_PATH']
else:
    data_path = "./data"
    
if 'BERT_PRETRAIN_MODELS_PATH' in os.environ:
    berts_pretrain_path = os.environ['BERT_PRETRAIN_MODELS_PATH']
else:
    berts_pretrain_path = "../bert"

acora_home_path, data_path, berts_pretrain_path

('C:\\Users\\user\\Research\\acora-pure',
 'E:\\GoogleDrive\\acora-data',
 'D:\\Research\\Datasets\\BERT')

In [3]:
random_seed = 102329

## Imports

In [4]:
import logging
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # or any {'0', '1', '2'}
logging.getLogger("tensorflow").setLevel(logging.ERROR)
import json

import pandas as pd
import numpy as np

import json

from scipy import stats

from sklearn.model_selection import train_test_split

from collections import Counter

from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import matthews_corrcoef as mcc_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix, multilabel_confusion_matrix
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
import math


import warnings  
with warnings.catch_warnings():  
    warnings.filterwarnings("ignore",category=FutureWarning)

    import tensorflow as tf

    if tf.__version__.startswith("1."):
        os.environ['TF_KERAS'] = '0'
        from tensorflow import ConfigProto, Session, set_random_seed
        import keras
        from keras.backend.tensorflow_backend import set_session
        from keras.backend.tensorflow_backend import clear_session
        from keras.backend.tensorflow_backend import get_session
    else:
        os.environ['TF_KERAS'] = '1'
        from tensorflow.compat.v1 import ConfigProto, Session, set_random_seed
        import tensorflow.compat.v1.keras as keras
        tf.get_logger().setLevel('INFO')
         
    from tensorflow.python.client import device_lib


    from keras_bert import Tokenizer, load_trained_model_from_checkpoint
    from keras_bert.layers.extract import Extract

    from keras_radam import RAdam

from acora.vocab import BERTVocab
from acora.comments import default_subject_columns, \
    load_comments_files, CommentPurposeTransformer, CommentSubjectTransformer, \
    plot_purpose_confusion_matrix, plot_subjects_confusion_matrix, \
    report_comment_predictions_accuracy, default_purpose_labels, save_comment_predictions_accuracy

# Model

In [5]:
#bert_name = 'uncased_L-8_H-512_A-8'
bert_name = 'multi_cased_L-12_H-768_A-12'

config_path = os.path.join(berts_pretrain_path, bert_name, 'bert_config.json')
checkpoint_path = os.path.join(berts_pretrain_path, bert_name, 'bert_model.ckpt')
vocab_path = os.path.join(berts_pretrain_path, bert_name, 'vocab.txt')
with open(config_path, "r", encoding='utf', errors='ignore') as json_file:
    bert_config = json.load(json_file)

In [6]:
not_use_gpu = False
seq_len = 128

In [7]:
gpus = [x.name for x in device_lib.list_local_devices() if x.device_type == 'GPU']
if not not_use_gpu and len(gpus) == 0:
    logger.error("You don't have a GPU available on your system, it can affect the performance...")

for gpu_entry in device_lib.list_local_devices():
    if hasattr(gpu_entry, 'physical_device_desc'):
        print(f"{gpu_entry.name}: {gpu_entry.physical_device_desc}, {gpu_entry.memory_limit}")

/device:CPU:0: , 268435456
/device:XLA_CPU:0: device: XLA_CPU device, 17179869184
/device:GPU:0: device: 0, name: NVIDIA TITAN Xp COLLECTORS EDITION, pci bus id: 0000:05:00.0, compute capability: 6.1, 10983471872
/device:XLA_GPU:0: device: XLA_GPU device, 17179869184


In [8]:
model = None

def get_model(seq_len, subject_columns, layer_num, config_path, checkpoint_path, lr=2e-5, 
              not_use_gpu = not_use_gpu):
    
    global model

    try:
        del model 
    except:
        print("Unable to delete the model")
    
    if tf.__version__.startswith("1."):
        sess = get_session()
        clear_session()
        sess.close()
        gpus = [x.name for x in device_lib.list_local_devices() if x.device_type == 'GPU']
        if not not_use_gpu and len(gpus) == 0:
            logger.error("You don't have a GPU available on your system, it can affect the performance...")

        config = ConfigProto( device_count = {'GPU': 0 if not_use_gpu else len(gpus)})
        config.gpu_options.per_process_gpu_memory_fraction = 1
        config.gpu_options.visible_device_list = "0"
        sess = Session(config=config)
        keras.backend.set_session(sess)
    else:
        sess = tf.compat.v1.keras.backend.get_session()
        tf.compat.v1.keras.backend.clear_session()
        sess.close()
        gpus = [x.name for x in device_lib.list_local_devices() if x.device_type == 'GPU']
        if not not_use_gpu and len(gpus) == 0:
            logger.error("You don't have a GPU available on your system, it can affect the performance...")

        config = ConfigProto( device_count = {'GPU': 0 if not_use_gpu else len(gpus)})
        config.gpu_options.per_process_gpu_memory_fraction = 1
        config.gpu_options.visible_device_list = "0"
        sess = Session(config=config)
        tf.compat.v1.keras.backend.set_session(sess)
    
    
    model = load_trained_model_from_checkpoint(
        config_path,
        checkpoint_path,
        training=True,
        trainable=True,
        seq_len=seq_len
    )
    
    inputs = model.inputs[:2]
    #dense = model.get_layer(f'Encoder-{layer_num}-FeedForward-Norm').output
    #dense = Extract(index=0, name="Extract")(dense)
    dense = model.get_layer('NSP-Dense').output
    dense = keras.layers.Dropout(0.1)(dense)


    losses = dict()
    loss_weights = dict()
    outputs = []
   
    for i, subject_class in enumerate(subject_columns):
        outputs.append(keras.layers.Dense(units=1, activation='sigmoid', name=f"{subject_class}_output")(dense))
        losses[f"{subject_class}_output"] = "binary_crossentropy"
        loss_weights[f"{subject_class}_output"] = 1.0

    model = keras.models.Model(inputs, outputs)
    
    model.compile(
        RAdam(learning_rate=lr,beta_1=0.9, beta_2=0.999,warmup_proportion=0.1),
        loss=losses, 
        loss_weights=loss_weights,
        metrics=['accuracy'],
    )
    
    return model


In [9]:
vocab = BERTVocab.load_from_file(vocab_path)

In [10]:
tokenizer = Tokenizer(vocab.token_dict)

## Classify Wireshark

In [11]:
sep = "$"

line_column = "line_contents"
message_column = "message"
purpose_column = "purpose"
subject_columns = default_subject_columns

cols = [line_column, message_column, purpose_column] + subject_columns

training_data_paths = [
    os.path.join(data_path, "mono", "mono-all.xlsx"),
    os.path.join(data_path, "onap", "onap_comments_all.xlsx")
]

testing_data_paths = [
    os.path.join(data_path, "wireshark", "wireshark_comments_all.xlsx"),
]
reviews_train_df = load_comments_files(training_data_paths, cols, sep)
reviews_test_df = load_comments_files(testing_data_paths, cols, sep)

duplicates = pd.concat(g for _, g in reviews_train_df.groupby(message_column) if len(g) > 1)
unique = pd.concat(g for _, g in reviews_train_df.groupby(message_column) if len(g) == 1)
reviews_train_df = pd.concat([duplicates, unique])

duplicates = pd.concat(g for _, g in reviews_test_df.groupby(message_column) if len(g) > 1)
unique = pd.concat(g for _, g in reviews_test_df.groupby(message_column) if len(g) == 1)
reviews_test_df = pd.concat([duplicates, unique])


Loading data from E:\GoogleDrive\acora-data\mono\mono-all.xlsx
Loading data from E:\GoogleDrive\acora-data\onap\onap_comments_all.xlsx
Loaded 1,424 rows and 15 cols...
Loading data from E:\GoogleDrive\acora-data\wireshark\wireshark_comments_all.xlsx
Loaded 1,248 rows and 15 cols...


In [12]:
reviews_train_df.shape,  reviews_test_df.shape

((1424, 15), (1248, 15))

In [14]:
tokenized_train_messages = [tokenizer.encode(str(text), max_len=seq_len)[0] for text in reviews_train_df[message_column].tolist()] 
x_train = [np.array(tokenized_train_messages), np.zeros_like(tokenized_train_messages)]

In [15]:
subject_transformer_train = CommentSubjectTransformer(reviews_train_df, subject_columns)
y_train_subject = subject_transformer_train.encode_one_hot_all_subjects()

In [16]:
tokenized_test_messages = [tokenizer.encode(str(text), max_len=seq_len)[0] for text in reviews_test_df[message_column].tolist()] 
x_test = [np.array(tokenized_test_messages), np.zeros_like(tokenized_test_messages)]


In [17]:
subject_transformer_test = CommentSubjectTransformer(reviews_test_df, subject_columns)
y_test_subject = subject_transformer_test.encode_one_hot_all_subjects()

In [18]:
y_all_train = dict()
for i, subject_class in enumerate(subject_columns):
    y_all_train[f"{subject_class}_output"] = subject_transformer_train.encode_binary_single_subject(subject_class).values


In [19]:
y_all_test = dict()
for i, subject_class in enumerate(subject_columns):
    y_all_test[f"{subject_class}_output"] = subject_transformer_test.encode_binary_single_subject(subject_class).values


In [20]:
subject_class_weights_train = subject_transformer_train.class_weights()
print(f"Calculated subject weights: {subject_class_weights_train}")

class_weights_all_train = dict()
for i, subject_class in enumerate(subject_columns):
    class_weights_all_train[f"{subject_class}_output"] = subject_class_weights_train[subject_class]


Calculated subject weights: {'code_design': array([0.53817082, 7.04950495]), 'code_style': array([0.55065739, 5.4351145 ]), 'code_naming': array([ 0.51970803, 13.18518519]), 'code_logic': array([0.83372365, 1.24912281]), 'code_io': array([ 0.50676157, 37.47368421]), 'code_data': array([0.6953125, 1.78     ]), 'code_doc': array([0.52779837, 9.49333333]), 'code_api': array([0.55322455, 5.19708029]), 'compatibility': array([ 0.50857143, 29.66666667]), 'rule_def': array([ 0.51631617, 15.82222222]), 'config_commit_patch_review': array([ 0.51333814, 19.24324324]), 'config_building_installing': array([ 0.52237711, 11.67213115])}


In [21]:
np.random.seed(random_seed)
set_random_seed(random_seed)

batch_size = 24
epochs = 15
lr=2e-5
layers_num=12

model = get_model(seq_len, subject_columns, layers_num, config_path, checkpoint_path, lr)

history = model.fit(
                x_train,
                y_all_train,
                epochs=epochs,
                batch_size=batch_size,
                verbose=1,
                shuffle=True,
            )


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Constant'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Constant'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the f

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Epoch 1/15
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15




In [23]:
print("Predicting...")
y_pred_subject = model.predict(x_test) 

y_pred_subject = np.array(y_pred_subject).reshape(len(y_pred_subject),len(y_pred_subject[0])).transpose()
subject_all_preds = []
for preds in y_pred_subject:
    subject_all_preds.append([1 if x > 0.5 else 0 for x in preds]) 
subject_preds_df = pd.DataFrame(subject_all_preds, columns=subject_columns)


Predicting...


In [24]:
y_true_df = pd.DataFrame(y_all_test)
y_true_df.columns = [str(col).replace("_output", "") for col in y_true_df.columns]

results = dict()

for subject in subject_columns:
    subject_acc = accuracy_score(y_true_df[subject], subject_preds_df[subject])
    subject_f1 = f1_score(y_true_df[subject], subject_preds_df[subject], average="macro")
    subject_precision = precision_score(y_true_df[subject], subject_preds_df[subject], average="macro")
    subject_recall = recall_score(y_true_df[subject], subject_preds_df[subject], average="macro")
    subject_f1_micro = f1_score(y_true_df[subject], subject_preds_df[subject], average="micro")
    subject_precision_micro = precision_score(y_true_df[subject], subject_preds_df[subject], average="micro")
    subject_recall_micro = recall_score(y_true_df[subject], subject_preds_df[subject], average="micro")
    subject_f1_binary = f1_score(y_true_df[subject], subject_preds_df[subject], average="binary")
    subject_precision_binary = precision_score(y_true_df[subject], subject_preds_df[subject], average="binary")
    subject_recall_binary = recall_score(y_true_df[subject], subject_preds_df[subject], average="binary")
    subject_mcc = mcc_score(y_true_df[subject], subject_preds_df[subject])    
    try:
        subject_roc_auc_score = roc_auc_score(y_true_df[subject], subject_preds_df[subject])
        print(f"Subject {subject} AUC = {subject_roc_auc_score:.2}")
    except:
        subject_roc_auc_score = None
    
    print(f"Subject {subject} Accuracy = {subject_acc:.2f}")
    print(f"Subject {subject} Precision (macro) = {subject_precision:.2}")
    print(f"Subject {subject} Recall (macro) = {subject_recall:.2}")
    print(f"Subject {subject} F1-score (macro) = {subject_f1:.2}")
    print(f"Subject {subject} Precision (micro) = {subject_precision_micro:.2}")
    print(f"Subject {subject} Recall (micro) = {subject_recall_micro:.2}")
    print(f"Subject {subject} F1-score (micro) = {subject_f1_micro:.2}")
    print(f"Subject {subject} Precision (binary) = {subject_precision_binary:.2}")
    print(f"Subject {subject} Recall (binary) = {subject_recall_binary:.2}")
    print(f"Subject {subject} F1-score (binary) = {subject_f1_binary:.2}")
    print(f"Subject {subject} MCC = {subject_mcc:.2}")


    results[subject] = dict()
    results[subject]['acc'] = subject_acc
    results[subject]['rec_macro']= subject_recall
    results[subject]['prec_macro'] = subject_precision
    results[subject]['fscore_macro'] = subject_f1
    results[subject]['rec_micro']= subject_recall_micro
    results[subject]['prec_micro'] = subject_precision_micro
    results[subject]['fscore_micro'] = subject_f1_micro
    results[subject]['rec_binary']= subject_recall_binary
    results[subject]['prec_binary'] = subject_precision_binary
    results[subject]['fscore_binary'] = subject_f1_binary
    results[subject]['mcc'] = subject_mcc
    results[subject]['auc'] = subject_roc_auc_score
    
subject_cm_path=f"./output/cross-wireshark-subject_cm.pdf"
print("Preparing confusion matrix for the comment subjects.")

figsize=(10,20)
cmap='Greens'

cf_matrix_all_subject = multilabel_confusion_matrix(y_true_df[subject_columns].values,
                                                    subject_preds_df[subject_columns].values, samplewise=False)

fig = plt.figure(figsize=figsize)
cols = math.ceil(float(len(subject_columns)) / 2.0)
#print(cols)
gs = gridspec.GridSpec(cols, 2, height_ratios=[1]*cols)
gs.update(hspace=0.4, wspace=0.5)

for i, cf in enumerate(cf_matrix_all_subject):

    row = i // 2
    col = i % 2
    #print(row, col)
    ax = plt.subplot(gs[row, col])

    cmn = cf.astype('float') / cf.sum(axis=1)[:, np.newaxis]
    perc_labs = ["{0:.1%}".format(value) for value in cmn.flatten()]

    group_counts = ["{0:0.0f}\n".format(value) for value in cf.flatten()]

    box_labels = [f"{v1}{v2}".strip() for v1, v2 in zip(group_counts,perc_labs)]
    box_labels = np.asarray(box_labels).reshape(cf.shape[0],cf.shape[1])

    sns.heatmap(cmn, 
                annot=box_labels, 
                fmt='', 
                annot_kws={"fontsize":12},
                xticklabels=("False", "True"), 
                yticklabels=("False", "True"),
            cmap=cmap,
            linecolor='lightgray', linewidths=0.5,
            square=True,
            cbar=False,
            vmin=0, vmax=1)
    ax.set_title(subject_columns[i])
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

plt.tight_layout()
plt.savefig(subject_cm_path)
print(f"Confusion matrix for the comment subject saved to {subject_cm_path}.")
plt.close()

Subject code_design AUC = 0.56
Subject code_design Accuracy = 0.95
Subject code_design Precision (macro) = 0.6
Subject code_design Recall (macro) = 0.56
Subject code_design F1-score (macro) = 0.58
Subject code_design Precision (micro) = 0.95
Subject code_design Recall (micro) = 0.95
Subject code_design F1-score (micro) = 0.95
Subject code_design Precision (binary) = 0.24
Subject code_design Recall (binary) = 0.14
Subject code_design F1-score (binary) = 0.17
Subject code_design MCC = 0.16
Subject code_style AUC = 0.79
Subject code_style Accuracy = 0.96
Subject code_style Precision (macro) = 0.89
Subject code_style Recall (macro) = 0.79
Subject code_style F1-score (macro) = 0.83
Subject code_style Precision (micro) = 0.96
Subject code_style Recall (micro) = 0.96
Subject code_style F1-score (micro) = 0.96
Subject code_style Precision (binary) = 0.8
Subject code_style Recall (binary) = 0.59
Subject code_style F1-score (binary) = 0.68
Subject code_style MCC = 0.67
Subject code_naming AUC = 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


Subject config_commit_patch_review AUC = 0.61
Subject config_commit_patch_review Accuracy = 0.96
Subject config_commit_patch_review Precision (macro) = 0.98
Subject config_commit_patch_review Recall (macro) = 0.61
Subject config_commit_patch_review F1-score (macro) = 0.67
Subject config_commit_patch_review Precision (micro) = 0.96
Subject config_commit_patch_review Recall (micro) = 0.96
Subject config_commit_patch_review F1-score (micro) = 0.96
Subject config_commit_patch_review Precision (binary) = 1.0
Subject config_commit_patch_review Recall (binary) = 0.21
Subject config_commit_patch_review F1-score (binary) = 0.35
Subject config_commit_patch_review MCC = 0.45
Subject config_building_installing AUC = 0.61
Subject config_building_installing Accuracy = 0.95
Subject config_building_installing Precision (macro) = 0.67
Subject config_building_installing Recall (macro) = 0.61
Subject config_building_installing F1-score (macro) = 0.63
Subject config_building_installing Precision (micro) =



Confusion matrix for the comment subject saved to ./output/cross-wireshark-subject_cm.pdf.


In [25]:
with open('./output/res-cross-wireshark.json', 'w') as fp:
    json.dump(results, fp, indent=4)

In [26]:
results_df = pd.DataFrame(results)
results_df.to_excel('./output/res-cross-wireshark.xlsx')

In [27]:
results_df = results_df
reviews_df = reviews_test_df

counts_df = reviews_df[subject_columns].sum()
counts_df.name = "n"

perc_counts_df = reviews_df[subject_columns].sum() / reviews_df.shape[0]
perc_counts_df.name = "perc_count"

summary_dfs = [counts_df, perc_counts_df]
for metric in results_df.index.unique().tolist():
    metric_df = results_df[results_df.index == metric].mean()
    metric_df.name = metric
    summary_dfs.append(metric_df)
    
summary_df = pd.concat(summary_dfs, axis=1)
summary_df.to_excel('./output/summary-cross-wireshark.xlsx')