# Datasets 

In [1]:
# Get paths set either from environment variables or if not set use some default values
import os

if 'ACORA_HOME_PATH' in os.environ:
    acora_home_path = os.environ['ACORA_HOME_PATH']
else:
    acora_home_path = "../../acora"

if 'ACORA_DATA_PATH' in os.environ:
    data_path = os.environ['ACORA_DATA_PATH']
else:
    data_path = "./data"
    
if 'BERT_PRETRAIN_MODELS_PATH' in os.environ:
    berts_pretrain_path = os.environ['BERT_PRETRAIN_MODELS_PATH']
else:
    berts_pretrain_path = "../bert"

acora_home_path, data_path, berts_pretrain_path

('C:\\Users\\user\\Research\\acora-pure',
 'E:\\GoogleDrive\\acora-data',
 'E:\\Research\\Datasets\\BERT')

## Imports

In [2]:
import logging
import os
import json

import pandas as pd
import numpy as np


from scipy import stats

from collections import Counter

import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
import math


import warnings  
with warnings.catch_warnings():  
    warnings.filterwarnings("ignore",category=FutureWarning)

    import tensorflow as tf

    if tf.__version__.startswith("1."):
        os.environ['TF_KERAS'] = '0'
        import keras
    else:
        os.environ['TF_KERAS'] = '1'
        import tensorflow.compat.v1.keras as keras
        tf.get_logger().setLevel('INFO')
         
    from keras_bert import Tokenizer, load_trained_model_from_checkpoint


from acora.vocab import BERTVocab
from acora.comments import default_subject_columns, \
    load_comments_files, CommentPurposeTransformer, CommentSubjectTransformer

## Load data

In [3]:
#bert_name = 'uncased_L-8_H-512_A-8'
bert_name = 'multi_cased_L-12_H-768_A-12'

config_path = os.path.join(berts_pretrain_path, bert_name, 'bert_config.json')
checkpoint_path = os.path.join(berts_pretrain_path, bert_name, 'bert_model.ckpt')
vocab_path = os.path.join(berts_pretrain_path, bert_name, 'vocab.txt')
with open(config_path, "r", encoding='utf', errors='ignore') as json_file:
    bert_config = json.load(json_file)

In [4]:
vocab = BERTVocab.load_from_file(vocab_path)

In [5]:
tokenizer = Tokenizer(vocab.token_dict)

In [6]:
sep = "$"

line_column = "line_contents"
message_column = "message"
purpose_column = "purpose"
subject_columns = default_subject_columns

cols = [line_column, message_column, purpose_column] + subject_columns

## Wireshark

In [7]:
training_data_paths = [
    os.path.join(data_path, "wireshark", "wireshark_comments_all.xlsx")
]
wireshark_df = load_comments_files(training_data_paths, cols, sep)

Loading data from E:\GoogleDrive\acora-data\wireshark\wireshark_comments_all.xlsx
Loaded 1,248 rows and 15 cols...


Check if there are any duplicated comments

In [9]:
print(f"Number of comments = {wireshark_df.shape[0]}")
print(f"Number of unique comments = {wireshark_df[message_column].unique().shape[0]}")
print(f"Number of duplicated lines and comments = {pd.concat(g for _, g in wireshark_df.groupby([line_column, message_column]) if len(g) > 1).shape[0]}")


Number of comments = 1248
Number of unique comments = 946
Number of duplicated lines and comments = 73


In [15]:
# duplicated lines and comments
for header, row in pd.concat(g for _, g in wireshark_df.groupby([line_column, message_column]) if len(g) > 1).iterrows():
    print(f"'{row[line_column]}' ---> '{row[message_column]}'")

'			DISSECTOR_ASSERT(pdata != NULL)_' ---> 'Done'
'			DISSECTOR_ASSERT(pdata != NULL)_' ---> 'Done'
'    ' ---> 'Done'
'    ' ---> 'Done'
'                           pipename, lastError, win32strerror(lastError))_' ---> 'Done'
'                           pipename, lastError, win32strerror(lastError))_' ---> 'Done'
'                   lastError, win32strerror(lastError))_' ---> 'And here.'
'                   lastError, win32strerror(lastError))_' ---> 'And here.'
'                   lastError, win32strerror(lastError))_' ---> 'Done'
'                   lastError, win32strerror(lastError))_' ---> 'Done'
'            enable_export = false_' ---> 'Done'
'            enable_export = false_' ---> 'Done'
'            enable_export = false_' ---> 'Is this code still in use? I think we disabled the entry in this cases.'
'            enable_export = false_' ---> 'Is this code still in use? I think we disabled the entry in this cases.'
'        {' ---> 'Done'
'        {' ---> 'Done'
'        {' 

In [16]:
seq_len = 128
comments_lengths = [len(tokenizer.encode(str(text))[0]) for text in wireshark_df[message_column].tolist()]
print("Message lengths distribution: 90% is {:.0f}, 95% is {:.0f}, 98% is {:.0f}, 99% is {:.0f}, and 100% is {}".format(
        *np.percentile(comments_lengths, [90, 95, 98, 99, 100])))
print(f"Your selected sequence length corresponds to {stats.percentileofscore(comments_lengths, seq_len):.2f} percentile in the training dataset.")

Message lengths distribution: 90% is 72, 95% is 107, 98% is 146, 99% is 186, and 100% is 333.0
Your selected sequence length corresponds to 97.12 percentile in the training dataset.


## ONAP

In [21]:
training_data_paths = [
    os.path.join(data_path, "onap", "onap_comments_all.xlsx")
]

onap_df = load_comments_files(training_data_paths, cols, sep)


Loading data from E:\GoogleDrive\acora-data\onap\onap_comments_all.xlsx
Loaded 1,252 rows and 15 cols...


In [22]:
print(f"Number of comments = {onap_df.shape[0]}")
print(f"Number of unique comments = {onap_df[message_column].unique().shape[0]}")
print(f"Number of duplicated lines and comments = {pd.concat(g for _, g in onap_df.groupby([line_column, message_column]) if len(g) > 1).shape[0]}")


Number of comments = 1252
Number of unique comments = 959
Number of duplicated lines and comments = 71


In [23]:
# duplicated lines and comments
for header, row in pd.concat(g for _, g in onap_df.groupby([line_column, message_column]) if len(g) > 1).iterrows():
    print(f"'{row[line_column]}' ---> '{row[message_column]}'")

'	    dr = new DmaapPropertyReader(testinput);' ---> 'Done'
'	    dr = new DmaapPropertyReader(testinput);' ---> 'Done'
'	    dr = new DmaapPropertyReader(testinput);' ---> 'this is not used, please refactor this testcase'
'	    dr = new DmaapPropertyReader(testinput);' ---> 'this is not used, please refactor this testcase'
'	exit' ---> 'tab'
'	exit' ---> 'tab'
'	exit' ---> 'tab'
'	exit' ---> 'tab'
'    ' ---> 'same'
'    ' ---> 'same'
'    ' ---> 'trailing white space'
'    ' ---> 'trailing white space'
'            e.printStackTrace();_' ---> 'Done'
'            e.printStackTrace();_' ---> 'Done'
'            if (status.toString().equals("FAILURE"))' ---> 'use this instead: status == QueryStatus.FAILURE'
'            if (status.toString().equals("FAILURE"))' ---> 'use this instead: status == QueryStatus.FAILURE'
'            rbs.executeProviderOperation(mg.getParams(), mg.getSvcLogicContext());' ---> 'maybe add an additional assertEquals to verify that REBOOT_STATUS=SUCCESS?'
'      

In [24]:
seq_len = 128
comments_lengths = [len(tokenizer.encode(str(text))[0]) for text in onap_df[message_column].tolist()]
print("Message lengths distribution: 90% is {:.0f}, 95% is {:.0f}, 98% is {:.0f}, 99% is {:.0f}, and 100% is {}".format(
        *np.percentile(comments_lengths, [90, 95, 98, 99, 100])))
print(f"Your selected sequence length corresponds to {stats.percentileofscore(comments_lengths, seq_len):.2f} percentile in the training dataset.")

Message lengths distribution: 90% is 52, 95% is 75, 98% is 107, 99% is 126, and 100% is 257.0
Your selected sequence length corresponds to 99.04 percentile in the training dataset.


### MONO

In [25]:
training_data_paths = [
    os.path.join(data_path, "mono", "mono-all.xlsx")
]

mono_df = load_comments_files(training_data_paths, cols, sep)


Loading data from E:\GoogleDrive\acora-data\mono\mono-all.xlsx
Loaded 172 rows and 15 cols...


In [26]:
print(f"Number of comments = {mono_df.shape[0]}")
print(f"Number of unique comments = {mono_df[message_column].unique().shape[0]}")
print(f"Number of duplicated lines and comments = {pd.concat(g for _, g in mono_df.groupby([line_column, message_column]) if len(g) > 1).shape[0]}")


Number of comments = 172
Number of unique comments = 111
Number of duplicated lines and comments = 4


In [27]:
# duplicated lines and comments
for header, row in pd.concat(g for _, g in mono_df.groupby([line_column, message_column]) if len(g) > 1).iterrows():
    print(f"'{row[line_column]}' ---> '{row[message_column]}'")

'public static void Main(string[] args)' ---> 'formatting: should use tabs not spaces, tabwidth=8'
'public static void Main(string[] args)' ---> 'formatting: should use tabs not spaces, tabwidth=8'
'}' ---> 'newline'
'}' ---> 'newline'


In [28]:
seq_len = 128
comments_lengths = [len(tokenizer.encode(str(text))[0]) for text in mono_df[message_column].tolist()]
print("Message lengths distribution: 90% is {:.0f}, 95% is {:.0f}, 98% is {:.0f}, 99% is {:.0f}, and 100% is {}".format(
        *np.percentile(comments_lengths, [90, 95, 98, 99, 100])))
print(f"Your selected sequence length corresponds to {stats.percentileofscore(comments_lengths, seq_len):.2f} percentile in the training dataset.")

Message lengths distribution: 90% is 61, 95% is 89, 98% is 155, 99% is 164, and 100% is 164.0
Your selected sequence length corresponds to 95.35 percentile in the training dataset.


## All

In [29]:
seq_len = 128
comments_lengths = [len(tokenizer.encode(str(text))[0]) for text in pd.concat([wireshark_df, onap_df, mono_df])[message_column].tolist()]
print("Message lengths distribution: 90% is {:.0f}, 95% is {:.0f}, 98% is {:.0f}, 99% is {:.0f}, and 100% is {}".format(
        *np.percentile(comments_lengths, [90, 95, 98, 99, 100])))
print(f"Your selected sequence length corresponds to {stats.percentileofscore(comments_lengths, seq_len):.2f} percentile in the training dataset.")

Message lengths distribution: 90% is 64, 95% is 94, 98% is 133, 99% is 164, and 100% is 333.0
Your selected sequence length corresponds to 97.90 percentile in the training dataset.
