In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from gensim.parsing.preprocessing import remove_stopwords
import contractions
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import numpy as np
import fasttext
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

import torch.nn.functional as F
import torch

import networkx as nx
from torch_geometric.utils.convert import from_networkx, to_networkx
from torch_geometric.data import DataLoader
from torch_geometric.nn import global_mean_pool

from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
from torch_geometric.nn import GATConv
import pickle as pkl
from tqdm import tqdm
from torch_geometric.nn.models import GNNExplainer
from torch_geometric.data import Data
import random
from inspect import signature
from math import sqrt
import torch


sns.set_theme()

# Set fixed random number seed
#torch.manual_seed(42)

%matplotlib inline

In [2]:
def calculate_accuracy_precision_recall(true_labels, predicted_labels):
    return (accuracy_score(true_labels, predicted_labels),
           precision_score(true_labels, predicted_labels),
           recall_score(true_labels, predicted_labels))

def print_evaluation_results(results):
    print('Avg accuracy | Avg precision | Avg recall')
    avg_accuracy, avg_precision, avg_recall = np.mean(results, axis=0)
    std_accuracy, std_precision, std_recall = np.std(results, axis=0)
    print(f'{avg_accuracy:.4f}+-{std_accuracy:.4f}, {avg_precision:.4f}+-{std_precision:.4f}, {avg_recall:.4f}+-{std_recall:.4f}')

def get_random_number():
    return random.randint(0, 10000)

global_random_number = [get_random_number()]
global_random_numbers = [get_random_number() for _ in range(10)]

In [5]:
df = pd.read_csv('samples.csv')
# bug == 0 and feature == 1
df = df[(df['label'] == 0) | (df['label'] == 1)]
#df = df[:500]
len(df)

407799

In [6]:
contractions.add('__label__', 'REMOVED_TOKEN')
# fix contractions
df['title'] = df['title'].apply(contractions.fix)
df['body'] = df['body'].apply(contractions.fix)
# removal of stopwords
df['title'] = df['title'].apply(remove_stopwords)
df['body'] = df['body'].apply(remove_stopwords)

In [7]:
glove2word2vec('glove.6B/glove.6B.100d.txt', 'tmpfile_glove')
glove_embeddings_model = KeyedVectors.load_word2vec_format('tmpfile_glove')

def get_word_glove_embedding(word):
    if word not in glove_embeddings_model:
        return np.zeros(100, dtype='float32')
    return glove_embeddings_model.get_vector(word)


def get_sentence_glove_embedding(sentence):
    word_embeddings = [
        glove_embeddings_model.get_vector(word) if word in glove_embeddings_model else np.zeros(100, dtype='float32')
        for word in sentence.split()]
    if len(word_embeddings) == 0:
        return np.zeros(100, dtype='float32')
    return np.mean(word_embeddings, axis=0)

df['fasttext_input'] = '__label__' + df['label'].map(str) + ' ' + df['title'] + ' ' + df['body']
train_input, test_input = train_test_split(df.fasttext_input.values, test_size=0.33, random_state=42)
np.savetxt('train.txt', train_input, fmt='%s')
np.savetxt('test.txt', test_input, fmt='%s')
fasttext_model = fasttext.train_supervised('train.txt', dim=100, epoch=5)
fasttext_model.test('test.txt')
df.drop('fasttext_input', axis=1, inplace=True)
embeddings_lookup = {word: fasttext_model.get_word_vector(word) for word in fasttext_model.get_words()}

  glove2word2vec('glove.6B/glove.6B.100d.txt', 'tmpfile_glove')


In [6]:
def create_graph_of_words(text, window_size):
    text = text.split()
    embeddings_for_text = []
    G = nx.Graph()
    for i, word in enumerate(text):
        #embedding = fasttext_model.get_word_vector(word)
        embedding = embeddings_lookup.get(word, np.zeros(100, dtype='float32'))
        G.add_node(word, x=embedding)
        embeddings_for_text.append(embedding)
        for j in range(i + 1, i + window_size):
            if j < len(text):
                G.add_edge(word, text[j])
    return G


def create_graph_of_words_for_pytorch(text, window_size):
    G  = create_graph_of_words(text, window_size)
    return G, from_networkx(G)


def generate_pytorch_geometric_graphs(data, window_size):
    netx_graphs = []
    pyg_graphs = []

    for s in tqdm(data['body'].values):
        netx_graph, pyg_graph = create_graph_of_words_for_pytorch(s, window_size)
        pyg_graphs.append(pyg_graph)
        netx_graphs.append(netx_graph)
    print('finished...')

    for i, label in enumerate(data['label'].values):
        pyg_graphs[i].y = torch.tensor(label).float()
        netx_graphs[i].graph['y']=label

    # print(len(pyg_graphs),len(netx_graphs))
    return pyg_graphs, netx_graphs

train_df, test_df = train_test_split(df, test_size=0.33, random_state=42)
train_pyg_graphs, train_netx_graphs = generate_pytorch_geometric_graphs(train_df, window_size=7)
test_pyg_graphs, test_netx_graphs = generate_pytorch_geometric_graphs(test_df, window_size=7)
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

  data[key] = torch.tensor(value)
100%|██████████| 273225/273225 [12:22<00:00, 368.16it/s] 


finished...


100%|██████████| 134574/134574 [06:10<00:00, 362.98it/s]


finished...


In [7]:
print(train_df)
print(test_df)
print(len(train_pyg_graphs),len(train_netx_graphs), len(train_df))
print(len(test_pyg_graphs),len(test_netx_graphs), len(test_df))

                                                    title  \
0       include \ external assigned\ slots mission lis...   
1       tke, like mark task imcomplete, progress, comp...   
2       thread monitors checkboxes oscilloscope activi...   
3                        crash camera plugin usage ios 10   
4       identicon display contact list friends running...   
...                                                   ...   
273220                         allow directing csv stdout   
273221                                   generate pledges   
273222  systemimager: fix upstream dhcp option- flags ...   
273223                        add flags temperature units   
273224                              epanet mtp2 run error   

                                                     body  label  
0       blocked slots currently counted \ players\ val...      0  
1       problem \are \r exists, users change status as...      1  
2       actual behaviour \are \r thread checks checkbo...      0  

In [8]:
id_error_train = []
for i, data in enumerate(train_pyg_graphs):
    if data.x == None:
        print(f'Found Error in train graph {i}')
        id_error_train.append(i)

id_error_test = []
for i, data in enumerate(test_pyg_graphs):
    if data.x == None:
        print(f'Found Error in test graph {i}')
        id_error_test.append(i)

train_df = train_df.drop(id_error_train)
test_df = test_df.drop(id_error_test)
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)
print(train_df)
print(test_df)

for i in id_error_train:
    train_pyg_graphs.pop(i)
    train_netx_graphs.pop(i)

for i in id_error_test:
    test_pyg_graphs.pop(i)
    test_netx_graphs.pop(i)

print(len(train_pyg_graphs),len(train_netx_graphs), len(train_df))
print(len(test_pyg_graphs),len(test_netx_graphs), len(test_df))


Found Error in train graph 36901
                                                    title  \
0       include \ external assigned\ slots mission lis...   
1       tke, like mark task imcomplete, progress, comp...   
2       thread monitors checkboxes oscilloscope activi...   
3                        crash camera plugin usage ios 10   
4       identicon display contact list friends running...   
...                                                   ...   
273219                         allow directing csv stdout   
273220                                   generate pledges   
273221  systemimager: fix upstream dhcp option- flags ...   
273222                        add flags temperature units   
273223                              epanet mtp2 run error   

                                                     body  label  
0       blocked slots currently counted \ players\ val...      0  
1       problem \are \r exists, users change status as...      1  
2       actual behaviour \are \r 

In [3]:
for name in tqdm(['train_pyg_graphs', 'test_pyg_graphs', 'train_netx_graphs', 'test_netx_graphs', 'train_df', 'test_df']):
    with open(name+'.pkl', 'wb') as handle:
        pkl.dump(eval(name), handle)


  0%|          | 0/6 [00:00<?, ?it/s]


NameError: name 'train_pyg_graphs' is not defined

In [None]:
for name in tqdm(['train_pyg_graphs', 'test_pyg_graphs', 'train_netx_graphs', 'test_netx_graphs', 'train_df', 'test_df']):
    with open(name+'.pkl', 'rb') as handle:
        exec(name+' = pkl.load(handle)')

In [53]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
class GATClassifier(torch.nn.Module):
    def __init__(self, embeddings_lookup, window_size=7):
        super().__init__()
        #torch.manual_seed(12345)
        self.embeddings_lookup = embeddings_lookup
        self.window_size = window_size
        self.conv1 = GATConv(100, 10, heads=3)
        # self.conv1 = SGConv(100, 50, K=1)
        self.linear1 = torch.nn.Linear(10 * 3, 1)

        self.sigmoid = torch.nn.Sigmoid()

    def forward(self, x, edge_index, batch):

        x = F.elu(self.conv1(x, edge_index))
        x = global_mean_pool(x, batch)
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.linear1(x)
        x = self.sigmoid(x)

        return x

    def predict_proba(self, X):
        pyg_graphs = []
        if not isinstance(X, list):
            X = [X]
        for text in X:
            text = text.split()
            embeddings_for_text = []
            G = nx.Graph()
            for i, word in enumerate(text):
                #embedding = fasttext_model.get_word_vector(word)
                embedding = self.embeddings_lookup.get(word, np.zeros(100, dtype='float32'))
                G.add_node(word, x=embedding)
                embeddings_for_text.append(embedding)
                for j in range(i + 1, i + self.window_size):
                    if j < len(text):
                        G.add_edge(word, text[j])
            pyg_graphs.append(from_networkx(G))

        predicted_probs = []
        loader = DataLoader(pyg_graphs, batch_size=1, shuffle=False)
        for graph in pyg_graphs:
            graph = graph.to(device)
            x = graph.x.to(device)
            edge_index = graph.edge_index.to(device)
            batch = graph.batch
            out = self.forward(x, edge_index, batch)
            print(out)
            out = out.squeeze().cpu().detach().numpy()
            predicted_probs.append([1-out, out])
        print(predicted_probs)
        return np.array(predicted_probs)


In [54]:
def run_gat_classifier(train_pyg_graphs, test_pyg_graphs, train_batch_size=300, learning_rate=0.001, num_epoch=10):
    train_loader = DataLoader(train_pyg_graphs, batch_size=train_batch_size, shuffle=False)
    test_loader = DataLoader(test_pyg_graphs, batch_size=200, shuffle=False)

    gat_model = GATClassifier(embeddings_lookup=embeddings_lookup, window_size=7).to(device)
    print(gat_model)
    # Define the loss function and optimizer
    loss_function = F.binary_cross_entropy
    optimizer = torch.optim.Adam(gat_model.parameters(), lr=learning_rate)

    gat_model.train()
    for epoch in range(0, num_epoch):
        for i, data in enumerate(train_loader):  # Iterate in batches over the training dataset.
            x = data.x.to(device)
            edge_index = data.edge_index.to(device)
            data = data.to(device)
            try:
                out = gat_model(x, edge_index, data.batch)  # Perform a single forward pass.
            except Exception as e:
                print(data)
                print(data.x)
                print(data.y)
            out = out.squeeze()
            y = data.y.squeeze()
            loss = loss_function(out, y)  # Compute the loss.
            loss.backward()  # Derive gradients.
            optimizer.step()  # Update parameters based on gradients.
            optimizer.zero_grad()  # Clear gradients.
        print(f'Epoch: {epoch}, Epoch loss {loss.item()}')

    print('Training process has finished.')
    print('Final loss', loss.item())

    true_labels = []
    pred_labels = []
    with torch.no_grad():
        gat_model.eval()
        for i, data in enumerate(test_loader):
            data = data.to(device)
            out = gat_model(data.x, data.edge_index, data.batch)
            pred_labels.extend(torch.round(out.squeeze()).tolist())
            true_labels.extend(data.y.tolist())

    #print('true labels ----')
    #print(true_labels)
    #print('pred labels ----')
    #print(pred_labels)

    results = calculate_accuracy_precision_recall(true_labels, pred_labels)

    print(results)
    return {
        'model': gat_model,
        'results': results
    }

gat_models_results = []
gat_evaluation_results = []
gat_final_models = []
for num in global_random_number:
    torch.manual_seed(num)
    foo = run_gat_classifier(train_pyg_graphs, test_pyg_graphs)
    gat_evaluation_results.append(foo['results'])
    gat_final_models.append(foo['model'])
    gat_models_results.append(foo)
with open('gat_models_results.pkl', 'wb') as f:
    pkl.dump(gat_models_results, f)



GATClassifier(
  (conv1): GATConv(100, 10, heads=3)
  (linear1): Linear(in_features=30, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)
Epoch: 0, Epoch loss 0.3533962070941925
Epoch: 1, Epoch loss 0.3461766541004181
Epoch: 2, Epoch loss 0.35605940222740173
Epoch: 3, Epoch loss 0.33927664160728455
Epoch: 4, Epoch loss 0.33910325169563293
Epoch: 5, Epoch loss 0.3337400555610657
Epoch: 6, Epoch loss 0.35725685954093933
Epoch: 7, Epoch loss 0.3405722379684448
Epoch: 8, Epoch loss 0.34837648272514343
Epoch: 9, Epoch loss 0.34768742322921753
Training process has finished.
Final loss 0.34768742322921753
(0.7972936822863257, 0.8124896020810962, 0.7830821258855427)


In [None]:
with open('gat_models_results.pkl', 'rb') as f:
    gat_models_results = pkl.load(f)
gat_final_models = [gat['model'] for gat in gat_models_results]

In [14]:
class fastTextVectorizer:
    def __init__(self, embeddings_lookup, verbose=False, lowercase=True, minchars=3):
        # load in pre-trained word vectors
        print('Loading word vectors...')


        self.idx2word = list(embeddings_lookup.keys())
        self.word2vec = embeddings_lookup
        self.embedding = np.array(list(embeddings_lookup.values()))
        self.word2idx = {v:k for k,v in enumerate(self.idx2word)}
        self.V, self.D = self.embedding.shape
        self.verbose = verbose
        self.lowercase = lowercase
        self.minchars = minchars

    def fit(self, data, *args):
        pass

    def transform(self, data, *args):
        X = np.zeros((len(data), self.D))
        n = 0
        emptycount = 0
        for sentence in data:
            # Note: lower-casing the words
            if self.lowercase:
                tokens = sentence.lower().split()
            else:
                tokens = sentence.split()
            vecs = []
            for word in tokens:
                if len(word) >= self.minchars and word in self.word2vec:
                    vec = self.word2vec[word]
                    vecs.append(vec)
            if len(vecs) > 0:
                vecs = np.array(vecs)
                X[n] = vecs.mean(axis=0)
            else:
                emptycount += 1
            n += 1
        if self.verbose:
            print("Number of samples with no words found / total: %s / %s" % (emptycount, len(data)))
        return X

    def fit_transform(self, X, *args):
        self.fit(X, *args)
        return self.transform(X, *args)

In [15]:
from sklearn.pipeline import make_pipeline


def run_logistic_classifier(train_df, test_df):

    vectorizer = fastTextVectorizer(embeddings_lookup)
    train_list_corpus = train_df["body"].tolist()

    test_list_corpus = test_df['body'].tolist()
    train_list_labels = train_df["label"].tolist()
    test_list_labels = test_df["label"].tolist()

    logreg = LogisticRegression(n_jobs=1, C=1e5)

    pipeline = make_pipeline(vectorizer, logreg)
    pipeline.fit(train_list_corpus, train_list_labels)

    pred_labels = pipeline.predict(test_list_corpus)

    results = calculate_accuracy_precision_recall(test_list_labels, pred_labels)

    print(results)
    return {
        'model': pipeline,
        'results': results
    }

In [16]:
logistic_model = run_logistic_classifier(train_df, test_df)
with open('logistic_model.pkl', 'wb') as f:
    pkl.dump(logistic_model, f)

Loading word vectors...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


(0.7891420333794047, 0.7992738743564562, 0.7830092417130696)


In [None]:
with open('logistic_model.pkl', 'rb') as f:
    logistic_model = pkl.load(f)

In [121]:
def visualize_graph(G, seed=10, **kwargs):
    node_args = set(signature(nx.draw_networkx_nodes).parameters.keys())
    node_kwargs = {k: v for k, v in kwargs.items() if k in node_args}
    node_kwargs['node_size'] = kwargs.get('node_size') or 1300
    node_kwargs['cmap'] = kwargs.get('cmap') or 'cool'

    label_args = set(signature(nx.draw_networkx_labels).parameters.keys())
    label_kwargs = {k: v for k, v in kwargs.items() if k in label_args}
    label_kwargs['font_size'] = kwargs.get('font_size') or 10

    pos = nx.spring_layout(G, seed=seed)
    ax = plt.gca()
    for source, target, data in G.edges(data=True):
        ax.annotate(
            '', xy=pos[target], xycoords='data', xytext=pos[source],
            textcoords='data', arrowprops=dict(
                arrowstyle="->",
                alpha=max(data['att'], 0.1),
                color=data['edge_color'],
                shrinkA=sqrt(node_kwargs['node_size']) / 2.0,
                shrinkB=sqrt(node_kwargs['node_size']) / 2.0,
                connectionstyle="arc3,rad=0.1",
            ))

    nx.draw_networkx_nodes(G, pos, node_color='none',
                               **node_kwargs)
    nx.draw_networkx_labels(G, pos,  **label_kwargs)

    return ax

def return_to_networkx(edge_index, edge_mask, original_graph, threshold=None, seed=10, visualize=True, **kwargs):


    assert edge_mask.size(0) == edge_index.size(1)


    hard_edge_mask = torch.BoolTensor([True] * edge_index.size(1),
                                      device=edge_mask.device)

    y = torch.zeros(edge_index.max().item() + 1,
                            device=edge_index.device)

    edge_color = ['black'] * edge_index.size(1)


    if threshold is not None:
        edge_mask = (edge_mask >= threshold).to(torch.float)



    data = Data(edge_index=edge_index, att=edge_mask,
                edge_color=edge_color, y=y, num_nodes=y.size(0)).to('cpu')
    G = to_networkx(data, node_attrs=['y'],
                    edge_attrs=['att', 'edge_color'])

    mapping = dict(zip(range(original_graph.number_of_nodes()), original_graph.nodes()))
    G = nx.relabel_nodes(G, mapping)

    if visualize:
        ax = visualize_graph(G, seed)
        # plt.savefig(f'explainability\\{idx}\\original graph idx {idx}.png', dpi=500)
        plt.show()

    return G

In [93]:
bug_df = train_df[train_df['label']==0]
bug_pyg_graphs = [graph for graph in train_pyg_graphs if graph.y == 0]
bug_netx_graphs = [graph for graph in train_netx_graphs if graph.graph['y'] == 0]
print(len(bug_df), len(bug_pyg_graphs), len(bug_netx_graphs))


134509 134509 134509


In [8]:
from lime.lime_text import LimeTextExplainer

# idx = random.randint(0, len(bug_df)-1)
# idx = 107258
k = 7

lime_explainer = LimeTextExplainer(class_names=['bug', 'feature'], random_state=1)

for idx in range((len(bug_df))):

    gat_model = gat_final_models[random.randint(0,len(gat_final_models)-1)].to(device)
    body = bug_df['body'].tolist()[idx]
    label = bug_df['label'].tolist()[idx]

    if 20 <= len(body.split()) or len(body.split()) <= 10:
        continue
    instance = bug_pyg_graphs[idx]
    if instance.x is None:
        continue
    print(f'TRUE LABEL IS '+('BUG' if label==0 else 'FEATURE'))
    exp = lime_explainer.explain_instance(body, gat_model.predict_proba, num_features=20)

    topk_words = exp.as_list()#/label=test_df['label'].tolist()[idx])
    print(f'The original words with their importance: {topk_words}')
    if label==0:
        topk_words = [word for word, pred in topk_words if pred<0]
    else:
        topk_words = [word for word, pred in topk_words if pred>=0]

    if len(topk_words)>=k:
        topk_words = topk_words[:k]
    # print(f'The top-{k} words from LIME are: {topk_words}')



    explainer = GNNExplainer(gat_model.to('cpu'))
    x = instance.x.to('cpu')
    edge_index = instance.edge_index.to('cpu')
    node_feat_mask, edge_mask = explainer.explain_graph(x, edge_index)

    # exp.show_in_notebook(text=True)

    import os
    # os.makedirs(f'explainability\\{idx}')


    G = return_to_networkx(edge_index, edge_mask, bug_netx_graphs[idx], threshold=0.5)
    words = list(G.nodes())
    # print(words)

    for word in words:
        if word not in topk_words:
            G.remove_node(word)
    ax = visualize_graph(G, 10)
    meaningful_edges = [edge for edge in list(G.edges.data("att")) if edge[2]==1.0]
    if len(meaningful_edges)>=3:
        print(idx)

    # plt.savefig(f'explainability\\{idx}\\graph with lime idx {idx}.png', dpi=500)
    # plt.show()
    print(idx)
    # exp.save_to_file(f'explainability\\{idx}/lime explainer idx {idx}.html')

NameError: name 'bug_df' is not defined