In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Global

In [None]:
cd /content/drive/My Drive/IssuesManagement

/content/drive/.shortcut-targets-by-id/1Hgps4QuC_8w15htjlDJJFgm2WyvBtk5Y/IssuesManagement


In [None]:
import os
import pandas as pd
import numpy as np
import json
from tqdm import tqdm
import tensorflow as tf
import torch
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
PROJECT_NAME = "FLUME"

TEXT_FEATURES = ["title", "description", "summary"]
ADDING_TIME_FEATURES = ['CC', 'CU']

MAXLEN = 256
MODEL_NAME = "CNN"
EMBEDDING_METHOD = "GloVe"
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"

In [None]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))

from tokenizers.trainers import BpeTrainer
trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])

from tokenizers.pre_tokenizers import Whitespace
tokenizer.pre_tokenizer = Whitespace()
tokenizer = Tokenizer.from_file("custom-tokenizer.json")

In [None]:
# Load GloVe pretrained model
__PADDED_INDEX__ = 0
__UNKNOWN_WORD__ = 1

def load_word_embeddings(fname):
    wordvecs = {}
    with open(fname, 'r') as file:
        lines = file.readlines()
        for line in lines:
            tokens = line.split(' ')
            vec = np.array(tokens[1:], dtype=np.float32)
            wordvecs[tokens[0]] = vec

    return wordvecs

if not os.path.exists("GloVe_embedding_matrix.npy"):
  wordvecs = load_word_embeddings("glove.42B.300d.txt")
  vocab = wordvecs.keys()
  matrix = list(wordvecs.values())
  WORD2INDEX = {word: index+2 for index, word in enumerate(vocab)}
  EMBEDDING_MATRIX = np.pad(matrix, [[2,0],[0,0]], mode='constant', constant_values =0.0)
  np.save("GloVe_embedding_matrix.npy", EMBEDDING_MATRIX)
  with open("GloVe_word_to_index.json", "w") as f:
    json.dump(WORD2INDEX, f)
else:
  EMBEDDING_MATRIX = np.load("GloVe_embedding_matrix.npy")
  with open("GloVe_word_to_index.json", "r") as f:
    WORD2INDEX = json.load(f)
EMBEDDING_MATRIX = torch.tensor(EMBEDDING_MATRIX)
HIDDEN_SIZE = EMBEDDING_MATRIX.shape[1]

def glove_tokenizer(sentences):
  tokenized_texts = [tokenizer.encode(text).tokens for text in sentences]
  X = []
  for text in tokenized_texts:
    cur_text_indices = []
    for word in text:
      if word in WORD2INDEX:
          cur_text_indices.append(WORD2INDEX[word])
      else:
          cur_text_indices.append(__UNKNOWN_WORD__)
    X.append(cur_text_indices)
  return X

# Load data

In [None]:
train_links = pd.read_csv(
    f"data/{PROJECT_NAME}/train_links.csv", keep_default_na=False)
train_links['label'] = train_links['label'].map(lambda x: int(x!=0))
issues = pd.read_csv(
    f"data/{PROJECT_NAME}/preprocessed_attributes.csv", index_col="key")
issues['created'] = pd.to_datetime(issues['created'], utc=True)
issues['updated'] = pd.to_datetime(issues['updated'], utc=True)
issues = issues.fillna(" ")
first_feature = TEXT_FEATURES[0]
issues["text"] = issues[first_feature]
if len(TEXT_FEATURES)>1:
  for feature in TEXT_FEATURES[1:]:
    issues["text"] = issues["text"] + " " + issues[feature]

# Filter model

In [None]:
test_links = pd.read_csv(
    f"data/{PROJECT_NAME}/test_links.csv", keep_default_na=False)
test_links['label'] = test_links['label'].map(lambda x: int(x!=0))

In [None]:
def filter_data(issues, links):
  issues_1 = links["key_1"].values
  issues_2 = links["key_2"].values
  cre_1 = issues.loc[issues_1]['created'].values
  cre_2 = issues.loc[issues_2]['created'].values
  links['date_gap'] = np.abs(np.array(
        (cre_1-cre_2) / np.timedelta64(1, 'D'), dtype=np.float64).reshape(-1, 1))
  return links[links['date_gap']<=30]

In [None]:
train_links = filter_data(issues, train_links)
test_links = filter_data(issues, test_links)

#Training

In [None]:
from pydantic import BaseModel

class ModelConfig(BaseModel):
  generate_batch_size: int
  mul: int
  adding_time_features: list
  mean_time_features: float
  std_time_features: float
  value_maxlen: int
  hidden_size: int
  number_units: int
  model_name: str
  learning_rate: float
  steps_per_epoch: int
  epochs: int

def generate_input(issues, train_links, config: ModelConfig):
    match_data = train_links[train_links["label"] != 0]
    none_data = train_links[train_links["label"] == 0]
    match_data = match_data[['key_1', "key_2"]].values
    none_data = none_data[['key_1', "key_2"]].values

    while True:
        each_size = int(config.generate_batch_size/2)

        # Shuffle index of match data
        shuffle_index = [index for index in np.random.choice(
            len(match_data), len(match_data), replace=False)]
        match_data = [match_data[index] for index in shuffle_index]

        for iter in range(int(len(match_data)/each_size)):
            # Split data by batch size and randomly select non_match_links: 1/2 for match data, 1/2 for non-match data

            # Get index
            match_index = np.array(range(iter*each_size, (iter+1)*each_size))
            none_index = np.array([index for index in np.random.choice(
                len(none_data), each_size*config.mul, replace=False)])

            match_links = [match_data[i] for i in match_index]
            none_links = [none_data[i] for i in none_index]

            # Create X by tokenizing and padding X
            index_pairs = np.array(match_links + none_links)
            text_1 = glove_tokenizer(issues["text"].loc[index_pairs[:,0]].values)
            text_1 = tf.keras.utils.pad_sequences(sequences=text_1, padding='post', truncating="post", maxlen=MAXLEN)
            text_2 = glove_tokenizer(issues["text"].loc[index_pairs[:,1]].values)
            text_2 = tf.keras.utils.pad_sequences(sequences=text_2, padding='post', truncating="post", maxlen=MAXLEN)

            # Create label y
            match_y = np.vstack(
                [np.zeros(len(match_links)), np.ones(len(match_links))]).T
            none_y = np.vstack([np.ones(len(none_links)), np.zeros(len(none_links))]).T
            y = np.concatenate([match_y, none_y])

            cre_1 = issues["created"].loc[index_pairs[:, 0]].values
            cre_2 = issues["created"].loc[index_pairs[:, 1]].values
            update = issues["updated"].loc[index_pairs[:, 1]].values
            cre_cre = np.array(
                (cre_1-cre_2) / np.timedelta64(1, 'D'), dtype=np.float64).reshape(-1, 1)
            cre_up =  np.array(
                (cre_1-update) / np.timedelta64(1, 'D'), dtype=np.float64).reshape(-1, 1)
            if len(config.adding_time_features) == 2:
                time_features = np.array(
                    [[cre_cre[i][0], cre_up[i][0]] for i in range(len(cre_cre))])
            elif "CC" in config.adding_time_features:
                time_features = cre_cre
            elif "CU" in config.adding_time_features:
                time_features = cre_up
            index = np.random.choice(len(index_pairs), config.generate_batch_size, replace=False)
            if len(config.adding_time_features) == 0:
                yield [text_1[index], text_2[index]], y[index]
            else:
                time_features = np.array(
                [(time_features[i]-config.mean_time_features)/config.std_time_features for i in range(len(time_features))])
                yield [text_1[index], text_2[index], time_features[index]], y[index]
    return 0

def return_model(config: ModelConfig):
  inputs_A = tf.keras.Input(shape=(config.value_maxlen), name="input_a")
  inputs_B = tf.keras.Input(shape=(config.value_maxlen), name="input_b")

  embedding_layer = tf.keras.layers.Embedding(input_dim=EMBEDDING_MATRIX.shape[0],
                 output_dim=EMBEDDING_MATRIX.shape[1],
                  embeddings_initializer = tf.keras.initializers.Constant(value=EMBEDDING_MATRIX),
                 mask_zero=True)
  embedding_layer.trainable=False
  # Embedding
  emb_A = embedding_layer(inputs_A)
  emb_B = embedding_layer(inputs_B)

  if len(ADDING_TIME_FEATURES)==1:
    inputs_C = tf.keras.Input(shape=(1), name="input_c")
  elif len(ADDING_TIME_FEATURES)==2:
    inputs_C = tf.keras.Input(shape=(2), name="input_c")

  # Deep Learning model's structure
  flatten_layer = tf.keras.layers.Flatten(name="flatten")
  dense_1_layer = tf.keras.layers.Dense(config.number_units, activation="relu", name="dense_1")
  output_layer = tf.keras.layers.Dense(2, activation="softmax", name="dense_output")

  if config.model_name=="CNN":
    core_layer = tf.keras.layers.Conv1D(config.number_units, 3, activation='relu')
  elif config.model_name=="LSTM":
    core_layer = tf.keras.layers.LSTM(config.number_units)
  else:
    core_layer = tf.keras.layers.GRU(config.number_units, name="gru")

  core_A = core_layer(emb_A)
  core_B = core_layer(emb_B)

  if len(ADDING_TIME_FEATURES)==0:

    # Concat two embedded inputs
    X = tf.concat([flatten_layer(core_A), flatten_layer(core_B)], axis=1)

    dense_1_X = dense_1_layer(X)

    outputs = output_layer(dense_1_X)

    model = tf.keras.Model(inputs=[inputs_A, inputs_B], outputs=outputs)

  else:

    # Concat two embedded inputs
    X = tf.concat([flatten_layer(core_A), flatten_layer(core_B), inputs_C], axis=1)

    dense_1_X = dense_1_layer(X)

    outputs = output_layer(dense_1_X)

    model = tf.keras.Model(inputs=[inputs_A, inputs_B, inputs_C], outputs=outputs)

  model.compile(tf.keras.optimizers.Adam(learning_rate=config.learning_rate), loss="mse", metrics=["categorical_accuracy"])
  model.summary()

  return model

def train_model(issues, train_links, config):
  model = return_model(config)
  history = model.fit(generate_input(issues, train_links, config),
              steps_per_epoch=config.steps_per_epoch,
              epochs=config.epochs,
              shuffle=False,
              verbose = 1)
  return model




In [None]:
def get_normalize_parameter(issues, train_links):
    index_pairs = train_links[["key_1", "key_2"]].values
    cre_1 = issues["created"].loc[index_pairs[:, 0]].values
    cre_2 = issues["created"].loc[index_pairs[:, 1]].values
    update = issues["updated"].loc[index_pairs[:, 1]].values
    cre_cre = np.array((cre_1-cre_2) / np.timedelta64(1,
                       'D'), dtype=int).reshape(-1, 1)
    cre_up = np.array((cre_1-update) / np.timedelta64(1, 'D'),
                      dtype=int).reshape(-1, 1)
    if len(ADDING_TIME_FEATURES) == 2:
        time_features = np.array([[cre_cre[i][0], cre_up[i][0]]
                                 for i in range(len(cre_cre))])
    elif "CC" in ADDING_TIME_FEATURES:
        time_features = cre_cre
    elif "CU" in ADDING_TIME_FEATURES:
        time_features = cre_up
    else:
      time_features = [0]
    mean = np.mean(time_features)
    std = np.std(time_features)
    return mean, std

In [None]:
MEAN_TIME_FEATURES, STD_TIME_FEATURES = get_normalize_parameter(
    issues, train_links)
print(MEAN_TIME_FEATURES, STD_TIME_FEATURES)

-233.00655826558267 542.4283763595195


In [None]:
model_config = ModelConfig(
  model_name = MODEL_NAME,
  generate_batch_size = 128,
  mul = 3,
  adding_time_features = ADDING_TIME_FEATURES,
  mean_time_features = MEAN_TIME_FEATURES,
  std_time_features = STD_TIME_FEATURES,
  value_maxlen = MAXLEN,
  hidden_size = HIDDEN_SIZE,
  number_units = 256,
  learning_rate = 1e-3,
  steps_per_epoch = 5,
  epochs = 200)
model = train_model(issues, train_links, model_config)

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_a (InputLayer)        [(None, 256)]                0         []                            
                                                                                                  
 input_b (InputLayer)        [(None, 256)]                0         []                            
                                                                                                  
 embedding (Embedding)       (None, 256, 300)             5752488   ['input_a[0][0]',             
                                                          00         'input_b[0][0]']             
                                                                                                  
 conv1d (Conv1D)             (None, 254, 256)             230656    ['embedding[0][0]',       

# Test

In [None]:
from torch.utils.data import Dataset,  DataLoader
class TestDataset(Dataset):
    def __init__(self, index_pairs, labels):
        self.index_pairs = index_pairs
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
      return [self.index_pairs[idx,0], self.index_pairs[idx,1]], self.labels[idx]

In [None]:
def get_predict(issues, index_pairs, config, model):
  text_1 = glove_tokenizer(issues["text"].loc[index_pairs[0]].values)
  text_1 = tf.keras.utils.pad_sequences(sequences=text_1, padding='post', truncating="post", maxlen=MAXLEN)
  text_2 = glove_tokenizer(issues["text"].loc[index_pairs[1]].values)
  text_2 = tf.keras.utils.pad_sequences(sequences=text_2, padding='post', truncating="post", maxlen=MAXLEN)
  cre_1 = issues["created"].loc[index_pairs[0]].values
  cre_2 = issues["created"].loc[index_pairs[1]].values
  update = issues["updated"].loc[index_pairs[1]].values
  cre_cre = np.array(
      (cre_1-cre_2) / np.timedelta64(1, 'D'), dtype=np.float64).reshape(-1, 1)
  cre_up =  np.array(
      (cre_1-update) / np.timedelta64(1, 'D'), dtype=np.float64).reshape(-1, 1)

  if len(config.adding_time_features) == 2:
      time_features = np.array(
          [[cre_cre[i][0], cre_up[i][0]] for i in range(len(cre_cre))])
  elif "CC" in config.adding_time_features:
      time_features = cre_cre
  elif "CU" in config.adding_time_features:
      time_features = cre_up
  else:
    return model([text_1, text_2])
  if len(config.adding_time_features) > 0:
    time_features = np.array(
        [(time_features[i]-config.mean_time_features)/config.std_time_features for i in range(len(time_features))])
  return model([text_1, text_2, time_features])

In [None]:
labels = []
for i in test_links["label"].values:
  if i==0:
    labels.append([1,0])
  else:
    labels.append([0,1])
test_data = TestDataset(test_links[["key_1", "key_2"]].values, labels)

In [None]:
test_dataloader = DataLoader(test_data, batch_size=2048)
len(test_dataloader)

8

In [None]:
if not os.path.exists(f"results_{EMBEDDING_METHOD}_{PROJECT_NAME}"):
  os.mkdir(f"results_{EMBEDDING_METHOD}_{PROJECT_NAME}")

In [None]:
for index, i in enumerate(test_dataloader):
  index_pairs, label = i
  index_pairs[0] = np.array( index_pairs[0])
  index_pairs[1] = np.array( index_pairs[1])
  proba = get_predict(issues, index_pairs, model_config, model)
  print(f"{index}/{len(test_dataloader)}")
  if index==0:
    pred_proba = proba
    continue
  pred_proba=np.concatenate([pred_proba, proba])

0/8
1/8
2/8
3/8
4/8
5/8
6/8
7/8


In [None]:
y_s = [np.argmax(i) for i in labels]
pred_s = [np.argmax(i) for i in pred_proba]

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
print("Confusion maxtrix")
print(confusion_matrix(y_s, pred_s))
print(classification_report(y_s, pred_s, digits= 2))

Confusion maxtrix
[[15846     0]
 [  126     0]]
              precision    recall  f1-score   support

           0       0.99      1.00      1.00     15846
           1       0.00      0.00      0.00       126

    accuracy                           0.99     15972
   macro avg       0.50      0.50      0.50     15972
weighted avg       0.98      0.99      0.99     15972



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Recommend

In [None]:
match_test_links = test_links[test_links["label"]!=0]

In [None]:
test_issues = pd.read_csv(f"data/{PROJECT_NAME}/test_issues.csv", index_col="key").index

y_test = []
filter_test_issues = []
for test_issue in tqdm(test_issues):
  filter_links = match_test_links[(match_test_links["key_1"]==test_issue)|(match_test_links["key_2"]==test_issue)]
  if len(filter_links)>0:
    match_issues = set(list(filter_links["key_1"].values) + list(filter_links["key_2"].values))
    match_issues.remove(test_issue)
    y_test.append(list(match_issues))
    filter_test_issues.append(test_issue)
test_issues = filter_test_issues

100%|██████████| 276/276 [00:00<00:00, 1457.83it/s]


In [None]:
len(test_issues)

154

In [None]:
def get_accuracy(pred, gt):
	acc = 0
	for i, item in enumerate(pred):
		if item in gt:
			acc += 1.0
			break
	return acc

def get_MRR(pred, gt):
	mrr = 0
	for i, item in enumerate(pred):
		if item in gt:
			mrr += 1.0/(i+1)
	return mrr

def get_precision_recall(pred, gt):
	right = 0

	for item in gt:
		if item in pred: # relevant
			right+=1

	if len(pred) == 0:
		precision = 0
	else:
		precision = right/len(pred)
	recall = right/len(gt)

	return precision, recall

def get_f1_score(precision, recall):
    if precision + recall == 0:
        return 0
    f1_score = 2 * (precision * recall) / (precision + recall)
    return f1_score

def get_metrics(recommend, label):
	acc = 0
	mrr = 0
	precision = 0
	recall = 0
	f1 = 0
	for i in range(0, len(recommend)):
		if len(label[i])!=0:
			acc += get_accuracy(recommend[i], label[i])
			mrr += get_MRR(recommend[i], label[i])
			precision_recall = get_precision_recall(recommend[i], label[i])
			precision += precision_recall[0]
			recall += precision_recall[1]
			f1 = get_f1_score(precision_recall[0], precision_recall[1])

	acc = acc/(len(recommend))
	mrr = mrr/(len(recommend))
	precision = precision/(len(recommend))
	recall = recall/(len(recommend))
	f1 = f1/(len(recommend))
	return acc, mrr, precision, recall, f1


In [None]:
def get_recommendation(test_issue, issues, test_links):
  all_issues = list(issues.index)
  all_issues.remove(test_issue)

  test_links_1 = pd.DataFrame(
      {"key_1": [test_issue]*len(all_issues), "key_2": all_issues})
  test_links_1["link_id"] = test_links_1["key_1"] + "-" + test_links_1["key_2"]
  test_links_1 = test_links[test_links["link_id"].isin(test_links_1["link_id"].values)]
  pred_proba_1 = [(pred, issue) for pred, issue in zip(test_links_1["proba"].values, test_links_1["key_2"].values)]

  test_links_2 = pd.DataFrame(
      {"key_1": all_issues, "key_2": [test_issue]*len(all_issues)})
  test_links_2["link_id"] = test_links_2["key_1"] + "-" + test_links_2["key_2"]
  test_links_2 = test_links[test_links["link_id"].isin(test_links_2["link_id"].values)]
  pred_proba_2 = [(pred, issue) for pred, issue in zip(test_links_2["proba"].values, test_links_2["key_1"].values)]

  pred_proba_3 = pred_proba_1 + pred_proba_2
  preprocess_pred_proba = {}
  for pred, issue in pred_proba_3:
    if issue not in preprocess_pred_proba:
      preprocess_pred_proba[issue] = pred
    else:
      preprocess_pred_proba[issue] = max(pred, preprocess_pred_proba[issue])
  preprocess_pred_proba = [(pred, issue) for issue, pred in preprocess_pred_proba.items()]

  return [pair[1] for pair in sorted(preprocess_pred_proba, reverse=True)]

In [None]:
test_links["proba"] = pred_proba[:,1]

In [None]:
test_links["link_id"] = test_links["key_1"] + "-" + test_links["key_2"]

In [None]:
recommend_results = []
for test_issue in tqdm(test_issues):
    recommend_results.append(get_recommendation(
        test_issue, issues, test_links))

100%|██████████| 154/154 [00:01<00:00, 143.78it/s]


In [None]:
for k in [1,2,3,5,10]:
  recommend_list = [i[:k] for i in recommend_results]
  acc, mrr, precision, recall, f1 = get_metrics(recommend_list, y_test)
  print(f"Top {k}:")
  print(f"Acc = {acc}")
  print(f"MRR = {mrr}")
  print(f"Recall = {recall}")
  print(f"F1 = {f1}")

Top 1:
Acc = 0.032467532467532464
MRR = 0.032467532467532464
Recall = 0.028138528138528136
F1 = 0.0
Top 2:
Acc = 0.07142857142857142
MRR = 0.05194805194805195
Recall = 0.06277056277056277
F1 = 0.004329004329004329
Top 3:
Acc = 0.07142857142857142
MRR = 0.054112554112554105
Recall = 0.06493506493506493
F1 = 0.003246753246753247
Top 5:
Acc = 0.12987012987012986
MRR = 0.06904761904761904
Recall = 0.12554112554112554
F1 = 0.002164502164502165
Top 10:
Acc = 0.3116883116883117
MRR = 0.09351680065965774
Recall = 0.3073593073593074
F1 = 0.001443001443001443
