# Identify Duplicate Bug Reports Using Siamese Cross-Encoder Network

## Load Data

In [1]:
import pandas as pd
import numpy as np
from pymongo import MongoClient
from typing import List, Dict, Tuple, Set
import itertools

In [2]:
def load_dataset(limit=0, verbose=False):
  client = MongoClient()
  db = client['eclipse']
  bug_collection = db['clear']
  pairs_collection = db['pairs']

  pairs: Tuple[Dict] = tuple(pairs_collection.find(limit=limit))
  if verbose:
    print('total pairs', len(pairs))
  bug_groups = [[pair['bug1'], pair['bug2']] for pair in pairs]
  candidate_bug_ids = [
    str(bug_id)
    for bug_group in bug_groups for bug_id in bug_group
  ]
  if verbose:
    print('total candidate_bug_ids', len(candidate_bug_ids))

  # Storing bug reports as dictionary so that they can be
  # retrieved by bug_id
  bug_reports: Dict[str, Dict] = {}
  for bug_report in bug_collection.find({'bug_id': {'$in': candidate_bug_ids}}):
    bug_reports[bug_report['bug_id']] = bug_report
  if verbose:
    print('total bug_reports', len(bug_reports))

  return bug_reports, pairs

In [3]:
def create_dataframe(bug_reports: Dict[str, Dict], pairs: Tuple[Dict]):
  data = [
    [
      bug_reports[str(pair['bug1'])]['short_desc'],
      bug_reports[str(pair['bug2'])]['short_desc'],
      bug_reports[str(pair['bug1'])]['description'],
      bug_reports[str(pair['bug2'])]['description'],
      False if pair['dec'] == -1 else True]
    for pair in pairs
  ]

  columns = ['title1', 'title2', 'description1', 'description2', 'is_similar']

  return pd.DataFrame(data=data, columns=columns)

In [4]:
dup_df = create_dataframe(*load_dataset(100000))
print('Data shape:', dup_df.shape)
dup_df.sample(n=10, random_state=13)

Data shape: (100000, 5)


Unnamed: 0,title1,title2,description1,description2,is_similar
72031,FVT JVEBEANS02 - Slider orientation,org.eclipse.osgi.* package names,When a slider's orientation is set to HORIZONT...,Build M8\n\nThe packages in the org.eclipse.os...,False
27978,Inner class indentation problem,Does not format nicely anonymous type (1FRLTO1),The code shown below is formatted as shown bel...,Formatter does not handle nicely the following...,True
55639,Preference page creation problems,Code Formatter Preferences Page Broken - NPE,Using the 3.0 release candidate\n\nTrying to b...,I200405290105\n\nSelecting the Code Formatter ...,True
51955,Spelling errors in Committer vote e-mails,missing plug-ins for feature org.eclipse.stp.s...,I just got an e-mail on the cdt-dev mailing li...,Feature org.eclipse.stp.sc.jaxws.feature_0.8.0...,False
52145,NPE in WAR validation when creating a new J2EE...,feature.jar's generated from update site need ...,"Using the WTP IBuild from 6/03, an intermitten...",Created attachment 95066\npatch to add update ...,False
3011,Typo on Type Filters pref page,Linux Agent Controller initial vmsize big and ...,"The button should be named ""Disable All"" inste...",While testing with 4.3 on Red Hat I noticed th...,False
83607,"[readme] In Variables View, values of referenc...",Debugging: Variable view messed up,"two references point to the same object, initi...","Hi,\n\nthe variables view in the debugging per...",True
68952,TransationUnit copy constructor broken,"""Subversive JDT ignore recommendations"" plug-i...",I broke the TranslationUnit copy constructor w...,Build Identifier: 20110916-0149\n\nI'm using S...,False
90269,NPE in org.eclipse.jdt.internal.core.ExternalF...,NPE when refreshing external folders,Build Identifier: Eclipse 3.6.0\n\nI am receiv...,Build Identifier: M20100909-0800\n\nI have a w...,True
69234,[perfs] Performance tests with no results shou...,[IBD] It shoud be allowed to display propertie...,Using 3.3 RC1 perf tests results page: \nhttp:...,[IBD] It shoud be allowed to display propertie...,False


In [5]:
print('Data Types:')
dup_df.dtypes

Data Types:


title1          object
title2          object
description1    object
description2    object
is_similar        bool
dtype: object

In [6]:
FEATURES = ['title1', 'title2', 'description1', 'description2']
print('Description of length of the feature columns')
dup_df[FEATURES].apply(lambda col: col.str.len().describe())

Description of length of the feature columns


Unnamed: 0,title1,title2,description1,description2
count,100000.0,100000.0,100000.0,100000.0
mean,55.02302,55.05257,1456.98527,1450.55969
std,21.750643,21.738267,4207.950738,4231.916511
min,1.0,1.0,0.0,0.0
25%,40.0,40.0,238.0,240.0
50%,53.0,53.0,450.0,450.0
75%,67.0,68.0,937.0,942.0
max,255.0,255.0,149346.0,373075.0


In [7]:
print('Outliers by length:')


def count_tail_outliers(col: pd.Series):
  lengths: pd.Series = col.str.len()
  iqr = lengths.quantile(0.75) - lengths.quantile(0.25)
  outlier_range = lengths.quantile(0.75) + 1.5 * iqr
  outlier_count = sum(lengths > outlier_range)
  return pd.Series({
    'iqr': iqr,
    'count': outlier_count,
    'frac': outlier_range / len(lengths),
  })


dup_df[FEATURES].apply(count_tail_outliers)

Outliers by length:


Unnamed: 0,title1,title2,description1,description2
iqr,27.0,28.0,699.0,702.0
count,1965.0,1545.0,13850.0,13914.0
frac,0.001075,0.0011,0.019855,0.01995


In [8]:
dup_df.groupby(by='is_similar').apply(
  lambda group: pd.Series({
    'count': group.size,
    'frac': len(group) / len(dup_df),
  }),
)

Unnamed: 0_level_0,count,frac
is_similar,Unnamed: 1_level_1,Unnamed: 2_level_1
False,296470.0,0.59294
True,203530.0,0.40706


## Train, Validation, Test Split

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
train_val_df, test_df = train_test_split(
  dup_df,
  test_size=10000,
  stratify=dup_df.is_similar,
  random_state=13,
)

In [11]:
train_df, val_df = train_test_split(
  train_val_df,
  test_size=10000,
  stratify=train_val_df.is_similar,
  random_state=13,
)

In [12]:
print(f'Train Val Test Size: {len(train_df):,} {len(val_df):,} {len(test_df):,}')

Train Val Test Size: 80,000 10,000 10,000


## Download & Prepare Embedding

In [13]:
import tensorflow as tf
import tensorflow.keras.utils as kutils
from keras.layers.preprocessing.text_vectorization import TextVectorization
from keras.initializers.initializers_v2 import Constant

In [14]:
def ensure_glove_embedding(verbose=False):
  import pathlib
  embedding_data_path = kutils.get_file(
    'glove.42B.300d.zip',
    'https://nlp.stanford.edu/data/glove.42B.300d.zip',
    untar=True,
    extract=True,
  )

  # If this operation fails, print the parent-dir
  # go there, and extract the file
  file_path = pathlib.Path(embedding_data_path).parent / 'glove.42B.300d.txt'

  if verbose:
    with open(file_path, encoding='utf-8') as glove_embedding_file:
      for i in range(5):
        line = glove_embedding_file.readline()
        word, *embedding = line.split()
        print(
          'Word:', word,
          '| Embedding length:', len(embedding),
          '| Average embedding:', sum(map(float, embedding)) / len(embedding),
        )

  return file_path


glove_file_path = ensure_glove_embedding(verbose=True)

Word: , | Embedding length: 300 | Average embedding: -0.02834135199999997
Word: the | Embedding length: 300 | Average embedding: -0.012646989333333348
Word: . | Embedding length: 300 | Average embedding: -0.05447891
Word: and | Embedding length: 300 | Average embedding: -0.054808682333333324
Word: to | Embedding length: 300 | Average embedding: -0.0682633267666667


### Create Embedding Index

In [15]:
def create_embedding_index(embedding_file_path: str, verbose=False):
  if verbose:
    from tqdm.notebook import tqdm

    # there are 1.9M words, and we will update progress
    # on every 1000 word read
    progress_bar = tqdm(total=1917494)

  embedding_index: Dict[str, np.ndarray] = {}
  with open(embedding_file_path, encoding='utf-8') as embedding_file:
    i = 0
    for line in embedding_file:
      i += 1
      word, coefficients = line.split(maxsplit=1)
      if i > 1917494:
        print('word:', word)
        break
      coefficients = np.fromstring(coefficients, 'float', sep=' ')
      embedding_index[word] = coefficients

      if verbose:
        if i % 1000 == 0:
          progress_bar.update(1000)

  if verbose:
    progress_bar.close()

  if verbose:
    print(f'Found {len(embedding_index)} words in the embedding.')
    print(f'Embedding dimension: {len(next(iter(embedding_index.values())))}')

  return embedding_index


embedding_index = create_embedding_index(glove_file_path, True)

  0%|          | 0/1917494 [00:00<?, ?it/s]

Found 1917494 words in the embedding.
Embedding dimension: 300


### Create Vocabulary Index

In [16]:
MAX_TOKENS = 20000
MAX_TITLE_LENGTH = 100
EMBEDDING_DIM = 300

In [17]:
def build_vocab(sentences: List[str], sequence_length: int):
  vectorizer = TextVectorization(
    max_tokens=MAX_TOKENS - 2,
    output_sequence_length=sequence_length,
  )
  vectorizer.adapt(sentences)
  vocab = vectorizer.get_vocabulary()
  word_index = dict(zip(vocab, range(len(vocab))))

  return vectorizer, word_index

In [18]:
title_vectorizer, title_word_index = build_vocab(
  [*dup_df.title1, *dup_df.title2],
  MAX_TITLE_LENGTH,
)

print(
  'Most frequent title words:',
  list(itertools.islice(title_word_index.keys(), 5)),
)

Most frequent title words: ['', '[UNK]', 'in', 'to', 'not']


### Create Embedding Matrix

In [19]:
def create_embedding_matrix(
  embedding_index: Dict[str, np.ndarray],
  word_index: Dict[str, int],
  verbose=False,
):
  hits = 0
  misses = 0

  # prepare embedding matrix
  embedding_matrix = np.zeros((MAX_TOKENS, EMBEDDING_DIM))
  for word, i in word_index.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
      # Words not found in embedding index will be all-zeros.
      # This includes the representation for "padding" and "OOV"
      embedding_matrix[i] = embedding_vector
      hits += 1
    else:
      misses += 1

  if verbose:
    print('Embedding shape:', embedding_matrix.shape)
    print(f'Found {hits} words, missed {misses}.')

  return embedding_matrix


title_embedding_matrix = create_embedding_matrix(
  embedding_index, title_word_index, True,
)

Embedding shape: (20000, 300)
Found 11764 words, missed 8234.


## Prepare Training Data

In [20]:
def sent_vectorize(col: pd.Series):
  return title_vectorizer(
    np.array([[s] for s in col])
  ).numpy()

train_x1 = sent_vectorize(train_df.title1)
train_x2 = sent_vectorize(train_df.title2)
val_x1 = sent_vectorize(val_df.title1)
val_x2 = sent_vectorize(val_df.title2)
test_x1 = sent_vectorize(test_df.title1)
test_x2 = sent_vectorize(test_df.title2)

train_y = np.array(train_df.is_similar)
val_y = np.array(val_df.is_similar)
test_y = np.array(test_df.is_similar)

print('Train shapes:', train_x1.shape, train_x2.shape, train_y.shape)
print('Val shapes:', val_x1.shape, val_x2.shape, val_y.shape)
print('Test shapes:', test_x1.shape, test_x2.shape, test_y.shape)

Train shapes: (80000, 100) (80000, 100) (80000,)
Val shapes: (10000, 100) (10000, 100) (10000,)
Test shapes: (10000, 100) (10000, 100) (10000,)


## Create Models

In [21]:
from tensorflow.keras import layers
from tensorflow.keras import models
from keras.initializers.initializers_v2 import Constant

In [22]:
TitleEmbeddingLayer = layers.Embedding(
  input_dim=MAX_TOKENS,
  output_dim=EMBEDDING_DIM,
  embeddings_initializer=Constant(title_embedding_matrix),
  trainable=False,
  name='TitleEmbeddingLayer',
)

TitleLSTMLayer = layers.Bidirectional(layers.LSTM(
  units=100,
  dropout=0.2,
  recurrent_dropout=0.2,
  return_sequences=True,
), name='TitleBidirectionalLSTMLayer')

In [34]:
title1_input = layers.Input(shape=(None,), dtype='int64', name='title1_input')
title1_embedding_layer = TitleEmbeddingLayer(title1_input)
title1_lstm_layer = TitleLSTMLayer(title1_embedding_layer)

title2_input = layers.Input(shape=(None,), dtype='int64', name='title2_input')
title2_embedding_layer = TitleEmbeddingLayer(title2_input)
title2_lstm_layer = TitleLSTMLayer(title2_embedding_layer)

title_concat = layers.Add(
  name='title_concat'
)([title1_lstm_layer, title2_lstm_layer])
title_output = layers.Dense(
  1, activation='sigmoid', name='title_output',
)(title_concat)
title_model = models.Model(
  inputs=[title1_input, title2_input],
  outputs=title_output,
  name='title_model'
)

title_model.summary()

Model: "title_model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
title1_input (InputLayer)       [(None, None)]       0                                            
__________________________________________________________________________________________________
title2_input (InputLayer)       [(None, None)]       0                                            
__________________________________________________________________________________________________
TitleEmbeddingLayer (Embedding) multiple             6000000     title1_input[0][0]               
                                                                 title2_input[0][0]               
__________________________________________________________________________________________________
TitleBidirectionalLSTMLayer (Bi multiple             320800      TitleEmbeddingLayer[14]

### Train

In [None]:
title_model.compile(
  loss='binary_crossentropy',
  optimizer='adam',
  metrics=['acc'],
)
title_history = title_model.fit(
  [train_x1, train_x2],
  train_y,
  batch_size=512,
  epochs=5,
  verbose=1,
  validation_data=[
    [val_x1, val_x2],
    val_y,
  ],
)

Epoch 1/5
 13/157 [=>............................] - ETA: 45:49 - loss: 0.6505 - acc: 0.6079

In [None]:
train_x1[0].shape