In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from pymongo import MongoClient

In [2]:
def load_dataset(limit=0, verbose=False):
  client = MongoClient()
  db = client['eclipse']
  bug_collection = db['clear']
  pairs_collection = db['pairs']

  pairs = tuple(pairs_collection.find(limit=limit))
  if verbose:
    print('pairs', len(pairs))
  bug_groups = [[pair['bug1'], pair['bug2']] for pair in pairs]
  candidate_bug_ids = [
    str(bug_id)
    for bug_group in bug_groups
    for bug_id in bug_group
  ]
  if verbose:
    print('candidate_bug_ids', len(candidate_bug_ids))
  bug_reports = {}
  for bug_report in bug_collection.find({'bug_id': {'$in': candidate_bug_ids}}):
    bug_reports[bug_report['bug_id']] = bug_report
  if verbose:
    print('bug_reports', len(bug_reports))

  data = [
    [
      bug_reports[str(pair['bug1'])]['short_desc'],
      bug_reports[str(pair['bug2'])]['short_desc'],
      0 if pair['dec'] == -1 else 1]
    for pair in pairs
  ]

  return pd.DataFrame(data=data, columns=['sentences1', 'sentences2', 'is_similar'])


dataset = load_dataset()

In [3]:
print('dataset.shape:', dataset.shape)
dataset.head(10)

dataset.shape: (271098, 3)


Unnamed: 0,sentences1,sentences2,is_similar
0,[update] could not load tasklist hyperlink det...,[update] Sometimes but not selden i get the er...,1
1,WSE hangs in external browser after invoking f...,Loading model aborts on non-fatal error,0
2,[Regression]<Select value...> can not select a...,Select value in table filter condition panel d...,1
3,Group completion options issue,[Group Code Assist] No code completion for und...,1
4,Add org.apache.bcel,[api tooling] comments from Eugene,0
5,Support cube filter in chart,add API-3.0 and other common tags to project s...,0
6,Max Rydahl Andersen's blog feed contains comments,Move my feed from blog.xam.dk to in.relation.to,1
7,[Regression] Highlight can not be added and th...,Submitting task fails with invalid date / time...,0
8,Notification e-mails not sent for committer el...,[Regression] The error is of no default value ...,0
9,unable to reassign tasks if no permissions to ...,cmdbf services make eclipse-specific references,0


In [4]:
dataset.dtypes

sentences1    object
sentences2    object
is_similar     int64
dtype: object

In [5]:
np.sum(dataset["is_similar"]), len(dataset["is_similar"])

(110181, 271098)

In [6]:
import gensim.downloader as api

In [7]:
embedding_model = api.load("glove-wiki-gigaword-100")

In [8]:
embedding_model.most_similar("orange")

[('yellow', 0.7358633279800415),
 ('red', 0.7140780091285706),
 ('blue', 0.7118036150932312),
 ('green', 0.7111418843269348),
 ('pink', 0.677507221698761),
 ('purple', 0.6774231791496277),
 ('black', 0.6709616780281067),
 ('colored', 0.665260910987854),
 ('lemon', 0.6251963973045349),
 ('peach', 0.6168624758720398)]

In [9]:
len(embedding_model["president"])

100

In [10]:
def load_data(df):
  sentences1 = df["sentences1"].astype(str).values
  sentences2 = df["sentences2"].astype(str).values
  #combine to get the tokens
  df["combined"] = df["sentences1"] + df["sentences2"]
  labels = df["is_similar"].values
  return sentences1, sentences2, labels

In [11]:
sentences1, sentences2, labels = load_data(dataset)

In [12]:
sentences1[0], sentences2[0], labels[0]

('[update] could not load tasklist hyperlink detector extension',
 '[update] Sometimes but not selden i get the error: InteractionContextManager.getScalingFactors',
 1)

In [13]:
combined = list(sentences1) + list(sentences2)

In [14]:
dataset["combined"][0]

'[update] could not load tasklist hyperlink detector extension[update] Sometimes but not selden i get the error: InteractionContextManager.getScalingFactors'

In [15]:
len(combined)

542196

In [16]:
def clean_ascii(text):
  return ''.join(i for i in text if ord(i) < 128)

In [17]:
cleaned = clean_ascii('Mad%sk')
cleaned

'Mad%sk'

In [18]:
max_words = 10000
embedding_dimension = 100

In [19]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [20]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [21]:
tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')

In [22]:
tokenizer.fit_on_texts(combined)

In [23]:
sequences = tokenizer.texts_to_sequences(combined)

In [24]:
sequences_padded = pad_sequences(sequences, maxlen=300, padding='post')

In [25]:
sequences_padded[0, :10]

array([  36,  328,    4,  478, 2886,  934, 3769,  187,    0,    0])

In [26]:
#create embedding matrix
def get_embedding_matrix():
  embedding_matrix = np.zeros((max_words, embedding_dimension))
  iteration_count = 0
  for word, i in tokenizer.word_index.items():
    iteration_count += 1
    if i == max_words:
      break
    if embedding_model.has_index_for(word):
      embedding_vector = embedding_model[word]
      assert embedding_vector is not None
      embedding_matrix[i] = embedding_vector
  assert iteration_count == max_words, f'iteration_count:{iteration_count}'
  return embedding_matrix


emb_matrix = get_embedding_matrix()
print('emb_matrix.shape:', emb_matrix.shape)

emb_matrix.shape: (10000, 100)


In [27]:
from tensorflow.keras.layers import Bidirectional, LSTM, Input, Lambda, Dense

In [28]:
from tensorflow.keras.models import Model

In [29]:
from sklearn.model_selection import train_test_split

In [30]:
import tensorflow.keras.backend as K

In [31]:
lstm_layer = Bidirectional(LSTM(10, dropout=0.2, recurrent_dropout=0.2))

In [32]:
emb = tf.keras.layers.Embedding(max_words, embedding_dimension, input_length=300, weights=[emb_matrix],
                                trainable=False)

In [33]:
input1 = Input(shape=(300,), name="input1")

In [34]:
e1 = emb(input1)
x1 = lstm_layer(e1)

In [35]:
input2 = Input(shape=(300,), name="input2")

In [36]:
e2 = emb(input2)
x2 = lstm_layer(e2)

In [37]:
manhattan_distance = lambda x: tf.keras.backend.abs(x[0] - x[1])

In [38]:
def euclidean_distance(x):
  sumSquared = K.sum(K.square(x[0] - x[1]), axis=1,
                     keepdims=True)
  return K.sqrt(K.maximum(sumSquared, K.epsilon()))

In [39]:
merged = Lambda(function=euclidean_distance, output_shape=lambda x: x[0], name="L1_distance")([x1, x2])

In [40]:
preds = Dense(1, activation="sigmoid")(merged)

In [41]:
model = Model(inputs=[input1, input2], outputs=preds)

In [42]:
def contrastive_loss(y_true, y_pred):
  y_true = tf.dtypes.cast(y_true, tf.float64)
  y_pred = tf.dtypes.cast(y_pred, tf.float64)
  margin = 1
  square_pred = K.square(y_pred)
  margin_square = K.square(K.maximum(margin - y_pred, 0))
  return K.mean(y_true * square_pred + (1 - y_true) * margin_square)

In [43]:
model.compile(loss="mse", optimizer="adam", metrics=["accuracy"])

In [44]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input1 (InputLayer)             [(None, 300)]        0                                            
__________________________________________________________________________________________________
input2 (InputLayer)             [(None, 300)]        0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 300, 100)     1000000     input1[0][0]                     
                                                                 input2[0][0]                     
__________________________________________________________________________________________________
bidirectional (Bidirectional)   (None, 20)           8880        embedding[0][0]              

In [45]:
def create_data():
  features, labels = dataset.drop(columns=["is_similar", "combined"]).values, dataset["is_similar"].values
  x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=20000, random_state=42)
  x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=20000, random_state=42)
  return x_train, y_train, x_test, y_test, x_val, y_val

In [46]:
x_train, y_train, x_test, y_test, x_val, y_val = create_data()
x_train.shape, y_train.shape, x_test.shape, y_test.shape, x_val.shape, y_val.shape

((231098, 2), (231098,), (20000, 2), (20000,), (20000, 2), (20000,))

In [47]:
def convert_to_sequences(sentences):
  sequences = tokenizer.texts_to_sequences(sentences)
  seq_padded = pad_sequences(sequences, maxlen=300, padding="post")
  return seq_padded

In [48]:
x_converted = convert_to_sequences(x_train[:, 0])

In [49]:
x_converted.shape

(231098, 300)

In [50]:
x_train[:, 0].shape

(231098,)

In [None]:
history = model.fit(
  [convert_to_sequences(x_train[:, 0]),
   convert_to_sequences(x_train[:, 1])],
  y_train,
  batch_size=128,
  epochs=10,
  verbose=1,
  validation_data=(
    [convert_to_sequences(x_val[:, 0]), convert_to_sequences(x_val[:, 1])],
    y_val,
  ),
)

Epoch 1/10
Epoch 2/10

In [None]:
model.evaluate([convert_to_sequences(x_test[:, 0]), convert_to_sequences(x_test[:, 1])], y_test)

In [None]:
import matplotlib.pyplot as plt

In [None]:
history.model

In [None]:
test_instance = x_test[3]

In [None]:
test_instance[0], test_instance[1], y_test[3]

In [None]:
test_label = y_test[0]

In [None]:
model.predict([convert_to_sequences([test_instance[0]]), convert_to_sequences([test_instance[1]])])

In [None]:
predicted = model.predict([convert_to_sequences(x_test[:, 0]), convert_to_sequences(x_test[:, 1])])

In [None]:
predicted