In [1]:
import os
import json
import itertools
import pandas as pd
from pathlib import Path
from mlengine.core.model import train_mlm

def load_json_dataset(path):
    with open(path) as f:
        return json.load(f)

def flatten_entry(data_entry):
    result = []
    for data in data_entry:
        what = data['what']
        if what.strip() != "":
            result.append(what)
    return result

Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



In [2]:
eclipse_dataset = load_json_dataset("data/msr2013-bug_dataset/data/v02/eclipse/short_desc.json")
mozilla_dataset = load_json_dataset("data/msr2013-bug_dataset/data/v02/mozilla/short_desc.json")

c_mozilla_dataset = [*itertools.chain.from_iterable(map(flatten_entry, mozilla_dataset['short_desc'].values()))]
c_eclipse_dataset = [*itertools.chain.from_iterable(map(flatten_entry, eclipse_dataset['short_desc'].values()))]

In [3]:
dataset = list(set(c_mozilla_dataset + c_eclipse_dataset))

In [4]:
dataset[:25]

['This URL (java) opens in the Sidebar instead of the browser window',
 'ToolItem remains pressed',
 'New bookmarks lost when closing Firefox',
 '[PostgreSQL] ./collectstats.pl --regenerate fails (again)',
 'JavaScript code bug in pref-crlupdate.js',
 'Program errors and closes.',
 '[CommonNavigator] Common Navigator Array Store Exception',
 '[Keybindings] Ctrl Shift T not working on Linux GTK',
 'layout gets fouled up when serving from cache.',
 'bmp image served from router fails to reload despite Cache-control: no-cache directive',
 'let me link to about:logo',
 'Provide more control for drag source to influence DnD operation',
 'Job#join should allow passing a progress monitor',
 '[Oracle] collectstats.pl --regenerate uses PL/SQL reserved word in query',
 'Packages missing from open type dialog (1GL5001)',
 'netscape stock quote page lays out badly',
 '[Progress] remove deprecated IProgressConstants.COMMAND_PROPERTY',
 'print hangs',
 'Progress during plugin installation is modal',

In [5]:
file_path = f'{os.path.abspath("")}/train.txt'
with open(file_path, 'w') as f:
    for line in dataset[:10]:
        f.write(f"{line}\n")

In [6]:
# https://www.sbert.net/docs/sentence_transformer/pretrained_models.html
train_mlm("sentence-transformers/all-mpnet-base-v2", file_path)

Some weights of the model checkpoint at sentence-transformers/all-mpnet-base-v2 were not used when initializing MPNetForMaskedLM: ['pooler.dense.bias', 'pooler.dense.weight']
- This IS expected if you are initializing MPNetForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing MPNetForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of MPNetForMaskedLM were not initialized from the model checkpoint at sentence-transformers/all-mpnet-base-v2 and are newly initialized: ['lm_head.bias', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
You should probably TRAIN this model on a down-stream task to be 

Save checkpoints to: output/sentence-transformers_all-mpnet-base-v2-2024-06-09_19-15-36
Train sentences: 10
Save tokenizer to: output/sentence-transformers_all-mpnet-base-v2-2024-06-09_19-15-36


We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Step,Training Loss


Save model to: output/sentence-transformers_all-mpnet-base-v2-2024-06-09_19-15-36
Training done


'output/sentence-transformers_all-mpnet-base-v2-2024-06-09_19-15-36'

In [7]:
import torch
from tqdm import tqdm
from transformers import AutoModelForMaskedLM, AutoTokenizer

saved_model_path = f'{os.path.abspath("")}/output/sentence-transformers_all-mpnet-base-v2-2024-06-09_18-09-14/'

tokenizer = AutoTokenizer.from_pretrained(saved_model_path)
model = AutoModelForMaskedLM.from_pretrained(saved_model_path, output_hidden_states=True)

In [8]:
corpus = dataset[100:1000]
corpus_embeddings = []
for doc in tqdm(corpus):
    encoded_input = tokenizer(doc, padding=True, truncation=True, max_length=128, return_tensors='pt')
    with torch.no_grad():
        output = model(**encoded_input)
        embeddings = output.hidden_states[-1].mean(dim=1)  # Using mean pooling
    corpus_embeddings.append(embeddings)
corpus_embeddings = torch.cat(corpus_embeddings)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
100%|██████████████████████████████████████████| 900/900 [01:21<00:00, 11.05it/s]


In [9]:
corpus_embeddings.shape

torch.Size([900, 768])

In [10]:
import random
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

k = 5
query = "reboot"# dataset[random.randint(1000, len(dataset))]

# Tokenize and encode the query into an embedding
encoded_query = tokenizer(query, padding=True, truncation=True, max_length=128, return_tensors='pt')
with torch.no_grad():
    output = model(**encoded_query)
    query_embedding = output.hidden_states[-1].mean(dim=1)  # Using mean pooling

similarities = cosine_similarity(query_embedding, corpus_embeddings)
top_k_indices = np.argsort(similarities[0])[-k:][::-1]
top_k_documents = [corpus[i] for i in top_k_indices]

In [11]:
top_k_documents

['inconsistency: it says click refresh button or do Ctrl+Alt+R, these are different',
 'Firebird fails to start',
 "I can enter my user name on the pop-up but not my password, so I can't get onto this page",
 'page does not load correctly',
 'could not start eclipse']