In [9]:
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer, BertModel
import pickle

# Load the model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')


repo_name = "ckeditor5"

#load centroid array pickle
centroid_arr = pickle.load(open('ckeditor5_centroid_arr.pkl', 'rb'))
#load cluster dataframe
df = pd.read_csv('data/clustered_data_' + repo_name + '.csv')

def assign_new_coming_issue_to_cluster(text:str):
    """Takes text as input which is issue title + body in concatenated form
    Outputs the cluster that the issue should be assigned to"""

    # Tokenize and encode the texts
    tokens = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)

    # Forward pass, get hidden states
    with torch.no_grad():
        output = model(**tokens)

    # Take the output embeddings from the last layer
    last_hidden_states = output.last_hidden_state

    # Pooling strategy: Take the mean of all token embeddings
    mean_embeddings = torch.mean(last_hidden_states, dim=1)
    distance_arr = []
    for centroid in centroid_arr:
        distance_arr.append(np.linalg.norm(np.array(mean_embeddings[0]) - centroid))
    distance_arr = np.array(distance_arr)

    # Find the closest cluster
    assigned_cluster = np.argmin(distance_arr)
    return assigned_cluster

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
# Try with some of the issues to see it works nicely
print(assign_new_coming_issue_to_cluster(df['Combined Text'][0]))
print(assign_new_coming_issue_to_cluster(df['Combined Text'][1]))
print(assign_new_coming_issue_to_cluster(df['Combined Text'][2]))

30
14
16


In [12]:
#print their original cluster
print(df['Cluster'][0])
print(df['Cluster'][1])
print(df['Cluster'][2])

30
14
16
