In [None]:
%pip install umap-learn
%pip install sentence-transformers

In [17]:
import pandas as pd
import sys
import umap
import pickle
import importlib
import os
import math
from sentence_transformers import SentenceTransformer

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.mixture import GaussianMixture

from sentence_transformers import SentenceTransformer
import umap
import matplotlib.pyplot as plt
import re
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from tqdm import tqdm 
tqdm.pandas()


DATE_STR = "20240908"
CLUSTER_N = 5
RS = 42
SNIPPET_LEN_LOWER_BOUND = 10
SNIPPET_LEN_UPPER_BOUND = 70
INTENT_LEN_LOWER_BOUND = 20
INTENT_LEN_UPPER_BOUND = 60

SNIPPET_TOKEN_N_LOWER_BOUND = 5
INTENT_TOKEN_N_LOWER_BOUND = 5

model_name="Salesforce/codet5-base-multi-sum"
tokenizer = AutoTokenizer.from_pretrained(model_name, skip_special_tokens=False)



In [18]:
dataset = load_dataset("neulab/conala", "mined")

In [19]:
dataset_df = pd.DataFrame(dataset["train"])
dataset_df['snippet_len'] = dataset_df.snippet.str.len()
dataset_df['intent_len'] = dataset_df.intent.str.len()
print(dataset_df.shape)

(593891, 8)


In [20]:
dataset_df = dataset_df.loc[(dataset_df.snippet_len>=SNIPPET_LEN_LOWER_BOUND) & (dataset_df.snippet_len<=SNIPPET_LEN_UPPER_BOUND), :]
dataset_df = dataset_df.loc[(dataset_df.intent_len>=INTENT_LEN_LOWER_BOUND) & (dataset_df.snippet_len<=INTENT_LEN_UPPER_BOUND), :]
print(dataset_df.shape)

(364115, 8)


In [21]:
dataset_df["snippet_token_n"] = dataset_df.snippet.progress_apply(lambda x: len(tokenizer.tokenize(x)))
dataset_df["intent_token_n"] = dataset_df.intent.progress_apply(lambda x: len(tokenizer.tokenize(x)))

100%|██████████| 364115/364115 [00:19<00:00, 19033.75it/s]
100%|██████████| 364115/364115 [00:21<00:00, 16811.46it/s]


In [22]:
dataset_df = dataset_df.loc[(dataset_df.snippet_token_n>=SNIPPET_TOKEN_N_LOWER_BOUND), :]
dataset_df = dataset_df.loc[(dataset_df.intent_token_n>=INTENT_TOKEN_N_LOWER_BOUND), :]
print(dataset_df.shape)

(325432, 10)


In [27]:
dataset_df["cluster"] = 0
dataset_df.loc[(dataset_df.intent_token_n>=8) & (dataset_df.intent_token_n<13), "cluster"] = 1
dataset_df.loc[(dataset_df.intent_token_n>=13) & (dataset_df.intent_token_n<16), "cluster"] = 2
dataset_df.loc[dataset_df.intent_token_n>=16, "cluster"] = 3

CLUSTER_LEN_INTENT = True

In [28]:
if not CLUSTER_LEN_INTENT:
    if not os.path.exists(f"../data/processed/conala/{DATE_STR}/conala_mined_clustered.csv"):
        # TOPIC MODELING
        # we do the topic modeling based on the semantic meaning of the intent

        model = SentenceTransformer(model_name)

        # Our sentences to encode
        sentences = dataset_df.snippet.values

        # Sentences are encoded by calling model.encode()
        embeddings = model.encode(sentences)

        # Print the embeddings
        for sentence, embedding in zip(sentences, embeddings):
            print("Sentence:", sentence)
            print("Embedding:", embedding)
            print("")
            break
    else: 
        print("Embeddings already created")

In [29]:
if not CLUSTER_LEN_INTENT:
    if not CLUSTER_N:
        km_silhouette = []
        km_db = []
        n_clusters = [3, 5, 8, 10 , 12, 14]

        for i in tqdm(n_clusters):
            cluster = KMeans(n_clusters=i,          
                            random_state=42).fit(embeddings)
            
            preds = cluster.predict(embeddings) 
            db_score = davies_bouldin_score(embeddings, preds)
            km_db.append(db_score)

        plt.figure(figsize=(10,4))
        plt.scatter(x=[i for i in n_clusters], y=km_db, s=150, edgecolor='k')
        plt.xlabel("Number of clusters", fontsize=14)
        plt.ylabel("Davies Bouldin score", fontsize=15)
        plt.xticks([3, 5, 8, 10 , 12, 14], fontsize=14)
        plt.yticks(fontsize=15)

    # WE IDENTIFIED 5 GROUPS AS THE OPTIMAL NUMBER OF CLUSTERS
    CLUSTER_N = 5

In [31]:
if not os.path.exists(f"../data/processed/conala/{DATE_STR}/conala_mined_clustered.csv"):
    if not os.path.exists(f"../data/processed/conala/{DATE_STR}/"):
        os.mkdir(f"../data/processed/conala/{DATE_STR}/")
        
    if not CLUSTER_LEN_INTENT:
        cluster = KMeans(n_clusters=CLUSTER_N,          
                            random_state=RS).fit(embeddings)
            
        preds = cluster.predict(embeddings)
        dataset_df["cluster"] = preds

    # SAVE DATASET AND EMEDDINGS
    dataset_df.to_csv(f"../data/processed/conala/{DATE_STR}/conala_mined_clustered.csv", index=False)
    if not CLUSTER_LEN_INTENT:
        with open(f"../data/processed/conala/{DATE_STR}/conala_mined_embeddings.pkl", "wb") as f:
            pickle.dump(embeddings, f)