In [None]:
import pandas as pd
import sys
import umap
import pickle
import importlib
import os
import math
from sentence_transformers import SentenceTransformer

sys.path.append("../")

import src

importlib.reload(src)

from src.data_prep_utils import (  # noqa: E402
    load_time_sorted_conala,
    time_batches
)

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.mixture import GaussianMixture

from sentence_transformers import SentenceTransformer
import umap
import matplotlib.pyplot as plt
import re
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer


DATE_STR = "20240327"
CLUSTER_N = 10
N_BATCHES = 65
TRAIN_SIZE = 379
RS = 42

In [None]:
dataset = load_time_sorted_conala("../data/raw/conala")
dataset.loc[dataset.rewritten_intent.isna(), "rewritten_intent"] = dataset.loc[dataset.rewritten_intent.isna(), "intent"]
dataset['idx'] = dataset.index 

if not os.path.exists(f"../data/processed/conala/{DATE_STR}/temporal/"):
    drift_types = ["temporal", "recurring", "gradual", "sudden"]
    for drift_t in drift_types:
        if not os.path.exists(f"../data/processed/conala/{DATE_STR}/{drift_t}"):
            os.makedirs(f"../data/processed/conala/{DATE_STR}/{drift_t}")

    # TEMPORAL DRIFT (AS IS OBSERVED IN THE DATA)

    dataset, batch_size = time_batches(dataset, TRAIN_SIZE, N_BATCHES)

    for batch in dataset.time_batch.unique():
        print(f"Batch {batch}")
        
        batch_df = dataset[dataset.time_batch == batch]
        batch_df.to_csv(f"../data/processed/conala/{DATE_STR}/temporal/conala_batch_{batch}.csv", index=False)
    
    dataset.to_csv(f"../data/processed/conala/{DATE_STR}/temporal/full.csv", index=False)
else: 
    print("Temporal batches already created")
    batch_size = pd.read_csv(f"../data/processed/conala/{DATE_STR}/temporal/conala_batch_1.csv").shape[0]
    print(batch_size)


In [None]:
if not os.path.exists(f"../data/processed/conala/{DATE_STR}/conala_clustered.csv"):
    # TOPIC MODELING
    # we do the topic modeling based on the semantic meaning of the intent

    model = SentenceTransformer("all-MiniLM-L6-v2")

    # Our sentences to encode
    sentences = dataset.rewritten_intent.values

    # Sentences are encoded by calling model.encode()
    embeddings = model.encode(sentences)

    # Print the embeddings
    for sentence, embedding in zip(sentences, embeddings):
        print("Sentence:", sentence)
        print("Embedding:", embedding)
        print("")
else: 
    print("Embeddings already created")

In [None]:
if not CLUSTER_N:
    km_silhouette = []
    km_db = []
    n_clusters = [*range(3,50)]


    for i in n_clusters:
        cluster = KMeans(n_clusters=i,          
                        random_state=42).fit(embeddings)
        
        preds = cluster.predict(embeddings)
        
        s_score = silhouette_score(embeddings, preds)
        db_score = davies_bouldin_score(embeddings, preds)
        km_silhouette.append(s_score)
        km_db.append(db_score)

    plt.figure(figsize=(10,4))
    plt.scatter(x=[i for i in n_clusters], y=km_silhouette, s=150, edgecolor='k')
    plt.xlabel("Number of clusters", fontsize=14)
    plt.ylabel("Silhouette score", fontsize=15)
    plt.xticks([10, 20, 30, 40, 50], fontsize=14)
    plt.yticks(fontsize=15)

# WE IDENTIFIED 10 GROUPS AS THE OPTIMAL NUMBER OF CLUSTERS

In [None]:
if not os.path.exists(f"../data/processed/conala/{DATE_STR}/conala_clustered.csv"):
    cluster = KMeans(n_clusters=CLUSTER_N,          
                        random_state=RS).fit(embeddings)
        
    preds = cluster.predict(embeddings)
    dataset["cluster"] = preds

    # SAVE DATASET AND EMEDDINGS
    dataset.to_csv(f"../data/processed/conala/{DATE_STR}/conala_clustered.csv", index=False)
    with open(f"../data/processed/conala/{DATE_STR}/conala_embeddings.pkl", "wb") as f:
        pickle.dump(embeddings, f)

    if False:
        for cluster_id in sorted(dataset.cluster.unique()):
            plt.figure(figsize=(10,4))
            sns.histplot(data=dataset[dataset.cluster == cluster_id], x="time_batch")
            plt.title(f"Cluster {cluster_id}")
            plt.show()

0 = string interactions <br>
1 = dictionary operations <br>
2 = date operations <br>
3 = pandas and df operations <br>
4 = os, htto, process operations <br>
5 = working with numbers, casting, precisions <br>
6 = numpy interactions <br>
7 = list, zip, range interactions, sorting etc <br>
8 = string encodings <br>
9 = working with files <br>

# Different Drift Types

In [None]:
dataset = pd.read_csv(f"../data/processed/conala/{DATE_STR}/conala_clustered.csv")
with open(f"../data/processed/conala/{DATE_STR}/conala_embeddings.pkl", "rb") as f:
    embeddings = pickle.load(f)

In [None]:
cluster_percentages = dataset.cluster[-dataset.cluster.isin((4,7,9))].value_counts(normalize=True).sort_values(ascending=True)
cluster_percentages_dict = cluster_percentages.to_dict()

In [None]:
# GRADUAL DRIFT 
# the 7 out of 10 topics are present during the initial training, new three topics are introduced gradually
# 4,7,9 gradually
gradual_to_sample = {}
sampled = 0
dataset["gradual_batch"] = -1
for i, cluster_id in enumerate(cluster_percentages.index): 
    print(cluster_id)
    
    to_sample = math.ceil(cluster_percentages_dict[cluster_id]*batch_size)
    
    if i==(len(cluster_percentages.index)-1):
        to_sample = TRAIN_SIZE - sampled
    print(to_sample)
    sampled_idx = dataset[dataset.cluster == cluster_id].sample(n=to_sample, random_state=RS).index
    dataset.loc[sampled_idx, "gradual_batch"] = 0
    sampled += to_sample

print(sampled, " ", TRAIN_SIZE)

dataset["gradual_new"] = 0 
dataset.loc[dataset.cluster.isin((4,7,9)), "gradual_new"] = 1

In [None]:
topic_per_dict = {
    11: 0.1,
    12: 0.2,
    13: 0.3,
    14: 0.4,
    15: 0.5,
}

for batch_id in range(1, N_BATCHES+1):
    if batch_id < 11:
        sampled_idx = dataset[(dataset["gradual_new"]==0) & (dataset["gradual_batch"] == -1)].sample(n=batch_size, random_state=RS).index
        dataset.loc[sampled_idx, "gradual_batch"] = batch_id

    elif batch_id in (11, 12, 13, 14, 15):
        to_sample_new = math.ceil(batch_size * topic_per_dict[batch_id])
        to_sample_old = batch_size - to_sample_new
        sampled_old_idx = dataset[(dataset["gradual_new"]==0) & (dataset["gradual_batch"] == -1)].sample(n=to_sample_old, random_state=RS).index
        dataset.loc[sampled_old_idx, "gradual_batch"] = batch_id
        sampled_new_idx = dataset[(dataset["gradual_new"]==1) & (dataset["gradual_batch"] == -1)].sample(n=to_sample_new, random_state=RS).index
        dataset.loc[sampled_new_idx, "gradual_batch"] = batch_id
    elif batch_id == N_BATCHES: # last batch
        sampled_idx = dataset[(dataset["gradual_batch"] == -1)].index
        dataset.loc[sampled_idx, "gradual_batch"] = batch_id
    else:
        sampled_idx = dataset[(dataset["gradual_batch"] == -1)].sample(n=batch_size, random_state=RS).index
        dataset.loc[sampled_idx, "gradual_batch"] = batch_id

In [None]:
for batch in dataset.gradual_batch.unique():
    print(f"Batch {batch}")
    
    batch_df = dataset[dataset.gradual_batch == batch]
    batch_df.to_csv(f"../data/processed/conala/{DATE_STR}/gradual/conala_batch_{batch}.csv", index=False)

In [None]:
# RECURRING DRIFT
# the 9 out of 10 topics are present during the initial training, new three topics are introduced at three different time points
# 4 recurring

cluster_percentages = dataset.cluster.loc[-dataset.cluster.isin([4])].value_counts(normalize=True).sort_values(ascending=True)
cluster_percentages_dict = cluster_percentages.to_dict()

sampled = 0
dataset["recurring_batch"] = -1
for i, cluster_id in enumerate(cluster_percentages.index): 
    to_sample = math.ceil(cluster_percentages_dict[cluster_id]*batch_size)
    if i==(len(cluster_percentages.index)-1):
        to_sample = TRAIN_SIZE - sampled

    sampled_idx = dataset[dataset.cluster == cluster_id].sample(n=to_sample, random_state=RS).index
    dataset.loc[sampled_idx, "recurring_batch"] = 0
    sampled += to_sample

print(sampled, " ", TRAIN_SIZE)

dataset["rec_new"] = 0 
dataset.loc[dataset.cluster.isin([4]), "rec_new"] = 1

new_topic_batch = math.ceil(dataset[dataset["rec_new"]==1].shape[0]/batch_size)

recurrent_batch_ids = [10, 11, 12, 25, 26, 27, 28, 45, 46, 47, 48]
max_batch = max(recurrent_batch_ids)


for batch_id in range(1, N_BATCHES+1):
    if batch_id in recurrent_batch_ids:
        if batch_id==max_batch:
            sampled_new_idx = dataset[(dataset["rec_new"]==1) & (dataset["recurring_batch"] == -1)].index
            dataset.loc[sampled_new_idx, "recurring_batch"] = batch_id
            to_sample_old = batch_size - len(sampled_new_idx)
            sampled_old_idx = dataset[(dataset["rec_new"]==0) & (dataset["recurring_batch"] == -1)].sample(n=to_sample_old, random_state=RS).index
            dataset.loc[sampled_old_idx, "recurring_batch"] = batch_id
        else:
            sampled_new_idx = dataset[(dataset["rec_new"]==1) & (dataset["recurring_batch"] == -1)].sample(n=batch_size, random_state=RS).index
            dataset.loc[sampled_new_idx, "recurring_batch"] = batch_id
    else:
        if batch_id == N_BATCHES: # last batch
            sampled_idx = dataset[(dataset["recurring_batch"] == -1)].index
            dataset.loc[sampled_idx, "recurring_batch"] = batch_id
        else:
            sampled_idx = dataset[(dataset["rec_new"]==0) & (dataset["recurring_batch"] == -1)].sample(n=batch_size, random_state=RS).index
            dataset.loc[sampled_idx, "recurring_batch"] = batch_id

In [None]:
for batch in dataset.recurring_batch.unique():
    print(f"Batch {batch}")
    
    batch_df = dataset[dataset.recurring_batch == batch]
    batch_df.to_csv(f"../data/processed/conala/{DATE_STR}/recurring/conala_batch_{batch}.csv", index=False)

In [None]:
# SUDDEN TOPIC DRIFT
# 9 topics are present during the initial training, 1 topic is introduced suddenly 
# 2 is introduced at the 10th time point

cluster_percentages = dataset.cluster.loc[-dataset.cluster.isin([2])].value_counts(normalize=True).sort_values(ascending=True)
cluster_percentages_dict = cluster_percentages.to_dict()

sampled = 0
dataset["sudden_batch"] = -1
for i, cluster_id in enumerate(cluster_percentages.index): 
    to_sample = math.ceil(cluster_percentages_dict[cluster_id]*batch_size)
    if i==(len(cluster_percentages.index)-1):
        to_sample = TRAIN_SIZE - sampled

    sampled_idx = dataset[dataset.cluster == cluster_id].sample(n=to_sample, random_state=RS).index
    dataset.loc[sampled_idx, "sudden_batch"] = 0
    sampled += to_sample

print(sampled, " ", TRAIN_SIZE)

dataset["sudden_new"] = 0 
dataset.loc[dataset.cluster.isin([2]), "sudden_new"] = 1

new_topic_batch = math.ceil(dataset[dataset["sudden_new"]==1].shape[0]/batch_size)

sudden_batch_ids = [10, 11, 12]
max_batch = max(sudden_batch_ids)


for batch_id in range(1, N_BATCHES+1):
    if batch_id in sudden_batch_ids:
        if batch_id==max_batch:
            sampled_new_idx = dataset[(dataset["sudden_new"]==1) & (dataset["sudden_batch"] == -1)].index
            dataset.loc[sampled_new_idx, "sudden_batch"] = batch_id
            to_sample_old = batch_size - len(sampled_new_idx)
            sampled_old_idx = dataset[(dataset["sudden_new"]==0) & (dataset["sudden_batch"] == -1)].sample(n=to_sample_old, random_state=RS).index
            dataset.loc[sampled_old_idx, "sudden_batch"] = batch_id
        else:
            sampled_new_idx = dataset[(dataset["sudden_new"]==1) & (dataset["sudden_batch"] == -1)].sample(n=batch_size, random_state=RS).index
            dataset.loc[sampled_new_idx, "sudden_batch"] = batch_id
    else:
        if batch_id == N_BATCHES: # last batch
            sampled_idx = dataset[(dataset["sudden_batch"] == -1)].index
            dataset.loc[sampled_idx, "sudden_batch"] = batch_id
        else:
            sampled_idx = dataset[(dataset["sudden_new"]==0) & (dataset["sudden_batch"] == -1)].sample(n=batch_size, random_state=RS).index
            dataset.loc[sampled_idx, "sudden_batch"] = batch_id

In [None]:
for batch in dataset.sudden_batch.unique():
    print(f"Batch {batch}")
    
    batch_df = dataset[dataset.sudden_batch == batch]
    batch_df.to_csv(f"../data/processed/conala/{DATE_STR}/sudden/conala_batch_{batch}.csv", index=False)

In [None]:
dataset.to_csv(f"../data/processed/conala/{DATE_STR}/all_drifts.csv", index=False)