In [1]:
import os
from tqdm.autonotebook import tqdm

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import MiniBatchKMeans as KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from tqdm import tqdm

import config

  from tqdm.autonotebook import tqdm


In [2]:
class CfgTemp:
    style=["bert", "gpt", "fast_text", "glove", "tfidf", "concatenated"]
    bert_size = 768
    gpt_size = 768
    fast_text_size = 300
    glove_size = 300
    dataset_cls_num = {"google":152, "tweets":89, "stackOverflow":20}

In [3]:
def get_data_cfg(targeted_data='google'):
    data_path, label_map = None, None
    if targeted_data == 'google':
        data_path = config.googel_news
        label_map = config.goole_label_map
    elif targeted_data == 'tweets':
        data_path = config.tweets
        label_map = config.tweet_label_map
    elif targeted_data == 'stackOverflow':
        data_path = config.stack_overflow
        label_map = config.stackOverflow_label_map
    else:
        raise ValueError(
            'targeted_data must be google ,stackOverflow or tweets')
    return data_path, label_map

In [4]:
ROOT_IMAGE_DIR = os.path.join(config.ROOT_DATA, "figures/obj1_pca_kmeans")
if not os.path.exists(ROOT_IMAGE_DIR):
    os.makedirs(ROOT_IMAGE_DIR)

In [5]:
def visualize_kmeans_clustering(embeddings, target="google", alg_name="bert"):
    X = np.array(embeddings)
    n_clusters = CfgTemp.dataset_cls_num.get(target, 0)
    prep_pca = Pipeline(
        [
            ("scaler", MinMaxScaler()),
            ("pca", PCA(n_components=2, random_state=42)),
        ]
    )

    clusterer = KMeans(
        n_clusters=n_clusters,
        init="k-means++",
        n_init=50,
        max_iter=500,
        random_state=42,
    )

    # Preprocess the data
    X_scaled_pca = prep_pca.fit_transform(X)

    # Perform clustering
    cluster_labels = clusterer.fit_predict(X_scaled_pca)

    # Visualize the results
    plt.figure(figsize=(12, 8))
    scatter = plt.scatter(
        X_scaled_pca[:, 0], X_scaled_pca[:, 1], c=cluster_labels, cmap='viridis')
    plt.colorbar(scatter)
    plt.title(f'{str(alg_name).upper()}')
    # plt.xlabel('First Principal Component')
    # plt.ylabel('Second Principal Component')

    # Add centroids to the plot
    centroids = clusterer.cluster_centers_
    plt.scatter(centroids[:, 0], centroids[:, 1],
                marker='x', s=200, linewidths=3, color='r')

    plt.tight_layout()

    out_path = os.path.join(ROOT_IMAGE_DIR, f"{target}_{alg_name}.pdf")
    print(f"out put to: {out_path}")
    plt.savefig(out_path, format="pdf")

    plt.close()

In [6]:
def split_embedding(embedding):
    """
        0:768
        768:1536
        1536:1836
        1836:2136
        2136:
    """
    start = 0
    end = CfgTemp.bert_size
    bert_embedding = embedding[start:end]

    start = end
    end += CfgTemp.gpt_size
    gpt_embedding = embedding[start:end]

    start = end
    end += CfgTemp.fast_text_size
    fast_text_embedding = embedding[start:end]

    start = end
    end += CfgTemp.glove_size
    glove_embedding = embedding[start:end]
    start = end
    tfid_embedding = embedding[start:]
    return bert_embedding, gpt_embedding, fast_text_embedding, glove_embedding, tfid_embedding, embedding

In [7]:
tqdm.pandas()
for target in tqdm(["tweets", "google", "stackOverflow"]):
    print("-"*30 + target + "-"*80)
    concatenated_vector = pd.read_json(os.path.join(
        config.embed_path, f"{target}_data_embedded.json"))[:-1]["concatenated_vector"]
    # Apply concatenated embeddings to each row in the DataFrame
    print("================================")
    print(
        f"concatenated_vector type: {type(concatenated_vector)}, shape:{concatenated_vector.shape}")
    print(f"concatenated_vector sample shape: {concatenated_vector[0]}")
    print("================================")
    concatenated_vector = concatenated_vector.apply(lambda x: np.array(x))
    splitted_embeddings = concatenated_vector.apply(split_embedding)
    splitted_df = pd.DataFrame(
        splitted_embeddings.tolist(), columns=CfgTemp.style)

    print(f"df head: {splitted_df.head()}")

    # info
    print("Shapes of the first row's embeddings and plt show k-means:")
    for ite in CfgTemp.style:
        print(f"{ite} embedding shape: {len(splitted_df[ite].iloc[0])}")
        # plt_show(splitted_df[ite].tolist(), target, ite)
        visualize_kmeans_clustering(splitted_df[ite].tolist(), target, ite)

  0%|          | 0/3 [00:00<?, ?it/s]

------------------------------tweets--------------------------------------------------------------------------------
concatenated_vector type: <class 'pandas.core.series.Series'>, shape:(2471,)
concatenated_vector sample shape: [0.1389111727, -0.0464272462, -0.3343537748, -0.2641819715, 0.4239065051, 0.0034178435, 0.3417643011, 0.6236883402, -0.2391759604, 0.0044607231, -0.12250669300000001, -0.3079218566, 0.0610172451, 0.1081449538, -0.3827938139, 0.1551609784, 0.1271389723, 0.0434453562, -0.0924241766, 0.0841459185, 0.21584741770000002, -0.0674438998, -0.20835599300000002, 0.37389907240000003, 0.1803355962, -0.4375417233, -0.4782920778, 0.1247312799, -0.0394260548, -0.3387620747, -0.044955104600000004, 0.19241561000000001, -0.21649894120000002, 0.0250327699, -0.077484332, 0.3139640689, -0.0507458225, -0.1180468947, -0.1414526254, 0.2135856599, -0.7438241243, -0.2580295801, -0.2499556839, 0.08087079230000001, -0.1082389504, 0.24464674290000002, 0.015952915, 0.09311507640000001, -0.081

 33%|███▎      | 1/3 [00:21<00:42, 21.41s/it]

out put to: /Users/zhouyf/Documents/data/majid/drive/MyDrive/project2/data/figures/obj1_pca_kmeans/tweets_concatenated.pdf
------------------------------google--------------------------------------------------------------------------------
concatenated_vector type: <class 'pandas.core.series.Series'>, shape:(11107,)
concatenated_vector sample shape: [-0.0950101465, -0.3668429852, 0.4380231202, 0.1539948136, 0.17293414470000001, -0.0355908126, 0.2930252552, 0.4269005954, -0.056787394000000005, -0.11701562260000001, 0.0071709082, -0.282694459, 0.24094462390000002, 0.33266827460000004, -0.2018690705, 0.2617349923, -0.2895553112, 0.018894048400000002, 0.3082212508, 0.21819503610000002, -0.07115974280000001, -0.2898958921, -0.19528353210000002, 0.3694991171, -0.0425080955, -0.1595191061, 0.1656820625, -0.08458998050000001, 0.1200369149, -0.078764528, -0.0667308569, 0.20687475800000002, -0.2174566984, -0.18105182050000002, 0.34494102, -0.0475732721, 0.07365287100000001, 0.0416938066, -0.0614

 67%|██████▋   | 2/3 [00:58<00:30, 30.44s/it]

------------------------------stackOverflow--------------------------------------------------------------------------------
concatenated_vector type: <class 'pandas.core.series.Series'>, shape:(16406,)
concatenated_vector sample shape: [-0.17703057830000002, -0.32281768320000004, 0.4629489183, -0.0526342057, 0.5023549199, -0.11806390430000001, -0.13967721160000002, -0.1182534471, -0.1918967813, -0.0308347512, -0.27709561590000004, -0.31723868850000003, -0.248184815, 0.2577259839, -0.1834786385, 0.1765830666, -0.0541802347, 0.2600778341, -0.0896128789, -0.14116594200000002, 0.3825683892, -0.2866398394, -0.48310747740000004, -0.18900005520000002, 0.7210383415, -0.2432029694, -0.38335669040000003, -0.0215340126, -0.561209321, -0.0341496207, -0.1091443524, 0.18943859640000002, -0.1729008406, -0.0103511708, -0.18474642930000001, 0.3034331799, -0.12757097180000002, 0.5237317085000001, -0.3467777669, 0.016592348, -0.2565497458, -0.436465174, 0.1265654117, 0.14433039720000002, -0.0672414601, -

100%|██████████| 3/3 [01:26<00:00, 28.69s/it]
