In [1]:
import os
import numpy as np

import time
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import davies_bouldin_score

from cuml import IncrementalPCA as cumlPCA
from cuml import TSNE as cumlTSNE


import joblib
import pandas as pd

import json

import plotly
import plotly.express as px
import plotly.graph_objects as go
import tqdm

# reset GPU
import subprocess
from numba import cuda
import threading


In [2]:
data_path = "/mnt/f/cluster_analysis/distilbert-test.npy"

In [3]:
data = np.load(data_path)

In [4]:
data.shape

(584, 294912)

In [5]:
data

array([[-1.03429747,  0.75285625, -0.12534927, ...,  0.69523805,
        -1.36313701, -0.57561493],
       [-1.03681886,  0.76581448, -0.11867484, ...,  0.68454576,
        -1.3793354 , -0.55683678],
       [-1.04473364,  0.75014418, -0.11938943, ...,  0.68804699,
        -1.37801635, -0.55951768],
       ...,
       [-1.05106902,  0.75740474, -0.11918234, ...,  0.68745422,
        -1.37183642, -0.5624885 ],
       [-1.03620577,  0.75632888, -0.12579644, ...,  0.6876002 ,
        -1.37288237, -0.56787789],
       [-1.03971851,  0.75834543, -0.11320455, ...,  0.69876885,
        -1.37027085, -0.5637297 ]])

In [6]:
scaler = RobustScaler()
scaled_data = scaler.fit_transform(data)

In [25]:
pca_gpu = cumlPCA(n_components = 25, batch_size=32)
pca_processed = pca_gpu.fit_transform(scaled_data)
cumltsne_final = cumlTSNE(n_components=2, perplexity=5, n_neighbors=50, method="exact")
cuml_processed = cumltsne_final.fit_transform(pca_processed)
cumltsne_final.kl_divergence_

0.3680535554885864

In [26]:
cumltsne_final.kl_divergence_

0.3680535554885864

In [27]:
cuml_processed.shape

(584, 2)

In [28]:
column_axis = ['x', 'y']
df_manifold = pd.DataFrame(cuml_processed, columns=column_axis, index=np.arange(cuml_processed.shape[0]))  
df_manifold

Unnamed: 0,x,y
0,-41.682705,43.882267
1,35.175560,19.549891
2,51.287228,-53.525631
3,-26.423721,-63.671886
4,-37.171738,-56.850910
...,...,...
579,-30.582468,-63.839230
580,41.885387,25.668541
581,-13.974753,5.951581
582,-80.353561,-0.445030


In [29]:
try:
    os.makedirs("/mnt/f/cluster_analysis/distilBERT/test/")
except:
    pass

In [30]:
def plot_dimension_reduction(manifold_processed_DataFrame):

	fig = go.Figure()

	fig.add_trace(go.Scatter(x=manifold_processed_DataFrame['x'], 
					y=manifold_processed_DataFrame['y'],
					mode='markers', 
					marker=dict(color='LightSkyBlue', 
								size=12, line=dict(color='MediumPurple', width=2)),
					name='reals'))

	fig.update_layout(autosize=True, width=900, height=900)
	fig.update_layout(title='DistilBERT, Korean Food Test Dataset, Hidden State Output in 2D')
	fig.write_image('/mnt/f/cluster_analysis/distilBERT/test/dtest_manifold.webp')
	fig.write_image('/mnt/f/cluster_analysis/distilBERT/test/dtest_manifold.png')
	plotly.offline.plot(fig, filename = '/mnt/f/cluster_analysis/distilBERT/test/dtest_manifold.html', auto_open=False)


In [31]:
plot_dimension_reduction(df_manifold)

In [32]:
df_manifold.to_csv('/mnt/f/cluster_analysis/distilBERT/test/dtest_manifold.csv', index=False)