In [1]:
import os
import numpy as np

import time
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import davies_bouldin_score

from cuml import IncrementalPCA as cumlPCA
from cuml import TSNE as cumlTSNE


import joblib
import pandas as pd

import json

import plotly
import plotly.express as px
import plotly.graph_objects as go
import tqdm

# reset GPU
import subprocess
from numba import cuda
import threading


In [2]:
data_path = "/mnt/f/cluster_analysis/kobert_test.npy"

In [3]:
data = np.load(data_path)

In [4]:
data.shape

(584, 294912)

In [5]:
data

array([[-0.00673927, -0.27696738,  0.18406919, ..., -0.26043224,
        -0.51055956,  0.2976166 ],
       [-0.30848643,  0.0253956 ,  0.2490128 , ..., -0.39589956,
        -0.61496603,  0.32088938],
       [-0.1203639 , -0.17493483,  0.49172568, ..., -0.22080015,
        -0.43350857,  0.31536001],
       ...,
       [-0.11639293, -0.11735741,  0.19944194, ..., -0.23721074,
        -0.61880887,  0.16988194],
       [-0.06783877, -0.13779117, -0.04867897, ..., -0.28151396,
        -0.51182514,  0.69420105],
       [ 0.05839941, -0.33118621,  0.0842925 , ..., -0.10381245,
        -0.35972151,  0.18938683]])

In [6]:
scaler = RobustScaler()
scaled_data = scaler.fit_transform(data)

In [7]:
pca_gpu = cumlPCA(n_components = 25, batch_size=32)
pca_processed = pca_gpu.fit_transform(scaled_data)
cumltsne_final = cumlTSNE(n_components=2, perplexity=5, n_neighbors=50, method="exact")
cuml_processed = cumltsne_final.fit_transform(pca_processed)
cumltsne_final.kl_divergence_

0.25436070561408997

In [8]:
cumltsne_final.kl_divergence_

0.25436070561408997

In [9]:
cuml_processed.shape

(584, 2)

In [10]:
column_axis = ['x', 'y']
df_manifold = pd.DataFrame(cuml_processed, columns=column_axis, index=np.arange(cuml_processed.shape[0]))  
df_manifold

Unnamed: 0,x,y
0,-35.619770,58.024300
1,-43.282856,6.999091
2,-12.151334,18.501585
3,-54.887043,4.461901
4,34.845520,12.508245
...,...,...
579,37.783928,3.886666
580,-37.135948,-50.223747
581,-36.071941,-44.177242
582,7.665291,10.419816


In [None]:
try:
    os.makedirs("/mnt/f/cluster_analysis/kobert/test/")
except:
    pass

In [11]:
def plot_dimension_reduction(manifold_processed_DataFrame):

	fig = go.Figure()

	fig.add_trace(go.Scatter(x=manifold_processed_DataFrame['x'], 
					y=manifold_processed_DataFrame['y'],
					mode='markers', 
					marker=dict(color='LightSkyBlue', 
								size=12, line=dict(color='MediumPurple', width=2)),
					name='reals'))

	fig.update_layout(autosize=True, width=900, height=900)
	fig.update_layout(title='KoBERT, Korean Food Training Dataset, Hidden State Output in 2D')
	fig.write_image('/mnt/f/cluster_analysis/kobert/test/dtest_manifold.webp')
	fig.write_image('/mnt/f/cluster_analysis/kobert/test/dtest_manifold.png')
	plotly.offline.plot(fig, filename = '/mnt/f/cluster_analysis/kobert/test/dtest_manifold.html', auto_open=False)

In [12]:
plot_dimension_reduction(df_manifold)

In [13]:
df_manifold.to_csv('/mnt/f/cluster_analysis/kobert/test/dtest_manifold', index=False)