In [1]:
import os
import numpy as np

import time
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import davies_bouldin_score

from cuml import IncrementalPCA as cumlPCA
from cuml import TSNE as cumlTSNE


import joblib
import pandas as pd

import json

import plotly
import plotly.express as px
import plotly.graph_objects as go
import tqdm

# reset GPU
import subprocess
from numba import cuda
import threading


In [2]:
data_path = "/mnt/f/cluster_analysis/kobert_train.npy"

In [3]:
data = np.load(data_path)

In [4]:
data.shape

(2337, 294912)

In [5]:
data

array([[ 0.12607618, -0.2177927 , -0.02630411, ..., -0.18577793,
        -0.65449542,  0.20252991],
       [-0.19878492,  0.1088952 ,  0.16065916, ..., -0.27704975,
        -0.90432692,  0.07277443],
       [ 0.03491082,  0.04019241,  0.25021282, ..., -0.28724536,
        -0.81275618,  0.34508607],
       ...,
       [ 0.09750635, -0.19071175,  0.05225878, ..., -0.16661298,
        -0.55150944, -0.16517413],
       [ 0.03690892, -0.36995983,  0.06538674, ...,  0.02063052,
        -0.6465764 ,  0.2242703 ],
       [ 0.03537651, -0.25579143,  0.07226921, ...,  0.01541212,
        -0.54696864, -0.17482601]])

In [6]:
scaler = RobustScaler()
scaled_data = scaler.fit_transform(data)

In [7]:
pca_gpu = cumlPCA(n_components = 25, batch_size=32)
pca_processed = pca_gpu.fit_transform(scaled_data)
cumltsne_final = cumlTSNE(n_components=2, perplexity=5, n_neighbors=50, method="exact")
cuml_processed = cumltsne_final.fit_transform(pca_processed)
cumltsne_final.kl_divergence_

0.17721471190452576

In [8]:
cumltsne_final.kl_divergence_

0.17721471190452576

In [9]:
cuml_processed.shape

(2337, 2)

In [10]:
column_axis = ['x', 'y']
df_manifold = pd.DataFrame(cuml_processed, columns=column_axis, index=np.arange(cuml_processed.shape[0]))  
df_manifold

Unnamed: 0,x,y
0,-91.874916,8.868083
1,15.114115,94.223305
2,23.729103,-46.067886
3,71.267189,-117.083344
4,-42.724987,142.073807
...,...,...
2332,10.695359,119.545364
2333,-8.111209,125.669495
2334,-15.123107,43.457623
2335,-114.006973,-17.471989


In [11]:
try:
    os.makedirs("/mnt/f/cluster_analysis/kobert/train/")
except:
    pass

In [12]:
def plot_dimension_reduction(manifold_processed_DataFrame):

	fig = go.Figure()

	fig.add_trace(go.Scatter(x=manifold_processed_DataFrame['x'], 
					y=manifold_processed_DataFrame['y'],
					mode='markers', 
					marker=dict(color='LightSkyBlue', 
								size=12, line=dict(color='MediumPurple', width=2)),
					name='reals'))

	fig.update_layout(autosize=True, width=900, height=900)
	fig.update_layout(title='KoBERT, Korean Food Training Dataset, Hidden State Output in 2D')
	fig.write_image('/mnt/f/cluster_analysis/kobert/train/dtrain_manifold.webp')
	fig.write_image('/mnt/f/cluster_analysis/kobert/train/dtrain_manifold.png')
	plotly.offline.plot(fig, filename = '/mnt/f/cluster_analysis/kobert/train/dtrain_manifold.html', auto_open=False)

In [13]:
plot_dimension_reduction(df_manifold)

In [14]:
df_manifold.to_csv('/mnt/f/cluster_analysis/kobert/train/dtrain_manifold', index=False)