In [1]:
import os
import numpy as np

import time
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import davies_bouldin_score

from cuml import IncrementalPCA as cumlPCA
from cuml import TSNE as cumlTSNE


import joblib
import pandas as pd

import json

import plotly
import plotly.express as px
import plotly.graph_objects as go
import tqdm

# reset GPU
import subprocess
from numba import cuda
import threading


In [2]:
data_path = "/mnt/f/cluster_analysis/distilbert-train.npy"

In [3]:
data = np.load(data_path)

In [4]:
data.shape

(2337, 294912)

In [5]:
data

array([[ 1.02084911,  0.8680743 ,  1.69840884, ..., -0.69521117,
         1.85809588, -0.25243106],
       [ 0.81879854,  1.09136426,  1.68719244, ..., -0.51915038,
         2.05521703, -0.17780054],
       [ 0.91335052,  1.02614546,  1.91160619, ..., -0.71842301,
         1.86320925,  0.06191902],
       ...,
       [ 0.90670973,  1.2899338 ,  1.70845807, ..., -0.51992625,
         1.92611516, -0.11209759],
       [ 0.86309361,  1.0453341 ,  1.64250541, ..., -0.562617  ,
         1.95709324,  0.03385187],
       [ 0.77229321,  1.2219336 ,  1.70545399, ..., -0.44734398,
         1.92916894,  0.11711369]])

In [6]:
scaler = RobustScaler()
scaled_data = scaler.fit_transform(data)

In [7]:
pca_gpu = cumlPCA(n_components = 25, batch_size=32)
pca_processed = pca_gpu.fit_transform(scaled_data)
cumltsne_final = cumlTSNE(n_components=2, perplexity=5, n_neighbors=50, method="exact")
cuml_processed = cumltsne_final.fit_transform(pca_processed)
cumltsne_final.kl_divergence_

0.3129844069480896

In [8]:
cumltsne_final.kl_divergence_

0.3129844069480896

In [9]:
cuml_processed.shape

(2337, 2)

In [10]:
column_axis = ['x', 'y']
df_manifold = pd.DataFrame(cuml_processed, columns=column_axis, index=np.arange(cuml_processed.shape[0]))  
df_manifold

Unnamed: 0,x,y
0,-117.752182,-44.765297
1,3.223371,94.655083
2,-80.087158,-69.563362
3,-75.593559,65.009048
4,15.957397,125.094986
...,...,...
2332,22.260605,-0.802777
2333,-12.615958,-71.338539
2334,-17.663586,16.931108
2335,-31.762358,-33.692554


In [11]:
try:
    os.makedirs("/mnt/f/cluster_analysis/distilBERT/train/")
except:
    pass

In [12]:
def plot_dimension_reduction(manifold_processed_DataFrame):

	fig = go.Figure()

	fig.add_trace(go.Scatter(x=manifold_processed_DataFrame['x'], 
					y=manifold_processed_DataFrame['y'],
					mode='markers', 
					marker=dict(color='LightSkyBlue', 
								size=12, line=dict(color='MediumPurple', width=2)),
					name='reals'))

	fig.update_layout(autosize=True, width=900, height=900)
	fig.update_layout(title='DistilBERT, Korean Food Training Dataset, Hidden State Output in 2D')
	fig.write_image('/mnt/f/cluster_analysis/distilBERT/train/dtrain_manifold.webp')
	fig.write_image('/mnt/f/cluster_analysis/distilBERT/train/dtrain_manifold.png')
	plotly.offline.plot(fig, filename = '/mnt/f/cluster_analysis/distilBERT/train/dtrain_manifold.html', auto_open=False)


In [13]:
plot_dimension_reduction(df_manifold)

In [14]:
df_manifold.to_csv('/mnt/f/cluster_analysis/distilBERT/train/dtrain_manifold.csv', index=False)