In [1]:
import os
import numpy as np

import time
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import davies_bouldin_score

from cuml import IncrementalPCA as cumlPCA
from cuml import TSNE as cumlTSNE


import joblib
import pandas as pd

import json

import plotly
import plotly.express as px
import plotly.graph_objects as go
import tqdm

# reset GPU
import subprocess
from numba import cuda
import threading


In [2]:
data_path = "/mnt/f/cluster_analysis/gpt2-test.npy"

In [3]:
data = np.load(data_path)

In [4]:
data.shape

(584, 786432)

In [5]:
data

array([[-0.38934901, -0.48881131, -1.67311108, ..., -0.92481565,
         2.05773044,  1.51792395],
       [-0.15043834, -0.4164905 , -2.0979259 , ..., -1.51937842,
         2.33411598,  1.25936997],
       [-0.13826902, -0.6510036 , -2.002563  , ..., -1.31498754,
         2.67430091,  0.86448747],
       ...,
       [-1.12969232, -0.86213106, -1.73689806, ..., -1.39270914,
         2.00119495,  1.35081303],
       [-0.33993092, -1.06993973, -1.93917382, ..., -1.61120725,
         2.25441217,  1.5238806 ],
       [-0.80413491, -1.12395978, -1.19840598, ..., -1.20117712,
         2.20153475,  0.92693681]])

In [6]:
scaler = RobustScaler()
scaled_data = scaler.fit_transform(data)

In [None]:
pca_gpu = cumlPCA(n_components = 25, batch_size=32)
pca_processed = pca_gpu.fit_transform(scaled_data)
cumltsne_final = cumlTSNE(n_components=2, perplexity=5, n_neighbors=50, method="exact")
cuml_processed = cumltsne_final.fit_transform(pca_processed)
cumltsne_final.kl_divergence_

0.2875140905380249

In [None]:
cumltsne_final.kl_divergence_

0.2875140905380249

In [None]:
cuml_processed.shape

(584, 2)

In [None]:
column_axis = ['x', 'y']
df_manifold = pd.DataFrame(cuml_processed, columns=column_axis, index=np.arange(cuml_processed.shape[0]))  
df_manifold

Unnamed: 0,x,y
0,-58.533691,14.197122
1,-40.949867,-27.241144
2,6.880807,-27.278839
3,-35.222725,23.408892
4,-30.935741,-17.908953
...,...,...
579,-38.426418,20.732286
580,16.499466,6.286759
581,-14.612539,-12.627235
582,25.279703,-11.077900


In [None]:
try:
    os.makedirs("/mnt/f/cluster_analysis/gpt2/test/")
except:
    pass

In [None]:
def plot_dimension_reduction(manifold_processed_DataFrame):

	fig = go.Figure()

	fig.add_trace(go.Scatter(x=manifold_processed_DataFrame['x'], 
					y=manifold_processed_DataFrame['y'],
					mode='markers', 
					marker=dict(color='LightSkyBlue', 
								size=12, line=dict(color='MediumPurple', width=2)),
					name='reals'))

	fig.update_layout(autosize=True, width=900, height=900)
	fig.update_layout(title='GPT2, Korean Food Test Dataset, Hidden State Output in 2D')
	fig.write_image('/mnt/f/cluster_analysis/gpt2/test/dtest_manifold.webp')
	fig.write_image('/mnt/f/cluster_analysis/gpt2/test/dtest_manifold.png')
	plotly.offline.plot(fig, filename = '/mnt/f/cluster_analysis/gpt2/test/dtest_manifold.html', auto_open=False)

In [None]:
plot_dimension_reduction(df_manifold)

In [13]:
df_manifold.to_csv('/mnt/f/cluster_analysis/gpt2/test/dtest_manifold.csv', index=False)