In [1]:
import os
import numpy as np

import time
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import davies_bouldin_score

from cuml import IncrementalPCA as cumlPCA
from cuml import TSNE as cumlTSNE


import joblib
import pandas as pd

import json

import plotly
import plotly.express as px
import plotly.graph_objects as go
import tqdm

# reset GPU
import subprocess
from numba import cuda
import threading


In [2]:
data_path = "/mnt/f/cluster_analysis/gpt2-train.npy"

In [3]:
data = np.load(data_path)

In [4]:
data.shape

(2337, 786432)

In [5]:
data

array([[-0.46568674, -1.02676618, -2.10928679, ..., -1.23934782,
         2.38638377,  1.14557326],
       [-0.39376995, -0.37493441, -1.8621099 , ..., -1.14678025,
         2.08672857,  1.53005564],
       [-0.95601022, -0.47106016, -1.42170763, ..., -1.31075263,
         2.62891436,  1.2803489 ],
       ...,
       [-0.64854562,  0.03179695, -1.74919176, ..., -0.87320691,
         2.14874005,  1.53177893],
       [-0.48164964, -0.51160353, -1.94577396, ..., -1.23118579,
         2.29757762,  0.91173029],
       [-0.43425503, -0.67103899, -2.09265566, ..., -1.23825562,
         2.31448531,  1.29377663]])

In [6]:
scaler = RobustScaler()
scaled_data = scaler.fit_transform(data)

In [7]:
pca_gpu = cumlPCA(n_components = 25, batch_size=32)
pca_processed = pca_gpu.fit_transform(scaled_data)
cumltsne_final = cumlTSNE(n_components=2, perplexity=5, n_neighbors=50, method="exact")
cuml_processed = cumltsne_final.fit_transform(pca_processed)
cumltsne_final.kl_divergence_

0.24280664324760437

In [8]:
cumltsne_final.kl_divergence_

0.24280664324760437

In [9]:
cuml_processed.shape

(2337, 2)

In [10]:
column_axis = ['x', 'y']
df_manifold = pd.DataFrame(cuml_processed, columns=column_axis, index=np.arange(cuml_processed.shape[0]))  
df_manifold

Unnamed: 0,x,y
0,38.576649,-39.150810
1,-54.216434,139.096512
2,104.826767,-55.286514
3,58.810375,-108.206070
4,-97.672928,-6.757593
...,...,...
2332,3.264577,-107.511368
2333,29.842676,19.792850
2334,2.137388,-94.250923
2335,40.597595,55.810890


In [11]:
try:
    os.makedirs("/mnt/f/cluster_analysis/gpt2/train/")
except:
    pass

In [12]:
def plot_dimension_reduction(manifold_processed_DataFrame):

	fig = go.Figure()

	fig.add_trace(go.Scatter(x=manifold_processed_DataFrame['x'], 
					y=manifold_processed_DataFrame['y'],
					mode='markers', 
					marker=dict(color='LightSkyBlue', 
								size=12, line=dict(color='MediumPurple', width=2)),
					name='reals'))

	fig.update_layout(autosize=True, width=900, height=900)
	fig.update_layout(title='GPT2, Korean Food Training Dataset, Hidden State Output in 2D')
	fig.write_image('/mnt/f/cluster_analysis/gpt2/train/dtrain_manifold.webp')
	fig.write_image('/mnt/f/cluster_analysis/gpt2/train/dtrain_manifold.png')
	plotly.offline.plot(fig, filename = '/mnt/f/cluster_analysis/gpt2/train/dtrain_manifold.html', auto_open=False)

In [13]:
plot_dimension_reduction(df_manifold)

In [14]:
df_manifold.to_csv('/mnt/f/cluster_analysis/gpt2/train/dtrain_manifold.csv', index=False)