In [None]:
import jupyter_fix
import warnings
import copy
import cv2
import matplotlib.pyplot as plt
import numpy as np
from umap import UMAP
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
from sklearn.mixture import GaussianMixture
from src.utility.symbols_loader import load_kuzushiji_pages, load_emnist_pages, load_kuzushiji_pages_with_spaces, load_emnist_pages_with_spaces
from tqdm import tqdm

jupyter_fix.fix_jupyter_path()
warnings.filterwarnings("ignore")

path = "data/encoded_data/"

In [None]:
emnist_pred = np.load(path + "emnist_preds.npz")['arr_0']
kuzushiji_pred = np.load(path + "kuzushiji_preds.npz")['arr_0']

decoded_emnist_pred = np.load("data/encoded_data/decoded_emnist_representation.npz")['arr_0']

In [None]:
assert load_emnist_pages(5, trial='trial_0').shape[0] == emnist_pred.shape[0]
assert load_kuzushiji_pages(5, trial='trial_0').shape[0] == kuzushiji_pred.shape[0]

In [None]:
reducer_emnist = UMAP(n_components=2)
emnist_reduced = reducer_emnist.fit_transform(emnist_pred)
emnist_scaled = StandardScaler().fit_transform(emnist_pred)

reducer_kuzushiji = UMAP(n_components=2)
kuzushiji_reduced = reducer_kuzushiji.fit_transform(kuzushiji_pred)
kuzushiji_scaled = StandardScaler().fit_transform(kuzushiji_pred)

In [None]:
plt.rcParams["figure.figsize"] = (16, 8)
fig, (ax1, ax2) = plt.subplots(1,2, sharey=True)

ax1.scatter(emnist_reduced[:, 0], emnist_reduced[:, 1], s=5, alpha=0.75, cmap='nipy_spectral')
ax1.axis('off')
ax1.title.set_text('Emnist')

ax2.scatter(kuzushiji_reduced[:, 0], kuzushiji_reduced[:, 1], s=5, alpha=0.75, cmap='nipy_spectral')
ax2.axis('off')
ax2.title.set_text('Kuzushiji')
plt.show()

In [None]:
n_components = range(20, 70)
# covariance_type = ['spherical', 'tied', 'diag', 'full']

score_emnist = []
for n_comp in tqdm(n_components):
    gmm=GaussianMixture(n_components=n_comp,covariance_type='spherical')
    gmm.fit(emnist_scaled)
    score_emnist.append((n_comp,gmm.bic(emnist_scaled)))

score_kuzushiji = []
for n_comp in tqdm(n_components):
    gmm=GaussianMixture(n_components=n_comp,covariance_type='spherical')
    gmm.fit(kuzushiji_scaled)
    score_kuzushiji.append((n_comp,gmm.bic(kuzushiji_scaled)))

for idx in range(len(score_kuzushiji)):
    try:
        n_clusters = score_kuzushiji[idx][0]
        emnist_difference = round(score_emnist[idx+1][1] - score_emnist[idx][1])
        kuzushiji_difference = round(score_kuzushiji[idx+1][1] - score_kuzushiji[idx][1])
        if emnist_difference > 0 and kuzushiji_difference > 0:
            print(f'{n_clusters, emnist_difference, kuzushiji_difference} !!!!!!!!!')
        elif emnist_difference > 0 or kuzushiji_difference > 0:
            print(f'{n_clusters, emnist_difference, kuzushiji_difference} ????????')
        else:
            print(f'{n_clusters, emnist_difference, kuzushiji_difference}')
    except:
        pass

In [None]:
#elbow method suggests that 50 clusters for each dataset are possible

plt.rcParams["figure.figsize"] = (16, 8)
fig, (ax1, ax2) = plt.subplots(1,2, sharey=True)

clustering_emnist = GaussianMixture(n_components=46).fit_predict(emnist_scaled)
ax1.scatter(emnist_reduced[:, 0], emnist_reduced[:, 1], c=clustering_emnist, s=5, alpha=0.75, cmap='nipy_spectral')
ax1.axis('off')
ax1.title.set_text('Emnist')

clustering_kuzushiji = GaussianMixture(n_components=46).fit_predict(kuzushiji_scaled)
ax2.scatter(kuzushiji_reduced[:, 0], kuzushiji_reduced[:, 1], c=clustering_kuzushiji, s=5, alpha=0.75, cmap='nipy_spectral')
ax2.axis('off')
ax2.title.set_text('Kuzushiji')
plt.show()

In [None]:
unique_kuzushiji, counts_kuzushiji = np.unique(clustering_kuzushiji, return_counts=True)
unique_emnist, counts_emnist = np.unique(clustering_emnist, return_counts=True)

unique_emnist = unique_emnist.astype('str')
unique_kuzushiji = unique_kuzushiji.astype('str')

unique_kuzushiji, counts_kuzushiji = zip(*sorted(zip(unique_kuzushiji,counts_kuzushiji), key=lambda x:x[1], reverse=True))
unique_emnist, counts_emnist = zip(*sorted(zip(unique_emnist,counts_emnist), key=lambda x:x[1], reverse=True))

In [None]:
# plt.rcParams["figure.figsize"] = (16, 8)
# fig, (ax1, ax2) = plt.subplots(1,2, sharey=True)
#
# ax1.bar(unique_emnist, counts_emnist, color='blue')
# ax1.axis('off')
# ax1.title.set_text('Emnist')
#
# ax2.bar(unique_kuzushiji, counts_kuzushiji, color='red')
# ax2.axis('off')
# ax2.title.set_text('Kuzushiji')
# plt.show()

In [None]:
plt.rcParams["figure.figsize"] = (8, 8)

plt.axis('off')
plt.bar(range(len(counts_emnist)), counts_emnist, color='blue', label='emnist', alpha=0.6)
plt.bar(range(len(counts_kuzushiji)), counts_kuzushiji, color='red', label='kuzushiji', alpha=0.6)
plt.show()

In [None]:
kuzushiji_pages, kuzushiji_spaces = load_kuzushiji_pages_with_spaces(5, trial='trial_0')
emnist_pages, emnist_spaces = load_emnist_pages_with_spaces(5, trial='trial_0')

In [None]:
emnist_char_clusters = dict()
kuzushiji_char_clusters = dict()

for idx in range(len(emnist_pages)):
    try:
        emnist_char_clusters[clustering_emnist[idx]].append(idx)
    except:
        emnist_char_clusters[clustering_emnist[idx]] = list()
        emnist_char_clusters[clustering_emnist[idx]].append(idx)

for idx in range(len(kuzushiji_pages)):
    try:
        kuzushiji_char_clusters[clustering_kuzushiji[idx]].append(idx)
    except:
        kuzushiji_char_clusters[clustering_kuzushiji[idx]] = list()
        kuzushiji_char_clusters[clustering_kuzushiji[idx]].append(idx)

kuzushiji_to_emnist = dict()
emnist_to_kuzushiji = dict()

for idx in range(len(unique_kuzushiji)):
    kuzushiji_to_emnist[unique_kuzushiji[idx]] = unique_emnist[idx]
    emnist_to_kuzushiji[unique_emnist[idx]] = unique_kuzushiji[idx]

In [None]:
kuzushiji_translated_emnist = []
for element_of_cluster in clustering_kuzushiji:
    kuzushiji_translated_emnist.append(kuzushiji_to_emnist[str(element_of_cluster)])

In [None]:
picture_vector = []
for cluster in kuzushiji_translated_emnist:
    picture_vector.append(emnist_pages[np.random.choice(emnist_char_clusters[int(cluster)])])

In [None]:
columns = 80
rows = 114
cell = 32
max_chars = columns * rows

empty_sheet = np.zeros(shape=(rows * cell, columns * cell))

In [None]:
l_idx = 0
translated_emnist_sheets = list()
iter_picture_vector = iter(picture_vector)

while True:
    if l_idx == (len(picture_vector) + len(kuzushiji_spaces)):
        break

    sheet = copy.deepcopy(empty_sheet)

    for i in range(0, sheet.shape[0], 32):
        for j in range(0, sheet.shape[1], 32):
            if l_idx in kuzushiji_spaces:
                sheet[i:i + 32, j:j + 32] = (np.zeros(shape=(32, 32)))
            else:
                sheet[i:i + 32, j:j + 32] = next(iter_picture_vector)

            l_idx += 1

    translated_emnist_sheets.append(sheet)

In [None]:
path = 'data/translated_pages/emnist/'

for idx in range(len(translated_emnist_sheets)):
    img = cv2.convertScaleAbs(translated_emnist_sheets[idx], alpha=255.0)
    cv2.imwrite(path + "/emnist_" + str(idx) + ".png", img)