In [1]:
import panel as pn

In [2]:
pn.extension()

In [3]:
md = """
## Abstract
This paper demonstrates that inconsistent scaling between training and test sets can cause key data patterns to disappear, harming machine learning performance. Proper normalization---applying scaling parameters learned solely from the training data---is essential to preserve meaningful features and ensure reliable model performance.
"""

abstract = pn.pane.Markdown(md)
abstract

BokehModel(combine_events=True, render_bundle={'docs_json': {'f55d0ef2-664b-47ba-ab86-3b3e6711e7f4': {'version…

In [4]:
md = """
## Introduction
Normalizing numerical variables in a dataset is essential, especially when employing distance-based machine learning algorithms. This paper does not focus on normalization itself, but rather on the consistent scaling across different data splits, such as training and test sets.
"""

introduction = pn.pane.Markdown(md)
introduction

BokehModel(combine_events=True, render_bundle={'docs_json': {'b3957e8f-eee3-427b-a442-8a55db4f8bd3': {'version…

In [5]:
md = """
## A Visual Introduction
Given a dataset consisting of pixel information from an image, each observation corresponds to a single pixel related to a specific layer. Different layers may overlap; that is, two observations can share the same coordinates on the two-dimensional canvas. In addition to the coordinates, each pixel has an RGBA value, separated into individual columns, along with a layer ID.
"""

visual_intro = pn.pane.Markdown(md)

In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler
from matplotlib.colors import ListedColormap

In [7]:
def get_image_array(X, canvas_template):
	array = canvas_template.copy()
	ii = X['i'].to_numpy().astype(int)
	jj = X['j'].to_numpy().astype(int)
	rgba = X[['r', 'g', 'b', 'a']].to_numpy().astype(array.dtype)
	array[jj, ii] = rgba
	return array

def get_image_array_scaled(X, canvas_template, mono=False, binary=False):
	width, height = canvas_template.shape[1], canvas_template.shape[0]

	ii = (np.round(X['i'] * (width - 1))).to_numpy().astype(int)
	jj = (np.round(X['j'] * (height - 1))).to_numpy().astype(int)
	
	if binary:
		array = np.full((height, width), np.nan)
		array[jj, ii] = 1
		array = np.ma.masked_invalid(array)

	elif mono:
		array = np.full((height, width), np.nan)
		rgba = X[['r', 'g', 'b', 'a']].to_numpy().astype(array.dtype)
		for x, y, rgb in zip(ii, jj, rgba[:, :3]):
			gray = np.dot(rgb, [0.2989, 0.5870, 0.1140]) / 255.0
			array[y, x] = gray

	else:
		array = canvas_template.copy()
		rgba = X[['r', 'g', 'b', 'a']].to_numpy().astype(array.dtype)
		array[jj, ii] = rgba

	return array

def show_layer(X, canvas_template, ax=None, hide_axes=False, title="Layer"):
	if ax is None:
		ax = plt.gca()

	ax.imshow(get_image_array(X, canvas_template), interpolation="none")
	ax.set_title(title)

	if hide_axes:
		ax.axes.set_axis_off()

def show_layer_positions(X, canvas_template, ax=None, hide_axes=False, title="Layer", marker=".", color="black", s=.025):
	if ax is None:
		ax = plt.gca()

	ax.scatter(X["i"], X["j"], marker=marker, color=color, s=s)
	ax.set_title(title)

	ax.axis("scaled")
	ax.set_xlim([0, canvas_template.shape[1] - 1])
	ax.set_ylim([0, canvas_template.shape[0] - 1])
	ax.yaxis.set_inverted(True)

	if hide_axes:
		ax.axes.set_axis_off()

def show_original(x_train, x_test, canvas_template, ax=None, hide_axes=False):
	if ax is None:
		ax = plt.gca()

	img_arrays = []
	img_arrays.append(get_image_array(x_train, canvas_template))
	img_arrays.append(get_image_array(x_test, canvas_template))

	ax.imshow(img_arrays[0], interpolation="none")
	ax.imshow(img_arrays[1], interpolation="none")
	ax.set_title("Original scale")

	if hide_axes:
		ax.axes.set_axis_off()

def show_scaled(x_train, x_test, canvas_template, features_to_scale, ax=None, hide_axes=False):
	if ax is None:
		ax = plt.gca()

	scaler = MinMaxScaler()
	scaler.fit(x_train[features_to_scale])

	x_train_scaled = x_train.copy()
	x_train_scaled[features_to_scale] = scaler.transform(x_train_scaled[features_to_scale])

	x_test_scaled = x_test.copy()
	x_test_scaled[features_to_scale] = scaler.transform(x_test_scaled[features_to_scale])

	img_arrays = []
	img_arrays.append(get_image_array_scaled(x_train_scaled, canvas_template))
	img_arrays.append(get_image_array_scaled(x_test_scaled, canvas_template))

	ax.imshow(img_arrays[0], interpolation="none", extent=(0, 1, 1, 0), aspect='equal')
	ax.imshow(img_arrays[1], interpolation="none", extent=(0, 1, 1, 0), aspect='equal')
	
	ax.set_xticks([0, 1])
	ax.set_yticks([0, 1])
	ax.set_title("Normalized scale")

	if hide_axes:
		ax.axes.set_axis_off()

def show_badly_scaled(x_train, x_test, canvas_template, features_to_scale, ax=None, hide_axes=False, extent=(0, 1, 1, 0), aspect="equal", interpolation="none",
					  mono=False, mono_cmap="binary", highlight=False, highlight_color="red"):
	if ax is None:
		ax = plt.gca()

	scaler = MinMaxScaler()

	x_train_scaled = x_train.copy()
	x_train_scaled[features_to_scale] = scaler.fit_transform(x_train_scaled[features_to_scale])

	x_test_scaled = x_test.copy()
	x_test_scaled[features_to_scale] = scaler.fit_transform(x_test_scaled[features_to_scale])

	img_arrays = []
	img_arrays.append(get_image_array_scaled(x_train_scaled, canvas_template, mono=mono))
	img_arrays.append(get_image_array_scaled(x_test_scaled, canvas_template, mono=mono, binary=highlight))

	ax.imshow(img_arrays[0], interpolation="none", extent=extent, aspect=aspect, cmap=mono_cmap)

	if highlight:
		mono_cmap = ListedColormap([highlight_color])

	ax.imshow(img_arrays[1], interpolation=interpolation, extent=extent, aspect=aspect, cmap=mono_cmap)

	ax.set_xticks([0, 1])
	ax.set_yticks([0, 1])
	ax.set_title("Improperly normalized scale")

	if hide_axes:
		ax.axes.set_axis_off()

In [8]:
import imageio.v3 as iio

In [9]:
dirurl = "./resources"
file_names = ["background_medium.png", "object_medium.png"]

images = []
image_pixels = []

for image_name in file_names:
	img = iio.imread(f"{dirurl}/{image_name}")
	images.append(img)
	image_pixels.append(img)

In [10]:
column_names = ["i", "j", "r", "g", "b", "a", "lid"]
data = pd.DataFrame(columns=column_names)


for (img_i, img), pixels in zip(enumerate(images), image_pixels):
	rows = []
	h, w = img.shape[:2]

	for (y, x) in np.ndindex((h, w)):
		r, g, b, a = pixels[y, x]
		transparent = (np.array([r, g, b, a]) == 0).all()
		
		if not transparent:
			rows.append({"i": y, "j": x, "r": r, "g": g, "b": b, "a": a, "lid": img_i})

	data = pd.concat((data, pd.DataFrame(rows, columns=column_names)), ignore_index=True)

In [11]:
data

Unnamed: 0,i,j,r,g,b,a,lid
0,0,0,78,70,53,255,0
1,0,1,79,72,57,255,0
2,0,2,85,79,63,255,0
3,0,3,85,78,61,255,0
4,0,4,84,75,59,255,0
...,...,...,...,...,...,...,...
670258,432,436,0,0,0,1,1
670259,433,432,55,46,36,28,1
670260,433,433,51,45,38,40,1
670261,433,434,36,36,36,7,1


In [12]:
data_card = pn.Card(title="Sample data.", collapsible=False)
data_card.append(pn.pane.DataFrame(data, max_rows=10, width=400))

data_card

BokehModel(combine_events=True, render_bundle={'docs_json': {'cec67f90-54fe-493b-9967-b4e9278c4a43': {'version…

## App

In [13]:
app = pn.Column(
    abstract,
    introduction,
    visual_intro,
    data_card
)

app.servable()

BokehModel(combine_events=True, render_bundle={'docs_json': {'35f1d467-e74f-4c52-88e9-63ece6022353': {'version…