In [1]:
import arrow
import pandas as pd

TEST = '/kaggle/input/digit-recognizer/test.csv'
TRAIN = '/kaggle/input/digit-recognizer/train.csv'

time_start = arrow.now()
test_df = pd.read_csv(filepath_or_buffer=TEST)
train_df = pd.read_csv(filepath_or_buffer=TRAIN)
print('{}: data load complete.'.format(arrow.now() - time_start))

0:00:06.734670: data load complete.


In [2]:
from plotly import express
express.imshow(img=train_df.drop(columns=['label']).sum().to_numpy().reshape(28, 28))

In [3]:
drop_columns = [key for key, value in train_df.drop(columns=['label']).sum().to_dict().items() if value == 0]

In [4]:
slim_train_df = train_df.drop(columns=drop_columns)
slim_test_df = test_df.drop(columns=drop_columns)

In [5]:
slim_train_df.shape, slim_test_df.shape

((42000, 709), (28000, 708))

In [6]:
from arrow import now
from umap import UMAP

time_start = now()
columns = [column for column in slim_train_df.columns if column.startswith('pixel')]
umap = UMAP(random_state=2024, verbose=True, n_jobs=1, low_memory=False, n_epochs=100)
slim_train_df[['x', 'y']] = pd.DataFrame(data=umap.fit_transform(X=slim_train_df[columns]))
express.scatter(data_frame=slim_train_df, x='x', y='y', color='label').show()
print('done with UMAP in {}'.format(now() - time_start))

2024-03-29 17:24:10.188926: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-29 17:24:10.189072: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-29 17:24:10.375719: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


UMAP(low_memory=False, n_epochs=100, n_jobs=1, random_state=2024, verbose=True)
Fri Mar 29 17:24:25 2024 Construct fuzzy simplicial set
Fri Mar 29 17:24:25 2024 Finding Nearest Neighbors
Fri Mar 29 17:24:25 2024 Building RP forest with 15 trees
Fri Mar 29 17:24:33 2024 NN descent for 15 iterations
	 1  /  15
	 2  /  15
	 3  /  15
	 4  /  15
	 5  /  15
	Stopping threshold met -- exiting after 5 iterations
Fri Mar 29 17:25:01 2024 Finished Nearest Neighbor Search
Fri Mar 29 17:25:06 2024 Construct embedding


Epochs completed:   0%|            0/100 [00:00]

	completed  0  /  100 epochs
	completed  10  /  100 epochs
	completed  20  /  100 epochs
	completed  30  /  100 epochs
	completed  40  /  100 epochs
	completed  50  /  100 epochs
	completed  60  /  100 epochs
	completed  70  /  100 epochs
	completed  80  /  100 epochs
	completed  90  /  100 epochs
Fri Mar 29 17:25:25 2024 Finished embedding


done with UMAP in 0:01:01.020389


In [7]:
slim_test_df[['x', 'y']] = pd.DataFrame(data=umap.transform(X=slim_test_df[columns]))
express.scatter(data_frame=slim_test_df.reset_index(), x='x', y='y', hover_name='index').show()

Fri Mar 29 17:25:30 2024 Worst tree score: 0.61926190
Fri Mar 29 17:25:30 2024 Mean tree score: 0.62758730
Fri Mar 29 17:25:30 2024 Best tree score: 0.63571429
Fri Mar 29 17:25:36 2024 Forward diversification reduced edges from 630000 to 265896
Fri Mar 29 17:25:41 2024 Reverse diversification reduced edges from 265896 to 265896
Fri Mar 29 17:25:45 2024 Degree pruning reduced edges from 289530 to 289530
Fri Mar 29 17:25:45 2024 Resorting data and graph based on tree order
Fri Mar 29 17:25:45 2024 Building and compiling search function


Epochs completed:   0%|            0/33 [00:00]

	completed  0  /  33 epochs
	completed  3  /  33 epochs
	completed  6  /  33 epochs
	completed  9  /  33 epochs
	completed  12  /  33 epochs
	completed  15  /  33 epochs
	completed  18  /  33 epochs
	completed  21  /  33 epochs
	completed  24  /  33 epochs
	completed  27  /  33 epochs
	completed  30  /  33 epochs


In [8]:
import arrow
from sklearn.neighbors import KNeighborsClassifier

target = 'label'
time_start = arrow.now()
best = KNeighborsClassifier(n_neighbors=3)
best.fit(X=slim_train_df[columns], y=slim_train_df[target])

best_df = pd.DataFrame(data=best.predict(X=slim_test_df[columns])).reset_index()
best_df.columns = ['ImageId', 'Label']
best_df['ImageId'] = best_df['ImageId'] + 1
best_file = '/kaggle/working/best.csv.zip'
print('{} : writing best so far guess to {}'.format(arrow.now() - time_start, best_file))
best_df.to_csv(path_or_buf=best_file, index=False, compression='zip')
print('{} : done.'.format(arrow.now() - time_start, ))

0:00:49.351859 : writing best so far guess to /kaggle/working/best.csv.zip
0:00:49.427152 : done.
