In [1]:
import pandas as pd

FOUR = '/kaggle/input/isolet-spoken-letters-of-alphabet/isolet1234.data'
FIVE = '/kaggle/input/isolet-spoken-letters-of-alphabet/isolet5.data'

train_df = pd.read_csv(FOUR, header=None)
test_df = pd.read_csv(filepath_or_buffer=FIVE, header=None)
train_df.rename(columns={train_df.columns[-1]: 'target'}, inplace=True)
test_df.rename(columns={test_df.columns[-1]: 'target'}, inplace=True)
train_df['target'] = train_df['target'].astype(int).apply(func=lambda x: chr(x + 96))
test_df['target'] = test_df['target'].astype(int).apply(func=lambda x: chr(x + 96))
df = pd.concat(objs=[train_df, test_df], axis='index')
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,608,609,610,611,612,613,614,615,616,target
0,-0.4394,-0.093,0.1718,0.462,0.6226,0.4704,0.3578,0.0478,-0.1184,-0.231,...,0.4102,0.2052,0.3846,0.359,0.5898,0.3334,0.641,0.5898,-0.4872,a
1,-0.4348,-0.1198,0.2474,0.4036,0.5026,0.6328,0.4948,0.0338,-0.052,-0.1302,...,0.0,0.2954,0.2046,0.4772,0.0454,0.2046,0.4318,0.4546,-0.091,a
2,-0.233,0.2124,0.5014,0.5222,-0.3422,-0.584,-0.7168,-0.6342,-0.8614,-0.8318,...,-0.1112,-0.0476,-0.1746,0.0318,-0.0476,0.1112,0.254,0.1588,-0.4762,b
3,-0.3808,-0.0096,0.2602,0.2554,-0.429,-0.6746,-0.6868,-0.665,-0.841,-0.9614,...,-0.0504,-0.036,-0.1224,0.1366,0.295,0.0792,-0.0072,0.0936,-0.151,b
4,-0.3412,0.0946,0.6082,0.6216,-0.1622,-0.3784,-0.4324,-0.4358,-0.4966,-0.5406,...,0.1562,0.3124,0.25,-0.0938,0.1562,0.3124,0.3124,0.2188,-0.25,c


In [2]:
print(df['target'].value_counts(dropna=False).to_dict())

{'a': 300, 'b': 300, 'y': 300, 'x': 300, 'w': 300, 'v': 300, 'u': 300, 't': 300, 's': 300, 'r': 300, 'q': 300, 'p': 300, 'o': 300, 'n': 300, 'l': 300, 'k': 300, 'j': 300, 'i': 300, 'h': 300, 'g': 300, 'e': 300, 'd': 300, 'c': 300, 'z': 300, 'm': 299, 'f': 298}


Our classes are essentially balanced; we have a fair number of instances and a fairly large number of classes, so let's try using dimension reduction to see how our data clusters.

In [3]:
import arrow
from umap import UMAP

COLUMNS = df.drop(columns=['target']).columns.tolist()

time_start = arrow.now()
umap = UMAP(random_state=2024, verbose=True, n_jobs=1, low_memory=False, n_epochs=201)
df[['x', 'y']] = umap.fit_transform(X=df[COLUMNS])
print('done with UMAP in {}'.format(arrow.now() - time_start))

2024-07-12 14:41:15.544435: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-12 14:41:15.544568: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-12 14:41:15.685063: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


UMAP(low_memory=False, n_epochs=201, n_jobs=1, random_state=2024, verbose=True)
Fri Jul 12 14:41:25 2024 Construct fuzzy simplicial set
Fri Jul 12 14:41:25 2024 Finding Nearest Neighbors
Fri Jul 12 14:41:25 2024 Building RP forest with 9 trees
Fri Jul 12 14:41:31 2024 NN descent for 13 iterations
	 1  /  13
	 2  /  13
	 3  /  13
	 4  /  13
	 5  /  13
	Stopping threshold met -- exiting after 5 iterations
Fri Jul 12 14:41:48 2024 Finished Nearest Neighbor Search
Fri Jul 12 14:41:52 2024 Construct embedding


Epochs completed:   0%|            0/201 [00:00]

	completed  0  /  201 epochs
	completed  20  /  201 epochs
	completed  40  /  201 epochs
	completed  60  /  201 epochs
	completed  80  /  201 epochs
	completed  100  /  201 epochs
	completed  120  /  201 epochs
	completed  140  /  201 epochs
	completed  160  /  201 epochs
	completed  180  /  201 epochs
	completed  200  /  201 epochs
Fri Jul 12 14:41:58 2024 Finished embedding
done with UMAP in 0:00:32.832248


Let's visualize a sample of the data because we have some clusters that are mixes of different letters and we want to get an idea of the proportions in those clusters, which is hard to see if we plot all the data.

In [4]:
import warnings
from plotly import express

warnings.filterwarnings(action='ignore', category=FutureWarning)
express.scatter(data_frame=df.sample(n=1000, random_state=2024), x='x', y='y', color='target', height=800).show()

For the most part we see that letters cluster according to the primary vowel sound in their name: the letters with names that have an eee sound all cluster together, as do the ehs and the ays. Maybe it's a surprise that the n/m cluster is distinct from the s/t cluster.

Let's build a simple model.

In [5]:
import arrow
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

TARGET = 'target'

time_start = arrow.now()
model = LogisticRegression(max_iter=100000, tol=1e-4).fit(X=train_df.drop(columns=['target']), y=train_df['target'])
print('model fit in {} iterations took {}'.format(model.n_iter_[0], arrow.now() - time_start))

print('accuracy: {:5.4f}'.format(accuracy_score(y_true=test_df['target'], y_pred=model.predict(X=test_df.drop(columns=['target'])))))
print('model done in {}'.format(arrow.now() - time_start))

model fit in 1493 iterations took 0:01:08.122947
accuracy: 0.9622
model done in 0:01:08.179808
