In [1]:
!pip install --quiet umap-learn
print('UMAP install successful.')

UMAP install successful.


Let's load up our data and take a look. What exactly are we dealing with here?

In [2]:
import pandas as pd

DATA = '/kaggle/input/indian-sign-language-hand-landmarks-dataset/Indian Sign Language Gesture Landmarks.csv'

df = pd.read_csv(filepath_or_buffer=DATA)
df['uses_two_hands'] = df['uses_two_hands'] == 1.0
df = pd.get_dummies(data=df, columns=['uses_two_hands'])
columns = df.drop(columns=['target']).columns.tolist()
df.head()

Unnamed: 0,target,left_hand_x_0,left_hand_y_0,left_hand_z_0,left_hand_x_1,left_hand_y_1,left_hand_z_1,left_hand_x_2,left_hand_y_2,left_hand_z_2,...,right_hand_y_18,right_hand_z_18,right_hand_x_19,right_hand_y_19,right_hand_z_19,right_hand_x_20,right_hand_y_20,right_hand_z_20,uses_two_hands_False,uses_two_hands_True
0,0,0.253815,0.683926,-5.182993e-07,0.334322,0.628995,-0.036808,0.395115,0.536168,-0.059675,...,0.636123,-0.077313,0.728371,0.649146,-0.074293,0.740377,0.632456,-0.069335,False,True
1,0,0.239443,0.705979,-4.334263e-07,0.328341,0.643167,-0.034284,0.39172,0.545824,-0.054809,...,0.631766,-0.067937,0.72932,0.651621,-0.06584,0.74523,0.634763,-0.060055,False,True
2,0,0.235131,0.710337,-4.203069e-07,0.326313,0.646435,-0.033129,0.391156,0.544003,-0.052426,...,0.634359,-0.065477,0.730437,0.652772,-0.064716,0.748273,0.63301,-0.060026,False,True
3,0,0.238261,0.711585,-4.149002e-07,0.32771,0.649296,-0.0326,0.392238,0.548713,-0.051402,...,0.635711,-0.064914,0.731558,0.652528,-0.06477,0.750294,0.631503,-0.060443,False,True
4,0,0.240304,0.714531,-4.118573e-07,0.329382,0.651202,-0.032841,0.392812,0.550876,-0.052709,...,0.637619,-0.066194,0.731997,0.653156,-0.0665,0.75149,0.631916,-0.062321,False,True


We have 128 columns of input data to produce one target column. That seems like a lot of data. 

Are the classes in our target class balanced? 

In [3]:
from plotly import express

express.histogram(data_frame=df, x='target')

Maybe surprisingly, they are more or less balanced. The smallest class is about 85% the size of the largest class.

Let's make a scatter plot of all of our data using UMAP for dimension reduction.

In [4]:
import arrow
from umap import UMAP

time_start = arrow.now()
umap = UMAP(random_state=2024, verbose=False, n_jobs=1, low_memory=False, n_epochs=201)
df[['umap_x', 'umap_y']] = umap.fit_transform(X=df[columns])
print('done with UMAP in {}'.format(arrow.now() - time_start))

done with UMAP in 0:01:27.403864


We have a lot of datapoints, so in the interest of performance let's just visualize a sample.

In [5]:
express.scatter(data_frame=df.sample(n=5000, random_state=2024), x='umap_x', y='umap_y', color='target')

We do see some clustering but we see a lot of seemlingly isolated points and a fair number of points with nearest neighbors that are in another class. We should probably have moderate expectations for how accurate our model will be.

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(df[columns], df['target'], test_size=0.2, random_state=2024, stratify=df['target'])

logreg = LogisticRegression(max_iter=10000, tol=1e-12).fit(X_train, y_train)
print('model fit in {} iterations'.format(logreg.n_iter_[0]))
print('accuracy: {:5.4f} f1: {:5.4f}'.format(accuracy_score(y_true=y_test, y_pred=logreg.predict(X=X_test)), f1_score(average='weighted', y_true=y_test, y_pred=logreg.predict(X=X_test), zero_division=0)))
print(classification_report(y_true=y_test, y_pred=logreg.predict(X=X_test), zero_division=0))

model fit in 2158 iterations
accuracy: 0.9698 f1: 0.9697
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       400
           1       1.00      1.00      1.00       393
           2       0.93      0.95      0.94       399
           3       0.96      0.97      0.97       392
           4       0.98      1.00      0.99       399
           5       1.00      1.00      1.00       400
           6       1.00      0.98      0.99       374
           7       0.99      1.00      1.00       367
           8       0.84      0.89      0.87       399
           9       0.99      1.00      1.00       394
          10       0.96      0.99      0.98       400
          11       0.93      0.86      0.89       400
          12       1.00      0.99      1.00       382
          13       1.00      1.00      1.00       386
          14       0.98      1.00      0.99       390
          15       0.99      0.97      0.98       400
          16       0.98 

Wow.

In [7]:
express.histogram(x=columns[:64], y=logreg.coef_[0][:64])

In [8]:
express.histogram(x=columns[64:], y=logreg.coef_[0][64:])

In [9]:
probability_df = pd.DataFrame(data=logreg.predict_proba(X=X_test).max(axis=1), columns=['probability'])
probability_df['prediction'] = logreg.predict(X=X_test)
probability_df['actual'] = y_test.tolist()
probability_df['correct'] = probability_df['actual'] == probability_df['prediction']
probability_df[['x', 'y']] = umap.transform(X=X_test)

probability_df.head()

Unnamed: 0,probability,prediction,actual,correct,x,y
0,0.693913,23,23,True,-3.476283,0.495043
1,0.935544,14,14,True,-1.091689,13.939263
2,0.813324,3,3,True,1.559419,-2.118929
3,0.48806,19,19,True,-1.005898,2.146498
4,0.785987,16,16,True,-5.829111,2.200757


How much test data do we have? 

In [10]:
len(X_test)

10172

What do our model probabilities look like? We don't have a lot of test data, so let's visualize all of it.

In [11]:
express.scatter(data_frame=probability_df, x='x', y='y', color='probability', facet_col='correct', hover_name='actual')

Our model rarely produces a high probability when it is wrong, which is nice.