In [47]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import xgboost as xgb
import sklearn
from tensorflow import keras
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_classification
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import pickle

In [48]:
language = 'fsl'

In [49]:
datasets = {chr(letter):pd.read_csv(f'processed_data/{language}_mp01021/{chr(letter)}_Data.csv') for letter in range(ord('A'), ord('Z') + 1)}

In [50]:
datasets['A'].head()

Unnamed: 0,Landmarks,0,0.1,0.2,1,1.1,1.2,2,2.1,2.2,...,17.2,18,18.1,18.2,19,19.1,19.2,20,20.1,20.2
0,Coordinates,x,y,z,x,y,z,x,y,z,...,z,x,y,z,x,y,z,x,y,z
1,Frame,,,,,,,,,,...,,,,,,,,,,
2,1,-0.328798770904541,0.9440675973892212,-0.025055259466171265,0.09019581228494644,0.9056023955345154,-0.2417997121810913,0.46820488572120667,0.5142858624458313,-0.30139783024787903,...,-0.08161065727472305,-0.49492689967155457,-0.06241708993911743,-0.1844290941953659,-0.4510268270969391,0.20504318177700043,-0.10821167379617691,-0.46734902262687683,0.2073359340429306,-0.024087782949209213
3,2,-0.2267197072505951,0.9729245901107788,-0.044898826628923416,0.20533159375190735,0.8290771842002869,-0.22340598702430725,0.5266153812408447,0.4613969922065735,-0.29489830136299133,...,-0.09364113956689835,-0.4352213144302368,-0.06646435707807541,-0.1759088933467865,-0.39210861921310425,0.18160800635814667,-0.09695502370595932,-0.4101670980453491,0.21136802434921265,-0.012231195345520973
4,3,-0.29781967401504517,0.9545851945877075,-0.008398245088756084,0.12096328288316727,0.8573551177978516,-0.1993054896593094,0.4726623594760895,0.46044933795928955,-0.22897109389305115,...,-0.008828305639326572,-0.4492706060409546,-0.06148320436477661,-0.11127342283725739,-0.42576828598976135,0.19160698354244232,-0.04531983286142349,-0.4427775740623474,0.2054169476032257,0.03030148148536682


In [51]:
for key, value in datasets.items():
    datasets[key] = datasets[key].drop([0,1])
    datasets[key] = datasets[key].drop(columns=['Landmarks'])
    datasets[key]['Letter'] = key
    datasets[key] = datasets[key].sample(frac=1).reset_index(drop=True)

In [52]:
datasets['B'].head()

Unnamed: 0,0,0.1,0.2,1,1.1,1.2,2,2.1,2.2,3,...,18,18.1,18.2,19,19.1,19.2,20,20.1,20.2,Letter
0,-0.2160150557756424,0.9738113880157472,0.0709148868918418,0.0934617817401886,0.815496027469635,-0.1321834772825241,0.2883219122886657,0.4936845600605011,-0.2058582305908203,0.1804180592298507,...,-0.3152645230293274,-0.2375402599573135,-0.0856349319219589,-0.2894911766052246,-0.4183290898799896,-0.1071999594569206,-0.275509625673294,-0.5794624090194702,-0.1291219741106033,B
1,0.0473339967429637,0.9986663460731506,-0.0206151474267244,0.2659691870212555,0.708798885345459,-0.0773437619209289,0.2931068539619446,0.3462115228176117,-0.1060476526618003,0.1503421366214752,...,-0.3579603135585785,-0.0944850593805313,-0.1517567783594131,-0.3652352690696716,-0.2766677439212799,-0.1512681394815445,-0.3506331443786621,-0.4345927834510803,-0.1607088595628738,B
2,-0.2353502660989761,0.9714571833610536,0.029685866087675,0.0698206648230552,0.7957343459129333,-0.1218542084097862,0.2416078448295593,0.4736809134483337,-0.1781067252159118,0.1214927807450294,...,-0.3361363112926483,-0.2017539590597152,-0.1249743998050689,-0.3097947239875793,-0.3969786167144775,-0.1655750125646591,-0.2781464457511902,-0.5613037943840027,-0.2040757536888122,B
3,-0.2825384140014648,0.9558480381965636,0.0807869210839271,0.0123580740764737,0.8269078731536865,-0.0752541199326515,0.2093178480863571,0.5194309949874878,-0.1453175991773605,0.090830385684967,...,-0.3459891974925995,-0.2348772138357162,-0.1153746917843818,-0.3257926702499389,-0.4264694452285766,-0.1421770900487899,-0.3046497106552124,-0.5919105410575867,-0.1724811792373657,B
4,-0.2166042774915695,0.972685694694519,0.0834570750594139,0.0800075680017471,0.8180012702941895,-0.1025720387697219,0.2874310612678528,0.5286237597465515,-0.1805153489112854,0.2001450359821319,...,-0.3202137351036072,-0.2416366040706634,-0.1002151146531105,-0.2970669865608215,-0.4221753478050232,-0.1086134016513824,-0.2812380790710449,-0.5861000418663025,-0.1184395030140876,B


In [53]:
data = pd.concat(datasets.values(), ignore_index=True)
data = data.sample(frac=1).reset_index(drop=True)
data.head()

Unnamed: 0,0,0.1,0.2,1,1.1,1.2,2,2.1,2.2,3,...,18,18.1,18.2,19,19.1,19.2,20,20.1,20.2,Letter
0,0.211717739701271,0.9767455458641052,0.0338186137378215,0.3350772559642792,0.6495612263679504,-0.0794727951288223,0.2409679293632507,0.2873315811157226,-0.0877183079719543,-0.0342430844902992,...,-0.2779718339443207,0.0723707303404808,-0.2070066034793853,-0.1903930604457855,0.1443158686161041,-0.2636958360671997,-0.1036644130945205,0.2782131731510162,-0.3136678636074066,W
1,0.2378988564014434,0.9710900187492372,-0.0197066981345415,0.3232544660568237,0.5948726534843445,-0.0493496842682361,0.2877626419067383,0.1963613331317901,-0.0561339370906353,0.1292117834091186,...,-0.397878348827362,0.1620409339666366,-0.2172173708677292,-0.3221866190433502,0.273086816072464,-0.1809097379446029,-0.2279220521450042,0.3547385931015014,-0.1338849365711212,Z
2,-0.2009689211845398,0.751974880695343,0.6278098821640015,0.065991923213005,0.3326540887355804,0.5956430435180664,0.2725592851638794,-0.1122938841581344,0.4567232429981231,0.3784763216972351,...,0.5222320556640625,0.613762617111206,-0.2137074023485183,0.7651430368423462,0.582495927810669,-0.1896494328975677,0.9263443350791932,0.530733585357666,-0.1745564937591552,J
3,-0.2009121775627136,0.979238212108612,0.0269603505730628,0.1195202842354774,0.8078165650367737,-0.1117513924837112,0.4115965366363525,0.5200336575508118,-0.1732639968395233,0.6729715466499329,...,-0.4504903554916382,-0.1702746003866195,-0.196145698428154,-0.3891325294971466,-0.0536008104681968,-0.1739224642515182,-0.3532902002334595,0.0792094022035598,-0.1340995728969574,L
4,-0.4875460267066955,0.8718947172164917,0.0458091646432876,-0.12789186835289,0.8026905655860901,-0.0472122058272361,0.2172506749629974,0.4997550547122955,-0.0841501727700233,0.198943942785263,...,-0.4127297401428222,-0.3878060281276703,-0.1659405678510666,-0.3909347355365753,-0.5976364016532898,-0.1667268425226211,-0.3465766310691833,-0.8242152333259583,-0.147739753127098,I


In [54]:
y = data['Letter']
X = data.drop(columns=['Letter'])

In [None]:
X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=0.3, random_state=42)

# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_val = scaler.transform(X_val)
# X_test = scaler.transform(X_test)
X_train = X_train.apply(pd.to_numeric, errors='coerce')
X_train = X_train.dropna()
X_train = X_train.to_numpy()
X_val = X_val.apply(pd.to_numeric, errors='coerce')
X_val = X_val.dropna()
X_val = X_val.to_numpy()
X_test = X_test.apply(pd.to_numeric, errors='coerce')
X_test = X_test.dropna()
X_test = X_test.to_numpy()


le = LabelEncoder()
y_train = le.fit_transform(y_train)  
y_val = le.transform(y_val)  
y_test = le.transform(y_test)

In [56]:
X_train.shape

(6502, 63)

In [57]:
# Define model function
def create_keras_model():
    model = keras.Sequential([
        keras.layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
        keras.layers.BatchNormalization(),
        keras.layers.Dropout(0.3),  # Helps prevent overfitting

        keras.layers.Dense(64, activation='relu'),
        keras.layers.BatchNormalization(),
        keras.layers.Dropout(0.3),

        keras.layers.Dense(32, activation='relu'),

        keras.layers.Dense(26, activation='softmax')  # 26 classes
    ])

    # Compile the model
    model.compile(optimizer='adam',
                loss='sparse_categorical_crossentropy',  # Use if labels are integers (0-25)
                metrics=['accuracy'])
    return model


In [58]:
keras_model = create_keras_model()
keras_model.fit(X_train, y_train, epochs=20, batch_size=32, verbose=0, validation_data=(X_val, y_val))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


<keras.src.callbacks.history.History at 0x1ce4eba8a70>

In [59]:
val_loss, val_acc = keras_model.evaluate(X_val, y_val, verbose=0)
print(f"Final Validation Accuracy: {val_acc:.4f}")

Final Validation Accuracy: 0.9921


In [60]:
test_loss, test_acc = keras_model.evaluate(X_test, y_test, verbose=0)
print(f"Final Test Accuracy: {test_acc:.4f}")

Final Test Accuracy: 0.9875


In [61]:
keras_model.save(f"models/{language}/keras_ANN.keras")

In [62]:
def create_xgb_model():
    model = xgb.XGBClassifier(
        objective="multi:softmax",  # Multi-class classification
        num_class=26,               # 26 different classes
        eval_metric="mlogloss",      # Multiclass log loss
        max_depth=6,                 # Depth of trees (adjust based on experiments)
        learning_rate=0.1,           # Step size shrinkage (adjust for tuning)
        n_estimators=100,            # Number of trees (increase for better results)
        use_label_encoder=False      # Required for newer versions of XGBoost
    )
    return model

In [63]:
xgb_model = create_xgb_model()
xgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=True)

Parameters: { "use_label_encoder" } are not used.



[0]	validation_0-mlogloss:2.29116
[1]	validation_0-mlogloss:1.91630
[2]	validation_0-mlogloss:1.66046
[3]	validation_0-mlogloss:1.46657
[4]	validation_0-mlogloss:1.30844
[5]	validation_0-mlogloss:1.17836
[6]	validation_0-mlogloss:1.06737
[7]	validation_0-mlogloss:0.97171
[8]	validation_0-mlogloss:0.88723
[9]	validation_0-mlogloss:0.81215
[10]	validation_0-mlogloss:0.74622
[11]	validation_0-mlogloss:0.68657
[12]	validation_0-mlogloss:0.63387
[13]	validation_0-mlogloss:0.58552
[14]	validation_0-mlogloss:0.54217
[15]	validation_0-mlogloss:0.50267
[16]	validation_0-mlogloss:0.46628
[17]	validation_0-mlogloss:0.43319
[18]	validation_0-mlogloss:0.40361
[19]	validation_0-mlogloss:0.37625
[20]	validation_0-mlogloss:0.35135
[21]	validation_0-mlogloss:0.32841
[22]	validation_0-mlogloss:0.30761
[23]	validation_0-mlogloss:0.28842
[24]	validation_0-mlogloss:0.27112
[25]	validation_0-mlogloss:0.25515
[26]	validation_0-mlogloss:0.24053
[27]	validation_0-mlogloss:0.22698
[28]	validation_0-mlogloss:0.2

In [64]:
# Make predictions
y_pred_val = xgb_model.predict(X_val)

# Compute accuracy
accuracy = accuracy_score(y_val, y_pred_val)
print(f"Evaluation Accuracy: {accuracy:.4f}")

Evaluation Accuracy: 0.9885


In [65]:
# Make predictions
y_pred_test = xgb_model.predict(X_test)

# Compute accuracy
accuracy = accuracy_score(y_test, y_pred_test)
print(f"Test Accuracy: {accuracy:.4f}")

Test Accuracy: 0.9828


In [66]:
xgb_model.save_model(f"models/{language}/xgboost_model.json")

In [67]:
X_test

array([[ 0.07117779,  0.99634153,  0.04730054, ..., -0.23675866,
         0.39642996, -0.11813724],
       [-0.2177745 ,  0.97414345,  0.06015664, ..., -0.35216853,
         0.09757519, -0.07609792],
       [-0.14845969,  0.98777109,  0.04762341, ...,  0.68911678,
         0.10101629, -0.14755474],
       ...,
       [-0.3642002 ,  0.92572355, -0.10195162, ..., -0.45715696,
         0.17325392, -0.04867852],
       [-0.09692637,  0.99232531,  0.07678401, ..., -0.12298288,
         0.52345651, -0.13003486],
       [-0.1798263 ,  0.97497064,  0.13074669, ..., -0.48591673,
        -0.05828797, -0.12163033]])

In [68]:
X_test = X_test[0].reshape(1, -1)
X_test

array([[ 0.07117779,  0.99634153,  0.04730054,  0.27384338,  0.72468346,
        -0.11597569,  0.2595152 ,  0.37949711, -0.1675832 , -0.0186281 ,
         0.16900374, -0.21901874, -0.27504542,  0.0476688 , -0.25372189,
         0.23688878, -0.02123114,  0.00586876,  0.29533234, -0.43121073,
        -0.10775971,  0.31384465, -0.69979954, -0.17469797,  0.32242078,
        -0.92468435, -0.20549747,  0.        ,  0.        ,  0.        ,
        -0.10455015, -0.46015909, -0.12994802, -0.19490489, -0.73943728,
        -0.19671227, -0.27647993, -0.95855331, -0.20890903, -0.18620028,
         0.12179077, -0.03532141, -0.29292914, -0.04322465, -0.25835791,
        -0.18804888,  0.17595395, -0.28775382, -0.0998941 ,  0.33021483,
        -0.22406849, -0.33081546,  0.29185092, -0.08118614, -0.43182147,
         0.1150532 , -0.22437821, -0.33023369,  0.26445407, -0.19043458,
        -0.23675866,  0.39642996, -0.11813724]])

In [69]:
t = xgb_model.predict(X_test[0].reshape(1, -1))

In [70]:
t[0]

21