In [184]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import xgboost as xgb
import sklearn
from tensorflow import keras
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_classification
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import pickle

In [185]:
datasets = {chr(letter):pd.read_csv(f'processed_data_mp01021/{chr(letter)}_Data.csv') for letter in range(ord('A'), ord('Z') + 1)}

In [186]:
datasets['A'].head()

Unnamed: 0,Landmarks,0,0.1,0.2,1,1.1,1.2,2,2.1,2.2,...,17.2,18,18.1,18.2,19,19.1,19.2,20,20.1,20.2
0,Coordinates,x,y,z,x,y,z,x,y,z,...,z,x,y,z,x,y,z,x,y,z
1,Frame,,,,,,,,,,...,,,,,,,,,,
2,1,-0.10191626101732254,0.9925596117973328,0.06662236154079437,0.2784864604473114,0.8289384841918945,-0.07882609963417053,0.6251492500305176,0.3535401523113251,-0.11874441802501678,...,-0.09184916317462921,-0.28185784816741943,0.07575207948684692,-0.28906363248825073,-0.2340974658727646,0.4078732132911682,-0.257875919342041,-0.22813771665096283,0.6210938096046448,-0.17181435227394104
3,2,-0.11209727078676224,0.99217689037323,0.05494748428463936,0.25864800810813904,0.7781158685684204,-0.08616439998149872,0.5868735313415527,0.33302342891693115,-0.13564430177211761,...,-0.07887183874845505,-0.29231786727905273,0.06680270284414291,-0.23227861523628235,-0.2517852783203125,0.38482457399368286,-0.17785505950450897,-0.248565673828125,0.5935962796211243,-0.07973474264144897
4,3,0.025396086275577545,0.9988321661949158,0.04110104590654373,0.3819988965988159,0.7106916308403015,-0.09989170730113983,0.6305642127990723,0.20811787247657776,-0.14538881182670593,...,-0.09832348674535751,-0.27134260535240173,0.0629710778594017,-0.28379887342453003,-0.20002099871635437,0.370392769575119,-0.237660214304924,-0.16897597908973694,0.6012681722640991,-0.13964223861694336


In [187]:
for key, value in datasets.items():
    datasets[key] = datasets[key].drop([0,1])
    datasets[key] = datasets[key].drop(columns=['Landmarks'])
    datasets[key]['Letter'] = key
    datasets[key] = datasets[key].sample(frac=1).reset_index(drop=True)

In [188]:
datasets['B'].head()

Unnamed: 0,0,0.1,0.2,1,1.1,1.2,2,2.1,2.2,3,...,18,18.1,18.2,19,19.1,19.2,20,20.1,20.2,Letter
0,-0.2207265943288803,0.9706849455833436,0.095134362578392,0.0217392612248659,0.7742661833763123,-0.084729753434658,0.1724144667387008,0.4414305686950683,-0.1481259763240814,0.0067980336025357,...,-0.3365214765071869,-0.1492941677570343,-0.1120033040642738,-0.3100425004959106,-0.3395440876483917,-0.1541227996349334,-0.2848142981529236,-0.5114002227783203,-0.1858625560998916,B
1,-0.2024701833724975,0.97892028093338,0.0268497876822948,0.0514155067503452,0.7501966953277588,-0.0784542709589004,0.1866015344858169,0.3737694919109344,-0.0968809872865676,0.0606736242771148,...,-0.2968710064888,-0.1418548822402954,-0.1349426209926605,-0.286550760269165,-0.3353334963321686,-0.1688061356544494,-0.2958759069442749,-0.5093653798103333,-0.1981541216373443,B
2,-0.0712907910346984,0.9955935478210448,0.0609191320836544,0.1572277396917343,0.7893714904785156,-0.0647079572081565,0.268802672624588,0.4028640389442444,-0.0947571247816085,0.12796550989151,...,-0.2835652828216553,-0.0757291838526725,-0.1586699187755584,-0.3036950528621673,-0.2650743722915649,-0.2053084671497345,-0.3452711999416351,-0.4360535144805908,-0.2394093126058578,B
3,-0.1520264744758606,0.9869832396507264,0.0524604059755802,0.093762382864952,0.7781801819801331,-0.0761689916253089,0.241716593503952,0.3992753326892853,-0.1033237874507904,0.108475774526596,...,-0.3137594759464264,-0.1075241938233375,-0.1442088633775711,-0.3078997731208801,-0.2955151796340942,-0.1899056434631347,-0.3138746619224548,-0.4701763093471527,-0.2260983884334564,B
4,-0.2542685270309448,0.9648824334144592,0.0659497827291488,0.0082327248528599,0.7797635197639465,-0.0483196750283241,0.1721972376108169,0.4189990758895874,-0.0797091126441955,0.0797830447554588,...,-0.2905497550964355,-0.139352485537529,-0.1513841897249221,-0.2710233628749847,-0.3216945230960846,-0.1821233183145523,-0.2713964283466339,-0.488657146692276,-0.209016278386116,B


In [189]:
data = pd.concat(datasets.values(), ignore_index=True)
data = data.sample(frac=1).reset_index(drop=True)
data.head()

Unnamed: 0,0,0.1,0.2,1,1.1,1.2,2,2.1,2.2,3,...,18,18.1,18.2,19,19.1,19.2,20,20.1,20.2,Letter
0,-0.0814270824193954,0.992375671863556,0.0925208404660224,0.2214431911706924,0.7998363375663757,0.1128211468458175,0.4199620485305786,0.508604884147644,0.0838860124349594,0.5590741634368896,...,-0.2238252460956573,-0.1812013834714889,-0.3240731656551361,-0.0393175929784774,-0.3214283287525177,-0.3725730776786804,0.1363149136304855,-0.3283331990242004,-0.3902526795864105,C
1,-0.0188851058483123,0.9858821630477904,0.1663723289966583,0.251800537109375,0.7381811738014221,0.0287408102303743,0.3737737238407135,0.2936705350875854,-0.0271249711513519,0.2091236263513565,...,-0.208985298871994,0.0627595409750938,-0.2911833524703979,-0.0805940702557563,0.2509395778179168,-0.2752704322338104,-0.0693030282855033,0.4208160638809204,-0.2123123407363891,U
2,-0.4363015294075012,0.8898440599441528,0.1334859132766723,-0.1635554730892181,0.7757006883621216,-0.046483363956213,0.0827075466513633,0.5467744469642639,-0.1437119096517563,0.1164417713880539,...,-0.4168643951416015,-0.0602707751095294,-0.2210739403963089,-0.3934091925621032,0.137828066945076,-0.1763492971658706,-0.3785313367843628,0.2643734216690063,-0.0964627563953399,Z
3,0.5751295685768127,0.8037475943565369,0.1523670703172683,0.6690853834152222,0.4786679148674011,-0.0041204215958714,0.5347392559051514,0.1114061400294303,-0.0632250159978866,0.2505123019218445,...,0.0179674532264471,0.3368670642375946,-0.278446614742279,0.2501538395881653,0.3949203491210937,-0.2933789789676666,0.4171738922595978,0.4737691879272461,-0.2447002381086349,H
4,-0.2935364842414856,0.9515162110328674,0.0919413119554519,-0.0334199108183383,0.7704428434371948,-0.0914120525121688,0.1522606611251831,0.4557228684425354,-0.1560986787080764,0.0201210472732782,...,-0.3272735476493835,-0.1836843788623809,-0.1099257692694664,-0.2873494923114776,-0.3707571923732757,-0.1521399021148681,-0.249105766415596,-0.5426187515258789,-0.1828824132680893,B


In [190]:
y = data['Letter']
X = data.drop(columns=['Letter'])

In [191]:
X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=0.3, random_state=42)

# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_val = scaler.transform(X_val)
# X_test = scaler.transform(X_test)
X_train = X_train.apply(pd.to_numeric, errors='coerce')
X_train = X_train.dropna()
X_train = X_train.to_numpy()
X_val = X_val.apply(pd.to_numeric, errors='coerce')
X_val = X_val.dropna()
X_val = X_val.to_numpy()
X_test = X_test.apply(pd.to_numeric, errors='coerce')
X_test = X_test.dropna()
X_test = X_test.to_numpy()


le = LabelEncoder()
y_train = le.fit_transform(y_train)  
y_val = le.transform(y_val)  
y_test = le.transform(y_test)

In [207]:
X_train.shape

(14728, 63)

In [193]:
# Define model function
def create_keras_model():
    model = keras.Sequential([
        keras.layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
        keras.layers.BatchNormalization(),
        keras.layers.Dropout(0.3),  # Helps prevent overfitting

        keras.layers.Dense(64, activation='relu'),
        keras.layers.BatchNormalization(),
        keras.layers.Dropout(0.3),

        keras.layers.Dense(32, activation='relu'),

        keras.layers.Dense(26, activation='softmax')  # 26 classes
    ])

    # Compile the model
    model.compile(optimizer='adam',
                loss='sparse_categorical_crossentropy',  # Use if labels are integers (0-25)
                metrics=['accuracy'])
    return model


In [194]:
keras_model = create_keras_model()
keras_model.fit(X_train, y_train, epochs=20, batch_size=32, verbose=0, validation_data=(X_val, y_val))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


<keras.src.callbacks.history.History at 0x2429ce69880>

In [195]:
val_loss, val_acc = keras_model.evaluate(X_val, y_val, verbose=0)
print(f"Final Validation Accuracy: {val_acc:.4f}")

Final Validation Accuracy: 0.9967


In [196]:
test_loss, test_acc = keras_model.evaluate(X_test, y_test, verbose=0)
print(f"Final Test Accuracy: {test_acc:.4f}")

Final Test Accuracy: 0.9973


In [197]:
keras_model.save("models/keras_ANN.keras")

In [198]:
def create_xgb_model():
    model = xgb.XGBClassifier(
        objective="multi:softmax",  # Multi-class classification
        num_class=26,               # 26 different classes
        eval_metric="mlogloss",      # Multiclass log loss
        max_depth=6,                 # Depth of trees (adjust based on experiments)
        learning_rate=0.1,           # Step size shrinkage (adjust for tuning)
        n_estimators=100,            # Number of trees (increase for better results)
        use_label_encoder=False      # Required for newer versions of XGBoost
    )
    return model

In [199]:
xgb_model = create_xgb_model()
xgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=True)

[0]	validation_0-mlogloss:2.09876
[1]	validation_0-mlogloss:1.73070


Parameters: { "use_label_encoder" } are not used.



[2]	validation_0-mlogloss:1.47860
[3]	validation_0-mlogloss:1.28654
[4]	validation_0-mlogloss:1.13236
[5]	validation_0-mlogloss:1.00366
[6]	validation_0-mlogloss:0.89470
[7]	validation_0-mlogloss:0.80094
[8]	validation_0-mlogloss:0.71922
[9]	validation_0-mlogloss:0.64742
[10]	validation_0-mlogloss:0.58404
[11]	validation_0-mlogloss:0.52768
[12]	validation_0-mlogloss:0.47744
[13]	validation_0-mlogloss:0.43262
[14]	validation_0-mlogloss:0.39246
[15]	validation_0-mlogloss:0.35652
[16]	validation_0-mlogloss:0.32410
[17]	validation_0-mlogloss:0.29496
[18]	validation_0-mlogloss:0.26857
[19]	validation_0-mlogloss:0.24479
[20]	validation_0-mlogloss:0.22324
[21]	validation_0-mlogloss:0.20384
[22]	validation_0-mlogloss:0.18632
[23]	validation_0-mlogloss:0.17048
[24]	validation_0-mlogloss:0.15599
[25]	validation_0-mlogloss:0.14294
[26]	validation_0-mlogloss:0.13112
[27]	validation_0-mlogloss:0.12034
[28]	validation_0-mlogloss:0.11053
[29]	validation_0-mlogloss:0.10160
[30]	validation_0-mlogloss:0

In [200]:
# Make predictions
y_pred_val = xgb_model.predict(X_val)

# Compute accuracy
accuracy = accuracy_score(y_val, y_pred_val)
print(f"Evaluation Accuracy: {accuracy:.4f}")

Evaluation Accuracy: 0.9978


In [201]:
# Make predictions
y_pred_test = xgb_model.predict(X_test)

# Compute accuracy
accuracy = accuracy_score(y_test, y_pred_test)
print(f"Test Accuracy: {accuracy:.4f}")

Test Accuracy: 0.9970


In [202]:
xgb_model.save_model("models/xgboost_model.json")

In [203]:
X_test

array([[-1.02237448e-01,  9.89239454e-01,  1.04655616e-01, ...,
         1.74758155e-02,  3.82115394e-01, -2.23726198e-01],
       [ 8.40613768e-02,  9.83648717e-01,  1.59276232e-01, ...,
         2.97183084e-04,  5.75120151e-01, -2.29534343e-01],
       [-2.50038236e-01,  9.68009591e-01, -2.09370777e-02, ...,
        -4.39779162e-01, -8.43960881e-01, -1.76155865e-01],
       ...,
       [-9.67272893e-02,  9.87501383e-01,  1.24438114e-01, ...,
         2.39147376e-02,  3.89859766e-01, -1.93353578e-01],
       [-1.09583504e-01,  9.92640495e-01,  5.15393503e-02, ...,
        -3.62617791e-01,  3.03899199e-01,  9.57416545e-04],
       [ 9.88415837e-01,  3.68006006e-02,  1.47240788e-01, ...,
         1.91839971e-02,  7.13500738e-01, -1.13028035e-01]])

In [204]:
X_test = X_test[0].reshape(1, -1)
X_test

array([[-0.10223745,  0.98923945,  0.10465562,  0.17893231,  0.79261518,
        -0.01392179,  0.34385887,  0.36800259, -0.06897634,  0.30114663,
         0.04339335, -0.13323432,  0.13481562, -0.15997395, -0.17533274,
         0.16651745,  0.01042212,  0.02770009,  0.12728798, -0.43586773,
        -0.09187035,  0.08827616, -0.6980114 , -0.15954748,  0.0390434 ,
        -0.9147734 , -0.20138982,  0.        ,  0.        ,  0.        ,
         0.03888666, -0.47333592, -0.16008905,  0.14496537, -0.72616202,
        -0.22094873,  0.20420197, -0.9234761 , -0.23718078, -0.1604946 ,
         0.05212369, -0.04848878,  0.01976278, -0.2745901 , -0.26545796,
         0.14106901, -0.0029481 , -0.24873146,  0.15021388,  0.20543793,
        -0.17611538, -0.26615071,  0.16940615, -0.10952435,  0.007667  ,
         0.03699892, -0.27983391,  0.07229839,  0.23496749, -0.27070817,
         0.01747582,  0.38211539, -0.2237262 ]])

In [205]:
t = xgb_model.predict(X_test[0].reshape(1, -1))

In [206]:
t[0]

17