### Import and install required dependencies

In [None]:
#install dependencies
#!pip install tensorflow==2.13.0rc1 opencv-python sklearn matplotlib

In [None]:
#install mediapipe
#!pip install mediapipe

In [None]:
#import openCV
import cv2 

#import numPy
import numpy as np 

#import os
import os 

#import matplotlib
from matplotlib import pyplot as plt

#import time
import time 

#import mediaPipe
import mediapipe as mp 

# 1. Keypoint Extraction using MediaPipe

##### Creating variables and functions for keypoint Extraction

In [None]:
#creating the variables and assigning functions
mediapipe_holistic = mp.solutions.holistic 
mediapipe_drawing = mp.solutions.drawing_utils

In [None]:
# creating detection function
def detection_function(image, model):
    #convert BGR to RGB
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) 
    #make image non-writeable
    image.flags.writeable = False 
    #make prediction
    detected_landmarks = model.process(image) 
    #make image writeable
    image.flags.writeable = True 
    #convert RGB 2 BGR
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    return image, detected_landmarks

In [None]:
# function to visualize the landmarks using 'mediapipe_drawing' variable
def draw_styled_landmarks(image, detected_landmarks):
    # Draw left hand connections
    mediapipe_drawing.draw_landmarks(image, detected_landmarks.left_hand_landmarks, mediapipe_holistic.HAND_CONNECTIONS, 
                             mediapipe_drawing.DrawingSpec(color=(1,255,255), thickness=2, circle_radius=4), 
                             mediapipe_drawing.DrawingSpec(color=(255,15,10), thickness=2, circle_radius=2)
                             ) 
    # Draw right hand connections  
    mediapipe_drawing.draw_landmarks(image, detected_landmarks.right_hand_landmarks, mediapipe_holistic.HAND_CONNECTIONS, 
                             mediapipe_drawing.DrawingSpec(color=(5,255,3), thickness=2, circle_radius=4), 
                             mediapipe_drawing.DrawingSpec(color=(9,9,255), thickness=2, circle_radius=2)
                             ) 

##### Extracting keypoints values from captured video frames

In [None]:
# access webcam (video capture device (0))
cam = cv2.VideoCapture(0) 


# Set mediapipe model
with mediapipe_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:

# begin while loop
    while cam.isOpened():

            # Read feed
            return_value, image_frame = cam.read()

            # Make detections
            # get the 'image' and 'detected_landmarks' 
            image, detected_landmarks = detection_function(image_frame, holistic)
            print(detected_landmarks)

            # Draw the landmarks
            draw_styled_landmarks(image, detected_landmarks)
    
            # Show o screen
            cv2.imshow('OpenCV window', image)

            # break statement
            if cv2.waitKey(10) & 0xFF == ord('q'):
                break
            #while loop end
            
    cam.release()
    cv2.destroyAllWindows()

In [None]:
#accessing the last frame to display landmark values (left-hand)
detected_landmarks.left_hand_landmarks

In [None]:
# length of the detected landmarks (left hand)
len(detected_landmarks.left_hand_landmarks.landmark)

In [None]:
# latest video frame inform of array
image_frame

In [None]:
# applying draw_styled_landmarks to current frame
draw_styled_landmarks(image_frame, detected_landmarks)

In [None]:
#visuaize the current captured frame in RGB format using matplotlib
plt.imshow(cv2.cvtColor(image_frame, cv2.COLOR_BGR2RGB))

##### Store extracted Keypoints into numPy array 

In [None]:
# creating variables to store extracted keypoints in a flattened array
left_hand = np.array([[res.x, res.y, res.z] for res in detected_landmarks.left_hand_landmarks.landmark]).flatten() if detected_landmarks.left_hand_landmarks else np.zeros(21*3) 
right_hand = np.array([[res.x, res.y, res.z] for res in detected_landmarks.right_hand_landmarks.landmark]).flatten() if detected_landmarks.right_hand_landmarks else np.zeros(21*3)

In [None]:
# displaying keypoint values for left hand
left_hand

In [None]:
# shape of left_hand array # 21*3 = 63
left_hand.shape 

In [None]:
# keypoint values for right hand 
right_hand

In [None]:
# shape of right_hand array shape #np.zeros(21*3) = 63
right_hand.shape 

###### Function to extract keypoints and concatenate into a single array 

In [None]:
# function to extract keypoints and concatenate into single array 
def mediapipe_keypoints(detected_landmarks):
    left_hand = np.array([[res.x, res.y, res.z] for res in detected_landmarks.left_hand_landmarks.landmark]).flatten() if detected_landmarks.left_hand_landmarks else np.zeros(21*3)
    right_hand = np.array([[res.x, res.y, res.z] for res in detected_landmarks.right_hand_landmarks.landmark]).flatten() if detected_landmarks.right_hand_landmarks else np.zeros(21*3)
    return np.concatenate([ left_hand, right_hand])

In [None]:
#checking the final shape of the concatenated array
mediapipe_keypoints(detected_landmarks).shape 

#expected result: 
# (left-hand keypoints * (x,y,z co-ordinates)) + (right-hand keypoints * (x,y,z co-ordinates))
# 21*3 + 21*3 = 126

In [None]:
# storing the resultant array in a variable
total_keypoints = mediapipe_keypoints(detected_landmarks)

In [None]:
# Displaing the concatenated array
total_keypoints

# 2. Create Datasets for BSL fingerspelling

##### Set variables for Dataset creation

In [None]:
# Path for exported data, numpy arrays
DATA_PATH = os.path.join('MP_Data') 

# BSL fingerspelling alphabets
alphabets = np.array(['A', 'B', 'C', 'D', 'E', 'F',
                    'G', 'H', 'I', 'J', 'K', 'L', 
                    'M', 'N', 'O', 'P', 'Q', 'R', 
                    'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'])

# 30 video seqences per each alphapbet 
no_sequences = 30

# 20 frames per each video sequence 
sequence_length = 20


##### Create Folders for Datasets

In [None]:
# create folders
for alphabet in alphabets: 
    for sequence in range(no_sequences):
        try: 
            os.makedirs(os.path.join(DATA_PATH, alphabet, str(sequence)))
        except:
            pass

##### Collecting datasets using openCV and mediaPipe

In [None]:
# access webcam (video capture device (1)
cam = cv2.VideoCapture(1)

# Set mediapipe model 
with mediapipe_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    
    # Loop through each alphabet
    for alphabet in alphabets:
        # Loop through each video sequence
        for sequence in range(no_sequences):
            # Loop through sequence length of each video
            for frame_number in range(sequence_length):

                # Read feed
                return_value, image_frame = cam.read()

                # Make detections
                image, detected_landmarks = detection_function(image_frame, holistic)

                # Draw landmarks
                draw_styled_landmarks(image, detected_landmarks)
                
                # creating of datasets
                if frame_number == 0: 
                    cv2.putText(image, 'STARTING COLLECTION', (120,200), 
                               cv2.FONT_HERSHEY_SIMPLEX, 1, (0,255, 0), 4, cv2.LINE_AA)
                    cv2.putText(image, 'Collecting frames for {} Video Number {}'.format(alphabet, sequence), (15,12), 
                               cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1, cv2.LINE_AA)
                    
                    cv2.imshow('OpenCV Data Collection', image)
                    cv2.waitKey(2000)
                else: 
                    cv2.putText(image, 'Collecting frames for {} Video Number {}'.format(alphabet, sequence), (15,12), 
                               cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1, cv2.LINE_AA)
                    # Show to screen
                    cv2.imshow('OpenCV Data Collection', image)
                
                # Export keypoints
                keypoints = mediapipe_keypoints(detected_landmarks)
                npy_path = os.path.join(DATA_PATH, alphabet, str(sequence), str(frame_number))
                np.save(npy_path, keypoints)

                # Break loop
                if cv2.waitKey(10) & 0xFF == ord('q'):
                    break
                    
    cam.release()
    cv2.destroyAllWindows()

In [None]:
cam.release()
cv2.destroyAllWindows()

##### labeling datasets

In [None]:
# create a lable dictionary to represent the alphabet index and their labels
alphabet_labels = {label:num for num, label in enumerate(alphabets)}

In [None]:
#display labels
alphabet_labels

##### Combine all data together

In [None]:
# bringing all data together and structuring it into a single array

# initializing empty arrays
sequences, labels = [], []
for alphabet in alphabets:
    for sequence in range(no_sequences):
        window = []
        for frame_number in range(sequence_length):
            res = np.load(os.path.join(DATA_PATH, alphabet, str(sequence), "{}.npy".format(frame_number)))
            window.append(res)
        sequences.append(window)
        labels.append(alphabet_labels[alphabet])

In [None]:
# checking shape of final array
np.array(sequences).shape

In [None]:
# checking shape of the labels
np.array(labels).shape 

##### preprocess data for training

In [None]:
# import dependencies for splitting dataset and convert data using one-hot encoding
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.utils import to_categorical

In [None]:
# storing the sequences in 'X'
X = np.array(sequences)

In [None]:
# checking shape of 'X'
X.shape

In [None]:
# converting the labels into binary flat using one-hot encoding
Y = to_categorical(labels).astype(int)

In [None]:
Y

In [None]:
Y.shape

##### Split dataset into Train and Test categories

In [None]:
# splitting the dataset into training and testing (training data = 90%, testing data = 10%)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1)

In [None]:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train.reshape(-1, X_train.shape[-1])).reshape(X_train.shape)
X_test_scaled = scaler.transform(X_test.reshape(-1, X_test.shape[-1])).reshape(X_test.shape)

In [None]:
# checking the shapes of training and testing data after the splitting of datasets
X_train.shape

In [None]:
X_test.shape 

In [None]:
Y_train.shape

In [None]:
Y_test.shape

# 3. Training dataset using LSTM

##### Build LSTM architecture

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.layers import LeakyReLU
from tensorflow.keras.optimizers import legacy as keras_legacy_optimizer

In [None]:
#storing TensorBoard logs
log_dir = os.path.join('Logs') 

#integrating TensorBoard with the model training process.
tb_callback = TensorBoard(log_dir=log_dir)

In [None]:
#initializing an empty LSTM Neural Network model
model = Sequential()

# adding LSTM layer to model which as 64 units, and uses 'tanh' activation function
model.add(LSTM(64, return_sequences=True, activation='tanh', input_shape=(20, 126)))
# adding another LSTM layer with 128 units and'tanh' activation function
model.add(LSTM(128, return_sequences=True, activation='tanh'))
# adding LSTM layer to model which as 64 units and'tanh' activation function
model.add(LSTM(64, return_sequences=False, activation='tanh'))

# adding a dense layer with 64 units 
model.add(Dense(64, activation='tanh'))
# adding a dense layer with 32 units
model.add(Dense(32, activation='tanh'))
# adding a dense layer with units equal to number of categories(alphabets) and 'softmax' activation function
model.add(Dense(alphabets.shape[0], activation='softmax'))

In [None]:
#checking the number of outputs in the final layer
alphabets.shape[0] 

In [None]:
res = [0.7, 0.2, 0.1]

In [None]:
alphabets[np.argmax(res)]

In [None]:
#configuring the training process of the model
model.compile(optimizer=keras_legacy_optimizer.Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['categorical_accuracy'])

In [None]:
#display the summary of the LSTM model built
model.summary()

##### Train LSTM Neural Network

In [None]:
# training model. 
model.fit(X_train, Y_train, epochs=2000, callbacks=[tb_callback])

##### save model

In [None]:
# saving the model
model.save('action.h5')

In [None]:
# loading the saved model
model.load_weights('action.h5')

# 4. Evaluation and Testing

##### Make predictions

In [None]:
# storing results inside 'res' variable
res = model.predict(X_test)

In [None]:
alphabets[np.argmax(res[0])]

In [None]:
alphabets[np.argmax(Y_test[0])]

In [None]:
alphabets[np.argmax(res[1])]

In [None]:
alphabets[np.argmax(Y_test[1])]

In [None]:
alphabets[np.argmax(res[2])]

In [None]:
alphabets[np.argmax(Y_test[2])]

In [None]:
alphabets[np.argmax(res[3])]

In [None]:
alphabets[np.argmax(Y_test[3])]

##### Test data

In [None]:
# making predictions on test data using trained LSTM model
ypredict = model.predict(X_test)

In [None]:
# extracting true labels from 'Y_test'
ytrue = np.argmax(Y_test, axis=1).tolist()

# extracting predicted labels from 'yhat'
ypredict = np.argmax(ypredict, axis=1).tolist()

##### Evaluate using confusion matrix

In [None]:
from sklearn.metrics import multilabel_confusion_matrix, accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score

import seaborn as sns

In [None]:
# calculating multilabel confusion matrix
multilabel_confusion_matrix(ytrue, ypredict)

In [None]:
# storing confusion matrix in variable
multilabel_cm = multilabel_confusion_matrix(ytrue, ypredict)

# Define the class labels
class_labels = ['A', 'B', 'C', 'D', 'E', 'F',
                    'G', 'H', 'I', 'J', 'K', 'L', 
                    'M', 'N', 'O', 'P', 'Q', 'R', 
                    'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']  

# Create a function to plot a confusion matrix heatmap
def plot_confusion_matrix(conf_matrix, class_labels):
    plt.figure(figsize=(10, 8))
    for i, cm in enumerate(conf_matrix):
        plt.subplot(5, 6, i + 1) 
        sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False)
        plt.title(f"Class {class_labels[i]}")
        plt.xlabel("True")
        plt.ylabel("Predicted")
    plt.tight_layout()
    plt.show()

# Plot the confusion matrix heatmap
plot_confusion_matrix(multilabel_cm, class_labels)

In [None]:
confusion_matrix(ytrue, ypredict)

In [None]:
# variable storing confusion matrix
cm = confusion_matrix(ytrue, ypredict)

# Visualize the confusion matrix using a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
        xticklabels=['A', 'B', 'C', 'D', 'E', 'F',
                    'G', 'H', 'I', 'J', 'K', 'L', 
                    'M', 'N', 'O', 'P', 'Q', 'R', 
                    'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'], 
        yticklabels=['A', 'B', 'C', 'D', 'E', 'F',
                    'G', 'H', 'I', 'J', 'K', 'L', 
                    'M', 'N', 'O', 'P', 'Q', 'R', 
                    'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'])
plt.xlabel('True Label')
plt.ylabel('Predicted Label')
plt.title('Confusion Matrix')
plt.show()

##### Evauate using evaluation metrics (accuracy, f1-score, precision, recall)

In [None]:
# calculating the accuracy, F1, precision, recall for all classes
accuracy = accuracy_score(ytrue, ypredict) 
f1 = f1_score(ytrue, ypredict, average='macro')
precision = precision_score(ytrue, ypredict, average='macro')
recall = recall_score(ytrue, ypredict, average='macro')

print(f"accuracy: {accuracy}")
print(f"F1-score: {f1}")
print(f"precision-score: {precision}")
print(f"recall-score: {recall}")

In [None]:
import pandas as pd

# Creating a DataFrame to store the metrics
metrics_df = pd.DataFrame({
    'Metric': ['Accuracy', 'F1-Score', 'Precision', 'Recall'],
    'Score': [accuracy, f1, precision, recall]
})

# Plotting the metrics using Seaborn
plt.figure(figsize=(8, 6))
sns.barplot(x='Metric', y='Score', data=metrics_df, palette='viridis')
plt.title('Overall Performance Metrics')
plt.ylim(0, 1)
plt.show()

In [None]:
# Calculating precision score for each class (alphabet)
precision_scores = precision_score(ytrue, ypredict, average=None)

# Printing precision scores for each class
for idx, alphabet in enumerate(alphabets):
    print(f"Precision for Alphabet {alphabet}: {precision_scores[idx]:.4f}")

In [None]:
import matplotlib.pyplot as plt
# Plotting the bar graph
plt.figure(figsize=(10, 6))
plt.bar(alphabets, precision_scores)
plt.xlabel('Alphabets')
plt.ylabel('Precision Score')
plt.title('Precision Scores for Each Alphabet')
plt.xticks(rotation=45)
plt.tight_layout()

# Display the plot
plt.show()

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score

alphabet_accuracy = {}

for idx, alphabet in enumerate(alphabets):
    indices = [i for i, y in enumerate(ytrue) if y == idx]
    class_ytrue = [ytrue[i] for i in indices]
    class_ypredict = [ypredict[i] for i in indices]
    accuracy = accuracy_score(class_ytrue, class_ypredict)
    alphabet_accuracy[alphabet] = accuracy

# Printing accuracy for each class
for alphabet, accuracy in alphabet_accuracy.items():
    print(f"Accuracy for Alphabet {alphabet}: {accuracy:.4f}")

# Plotting the bar graph
plt.figure(figsize=(10, 6))
plt.bar(alphabet_accuracy.keys(), alphabet_accuracy.values())
plt.xlabel('Alphabets')
plt.ylabel('Accuracy Score')
plt.title('Accuracy Scores for Each Alphabet')
plt.xticks(rotation=45)
plt.tight_layout()

# Display the plot
plt.show()

In [None]:
# Calculating F1 score for each class (alphabet)
f1_scores = f1_score(ytrue, ypredict, average=None)

# Printing F1 scores for each class
for idx, alphabet in enumerate(alphabets):
    print(f"F1 Score for Alphabet {alphabet}: {f1_scores[idx]:.4f}")

In [None]:
import matplotlib.pyplot as plt
# Plotting the bar graph
plt.figure(figsize=(10, 6))
plt.bar(alphabets, f1_scores)
plt.xlabel('Alphabets')
plt.ylabel('F1-scores')
plt.title('F1-score for Each Alphabet')
plt.xticks(rotation=45)
plt.tight_layout()

# Display the plot
plt.show()

In [None]:
# Calculating recall score for each class (alphabet)
recall_scores = recall_score(ytrue, ypredict, average=None)

# Printing recall scores for each class
for idx, alphabet in enumerate(alphabets):
    print(f"Recall for Alphabet {alphabet}: {recall_scores[idx]:.4f}")

In [None]:
import matplotlib.pyplot as plt
# Plotting the bar graph
plt.figure(figsize=(10, 6))
plt.bar(alphabets, recall_scores)
plt.xlabel('Alphabets')
plt.ylabel('Recall scores')
plt.title('Recall score for Each Alphabet')
plt.xticks(rotation=45)
plt.tight_layout()

# Display the plot
plt.show()

##### Train Data

In [None]:
# making predictions on train data using trained model
# yhat = model.predict(X_train)

In [None]:
# ytrue = np.argmax(Y_train, axis=1).tolist()
# yhat = np.argmax(yhat, axis=1).tolist()

# 5. Realtime testing

In [None]:
# function to perform real-time testing
colors = [(255,153,153), (255,178,102), (255, 255, 102), (153, 255,51), (51,153,255), (255,153,204), (255,153,153), (255,178,102), (255, 255, 102), (153, 255,51), (51,153,255), (255,153,204), (255,153,153), (255,178,102), (255, 255, 102), (153, 255,51), (51,153,255), (255,153,204), (255,153,153), (255,178,102), (255, 255, 102), (153, 255,51), (51,153,255), (255,153,204), (255,153,153), (255,178,102)]
def prob_viz(res, alphabets, input_frame, colors):
    output_frame = input_frame.copy()
    for num, prob in enumerate(res):
        cv2.rectangle(output_frame, (0, 25+num*25), (int(prob*100), 45+num*25), colors[num], -1)
        cv2.putText(output_frame, alphabets[num], (0, 40+num*25), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0,0,0), 1, cv2.LINE_AA)
        
    return output_frame

In [None]:
# making real time predictions
sequence = []
sentence = []
threshold = 0.9

cam = cv2.VideoCapture(0)
# Set mediapipe model 
with mediapipe_holistic.Holistic(min_detection_confidence=0.9, min_tracking_confidence=0.9) as holistic:
    while cam.isOpened():

        # Read feed
        return_value, image_frame = cam.read()

        # Make detections
        image, detected_landmarks = detection_function(image_frame, holistic)
        print(detected_landmarks)
        
        # Draw landmarks
        draw_styled_landmarks(image, detected_landmarks)
        
        # making Prediction
        keypoints = mediapipe_keypoints(detected_landmarks)
        sequence.append(keypoints)
        sequence = sequence[-30:]
        
        if len(sequence) == 30:
            res = model.predict(np.expand_dims(sequence, axis=0))[0]
            print(alphabets[np.argmax(res)])
            
            
            if res[np.argmax(res)] > threshold: 
                if len(sentence) > 0: 
                    if alphabets[np.argmax(res)] != sentence[-1]:
                        sentence.append(alphabets[np.argmax(res)])
                else:
                    sentence.append(alphabets[np.argmax(res)])

            if len(sentence) > 5: 
                sentence = sentence[-5:]

            # displaying probability bar
            image = prob_viz(res, alphabets, image, colors)
            
        cv2.rectangle(image, (0,0), (1280, 20), (245, 117, 16), -1)
        cv2.putText(image, ' '.join(sentence), (3,20), 
                       cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1, cv2.LINE_AA)
        
        # displaying on screen
        cv2.imshow('Test model', image)

        if cv2.waitKey(10) & 0xFF == ord('q'):
            break
    cam.release()
    cv2.destroyAllWindows()

In [None]:
plt.figure(figsize=(18,18))
plt.imshow(prob_viz(res, alphabets, image, colors))