# 1- Determine the words that we will work on

In [2]:
# libraries
import os
import cv2
import mediapipe as mp
import numpy as np
import pickle
import matplotlib.pyplot as plt
import random

In [7]:
class_names = []
for class_name in os.listdir("english"):
    class_names.append(class_name.split(".")[0])
print(class_names) # 90 words

['A', 'About', 'After', 'At', 'B', 'Before', 'Black', 'C', 'Can', 'Choose', 'Coffee', 'Cold', 'Congratulation', 'D', 'Doctor', 'Drink', 'E', 'Eight', 'F', 'Favourite', 'Five', 'Four', 'Friday', 'G', 'Go', 'Goodbye', 'H', 'Happy', 'Has_or_Have', 'Hearing_Aid', 'Hello', 'Help', 'How', 'I', 'I_Love_You', 'I_or_Me', 'J', 'K', 'L', 'Late', 'Live', 'Love', 'M', 'Monday', 'Month', 'My', 'My_Self', 'N', 'Name', 'Near', 'Nice', 'No', 'Now_or_Today', 'O', 'One', 'P', 'Professor', 'Q', 'R', 'Red', 'S', 'Saturday', 'Seven', 'Sit', 'Sorry', 'Stand', 'Sunday', 'T', 'Ten', 'Thank_You', 'Then', 'This', 'Three', 'Ticket', 'To', 'Tuesday', 'Two', 'U', 'V', 'W', 'Warm', 'Weather', 'Week', 'What', 'When', 'Where', 'white', 'Work', 'X', 'Y', 'Yellow', 'Yes', 'You', 'Your']


## Create folder for each class in english data directory

In [7]:
for class_name in os.listdir("english"):
    os.mkdir(os.path.join("english_data", class_name.split(".")[0]))

# 2- Collect images for each class [word] -> using web camera
- we will collect 2000 images for each class and put them in folders created in english data directory

In [8]:
# create objects just focus on the hands 
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
mp_drawing_styles = mp.solutions.drawing_styles

# get the model that detect hand_landmarks 
hands = mp_hands.Hands(static_image_mode=True, min_detection_confidence=0.5)

In [277]:
# setup my camera [my camera has the index (0)]
my_camera = cv2.VideoCapture(0)

# define a counter to give each frame a unique name 
counter = 0

# In this loop, Frames will be taken from the camera until we stop it.
while my_camera.isOpened():
    
    # read frame by frame from the camera -> return [frame, status of the reading process (is the camera capture the frame right or not)]
    # status -> Boolean, Frame --> image with np.array data type
    status, frame = my_camera.read()
    frame = cv2.flip(frame, 1)
    frame_copy = np.copy(frame)
    
    # convert frame to rgb
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    # get hands detection on the rgb image 
    result = hands.process(frame_rgb)
    
    if result.multi_hand_landmarks:
        for hand_landmark in result.multi_hand_landmarks:
            # draw the landmarks
            mp_drawing.draw_landmarks(
                frame_copy,
                hand_landmark, 
                mp_hands.HAND_CONNECTIONS,
                mp_drawing_styles.get_default_hand_landmarks_style(),
                mp_drawing_styles.get_default_hand_connections_style())
    
    # display the captured frame
    cv2.imshow('Captured Frame', frame_copy)

    # save the captured frame on pressing [s]
    # if cv2.waitKey(1) & 0xFF == ord('s'):
    #     # increase counter by 1 
    counter += 1 
    cv2.imwrite(os.path.join("english_data", "Cold", f"new_left_Cold_frame_{counter}.jpg"), frame)
    # print(f'{counter}_image successfully saved')
    
    # Stop the reading process on pressing [q]
    if (cv2.waitKey(1) & 0xFF == ord('q')) or counter == 2000:
        break
    
my_camera.release()
cv2.destroyAllWindows()

## Remove images that can't be detected by the google model 

In [278]:
for i in os.listdir(os.path.join("english_data", "Cold")):
    image = cv2.imread(os.path.join("english_data", "Cold", i))
    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    result = hands.process(image_rgb)
    
    if result.multi_hand_landmarks:
        continue
    else:
        print(i)
        os.remove(os.path.join("english_data", "Cold", i))

## Remove Frames till 2000 frame 

In [151]:
while len(os.listdir(os.path.join("english_data", "Yes"))) > 2000:
    random_image = random.sample(os.listdir(os.path.join("english_data", "Yes")), k=1)
    # print(random_image)
    random_image_path = os.path.join("english_data", "Yes", random_image[0])
    os.remove(random_image_path)

## Check that 2 hands words are detected correctly

In [1]:
# list of all 2 hands words 
l = ["Sunday", "Has_or_Have", "Can", "Live", "Love", "Sit", "Doctor", "Stand", "Work", "What", "When", 
     "Ticket", "To", "Then", "This", "How", "Near", "Now_or_Today", "Professor", "Name", "Month", "Happy",
    "Help", "About", "After", "At", "Coffee", "Cold"]


for two_hand_class_name in l: 
    # print(f"------------------------- {two_hand_class_name} -----------------------")
    for i in os.listdir(os.path.join("english_data", two_hand_class_name)):
        image = cv2.imread(os.path.join("english_data", two_hand_class_name, i))
        image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        result = hands.process(image_rgb)
        # print(i)
    
        if result.multi_hand_landmarks:
            extracted_frame_features = []
            for hand_landmark in result.multi_hand_landmarks:
                for landmark in hand_landmark.landmark:
                    extracted_frame_features.append(landmark.x)
                    extracted_frame_features.append(landmark.y)

            if len(extracted_frame_features) != 84:
                print(i)
                print(len(extracted_frame_features))
                os.remove(os.path.join("english_data", two_hand_class_name, i))
    

## Check the balancing of the dataset

In [2]:
for class_name in os.listdir(os.path.join('english_data')):
    if len(os.listdir(os.path.join('english_data', class_name))) != 0:
        print(class_name)
        print(len(os.listdir(os.path.join('english_data', class_name))))

# 3- Extract Features from the images

## Process : extract features and save it in .npy file for each image

In [3]:
for class_name in os.listdir(os.path.join('english_data')):
    if len(os.listdir(os.path.join('english_data', class_name))) != 0:
        print(f"------------------------------------- {class_name} ---------------------------------")
        for image_name in os.listdir(os.path.join('english_data', class_name)):
            if image_name.split(".")[-1] == "jpg":
                print(image_name)
                # define the path of the image
                image_path = os.path.join('english_data', class_name, image_name)

                # bgr image
                image = cv2.imread(image_path)

                # rgb image 
                image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

                # process the rgb image to get the hand detection
                result = hands.process(image_rgb)

                # check if there is any detection of hands or not 
                if result.multi_hand_landmarks:

                    # define list to put in it x,y values for each landmarks 
                    current_image_landmarks = []

                    # get x and y value for each landmark
                    for hand_landmark in result.multi_hand_landmarks:
                        for landmark in hand_landmark.landmark:
                            current_image_landmarks.append(landmark.x)
                            current_image_landmarks.append(landmark.y)

                    # check that the number of landmarks are equal for each image
                    if len(current_image_landmarks) < 84:
                        current_image_landmarks = current_image_landmarks + [0]*(84-len(current_image_landmarks))

                    # save the extracted image features in numpy array 
                    save_path = os.path.join("english_data", class_name, f"{image_name}_features")
                    np.save(save_path, current_image_landmarks)
                    

## Create label map

In [12]:
classes = []
for class_name in os.listdir(os.path.join('english_data')):
    if len(os.listdir(os.path.join('english_data', class_name))) != 0:
        classes.append(class_name)

print(len(classes))
print(classes)

90
['A', 'About', 'After', 'At', 'B', 'Before', 'Black', 'C', 'Can', 'Choose', 'Coffee', 'Cold', 'D', 'Doctor', 'Drink', 'E', 'Eight', 'F', 'Favourite', 'Five', 'Four', 'Friday', 'G', 'Goodbye', 'H', 'Happy', 'Has_or_Have', 'Hearing_Aid', 'Hello', 'Help', 'How', 'I', 'I_Love_You', 'I_or_Me', 'J', 'K', 'L', 'Late', 'Live', 'Love', 'M', 'Monday', 'Month', 'My', 'My_Self', 'N', 'Name', 'Near', 'No', 'Now_or_Today', 'O', 'One', 'P', 'Professor', 'Q', 'R', 'S', 'Saturday', 'Seven', 'Sit', 'Sorry', 'Stand', 'Sunday', 'T', 'Ten', 'Thank_You', 'Then', 'This', 'Three', 'Ticket', 'To', 'Tuesday', 'Two', 'U', 'V', 'W', 'Warm', 'Weather', 'Week', 'What', 'When', 'Where', 'White', 'Work', 'X', 'Y', 'Yellow', 'Yes', 'You', 'Your']


In [13]:
class_to_index = {class_name:index for index, class_name in enumerate(classes)}
print(class_to_index)

{'A': 0, 'About': 1, 'After': 2, 'At': 3, 'B': 4, 'Before': 5, 'Black': 6, 'C': 7, 'Can': 8, 'Choose': 9, 'Coffee': 10, 'Cold': 11, 'D': 12, 'Doctor': 13, 'Drink': 14, 'E': 15, 'Eight': 16, 'F': 17, 'Favourite': 18, 'Five': 19, 'Four': 20, 'Friday': 21, 'G': 22, 'Goodbye': 23, 'H': 24, 'Happy': 25, 'Has_or_Have': 26, 'Hearing_Aid': 27, 'Hello': 28, 'Help': 29, 'How': 30, 'I': 31, 'I_Love_You': 32, 'I_or_Me': 33, 'J': 34, 'K': 35, 'L': 36, 'Late': 37, 'Live': 38, 'Love': 39, 'M': 40, 'Monday': 41, 'Month': 42, 'My': 43, 'My_Self': 44, 'N': 45, 'Name': 46, 'Near': 47, 'No': 48, 'Now_or_Today': 49, 'O': 50, 'One': 51, 'P': 52, 'Professor': 53, 'Q': 54, 'R': 55, 'S': 56, 'Saturday': 57, 'Seven': 58, 'Sit': 59, 'Sorry': 60, 'Stand': 61, 'Sunday': 62, 'T': 63, 'Ten': 64, 'Thank_You': 65, 'Then': 66, 'This': 67, 'Three': 68, 'Ticket': 69, 'To': 70, 'Tuesday': 71, 'Two': 72, 'U': 73, 'V': 74, 'W': 75, 'Warm': 76, 'Weather': 77, 'Week': 78, 'What': 79, 'When': 80, 'Where': 81, 'White': 82, 'Wor

## Collect all the features in one array called "all_features" with thier labels in "all_labels"

In [4]:
all_features = []
all_labels = []

for class_name in os.listdir(os.path.join('english_data')):
    print(f"-------------------------------- {class_name} -----------------------------")
    for numpy_file in os.listdir(os.path.join('english_data', class_name)):
        if numpy_file.split(".")[-1] == "npy":
            print(numpy_file)
            # load the numpy file 
            numpy_file_path = os.path.join('english_data', class_name, numpy_file)
            numpy_array = np.load(numpy_file_path)
            all_features.append(numpy_array)
            all_labels.append(class_to_index[class_name])

## Save all_features and all_labels lists to external file called "english_data.pickle"

In [15]:
# save all_landmarks and labels in file called data.pickle using pickle module 
data_file = open('english_data.pickle','wb')
pickle.dump({'all_features':all_features, 'all_labels':all_labels}, data_file)
data_file.close()

## Load all_features and all_labels to use them

In [4]:
# load the values from the file 
data_file = open('english_data.pickle', 'rb')
data = pickle.load(data_file)

all_features = data['all_features']
all_labels = data['all_labels']

data_file.close()

## Convert all_features and all_labels to arrays

In [5]:
# convert the data to arrays 
all_features_array = np.array(all_features)
all_labels_array = np.array(all_labels)

print(type(all_features_array))
print(type(all_labels_array))

print(all_features_array.shape)
print(all_labels_array.shape)

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(180000, 84)
(180000,)


# 4- Dataset Splitting

In [36]:
from sklearn.model_selection import train_test_split

In [37]:
x_train, x_test, y_train, y_test = train_test_split(all_features_array, all_labels_array, test_size=0.2, shuffle=True, stratify=all_labels)

# 5- Model Building 

In [38]:
from sklearn.ensemble import RandomForestClassifier

In [39]:
model_random_forest = RandomForestClassifier()

In [40]:
model_random_forest.fit(x_train, y_train)

## Save the model 

In [41]:
# save the model
model_file = open('english_model.pickle','wb')
pickle.dump({'model':model_random_forest}, model_file)
model_file.close()

## Load the model

In [3]:
# load model
model_file = open('english_model.pickle','rb')
model_dict = pickle.load(model_file)
model = model_dict['model']

# 6-Model Evaluation 

In [27]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score, precision_score, recall_score, f1_score, classification_report

## Calculate y_predict

In [5]:
# get the y_predict 
y_predict = model.predict(x_test)
print(y_predict)

## Calculate confusion matrix

In [6]:
cm = confusion_matrix(y_test, y_predict)
for i in cm:
    print(i)

## Calculate Accuracy

In [7]:
accuracy = accuracy_score(y_test, y_predict)
print(accuracy)

0.9986388888888889


## Calculate Precision

In [20]:
precision = precision_score(y_test, y_predict, average='micro')
print(precision)

0.9986388888888889


## Calculate F1-Score

In [21]:
f1 = f1_score(y_test, y_predict, average='micro')
print(f1)

0.9986388888888889


## Calculate Recall

In [22]:
recall = recall_score(y_test, y_predict, average='micro')
print(recall)

0.9986388888888889


## Classification Report

In [23]:
report = classification_report(y_test, y_predict)
print(report)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       400
           1       1.00      1.00      1.00       400
           2       1.00      1.00      1.00       400
           3       1.00      1.00      1.00       400
           4       1.00      1.00      1.00       400
           5       1.00      1.00      1.00       400
           6       1.00      1.00      1.00       400
           7       1.00      1.00      1.00       400
           8       1.00      1.00      1.00       400
           9       1.00      1.00      1.00       400
          10       1.00      1.00      1.00       400
          11       1.00      1.00      1.00       400
          12       1.00      1.00      1.00       400
          13       1.00      1.00      1.00       400
          14       1.00      1.00      1.00       400
          15       1.00      0.99      1.00       400
          16       1.00      1.00      1.00       400
          17       1.00    

# 7- Test model on videos

## Load google model

In [2]:
# create object just focus on the hands 
mp_hands = mp.solutions.hands

# get the model that detect hand_landmarks 
hands = mp_hands.Hands(static_image_mode=True, min_detection_confidence=0.5)

## Load my model model 

In [3]:
# load model
model_file = open('english_model.pickle','rb')
model_dict = pickle.load(model_file)
model = model_dict['model']
model_file.close()

## Create label map

In [4]:
classes = []
for class_name in os.listdir(os.path.join('english_data')):
    if len(os.listdir(os.path.join('english_data', class_name))) != 0:
        classes.append(class_name)
print(classes)

['A', 'About', 'After', 'At', 'B', 'Before', 'Black', 'C', 'Can', 'Choose', 'Coffee', 'Cold', 'D', 'Doctor', 'Drink', 'E', 'Eight', 'F', 'Favorite', 'Five', 'Four', 'Friday', 'G', 'Goodbye', 'H', 'Happy', 'Has_or_Have', 'Hearing_Aid', 'Hello', 'Help', 'How', 'I', 'I_Love_You', 'I_or_Me', 'J', 'K', 'L', 'Late', 'Live', 'Love', 'M', 'Monday', 'Month', 'My', 'My_Self', 'N', 'Name', 'Near', 'No', 'Now_or_Today', 'O', 'One', 'P', 'Professor', 'Q', 'R', 'S', 'Saturday', 'Seven', 'Sit', 'Sorry', 'Stand', 'Sunday', 'T', 'Ten', 'Thank_You', 'Then', 'This', 'Three', 'Ticket', 'To', 'Tuesday', 'Two', 'U', 'V', 'W', 'Warm', 'Weather', 'Week', 'What', 'When', 'Where', 'White', 'Work', 'X', 'Y', 'Yellow', 'Yes', 'You', 'Your']


In [5]:
index_to_class = {index:class_name for index, class_name in enumerate(classes)}
print(index_to_class)

{0: 'A', 1: 'About', 2: 'After', 3: 'At', 4: 'B', 5: 'Before', 6: 'Black', 7: 'C', 8: 'Can', 9: 'Choose', 10: 'Coffee', 11: 'Cold', 12: 'D', 13: 'Doctor', 14: 'Drink', 15: 'E', 16: 'Eight', 17: 'F', 18: 'Favorite', 19: 'Five', 20: 'Four', 21: 'Friday', 22: 'G', 23: 'Goodbye', 24: 'H', 25: 'Happy', 26: 'Has_or_Have', 27: 'Hearing_Aid', 28: 'Hello', 29: 'Help', 30: 'How', 31: 'I', 32: 'I_Love_You', 33: 'I_or_Me', 34: 'J', 35: 'K', 36: 'L', 37: 'Late', 38: 'Live', 39: 'Love', 40: 'M', 41: 'Monday', 42: 'Month', 43: 'My', 44: 'My_Self', 45: 'N', 46: 'Name', 47: 'Near', 48: 'No', 49: 'Now_or_Today', 50: 'O', 51: 'One', 52: 'P', 53: 'Professor', 54: 'Q', 55: 'R', 56: 'S', 57: 'Saturday', 58: 'Seven', 59: 'Sit', 60: 'Sorry', 61: 'Stand', 62: 'Sunday', 63: 'T', 64: 'Ten', 65: 'Thank_You', 66: 'Then', 67: 'This', 68: 'Three', 69: 'Ticket', 70: 'To', 71: 'Tuesday', 72: 'Two', 73: 'U', 74: 'V', 75: 'W', 76: 'Warm', 77: 'Weather', 78: 'Week', 79: 'What', 80: 'When', 81: 'Where', 82: 'White', 83: '

## Function: take image and output label and its probability

In [6]:
# define method to predict image with the random forest model
# image -> label, probablity of it 

def model_image_predict(image):
    
    # convert image to rgb 
    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    
    # get landmarks
    result = hands.process(image_rgb)
    
    # define list to put all the landmarks in it 
    all_features = []
    
    # define list to put in it x,y values for each landmarks 
    current_image_landmarks = []

    # get x and y value for each landmark

    # check if there is any detection of hands or not 
    if result.multi_hand_landmarks:

        for hand_landmark in result.multi_hand_landmarks:
            for landmark in hand_landmark.landmark:
                current_image_landmarks.append(landmark.x)
                current_image_landmarks.append(landmark.y)

        # check that the number of landmarks are equal for each image
        if len(current_image_landmarks) < 84:
            current_image_landmarks = current_image_landmarks + [0]*(84-len(current_image_landmarks))

        # append the value of current_image_data in the all_data list
        all_features.append(current_image_landmarks)
        
        # convert the all_landmarks from list to 2d array
        all_features_array = np.array(all_features)
        
        
        prediction = model.predict(all_features_array)
        prediction_with_probability = model.predict_proba(all_features_array)

        return {'class':prediction[0], 'probability':prediction_with_probability[0][prediction[0]]}

## Test model prediction on images

In [7]:
image_path = os.path.join("english_data", "A", "new_left_A_frame_1.jpg")
image = cv2.imread(image_path)
prediction = model_image_predict(image)
print(prediction)
print(type(prediction))

{'class': 0, 'probability': 0.98}
<class 'dict'>


## Function: Convert the video to text
Be attention for 2 important things :
- Frame Per Second [FPS]
- Video Status: 
    - 1 sign language 
    - Multiple sign language 

In [8]:
def video_to_text_prediction(video_path):

    # read the video
    video = cv2.VideoCapture(video_path)

    # calculate the frame per second of the video -> take 1 frame per 1/2 second
    fps = round(round(video.get(cv2.CAP_PROP_FPS))/2) # 30 frmas per sec [in each second we will get 2 frames for detection process]

    # define frame_counter variable --> when it reach 15 we will take this frame [so we take a frame after 1/2 second]
    frame_counter = 0

    # get all predictions from the video in all_prediction dictionary 
    all_predictions = dict()
    prediction_id = 0

    status = True  
    while status:
        # read frames from the video
        status, frame = video.read()

        if status == True:
            # count the current frame
            frame_counter += 1
            # here is the frame that will be used for prediction
            if frame_counter % fps == 0:
                prediction = model_image_predict(frame)
                prediction_id += 1 
                all_predictions[prediction_id] = prediction
    
    
    threshold = 0.25
    all_classes = str()
    for i in all_predictions:
        # check if there is a prediction or not
        if all_predictions[i]:
            if all_predictions[i]['probability'] > threshold:
                all_classes += index_to_class[all_predictions[i]['class']] + ' '
    # make unique classes [Cancel repetition]
    previous_class = None
    final_classes = []

    for i in all_classes.rstrip().split():

        if i != previous_class:
            final_classes.append(i)
            previous_class = i

    return " ".join(final_classes)

## Test model predicion on videos

In [9]:
print(video_to_text_prediction(os.path.join("english_test_videos", "i_love_you.mp4")))

I_Love_You


# 8- Video to text model deployment

## Endpoint video to text

In [10]:
import mediapipe as mp
import cv2
import numpy as np
import pickle
import os
from flask import Flask, request, jsonify
from celery import Celery

In [8]:
app = Flask(__name__)

@app.route("/video_to_text_english", methods=['POST'])
def video_to_text():

    # upload file
    # this is the video itself
    video = request.files['video']
    file_name = video.filename.split("/")[-1]
    
    print("this is the path of the video", file_name)

    # save the video -> to access it and make the detection process
    # after saving the video we will get the 1st parameter for the detection method -> [video_path]
    video.save(os.path.join('uploaded', file_name))

    # apply model on it
    output = video_to_text_prediction(os.path.join('uploaded', file_name))
    
    return jsonify([{"text":output}])


app.run(port=5000)

# 9- Test in real time

In [34]:
import mediapipe as mp
import cv2
import numpy as np
import pickle
import os

# load model 
model_file = open('english_model.pickle','rb')
model_dict = pickle.load(model_file)
model = model_dict['model']
model_file.close()

In [35]:
classes = []
for class_name in os.listdir(os.path.join('english_data')):
    if len(os.listdir(os.path.join('english_data', class_name))) != 0:
        classes.append(class_name)
print(classes)

['A', 'About', 'After', 'At', 'B', 'Before', 'Black', 'C', 'Can', 'Choose', 'Coffee', 'Cold', 'D', 'Doctor', 'Drink', 'E', 'Eight', 'F', 'Favourite', 'Five', 'Four', 'Friday', 'G', 'Goodbye', 'H', 'Happy', 'Has_or_Have', 'Hearing_Aid', 'Hello', 'Help', 'How', 'I', 'I_Love_You', 'I_or_Me', 'J', 'K', 'L', 'Late', 'Live', 'Love', 'M', 'Monday', 'Month', 'My', 'My_Self', 'N', 'Name', 'Near', 'No', 'Now_or_Today', 'O', 'One', 'P', 'Professor', 'Q', 'R', 'S', 'Saturday', 'Seven', 'Sit', 'Sorry', 'Stand', 'Sunday', 'T', 'Ten', 'Thank_You', 'Then', 'This', 'Three', 'Ticket', 'To', 'Tuesday', 'Two', 'U', 'V', 'W', 'Warm', 'Weather', 'Week', 'What', 'When', 'Where', 'White', 'Work', 'X', 'Y', 'Yellow', 'Yes', 'You', 'Your']


In [36]:
index_to_class = {index:class_name for index, class_name in enumerate(classes)}
print(index_to_class)

{0: 'A', 1: 'About', 2: 'After', 3: 'At', 4: 'B', 5: 'Before', 6: 'Black', 7: 'C', 8: 'Can', 9: 'Choose', 10: 'Coffee', 11: 'Cold', 12: 'D', 13: 'Doctor', 14: 'Drink', 15: 'E', 16: 'Eight', 17: 'F', 18: 'Favourite', 19: 'Five', 20: 'Four', 21: 'Friday', 22: 'G', 23: 'Goodbye', 24: 'H', 25: 'Happy', 26: 'Has_or_Have', 27: 'Hearing_Aid', 28: 'Hello', 29: 'Help', 30: 'How', 31: 'I', 32: 'I_Love_You', 33: 'I_or_Me', 34: 'J', 35: 'K', 36: 'L', 37: 'Late', 38: 'Live', 39: 'Love', 40: 'M', 41: 'Monday', 42: 'Month', 43: 'My', 44: 'My_Self', 45: 'N', 46: 'Name', 47: 'Near', 48: 'No', 49: 'Now_or_Today', 50: 'O', 51: 'One', 52: 'P', 53: 'Professor', 54: 'Q', 55: 'R', 56: 'S', 57: 'Saturday', 58: 'Seven', 59: 'Sit', 60: 'Sorry', 61: 'Stand', 62: 'Sunday', 63: 'T', 64: 'Ten', 65: 'Thank_You', 66: 'Then', 67: 'This', 68: 'Three', 69: 'Ticket', 70: 'To', 71: 'Tuesday', 72: 'Two', 73: 'U', 74: 'V', 75: 'W', 76: 'Warm', 77: 'Weather', 78: 'Week', 79: 'What', 80: 'When', 81: 'Where', 82: 'White', 83: 

## Process

In [28]:
# create objects just focus on the hands 
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
mp_drawing_styles = mp.solutions.drawing_styles

# get the model that detect hand_landmarks 
hands = mp_hands.Hands(static_image_mode=True, min_detection_confidence=0.5)

# setup webcam 
camera = cv2.VideoCapture(0)

# # define empty string 
# statement = str()

# loop to read frames from the webcam
while camera.isOpened():

    # read frames from the webcam
    status, frame = camera.read()
    
    # flip the frame 
    frame = cv2.flip(frame, 1)

    # convert frame to rgb
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    # get hands detection on the rgb image 
    result = hands.process(frame_rgb)
    
    # define lists
    current_image_landmarks = []
    all_features = []
    
    if result.multi_hand_landmarks:
        for hand_landmark in result.multi_hand_landmarks:
            # draw the landmarks
            mp_drawing.draw_landmarks(
                frame,
                hand_landmark, 
                mp_hands.HAND_CONNECTIONS,
                mp_drawing_styles.get_default_hand_landmarks_style(),
                mp_drawing_styles.get_default_hand_connections_style())
                
            for landmark in hand_landmark.landmark:
                current_image_landmarks.append(landmark.x)
                current_image_landmarks.append(landmark.y)

        # check that the number of landmarks are equal for each image
        if len(current_image_landmarks) < 84:
            current_image_landmarks = current_image_landmarks + [0]*(84-len(current_image_landmarks))

        # append the value of current_image_data in the all_data list
        all_features.append(current_image_landmarks)
        
        # convert the all_landmarks from list to 2d array
        all_features_array = np.array(all_features)
        
        prediction = model.predict(all_features_array)
        prediction_with_probability = model.predict_proba(all_features_array)
        
        class_label = index_to_class[prediction[0]]
        full_text = f'{class_label}, {str(prediction_with_probability[0][prediction[0]] * 100)[:4]} %'
        
        cv2.putText(frame, full_text, (100,100), cv2.FONT_HERSHEY_SIMPLEX, 1.3, (0,0,255), 3, cv2.LINE_AA)        
    
    cv2.imshow('Window', frame)
    if cv2.waitKey(1) &  0xFF == ord('q'):
        break

camera.release()
cv2.destroyAllWindows()

In [43]:
camera.release()
cv2.destroyAllWindows()

## 10- Covert text to video

In [5]:
import cv2
import os 
import string 

In [6]:
# list of exist videos called mapping videos
mapping_video_list = []
mapping_videos = os.listdir(os.path.join("mapping_videos_english"))
for video in mapping_videos:
    mapping_video_list.append(video.split(".")[0])

print(mapping_video_list)

['0', '1', '10', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'about', 'after', 'at', 'b', 'before', 'black', 'c', 'can', 'choose', 'coffee', 'cold', 'd', 'doctor', 'drink', 'e', 'eight', 'f', 'favorite', 'five', 'four', 'friday', 'g', 'goodbye', 'h', 'happy', 'has', 'have', 'hearing_aid', 'hello', 'help', 'how', 'i', 'i_love_you', 'j', 'k', 'l', 'late', 'live', 'love', 'm', 'me', 'monday', 'month', 'my', 'myself', 'n', 'name', 'near', 'nine', 'no', 'now', 'o', 'one', 'p', 'professor', 'q', 'r', 's', 'saturday', 'seven', 'sit', 'six', 'sorry', 'stand', 'sunday', 't', 'ten', 'thank_you', 'then', 'this', 'three', 'ticket', 'to', 'today', 'tuesday', 'u', 'v', 'w', 'warm', 'weather', 'week', 'what', 'when', 'where', 'white', 'work', 'x', 'y', 'yellow', 'yes', 'you', 'your', 'z', 'zero']


In [8]:
def text_to_video_prediction(text:str):
    
    # convert all text to lowercase
    text = text.lower()
    
    # delete any punctuation in the text
    
    # Create a translation table for the translate function
    translator = str.maketrans('', '', string.punctuation) 
    
    # Remove punctuation 
    text = text.translate(translator)  
    
    # check the multi-words
    if 'thank you' in text:
        text = text.replace('thank you', 'thank_you')
    
    if 'hearing aid' in text:
        text = text.replace('hearing aid', 'hearing_aid')
        
    if 'i love you' in text:
        text = text.replace('i love you', 'i_love_you')
    
    print(text)
    # split the whole text into separated words
    words_list = text.split(" ")
    
    # define codecc
    fourcc = cv2.VideoWriter_fourcc(*'XVID')
    
    # define the save path 
    output_video_path = os.path.join("created_videos", "new_created_video.avi")
    
    # define video writer object to concatenate all videos in it -> by default none may be there is no words in the mapping video list
    video_writer = None
    
    for word in words_list:
        if word in mapping_video_list:
            # means there is a video for this words 
            # access this video path 
            video_path = os.path.join("mapping_videos_english", f"{word}.mp4")
            
            # read the video
            video = cv2.VideoCapture(video_path)
            
            # extract important video features
            width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))
            height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))
            fps = video.get(cv2.CAP_PROP_FPS)
            
            if video_writer is None:
                video_writer = cv2.VideoWriter(output_video_path, fourcc, fps, (width, height))
            
            status = True
            while status:
                
                # read Frames
                status, frame = video.read()
                
                if status == True:
                    # insert the frames of the video in the new created video [frame by frame]
                    video_writer.write(frame)
                
            
            # we get out of the first video and we will cloase it safely now
            video.release()
        
        # has no corresponding words but we can express it by letters
        else:
            
            # split the word to its letter and express the letters by their videos
            word = " ".join(word)
            
            for letter in word:
                video_path = os.path.join("mapping_videos_english", f"{letter}.mp4")
            
                # read the video
                video = cv2.VideoCapture(video_path)

                # extract important video features
                width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))
                height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))
                fps = video.get(cv2.CAP_PROP_FPS)

                if video_writer is None:
                    video_writer = cv2.VideoWriter(output_video_path, fourcc, fps, (width, height))

                status = True
                while status:

                    # read Frames
                    status, frame = video.read()

                    if status == True:
                        # insert the frames of the video in the new created video [frame by frame]
                        video_writer.write(frame)

                # we get out of the first video and we will cloase it safely now
                video.release()
            
            
    # here we insert all the frames of all videos in the new created video -> we will cloase this video safely 
    if video_writer != None:
        video_writer.release()
        return output_video_path
    

# 11- Test text to video model

In [9]:
print(text_to_video_prediction('My Name Is Mohamed, Iam 22 Years old. I Love Python!!!,#$%@^^ what about you?!@'))

my name is mohamed iam 22 years old i love python what about you
created_videos\new_created_video.avi


# 12- Text to video model deployment

## Endpoint text to video 

In [10]:
import os 
from flask import Flask, request, send_file

In [11]:
app = Flask(__name__)

@app.route("/text_to_video_english", methods=['POST'])
def text_to_video():

    # request the text from the client as form data
    # text_value = request.form.get('text')
    
    # request the text as raw data 
    text_value = str(request.data ,encoding='utf-8')
    
    print(type(text_value))
    print(text_value)
    

    # apply model on the text
    output_video_path = text_to_video_prediction(text_value)
    print(output_video_path)
    
    return send_file(output_video_path)


app.run(port=5000)