In [1]:
from os import listdir
from os.path import isfile, join
import json
import re
from sklearn.model_selection import train_test_split
import pickle
import torch
import random
import pickle
from tqdm import tqdm

irrelevant_const = "IRRELEVANT"

sign_subtitle_json_file_root = "../extractedbdhandspeakskeletons/skeleton_subtitle_jsons"
# video_embeddings_root = "../extractedbdhandspeakskeletons/video_embeddings"

onlyfiles = [f for f in listdir(sign_subtitle_json_file_root) if isfile(join(sign_subtitle_json_file_root, f))]

def generate_sequence_id(line):
    p = re.compile('[a-zA-Z0-9\-]')
    return "".join(p.findall(line))




keys_of_skeleton = []
properties_of_consideration = [{
    'name': 'pose_keypoints_2d', 'count': 25
}, {
    'name': 'face_keypoints_2d', 'count': 70
}, {
    'name': 'hand_left_keypoints_2d', 'count': 21
}, {
    'name': 'hand_right_keypoints_2d', 'count': 21
}]

total_body_keypoint_count = 0
for property in properties_of_consideration:
    property_name = "_".join(property['name'].split('_')[:-2])
    for i in range(property['count']):
        keys_of_skeleton.append(f"{ property_name }_{i}_x")
        keys_of_skeleton.append(f"{ property_name }_{i}_y")

train_set_videos, not_train_set_videos = train_test_split(onlyfiles, test_size=0.1, random_state=42)
test_set_videos, validation_set_videos = train_test_split(not_train_set_videos, test_size=0.5, random_state=42)

In [2]:
print(len(train_set_videos))
print(len(test_set_videos))
print(len(validation_set_videos))

242
13
14


In [3]:
def generate_final_array(list_of_videos):
    final_array = []
    for json_file_name in tqdm(list_of_videos):
        json_file_object = open(f"{sign_subtitle_json_file_root}/{json_file_name}", encoding='utf-8')
    
        json_dict = json.load(json_file_object)

        # video_embedding_pickle_file_path = f"{video_embeddings_root}/{json_file_name.split('.')[0]}.mp4.pickle"
        # with open(video_embedding_pickle_file_path, 'rb') as f:
        #     video_embeddings = pickle.load(f)
        
        for single_sentence_data in json_dict['skeleton_data']:
            sequence_id = f"{json_dict['video_name']}%%{single_sentence_data['english']}%%{single_sentence_data['start_time']}-{single_sentence_data['end_time']}%%"


            # sign_embeddings = torch.clone(video_embeddings[ single_sentence_data['start_time']*10 : (single_sentence_data['end_time']+1)*10 ])

            sign_array_of_single_sentence = []

            skeletons_of_single_sentence = single_sentence_data['skeletons']

            for single_skeleton in skeletons_of_single_sentence:
                single_skeleton_numbers = []
                for key_of_skeleton in keys_of_skeleton:
                    single_skeleton_numbers.append(single_skeleton[key_of_skeleton])
                sign_array_of_single_sentence.append(single_skeleton_numbers)

            final_array.append({
                "name": sequence_id,
                "signer": irrelevant_const,
                "gloss": irrelevant_const,
                "sign": torch.Tensor(sign_array_of_single_sentence),
                # "sign": sign_embeddings,
                "text": single_sentence_data['bengali']
            })

        json_file_object.close()
    return final_array

In [4]:
train_set = generate_final_array(train_set_videos)
test_set = generate_final_array(test_set_videos)
validation_set = generate_final_array(validation_set_videos)

random.shuffle(train_set)

100%|██████████| 242/242 [02:15<00:00,  1.78it/s]
100%|██████████| 13/13 [00:08<00:00,  1.49it/s]
100%|██████████| 14/14 [00:09<00:00,  1.50it/s]


In [5]:
print(len(train_set))
print(len(test_set))
print(len(validation_set))

8879
562
598


In [6]:
train_array_pickle_handler = open("data_for_stochastic/phoenix14t.pami0.train","wb")
validation_array_pickle_handler = open("data_for_stochastic/phoenix14t.pami0.dev","wb")
test_array_pickle_handler = open("data_for_stochastic/phoenix14t.pami0.test","wb")

In [7]:
print("dumping train pickles")
pickle.dump(train_set, train_array_pickle_handler)
print("dumping test pickles")
pickle.dump(test_set, test_array_pickle_handler)
print("dumping dev pickles")
pickle.dump(validation_set, validation_array_pickle_handler)

train_array_pickle_handler.close()
validation_array_pickle_handler.close()
test_array_pickle_handler.close()

dumping train pickles
dumping test pickles
dumping dev pickles


In [8]:
def print_video_names_of_set(file_name, list_of_video_names):
    text_file = open(file_name,'a', encoding='utf-8')
    for name in list_of_video_names:
        print(name,file=text_file)
    text_file.close()

In [9]:
print_video_names_of_set('data_for_stochastic/train_video_names.txt',train_set_videos)
print_video_names_of_set('data_for_stochastic/test_video_names.txt',test_set_videos)
print_video_names_of_set('data_for_stochastic/validation_video_names.txt',validation_set_videos)