In [None]:
# Helper Function to rotate image
def rotate_image(image, angle):
    import cv2
    import numpy as np
    height, width = image.shape[:2]
    center = (width / 2, height / 2)
    scale = 1.0
    rotation_matrix = cv2.getRotationMatrix2D(center, angle, scale)
    rotated_image = cv2.warpAffine(image, rotation_matrix, (width, height))
    return rotated_image

In [None]:
def make_directory(path):
    import os
    if not os.path.exists(path):
        os.makedirs(path)

In [None]:
# Step 1: Extract Frames and save in different directories
def extract_frames(k, directory):
    import os
    import re
    import cv2
    import json
    import mediapipe as mp
    from tqdm.notebook import tqdm 
    import pandas as pd

    bad_sample = [121]
    report = pd.DataFrame()
    report_stat = []
    content = json.load(open('final_train.json'))
    prog_gloss = tqdm()
    if os.path.exists(directory):
        video_ls = os.listdir(directory)
    else:
        raise Exception("Directory does not exist.")
    
    mp_holistic = mp.solutions.holistic
    progress = tqdm(total=k)
    with mp_holistic.Holistic(min_detection_confidence=0.7, min_tracking_confidence=0.7) as holistic:
        for i, entry in enumerate(content):
            gloss = entry['gloss']
            instances = entry['instances']
            progress.update(1)
            progress.set_description(f"Working on Gloss {gloss}")
            if not i < k:
                #Reaching The top-k
                break
            prog_gloss.reset(0)
            prog_gloss.total = len(instances)
            for j, inst in enumerate(instances):
                prog_gloss.update(1)
                video_id = inst["video_id"]
                prog_gloss.set_description(f"Getting {video_id}")
                video_directory = f'videos/{video_id}.mp4'
                sign_id = inst["signer_id"]
                split = inst["split"] #Train, Test or Validation
                variation_id = inst["variation_id"]
                bbox = inst['bbox']
                target_directory = f'frames/{split}/{gloss}/{variation_id}'

                # Create Directory If not exist
                make_directory(target_directory)

                # Skip Bad Samples
                if sign_id in bad_sample:
                    print(f"Skip Signer {sign_id}")
                    continue

                if os.path.exists(video_directory):
                    cap = cv2.VideoCapture(video_directory)
                    frame_counter = 0
                    while True:
                        # Read a frame from the video file
                        ret, frame = cap.read()
                        
                        # If the frame was not retrieved, then we have reached the end of the video
                        if not ret:
                            break

                        # Remove Trash Frames with no pose/ two hands
                        frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
                        results = holistic.process(frame)
                        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)    
                        if results.pose_landmarks is None:
                            continue
                        
                        if results.left_hand_landmarks is None and results.right_hand_landmarks is None:
                            continue

                        cv2.imwrite(os.path.join(target_directory, f'{video_id}_{str(frame_counter).zfill(4)}.jpg'), frame)
                        frame_counter += 1
                    # Release the video capture object
                    cap.release()
                    report_stat.append([gloss, variation_id, video_id, sign_id, split, frame_counter, bbox])
    report = pd.DataFrame(report_stat, columns=["gloss", "variation","video id","signer id", "split", "number of frames", "Bounding Box"])
    report.to_csv("frames/frame_report.csv", index = False)
    return report
    
                          



In [None]:
# Only run this function for the first time, or when k is changed
report_df = extract_frames(100, "videos")
display(report_df)

In [None]:
import pandas as pd
# Merge two report
# report_df = pd.read_csv("frames/frame_report.csv")
# letter_df = pd.read_csv("frames/frame_report_new.csv")
# report_df = pd.concat([report_df, letter_df])
# report_df.to_csv('frames/frame_report.csv')

In [None]:
import pandas as pd
report_df = pd.read_csv("frames/frame_report.csv")
display(report_df.groupby(['gloss','variation'])['number of frames'].count()>10)
report_df.groupby(['gloss','variation']).filter(lambda x: len(x) >= 14).shape

In [None]:
def remove_empty_dirs(path):
    import os
    # list all directories and files
    for root, dirs, files in os.walk(path):
        # iterate over all directories
        for name in dirs:
            full_path = os.path.join(root, name)
            # if directory is empty, remove it
            if not os.listdir(full_path):
                os.rmdir(full_path)
                print(f"Removed empty directory: {full_path}")

# Only keep Gloss, Variation with at least 10 samples in total (train, test, val)
def remove_minor_samples(df, min_samples = 10):
    import numpy as np
    import pandas as pd
    import shutil
    split_ls = ['train', 'test', 'val']
    # remove empty samples
    df = df[df['number of frames']!=0]
    
    df_remain = df.groupby(['gloss','variation']).filter(lambda x: len(x) >= min_samples)
    to_remove = df.groupby(['gloss','variation']).filter(lambda x: len(x) < min_samples)
    print(to_remove)
    return
    gloss_to_remove = to_remove['gloss'].unique()
    for gloss in gloss_to_remove:
        if len(gloss) == 1: # Skip letter
            continue
        variations_to_remove = to_remove[to_remove['gloss']==gloss].groupby('variation').filter(lambda x: len(x)<min_samples)['variation'].unique()
        for variation in variations_to_remove:
            for split in split_ls:
                try:
                    to_remove_dir = f'frames/{split}/{gloss}/{variation}'
                    shutil.rmtree(to_remove_dir)
                except:
                    continue
    remove_empty_dirs('frames')
    df_remain = df_remain[['gloss','variation','video id','signer id','split','number of frames','Bounding Box']]
    df_remain.to_csv("frames/frame_report.csv", index = False)
    return df_remain

import pandas as pd
report_df = pd.read_csv("frames/frame_report.csv")
remove_minor_samples(report_df)

In [None]:

def remove_letter_frames(): # All non v2 versions are damaged
    import os
    import re
    from tqdm.notebook import tqdm
    frame_path = 'frames'
    split_ls = ['train','test','val']
    for split in split_ls:
        regular_expression = r'^[a-zA-Z]_\d{3}\.jpg$'
        path = os.path.join(frame_path, split)
        for root, _, files in tqdm(os.walk(path)):
            for file in files:
                if 'test' in file or 'val' in file or bool(re.match(regular_expression, file)):
                    print(file)
                    os.remove(os.path.join(root, file))

remove_letter_frames()

In [None]:
def make_directory(path):
    import os
    if not os.path.exists(path):
        os.makedirs(path)
        
letters = 'a b c d e f g h i j k l m n o p q r s t u v w x y z'
# All non v2 versions are damaged, Redistribute V2 among train/val/test
def letter_redistribute(): 
    import os
    import re
    from tqdm.notebook import tqdm
    regular_expression = r'^[a-zA-Z]_V2_\d{3}\.jpg$'
    frame_path = 'frames'
    split_ls = ['train','train','val']
    letter_ls = letters.split(' ')
    for letter in letter_ls:
        path = f'frames/train/{letter}/0'
        v2_frames = []
        for i in os.listdir(path):
            if 'V2' in i:
                v2_frames.append(i)
        frame_count = len(v2_frames)

        distribution = [frame_count*0.3//1, frame_count*0.6//1, frame_count]
        letter_upper = letter.upper()
        distribution_name = [f'{letter_upper}',f'{letter_upper}_V2', f'{letter_upper}_val']
        counter = 0

        for new_name, dir_name, num_of_frames in zip(distribution_name, split_ls, distribution):
            file_counter = 0
            while counter < num_of_frames:
                original_name = f'{letter_upper}_V2_{str(counter).zfill(4)}.jpg'
                new_file_name = f'{new_name}_{str(file_counter).zfill(4)}.jpg'
                original_file_path = os.path.join(path, original_name)
                new_file_path = os.path.join(f'frames/{dir_name}/{letter}/0', new_file_name)
                make_directory(f'frames/{dir_name}/{letter}/0')
                try:
                    os.rename(original_file_path, new_file_path)
                except Exception as e:
                    print(e)
                    pass
                counter += 1
                file_counter += 1
        


# Causion: Do not run more than once!! result will not be expected
letter_redistribute()

# If accidently run twich, it should raise error in lteration of letter A
# To recover, remove all files in train/test/val of A, put all files to train of A from backup of A.rar

In [None]:
def move_frames(source, dst, video_id):
    import os
    import shutil

    # Check if target directory exists, if not, create it
    if not os.path.exists(dst):
        os.makedirs(dst)

    # Iterate over all files in source directory
    for filename in os.listdir(source):
        # If file's name starts with the given prefix, move it to the target directory
        if filename.startswith(str(video_id)):
            shutil.move(os.path.join(source, filename), dst)

# Redistribute Samples, So All Minor Samples at least have some data
def redistribute_samples(df, at_least_amount = 2, reduce_least_amount = 6):
    import numpy as np
    import os
    gloss_ls = df['gloss'].unique()
    split_ls = ['train', 'val', 'test']
    if at_least_amount * 2 >= reduce_least_amount:
        print("Error in parameters")
        return
    for gloss in gloss_ls:
        if len(gloss) == 1:
            continue
        variation_ls = df[df['gloss']==gloss]['variation'].unique()
        for variation in variation_ls:
            df_focus = df[(df['gloss']==gloss) & (df['variation']==variation)]
            split_sample_count = [0, 0, 0]
            largest_set = -1
            largest_count = -1
            # Find which split has largest sample size
            for i, split in enumerate(split_ls):
                df_split = df_focus[df_focus['split']==split]
                split_count = len(df_split.index)
                split_sample_count[i] = split_count
                if split_count > largest_count:
                    largest_set = i
                    largest_count = split_count
            # Probably this function is repeatly called
            if largest_count <= reduce_least_amount:
                continue
            move_count = 0 # index of which video to move
            video_id_to_move_ls = df_focus[df_focus['split']==split_ls[largest_set]]['video id'].to_list()
            source_dir = f'frames/{split_ls[largest_set]}/{gloss}/{variation}'
            for i, split in enumerate(split_ls):
                if i == largest_set:
                    continue
                dst_dir = f'frames/{split}/{gloss}/{variation}'
                num_videos_to_move = at_least_amount - split_sample_count[i]
                for j in range(num_videos_to_move):
                    video_ld_to_move = video_id_to_move_ls[move_count]
                    df.iloc[df['video id']==video_ld_to_move, df.columns=='split'] = split # Update df
                    move_frames(source_dir, dst_dir, video_ld_to_move)
                    move_count+=1
    df.to_csv("frames/frame_report.csv", index = False)
    
import pandas as pd
report_df = pd.read_csv("frames/frame_report.csv")
redistribute_samples(report_df)

In [None]:
# Define two hands frame > 50 % as two hands gesture
def count_hands():
    import json
    import mediapipe as mp
    import cv2
    import os
    from tqdm.notebook import tqdm
    mp_holistic = mp.solutions.holistic # Mediapipe Solutions
    result_dict = {}
    with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
        
        gloss_ls = os.listdir(f'frames/train')
        for gloss in tqdm(gloss_ls):
            variation_ls = os.listdir(f'frames/train/{gloss}')
            gloss_dict = {}
            for variation in variation_ls:
                two_hands_count = 0
                total_frame_count = 0
                image_ls = os.listdir(f'frames/train/{gloss}/{variation}')
                for image_path in image_ls:
                    image = cv2.imread(f'frames/train/{gloss}/{variation}/{image_path}')
                    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
                    results = holistic.process(image)

                    # Skip Empty Frames
                    if results.right_hand_landmarks is None and results.left_hand_landmarks is None:
                        continue
                    if results.right_hand_landmarks is not None and results.left_hand_landmarks is not None:
                        two_hands_count += 1
                    total_frame_count += 1
                print(f'Gloss: {gloss} Variation: {variation} Two Hands Count: {two_hands_count}, total_frame_count: {total_frame_count}')
                gloss_dict[variation] = two_hands_count > (total_frame_count//2)
                result_dict[gloss] = gloss_dict
    with open('is_two_hands.json', 'w', encoding='utf-8') as f:
        json.dump(result_dict, f, ensure_ascii=False, indent=4)
        
count_hands()

In [None]:
def video_reconstruct(split): #Reconstruct Video To speed up landmark fetching by reducing File I/O 
    import os
    import cv2
    from tqdm.notebook import tqdm
    file_prog = tqdm()
    split_path = os.path.join('frames',split)
    for root, _, files in tqdm(os.walk(split_path)):
        file_prog.reset()
        file_prog.total = len(files)
        frame_dirs = '\\'.join(root.split('\\')[1:])
        video_path = os.path.join('videos_reconstruct',frame_dirs)
        make_directory(video_path)

        video_id = None
        fourcc = None
        out = None
        for file in files:
            if len(file.split("_")) == 2:
                video_id_current = file.split("_")[0]
            else:
                video_id_current = "_".join(file.split("_")[0:2])
            frame = cv2.imread(os.path.join(root, file))
            
            if video_id_current != video_id:
                video_id = video_id_current
                # Define the codec using VideoWriter_fourcc and create a VideoWriter object
                fourcc = cv2.VideoWriter_fourcc(*'mp4v') 
                out = cv2.VideoWriter(os.path.join(video_path,f'{video_id}.mp4'), fourcc, 30.0, (frame.shape[1], frame.shape[0]))
                
            out.write(frame)
            
            
            
        
video_reconstruct('train')
video_reconstruct('val')

In [None]:
def test_holistic():
    import mediapipe as mp
    import pandas as pd
    import cv2
    mp_drawing = mp.solutions.drawing_utils # Drawing helpers
    mp_holistic = mp.solutions.holistic # Mediapipe Solutions
    with mp_holistic.Holistic(min_detection_confidence=0.7, min_tracking_confidence=0.7) as holistic:
        frame_num = 32
        split = 'train'
        variation = 0
        test_path = f"frames/train/all/0/01987_0000.jpg"
        print(test_path)
        image = cv2.imread(test_path)
        
        image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
        
        print(image.shape)
        
        #image = image[:, 209:495]
        
        height = 540
        width_cut = int(height/1.33)
        print(width_cut)
        width = int(image.shape[1] * height/image.shape[0] * 1)
        print(width)
        image = cv2.resize(image, (width, height))
        #image = cv2.copyMakeBorder(image,0,0,140,140,cv2.BORDER_CONSTANT)
        image = cv2.resize(image, (int(image.shape[1]*1.5), height))
#         
#         image = cv2.resize(image,dsize=None,fx=0.8,fy=1.2,interpolation=cv2.INTER_LINEAR)

        #image = rotate_image(image, 45)
        #image = cv2.rotate(image, cv2.ROTATE_45_CLOCKWISE)
        
        results = holistic.process(image)
        print(dir(results.pose_landmarks))
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        # Right hand
        mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                                 mp_drawing.DrawingSpec(color=(80,22,10), thickness=2, circle_radius=4),
                                 mp_drawing.DrawingSpec(color=(80,44,121), thickness=2, circle_radius=2)
                                 )

        # Left Hand
        mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                                 mp_drawing.DrawingSpec(color=(121,22,76), thickness=2, circle_radius=4),
                                 mp_drawing.DrawingSpec(color=(121,44,250), thickness=2, circle_radius=2)
                                 )

        # Pose Detections
        mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS, 
                                 mp_drawing.DrawingSpec(color=(245,117,66), thickness=2, circle_radius=4),
                                 mp_drawing.DrawingSpec(color=(245,66,230), thickness=2, circle_radius=2)
                                 )

        print(type(results.pose_landmarks.landmark[0]))
        def standardize_landmarks(results):
            if results is not None:
                pivot = results.pose_landmarks.landmark[0]
                scale = results.pose_landmarks.landmark[11].x - results.pose_landmarks.landmark[12].x
                for i, pose in enumerate(results.pose_landmarks.landmark):
                    print(f"X{i}: ", (pose.x-pivot.x)/scale, "Origin: ",(pose.x))
                    print(f"Y{i}: ", (pose.x-pivot.x)/scale, "Origin: ",(pose.y))
        
        standardize_landmarks(results)
        
        cv2.imshow('Test Holistic', image)
        cv2.waitKey(0)
# Function to test if the frames really extracted

test_holistic()



In [25]:
import cv2
import json
import numpy as np
import mediapipe as mp
import pandas as pd
import os
from tqdm.notebook import tqdm
from mediapipe.framework.formats.landmark_pb2 import NormalizedLandmark

class dataset:

    content = json.load(open('final_train.json'))
    is_two_hands_table = json.load(open('is_two_hands.json'))
    empty_fill = np.array([0, 0, 0])
    empty_pivot = NormalizedLandmark()
    empty_pivot.x = 0
    empty_pivot.y = 0
    empty_pivot.z = 0
    empty_pivot.visibility = 0

    # keep_pose = [0,2,5,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24]

    def __init__(self, k, split, partition, name, test=False):
        self.k = k
        self.split = split
        self.partition = partition
        self.name = name
        self.test = test
        self.df = pd.read_csv('frames/frame_report.csv')

    def get_variation(self, gloss_instances):
        #Getting Variation of Gloss
        max_variation = 0
        for inst in gloss_instances:
            max_variation = max(max_variation, inst['variation_id'])
        return max_variation + 1 # Offset from 0

    def _crop_image(self, image, midpt, max_min_diff, hands_in, ratio, long_scale, short_scale):
        midpt = int(midpt)
        shoudler_scale = 1 - (max_min_diff/2) # Keep Half Shoulder Spaces
        height = image.shape[0]
        # Ratio: Scaling base on resolution
        # Long/ Short Scale: Scaling differ for hand majority
        if hands_in == 0: # Left
            width_length_half_left = int((height/ ratio) / shoudler_scale / 2 * long_scale)
            width_length_half_right = int((height/ ratio) / shoudler_scale / 2 * short_scale)

        elif hands_in == 1: # Right
            width_length_half_left = int((height/ ratio) / shoudler_scale / 2 * short_scale)
            width_length_half_right = int((height/ ratio) / shoudler_scale / 2 * long_scale)
        elif hands_in == 2: # Two Hands
            width_length_half_left = int((height/ ratio) / shoudler_scale / 2 * short_scale)
            width_length_half_right = int((height/ ratio) / shoudler_scale / 2 * short_scale)

        shift_left = max(int(midpt-width_length_half_left),0)
        shift_right = min(int(midpt+width_length_half_right),image.shape[1])

        image = image[:, shift_left:shift_right]

        return image

    def _standard_scaling(self, image):
        height= image.shape[0]
        width = image.shape[1]
        # Height > Weight
        if image.shape[1] > image.shape[0]:
            new_height = 540
            new_width = int(width * new_height/height)
        else:
            new_width = 540
            new_height = int(height * new_width/width)

        if image.shape[0] != 0 and image.shape[1] != 0:
            image = cv2.resize(image, (new_width, new_height))
        return image

    # Helper Function to rotate image
    def _rotate_image(self, image, angle):
        height, width = image.shape[:2]
        center = (width / 2, height / 2)
        scale = 1.0
        rotation_matrix = cv2.getRotationMatrix2D(center, angle, scale)
        rotated_image = cv2.warpAffine(image, rotation_matrix, (width, height))
        return rotated_image


    def _get_midpoint(self, video_path, holistic):
        # Find x mid point
        if not os.path.exists(video_path):
            return -1, None, None
        image = cv2.imread(video_path)
        image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
        results = holistic.process(image)
        min_x = 1
        max_x = 0

        if results.left_hand_landmarks is None and results.right_hand_landmarks is None:
            return -1, None, None

        if results.pose_landmarks is None:
            return -1, None, None
        else:
            # A boundary in case left/ right hand is not detected
            min_x = results.pose_landmarks.landmark[0].x
            max_x = results.pose_landmarks.landmark[0].x


        if results.left_hand_landmarks is not None:
            for landmark in results.left_hand_landmarks.landmark:
                min_x = min(landmark.x, min_x)
                max_x = max(landmark.x, max_x)
                hands_in = 0

        if results.right_hand_landmarks is not None:
            for landmark in results.right_hand_landmarks.landmark:
                min_x = min(landmark.x, min_x)
                max_x = max(landmark.x, max_x)
                hands_in = 1

        if results.left_hand_landmarks is not None and results.right_hand_landmarks is not None:
            hasattrnds_in = 2

        midpt=(min_x+max_x) /2 * image.shape[1]
        max_min_diff = max_x-min_x
        return hands_in, midpt, max_min_diff


    # Fetch Landmarks with flipped/ rotated image, merge into a list with its status and result
    # Further unpack in later code
    def _get_landmarks_merge(self, image, holistic, midpt, hands_in, max_min_diff, visibility=False, scale=False,
                             random_factors=[], flip_ls=None):
        result = []
        flip_ls = [] if flip_ls is None else flip_ls

        image_backup = image.copy()
        steps = random_factors['steps']

        crop_midpt = random_factors.get('hw_ratio') is not None
        border_random = random_factors.get('random_border_lr') is not None
        scale_random = random_factors.get('scale_random') is not None
        rotation_random = random_factors.get('rotation') is not None

        if hands_in == -1 and crop_midpt:
            return []

        for i in range(-1, steps):
            if i != -1: # Keep Original Image in first iteration
                image = image_backup.copy() # Reset image change done by last iteration
                if rotation_random:
                    rotation_angle = random_factors['rotation'][i]
                    image = self._rotate_image(image, rotation_angle)

                if crop_midpt:
                    hw_ratio = random_factors['hw_ratio'][i]
                    midpt_shift = random_factors['midpt_shift'][i]
                    short_scale = random_factors['width_scaling_short'][i]
                    long_scale = random_factors['width_scaling_long'][i]
                    image=self._crop_image(image, midpt=midpt*midpt_shift, max_min_diff=max_min_diff,hands_in=hands_in, ratio=hw_ratio, long_scale=long_scale, short_scale=short_scale)
                if border_random:
                    top = random_factors['random_border_tb'][0][i]
                    bot = random_factors['random_border_tb'][1][i]
                    left = random_factors['random_border_lr'][0][i]
                    right = random_factors['random_border_lr'][1][i]
                    image = cv2.copyMakeBorder(image,top,bot,left,right,cv2.BORDER_CONSTANT)

                if scale_random:
                    width_scale =  random_factors['scale_random'][0][i]
                    height_scale =  random_factors['scale_random'][1][i]

                    if image.shape[0] == 0 or image.shape[1] == 0:
                        break
                    image = cv2.resize(image,dsize=None,fx=width_scale,fy=height_scale,interpolation=cv2.INTER_LINEAR)

            image = self._standard_scaling(image)
            for f in range(-1, len(flip_ls)): # -1 for origin landmark fetching
                base_image = image.copy()
                flip_image = base_image
                if f != -1:
                    flip_image = cv2.flip(base_image, flip_ls[f])
#                 cv2.imshow('Test Holistic', flip_image)
#                 cv2.waitKey(0)
                status, landmark = self._get_flatten_landmarks(flip_image, holistic, visibility, scale)
                dict_landmark = {'status': status, 'landmark':landmark}
                result.append(dict_landmark)
#                 for r in range(1, rotation_step):
#                     angle = 360/ rotation_step * r
#                     base_image_flip = flip_image.copy()
#                     rotate_image = self._rotate_image(base_image_flip, angle)
#                     status, landmark = self._get_flatten_landmarks(rotate_image, holistic, visibility, scale)
#                     dict_landmark = {'status': status, 'landmark':landmark}
#                     result.append(dict_landmark)
        return result

    def _distance(self, pt1, pt2):
        return ((pt1.x-pt2.x)**2 + (pt1.y-pt2.y)**2 + (pt1.z-pt2.z)**2)**0.5

    # Return: status, Landmarks, left vaild landmark, right valid landmark
    # Valid landmark: landmarks that are detected, else empty list
    # status: -1: not detected, 0: two hands, 1: left hand only, 2: right hand only
    # TODO: Handle Rotation/ Flip values here later
    def _get_flatten_landmarks(self, image, holistic, use_visibility=False, scale=False):
        if image is None:
            return -1, [None]
        if(image.shape[0]==0 or image.shape[1]==0):
            return -1, [None]
        results = holistic.process(image)

        status = -1

        if self.test:
            mp_holistic = mp.solutions.holistic
            mp_drawing = mp.solutions.drawing_utils
            # Right hand
            mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                                     mp_drawing.DrawingSpec(color=(80,22,10), thickness=2, circle_radius=4),
                                     mp_drawing.DrawingSpec(color=(80,44,121), thickness=2, circle_radius=2)
                                     )

            # Left Hand
            mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                                     mp_drawing.DrawingSpec(color=(121,22,76), thickness=2, circle_radius=4),
                                     mp_drawing.DrawingSpec(color=(121,44,250), thickness=2, circle_radius=2)
                                     )

            # Pose Detections
            mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS,
                                     mp_drawing.DrawingSpec(color=(245,117,66), thickness=2, circle_radius=4),
                                     mp_drawing.DrawingSpec(color=(245,66,230), thickness=2, circle_radius=2)
                                     )
            cv2.imshow("preview",image)
            cv2.waitKey(0)

        # Extract landmarks
        pose = results.pose_landmarks
        left_hand = results.left_hand_landmarks
        right_hand = results.right_hand_landmarks

        # Counting this to prevent the image show a tiny part of hands (20 is the max)
        left_boundary_count = 0
        right_boundary_count = 0

        if pose is not None:
            if scale:
                pivot = pose.landmark[0]
                a, b = pose.landmark[11], pose.landmark[12]
                scale_length = self._distance(a,b)
            else:
                pivot = dataset.empty_pivot
                scale_length = 1

            pose_row = None
            for i, landmark in enumerate(pose.landmark):
#                 if i not in dataset.keep_pose:
#                     continue # Skip Non-informative Landmarks
                landmark_x = (landmark.x-pivot.x)/scale_length
                landmark_y = (landmark.y-pivot.y)/scale_length
                landmark_z = (landmark.z-pivot.z)/scale_length
                visibility = landmark.visibility

                pose_row = landmark_x if pose_row is None else np.append(pose_row, landmark_x)
                pose_row = np.append(pose_row, landmark_y)
                pose_row = np.append(pose_row, landmark_z)
                if use_visibility:
                    pose_row = np.append(pose_row, visibility)
        else:
            status = -1
            # Invalid Image
            return status, [None]

        if left_hand is not None:
            if scale:
                pivot_left = left_hand.landmark[0]
                a, b = left_hand.landmark[5], left_hand.landmark[0]
                scale_length_left = self._distance(a,b)
                
            left_row = None
            for landmark in left_hand.landmark:
                landmark_x_relative_hand = (landmark.x-pivot_left.x)/scale_length_left
                landmark_y_relative_hand = (landmark.y-pivot_left.y)/scale_length_left
                landmark_z_relative_hand = (landmark.z-pivot_left.z)/scale_length_left
                
                landmark_x = (landmark.x-pivot.x)/scale_length
                landmark_y = (landmark.y-pivot.y)/scale_length
                landmark_z = (landmark.z-pivot.z)/scale_length

                left_row = landmark_x if left_row is None else np.append(left_row, landmark_x)
                left_row = np.append(left_row, landmark_y)
                left_row = np.append(left_row, landmark_z)

                left_row = np.append(left_row, landmark_x_relative_hand)
                left_row = np.append(left_row, landmark_y_relative_hand)
                left_row = np.append(left_row, landmark_z_relative_hand)

                if (not (landmark.x > 0  and landmark.x < 1)) or not (landmark.y > 0  and landmark.y < 1):
                    left_boundary_count+=1

        if right_hand is not None:
            if scale:
                pivot_right = right_hand.landmark[0]
                a, b = right_hand.landmark[5], right_hand.landmark[0]
                scale_length_right = self._distance(a,b)

            right_row = None
            for landmark in right_hand.landmark:
                landmark_x_relative_hand = (landmark.x-pivot_right.x)/scale_length_right
                landmark_y_relative_hand = (landmark.y-pivot_right.y)/scale_length_right
                landmark_z_relative_hand = (landmark.z-pivot_right.z)/scale_length_right
                
                landmark_x = (landmark.x-pivot.x)/scale_length
                landmark_y = (landmark.y-pivot.y)/scale_length
                landmark_z = (landmark.z-pivot.z)/scale_length

                right_row = landmark_x if right_row is None else np.append(right_row, landmark_x)
                right_row = np.append(right_row, landmark_y)
                right_row = np.append(right_row, landmark_z)
                
                right_row = np.append(right_row, landmark_x_relative_hand)
                right_row = np.append(right_row, landmark_y_relative_hand)
                right_row = np.append(right_row, landmark_z_relative_hand)
                
                if (not (landmark.x > 0  and landmark.x < 1)) or not (landmark.y > 0  and landmark.y < 1):
                    right_boundary_count+=1

        # Two Hands Detected, but could be only showing a very tiny part
        if left_hand is not None and right_hand is not None:
            result = np.concatenate([pose_row,left_row,right_row])
            result = result.reshape((1,result.shape[0]))
            #print('both',result.shape)
            status = 0
            if left_boundary_count > 10: # Left Hand Invalid
                status = 2
            if right_boundary_count > 10: # Right Hand Invalid
                status = 1
            if left_boundary_count > 10 and right_boundary_count > 10: # Two Hands Are Invalid
                status = -1
        else:
            # x->0.5 = middle, y->1 = bottom

            #Single Hand Model
            if left_hand is not None:
                empty_arr = np.tile(dataset.empty_fill, 21*2)
                result = np.concatenate([pose_row,left_row,empty_arr])
                result = result.reshape((1,result.shape[0]))
                #print('left',result.shape)
                status = 1
            elif right_hand is not None:
                empty_arr = np.tile(dataset.empty_fill, 21*2)
                result = np.concatenate([pose_row, empty_arr, right_row])
                result = result.reshape((1,result.shape[0]))
                #print('right',result.shape)
                status = 2
            else:
                result = [None]
                status = -1
        # print(result.shape)
        if self.test:
            print(result)

        return status, result

    def _save_npy(self, save_directory, gloss, dataset):
        for j, partition in enumerate(dataset):
            index_col = None
            gloss_col = None
            label_col = None
            variation_col = None
            for i, data_variation in enumerate(partition):
                if data_variation is None:
                    print(f"No data for {gloss}, partition:{j}")
                    continue
                for row_idx in range(data_variation.shape[0]):
                    gloss_directory = f'np_arrays/{save_directory}/{self.split}_{j}/{gloss}'
                    row = data_variation[row_idx]
                    # Create Directory If not exist
                    self._make_directory(gloss_directory)

                    to_save_path = f'np_arrays/{save_directory}/{self.split}_{j}/{gloss}/{i}_{row_idx}.npy'
                    with open(to_save_path, 'wb') as f:
                        np.save(to_save_path, row)

                label_name = f'{gloss}_{i}'
                variation_full = np.full(data_variation.shape[0], i)
                index = np.arange(data_variation.shape[0])
                gloss_full = np.full(data_variation.shape[0], gloss)
                label = np.full(data_variation.shape[0], label_name)

                index_col = index if index_col is None else np.append(index_col, index)
                gloss_col = gloss_full if gloss_col is None else np.append(gloss_col, gloss_full)
                variation_col = variation_full if variation_col is None else np.append(variation_col, variation_full)
                label_col = label if label_col is None else np.append(label_col, label)
            if index_col is not None:
                result = pd.DataFrame()
                result['Index'] = index_col
                result['Gloss'] = gloss_col
                result['Variation'] = variation_col
                result['Label'] = label_col
                result.to_csv(f'np_arrays/{save_directory}/{self.split}_{j}/{gloss}/meta.csv',index=False)


    def _rng_upsampling(self, sample_set, window, max_duplicate=4):
        result = []
        remain = sample_set
        shape_remain = sample_set.shape[0]
        for i in range(sample_set.shape[0]):
            target_row = sample_set[i]
            # Window - (Shape remain) - Current Index
            # Shape remain = sample shape - (i + 1)
            shape_remain = sample_set.shape[0]-i
            random_range = min(window - shape_remain - len(result), max_duplicate) + 1 # Range of random value could be
            random_duplicate = np.random.randint(0, random_range + 1) # The random value to determine how much a sample be duplicated
            for j in range(random_duplicate):
                result.append(target_row)
                if len(result)+shape_remain == window:
                    break
            if len(result)+shape_remain == window:
                break
        result = np.array(result)
        result = np.vstack([result, remain[i:]])
        if result.shape[0] != window: # Fill last row if still some empty space
            to_fill = np.full((window-result.shape[0], result.shape[1]), result[-1])
            result = np.vstack([result, to_fill])
        return result


    def _extract_lstm(self, sample_set, window=20, upsampling_times=5):
        # Shape 0: How many samples
        # Shape 1: How many Landmarks
        # Output: Extra shape 2: Time of Landmark
        result = []
        if len(sample_set.shape) != 2:
            # Empty sample set
            return np.array(result)
        if sample_set.shape[0] < 10:
            # Not Enough Samples
            return np.array(result)

        if sample_set.shape[0] < window :
            upsampling_times = upsampling_times * 3 # Upsample more for minor samples
            upsampling_time = 1 if upsampling_times == 0 else upsampling_times# At least resample once to prevent lost of data
            for i in range(upsampling_times):
                rng_samp = self._rng_upsampling(sample_set, window)
                result.append(rng_samp)
            return np.array(result)

        for i in range(sample_set.shape[0]-window):
            temp_result = sample_set[i:i+window]
            temp_result = np.array(temp_result)
            result.append(temp_result)
            for i in range(upsampling_times):
                rng_samp = self._rng_upsampling(sample_set[i:i+(window//2)], window)
                result.append(rng_samp)
        result = np.array(result)
        return np.array(result)


    def _make_directory(self, path):
        if not os.path.exists(path):
            os.makedirs(path)

    def fetch_data(self, progress=0, window=20, rotation=False, visibility=False, scale=False, keep_noise_frame=False, flip=None, flatten=False, random_augmentation_steps=0, upsampling_times=10,midpt_crop=False,random_boarder=False):
        flip = [] if flip is None else flip
        k = self.k
        gloss_pbar = tqdm(total=self.k)
        pbar = tqdm()
        labels_to_fetch = self.df["gloss"].to_list()
        for i, entry in enumerate(dataset.content):
            gloss_pbar.update(1)
            gloss = entry['gloss']
            if i < progress:
                continue # Skip to current progress
            if not i < k:
                #Reaching The top-k
                break
            gloss = entry['gloss']
            if gloss not in labels_to_fetch:
                continue
            
            title = f"Gloss {i+1} {gloss}, {i}/{k}"
            gloss_pbar.set_description(title)
            instances = entry['instances']
            variations = self.get_variation(instances)
            video_count = [0] * variations # Count Video for different variation
            result = [[None] * variations for i in range(self.partition)] # Also Spaces for partitions
            mp_holistic = mp.solutions.holistic # Mediapipe Solutions
            with mp_holistic.Holistic(min_detection_confidence=0.7, min_tracking_confidence=0.7) as holistic:
                # Per Video of Gloss
                pbar.total = len(instances)
                pbar.refresh()
                for inst in instances:
                    pbar.update(1)
                    pbar.set_description('Fetching Videos')
                    split = inst["split"] #Train, Test or Validation
                    if split != self.split: # Skip other split
                        continue

                    hands_in = -1
                    midpt = None
                    max_min_diff = None # To Scale By Shoudler Diff
                    video_id = inst["video_id"]
                    variation_id = inst["variation_id"]
                    frame_count = 0
                    temp_data = {}
                    to_partition = video_count[variation_id] % self.partition

                    # For two hands indicating
                    two_hands_count = 0
                    total_frame_count = 0
                    is_two_hands_gesture = dataset.is_two_hands_table.get(gloss,{}).get(str(variation_id))
                    # Removed Label
                    if is_two_hands_gesture is None:
                        continue

                    random_factors = {}
                    random_factors['steps'] = random_augmentation_steps
                    if midpt_crop:
                        random_factors['hw_ratio'] = np.around(np.random.uniform(1.25,1.75,random_augmentation_steps),2)
                        random_factors['midpt_shift'] = np.around(np.random.uniform(0.9,1.1,random_augmentation_steps),2)
                        random_factors['width_scaling_long'] = np.around(np.random.uniform(1.2,1.4,random_augmentation_steps),2)
                        random_factors['width_scaling_short'] = np.around(np.random.uniform(1.1,1.2,random_augmentation_steps),2)
                        random_factors['scale_random'] = [np.around(np.random.uniform(0.8,1.2,random_augmentation_steps),2) for i in range(2)]
                    if random_boarder:
                        random_factors['random_border_lr'] = [np.random.randint(0,120,random_augmentation_steps) for i in range(2)]
                        random_factors['random_border_tb'] = [np.random.randint(0,40,random_augmentation_steps) for i in range(2)]
                    if rotation:
                        random_factors['rotation'] = np.around(np.random.uniform(-20,20,random_augmentation_steps),2)

                    video_path = f'videos_reconstruct/{split}/{gloss}/{variation_id}/{video_id}.mp4'

                    if not os.path.exists(video_path):
                        frame_count = -1 # Leave Loop Instantly
                    else:
                        cap = cv2.VideoCapture(video_path)
                    # Per Frame of Video
                    while frame_count < 300:
                        if frame_count == -1:
                            break
                        if midpt is None and midpt_crop:
                            # Smoothing, prevent determine by only one frame
                            midpt_sum = 0
                            max_min_diff_ls = []
                            hands_in_ls = []
                            valid_count = 0
                            for i in range(5):
                              # Not hard code 15 prevent 15 is none
                              video_path_15shift = f'frames/{split}/{gloss}/{variation_id}/{video_id}_{str(frame_count+15+i).zfill(4)}.jpg'
                              hands_in_temp, midpt_temp, max_min_diff_temp = self._get_midpoint(video_path_15shift, holistic)
                              if hands_in_temp != -1:
                                valid_count += 1
                                midpt_sum+=midpt_temp
                                max_min_diff_ls.append(max_min_diff_temp)
                                hands_in_ls.append(hands_in_temp)
                            if valid_count != 0:
                                midpt = midpt_sum/ valid_count
                                max_min_diff=max(max_min_diff_ls)
                                hands_in = max(hands_in_ls, key= hands_in_ls.count)

                        if midpt is None and midpt_crop:
                            # Prevent midpoint finding failed in first iteration
                            # print("Mid point error")
                            frame_count+=1
                            continue

                        ret, image = cap.read()
                        if image is None:
                            break
                        image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)

                        frame_count += 1
                        landmark_results = self._get_landmarks_merge(image, holistic, visibility=visibility, scale=scale, random_factors=random_factors, midpt=midpt, max_min_diff=max_min_diff, hands_in=hands_in, flip_ls=flip)

                        for i, landmark_result in enumerate(landmark_results):
                            status = landmark_result['status']
                            landmark = landmark_result['landmark']
                            if status == -1: # Error
                                continue

                            total_frame_count += 1
                            if is_two_hands_gesture: #Only add two hands
                                if status == 0:
                                    two_hands_count += 1
                                    temp_data[i] = landmark if i not in temp_data else np.vstack([temp_data[i], landmark])
                                else:
                                    if keep_noise_frame:
                                        temp_data[i] = landmark if i not in temp_data else np.vstack([temp_data[i], landmark])
                                    continue
                            else:
                                if status == 0: #Only add one hand
                                    two_hands_count += 1
                                    if keep_noise_frame:
                                        temp_data[i] = landmark if i not in temp_data else np.vstack([temp_data[i], landmark])
                                    continue
                                else:
                                    temp_data[i] = landmark if i not in temp_data else np.vstack([temp_data[i], landmark])
                    # End loop through a video here
                    if frame_count != -1:
                        cap.release()
                    pbar.set_description(f"{title}")
                    temp_result = None
                    # Unexpected HandCount for each video
                    if is_two_hands_gesture:
                        if two_hands_count < (total_frame_count * 0.7):
                            #print(f"Two hands frame: {two_hands_count}")
                            pbar.set_description(f"{title}\tSkip: Unexpected two hand count\n")
                            continue
                    else:
                        if two_hands_count > (total_frame_count * 0.7):
                            pbar.set_description(f"{title}\tSkip: Unexpected two hand count\n")
                            continue

                    if total_frame_count == 0: # File not exist
                        pbar.set_description(f"{title}\tSkip: File Not Exist\n")
                        continue

                    # Merge different transformations and to LSTM
                    for key in temp_data:
                        if not flatten:
                            result_data = self._extract_lstm(temp_data[key], window=window, upsampling_times=upsampling_times)
                            check_non_empty = len(result_data.shape) == 3
                        else:
                            result_data = temp_data[key]
                            check_non_empty = len(result_data.shape) == 2
                        if check_non_empty: # Prevent Empty Data
                            temp_result = result_data if temp_result is None else np.vstack([temp_result, result_data])
                    # Not Enough Data Remain
                    if temp_result is not None:
                        check_empty_result = len(temp_result.shape) != 3 if not flatten else len(temp_result.shape) != 2
                    if temp_result is None or check_empty_result:
                        pbar.set_description(f"{title}\tSkip: Not Enough Data Remain\n")
                        continue

                    result[to_partition][variation_id] = temp_result if result[to_partition][variation_id] is None and not temp_result.shape[1]==0 \
                                                        else np.vstack([result[to_partition][variation_id], temp_result])
                    video_count[variation_id] += 1 # For Partitioning
                # Endloop of per Gloss Here
            pbar.set_description(f'Saving Files...')
            self._save_npy(self.name, gloss, result)




In [26]:
dir_name = 'dataset_standardize(refine2)_boarder_rotation_20'
num_gloss = 100 # If over max gloss, get max gloss, include the letters
progress = 0
flatten=False
flip = [1]
window = 20
random_augmentation_steps=10
upsampling_times=0
rotation = True
visibility=False
scale=True
midpt_crop=False
keep_noise_frame=True
random_boarder=True

train_ds = dataset(num_gloss,'train',1,dir_name, test=False)
train_ds.fetch_data(window=window,
                    progress=progress,
                    visibility=visibility,
                    scale=scale,
                    rotation=rotation,
                    flip=flip, flatten=flatten,
                    random_augmentation_steps=random_augmentation_steps,
                    keep_noise_frame=keep_noise_frame,
                    upsampling_times=upsampling_times,
                    midpt_crop=midpt_crop,
                    random_boarder=random_boarder)

# test_ds = dataset(num_gloss,'test',1,dir_name)
# test_ds.fetch_data(window=window, progress=progress, rotation_step=rotation_step, flip=flip, flatten=flatten, random_augmentation_steps=random_augmentation_steps, keep_noise_frame=keep_noise_frame,upsampling_times=upsampling_times)

val_ds = dataset(num_gloss,'val',1,dir_name)
val_ds.fetch_data(window=window,
                    progress=progress,
                    visibility=visibility,
                    scale=scale,
                    rotation=rotation,
                    flip=flip, flatten=flatten,
                    random_augmentation_steps=random_augmentation_steps,
                    keep_noise_frame=keep_noise_frame,
                    upsampling_times=upsampling_times,
                    midpt_crop=midpt_crop,
                    random_boarder=random_boarder)

  0%|          | 0/100 [00:00<?, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

0it [00:00, ?it/s]

In [27]:
import pandas as pd
import os
from tqdm.notebook import tqdm
import numpy as np
import pickle
from sklearn.preprocessing import StandardScaler
import h5py
from sklearn.preprocessing import OneHotEncoder
def make_directory(path):
    import os
    if not os.path.exists(path):
        os.makedirs(path)
# Integrate all meta df
# Get Scaler and Save
def integrate(dir_path, split):
    merge_df = pd.DataFrame()
    split_path = os.path.join(dir_path, split)
    npy_stack = {}
    full_set = None
    gloss_col = None
    index_col = None
    file_prog = tqdm()
    flatten_set = []
    # Divide h5 to multiple files (in same directory, by gloss)
    # Calculate Mean, Variation
    h5_dir = os.path.join(dir_path, 'h5',split)
    make_directory(h5_dir)
    
    for root, _, files in tqdm(os.walk(split_path)):
        file_prog.reset()
        file_prog.total = len(files)
        for file in files:
            file_prog.update(1)
            if f'.npy' in file:
                variation = file.split('_')[0]
                row = np.load(os.path.join(root, file))
                if npy_stack.get(variation) is None:
                    npy_stack[variation] = [np.expand_dims(row, axis=0)]
                else:
                    npy_stack[variation].append(np.expand_dims(row, axis=0))
                
        if len(npy_stack) != 0:
            for variation in npy_stack:
                # Construct np array
                gloss_set = np.vstack(npy_stack[variation])
                
                # Get Flatten Set
                flat = np.unique(gloss_set.reshape(-1, gloss_set.shape[-1]), axis=0)
                flatten_set.append(flat)
                
                # Generate Meta
                gloss = root.split("\\")[-1]
                label_name = f'{gloss}_{variation}'
                gloss_col_temp = np.full(gloss_set.shape[0], label_name)
                print(gloss)
                
                gloss_col = gloss_col_temp if gloss_col is None else np.concatenate([gloss_col, gloss_col_temp])
                index_col = np.arange(gloss_set.shape[0]) if index_col is None else np.concatenate([index_col, np.arange(gloss_set.shape[0])])

                gloss_col_temp = np.string_(gloss_col_temp)
                # print(gloss_col_temp.decode('utf-8'))
                with h5py.File(os.path.join(h5_dir,f"{label_name}.h5"), "w") as out:
                  out.create_dataset(f'data', data=gloss_set)
                  out.create_dataset(f'label',data=gloss_col_temp)
            npy_stack.clear()

                
            
    # merge_df = merge_df[['Index','Gloss','Variation','Label']]
    merge_df['Gloss'] = gloss_col
    merge_df['Index'] = index_col
    print(merge_df)
#     merge_df.to_csv(os.path.join(h5_dir, f'meta.csv'))

    print("Getting Scaler")
    full_set = np.vstack(flatten_set)
    scaler = StandardScaler()
    scaler.fit(full_set)
    with open(os.path.join(dir_path,f'{split}_scaler.pkl'), 'wb') as to_write:
        pickle.dump(scaler, to_write)
        
path = 'np_arrays/dataset_standardize(refine2)_boarder_rotation_20'
integrate(path, 'train_0')
# integrate(path, 'train_1')
integrate(path, 'val_0')

0it [00:00, ?it/s]

0it [00:00, ?it/s]

yes
      Gloss  Index
0     yes_0      0
1     yes_0      1
2     yes_0      2
3     yes_0      3
4     yes_0      4
...     ...    ...
2677  yes_0   2677
2678  yes_0   2678
2679  yes_0   2679
2680  yes_0   2680
2681  yes_0   2681

[2682 rows x 2 columns]
Getting Scaler


0it [00:00, ?it/s]

0it [00:00, ?it/s]

yes
      Gloss  Index
0     yes_0      0
1     yes_0      1
2     yes_0      2
3     yes_0      3
4     yes_0      4
...     ...    ...
1824  yes_0   1824
1825  yes_0   1825
1826  yes_0   1826
1827  yes_0   1827
1828  yes_0   1828

[1829 rows x 2 columns]
Getting Scaler


In [None]:
label0 = pd.read_csv(os.path.join(path, 'h5',"train_0", 'meta.csv'))['Gloss'].unique()
label1 = pd.read_csv(os.path.join(path, 'h5',"val_0", 'meta.csv'))['Gloss'].unique()

print("label 1 missing")
for i in label0:
    if i not in label1:
        print(i)
        
print("label 0 missing")
for i in label1:
    if i not in label0:
        print(i)

In [30]:
# def add_missing(split):
#     to_delete = ['go_1', 'finish_1' , 'later_0', 'all_0' ,'right_0', 'school_0', 'son_0','cow_1','yes_0']
#     h5_path = os.path.join('np_arrays','dataset_standardize(refine2)_boarder_rotation_20','h5',split)
#     df_origin = pd.read_csv(os.path.join(h5_path,'meta.csv'),index_col=0)
#     df_origin = df_origin[~df_origin['Gloss'].isin(to_delete)]
#     df_append = pd.read_csv(os.path.join(h5_path,'meta_append2.csv'),index_col=0)
#     df_result = pd.concat([df_origin,df_append])
#     df_result.reset_index(inplace=True)
#     df_result.drop(columns=['index'],inplace=True)
#     df_result.to_csv(os.path.join(h5_path,'meta.csv'))
                     
#     display(df_result)
    
# add_missing('train_0')
# add_missing('val_0')

Unnamed: 0,Gloss,Index
0,a_0,0
1,a_0,1
2,a_0,2
3,a_0,3
4,a_0,4
...,...,...
389799,yes_0,2677
389800,yes_0,2678
389801,yes_0,2679
389802,yes_0,2680


Unnamed: 0,Gloss,Index
0,a_0,0
1,a_0,1
2,a_0,2
3,a_0,3
4,a_0,4
...,...,...
149699,yes_0,1824
149700,yes_0,1825
149701,yes_0,1826
149702,yes_0,1827
