Create tfrecords that's split into 6 folds.

Include only landmarks of left/right hand, left/right chest, and lips.

Phrase is encoded into index as int32.

Since there are duplicate phrases signed by different people, split the data by phrase such that the multiple videos of the same phrase are stored in the same fold.

In [1]:
import numpy as np 
import pandas as pd 
import os
import tensorflow as tf
import json
from tqdm.notebook import tqdm
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
import json
with open ("/kaggle/input/asl-fingerspelling/character_to_prediction_index.json", "r") as f:
    char2id = json.load(f)
id2char = {i:char for char,i in char2id.items()}

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [2]:
df = pd.read_csv('/kaggle/input/asl-fingerspelling/train.csv')
df.head()

Unnamed: 0,path,file_id,sequence_id,participant_id,phrase
0,train_landmarks/5414471.parquet,5414471,1816796431,217,3 creekhouse
1,train_landmarks/5414471.parquet,5414471,1816825349,107,scales/kuhaylah
2,train_landmarks/5414471.parquet,5414471,1816909464,1,1383 william lanier
3,train_landmarks/5414471.parquet,5414471,1816967051,63,988 franklin lane
4,train_landmarks/5414471.parquet,5414471,1817123330,89,6920 northeast 661st road


In [3]:
RIGHT_HAND_COLS = [f'{c}_right_hand_{i}' for i in range(21) for c in 'xyz']
LEFT_HAND_COLS = [f'{c}_left_hand_{i}' for i in range(21) for c in 'xyz']
LIP_COLS = [f'{c}_face_{i}' for i in 
            [61, 185, 40, 39, 37, 0, 267, 269, 270, 409,
            291, 146, 91, 181, 84, 17, 314, 405, 321, 375,
            78, 191, 80, 81, 82, 13, 312, 311, 310, 415,
            95, 88, 178, 87, 14, 317, 402, 318, 324, 308,
            ] for c in 'xyz']
LPOSE_COLS = [f'{c}_pose_{i}' for i in [11, 13, 15, 17, 19, 21] for c in 'xyz']
RPOSE_COLS = [f'{c}_pose_{i}' for i in [12, 14, 16, 18, 20, 22] for c in 'xyz']
LIST_COLS_LIST = [RIGHT_HAND_COLS, LEFT_HAND_COLS, LIP_COLS, LPOSE_COLS, RPOSE_COLS]
SEL_COLS = RIGHT_HAND_COLS + LEFT_HAND_COLS + LIP_COLS + LPOSE_COLS + RPOSE_COLS

f = open('/kaggle/input/asl-fingerspelling/character_to_prediction_index.json')
char2id = json.load(f)
f.close()

In [7]:
phrase_df = pd.DataFrame({'phrase':df.phrase.unique()})
phrase_df.head()

Unnamed: 0,phrase
0,3 creekhouse
1,scales/kuhaylah
2,1383 william lanier
3,988 franklin lane
4,6920 northeast 661st road


In [9]:
print(len(phrase_df))
print(df.phrase.nunique())

46478
46478


In [10]:
phrase_df['fold'] = -1
kfold = KFold(n_splits=6, shuffle=True, random_state=42)
for fold_idx, (train_idx, val_idx) in enumerate(kfold.split(phrase_df.index)):
    phrase_df.loc[val_idx,'fold'] = fold_idx
assert (phrase_df['fold']==-1).sum()==0
phrase_df.head(10)

Unnamed: 0,phrase,fold
0,3 creekhouse,1
1,scales/kuhaylah,0
2,1383 william lanier,4
3,988 franklin lane,3
4,6920 northeast 661st road,0
5,www.freem.ne.jp,4
6,https://jsi.is/hukuoka,2
7,239613 stolze street,0
8,242-197-6202,2
9,271097 bayshore boulevard,5


In [11]:
df = df.merge(phrase_df,on='phrase')
df.head()

Unnamed: 0,path,file_id,sequence_id,participant_id,phrase,fold
0,train_landmarks/5414471.parquet,5414471,1816796431,217,3 creekhouse,1
1,train_landmarks/5414471.parquet,5414471,1816825349,107,scales/kuhaylah,0
2,train_landmarks/175396851.parquet,175396851,105856225,121,scales/kuhaylah,0
3,train_landmarks/5414471.parquet,5414471,1816909464,1,1383 william lanier,4
4,train_landmarks/1134756332.parquet,1134756332,137139884,236,1383 william lanier,4


In [14]:
write_options = tf.io.TFRecordOptions(compression_type='GZIP', compression_level=9)

def write_fold(df, file_name):
    num_samples = 0
    with tf.io.TFRecordWriter(file_name,options=write_options) as file_writer:
        for file in tqdm(df.path.unique()):
            pq_df = pd.read_parquet('/kaggle/input/asl-fingerspelling/'+file,
                                    columns = SEL_COLS)
            for sid in df[df['path']==file]['sequence_id'].unique():

                phrase = df[df['sequence_id']==sid]['phrase'].iloc[0]
                encoded_phrase = tf.constant([char2id[char] for char in phrase], dtype=tf.int32) # didn't specify dtype
                lm_df = pq_df[pq_df.index==sid]
                
                rh_np = lm_df[RIGHT_HAND_COLS].values.reshape(-1,len(RIGHT_HAND_COLS)//3, 3)
                lh_np = lm_df[LEFT_HAND_COLS].values.reshape(-1,len(LEFT_HAND_COLS)//3, 3)
                rpose_np = lm_df[RPOSE_COLS].values.reshape(-1,len(RPOSE_COLS)//3, 3)
                lpose_np = lm_df[LPOSE_COLS].values.reshape(-1,len(LPOSE_COLS)//3, 3)
                lip_np = lm_df[LIP_COLS].values.reshape(-1,len(LIP_COLS)//3, 3)

                record_bytes = tf.train.Example(features = tf.train.Features(feature ={
                    'right_hand':tf.train.Feature(bytes_list=tf.train.BytesList(value=[rh_np.tobytes()])),
                    'left_hand':tf.train.Feature(bytes_list=tf.train.BytesList(value=[lh_np.tobytes()])),
                    'left_pose':tf.train.Feature(bytes_list=tf.train.BytesList(value=[lpose_np.tobytes()])),
                    'right_pose':tf.train.Feature(bytes_list=tf.train.BytesList(value=[rpose_np.tobytes()])),
                    'lip':tf.train.Feature(bytes_list=tf.train.BytesList(value=[lip_np.tobytes()])),
                    'encoded_phrase':tf.train.Feature(int64_list=tf.train.Int64List(value=encoded_phrase)),
                    })).SerializeToString()

                file_writer.write(record_bytes)
                num_samples+=1
    print(file_name,f'has {num_samples} samples')

In [None]:
for fold in range(6):
    write_fold(df[df['fold']==fold], f'fold{fold}.tfrecords')

In [5]:
# Check the number of samples that are completely nan on right hand and left hand
no_good_frame=0
for file in tqdm(df.path.unique()):
    pq_df = pd.read_parquet('/kaggle/input/asl-fingerspelling/'+file,
                            columns = RIGHT_HAND_COLS+LEFT_HAND_COLS)
    for sid in df[df['path']==file]['sequence_id'].unique():
        lm_df = pq_df[pq_df.index==sid]
        lm = lm_df.values
        if np.sum(np.any(~np.isnan(lm),axis=1)) == 0:
            no_good_frame+=1
        
print(no_good_frame)

  0%|          | 0/68 [00:00<?, ?it/s]

0


In [4]:
# Check the maximum number of frames after dropping the nan frames
max_good_frame = 0
for file in tqdm(df.path.unique()):
    pq_df = pd.read_parquet('/kaggle/input/asl-fingerspelling/'+file,
                            columns = RIGHT_HAND_COLS+LEFT_HAND_COLS)
    for sid in df[df['path']==file]['sequence_id'].unique():
        lm_df = pq_df[pq_df.index==sid]
        lm = lm_df.values
        max_good_frame = max(max_good_frame,np.sum(np.any(~np.isnan(lm),axis=1)))
        
print(max_good_frame)


  0%|          | 0/68 [00:00<?, ?it/s]

598
