# Data Description
**Files**
- `index.json`: a dict of index-character pairs
- `train.npy`: train numpy data, a `(F,75,3)` shape numpy object array
    - `F` stand for the amount of total frames in train dataset
    - `75` is the amount of skeleton points(without face points)
    - `3` is the `[x,y,z]` coordinates
- `train.pickle`: train labels, a dict
    1. `label_list`: a list of labels, each label is a array of character's index
    2. `sequence_id`: a list of sequence_id.
    3. `start_list`: a list of each sequence index frames
    4. `length_list`: a list of each sequence length

# Config

In [None]:
import json
import pickle
import pandas as pd
import numpy as np
from tqdm import tqdm

# Data Process

In [None]:
train_df = pd.read_csv("/kaggle/input/asl-fingerspelling/train.csv")
train_df.head()

## Character Index

In [None]:
char_index = json.load(open("/kaggle/input/asl-fingerspelling/character_to_prediction_index.json"))
print(char_index)

index_char = dict([val, key] for key, val in char_index.items())
print(index_char)

with open('index.json','w') as f:
    b = json.dump(index_char, f)

## Sequence List

In [None]:
sequence_id_list = train_df.sequence_id.tolist()

## Label List

In [None]:
phrase_list = train_df.phrase.to_list()
label_list = []
for p in phrase_list:
    p = list(p)
    label = []
    for i in p:
        label.append(char_index[i])
    label_list.append(label)

## Start List & Length List

In [None]:
dataset_path = "/kaggle/input/asl-fingerspelling"
cur_file = ''
len_list = []
start_list = []
length = 0

In [None]:
for i in tqdm(range(len(train_df))):
    file_name = train_df.path.iloc[i]
    sequence_id = train_df.sequence_id.iloc[i]
    if cur_file != file_name:
        file_df = pd.read_parquet(f"{dataset_path}/{file_name}")
        cur_file = file_name
        file_df = file_df.reset_index()
    
    frames_df = file_df[file_df.sequence_id == sequence_id]
    len_list.append(len(frames_df))
    start_list.append(length)
    length += len(frames_df)

In [None]:
LENGTH = 10749578
LANDMARKS = 543
HAND_INDEX = 468
LANDMARK_LENGTH = 543 - 468

In [None]:
dicts = {'label_list':label_list,'sequence_id_list':sequence_id_list,'start_list':start_list,'length_list':len_list}
with open('train.pickle','wb') as f:
    pickle.dump(dicts,f)

In [None]:
train_npy = np.zeros((length,75,3))
# train_npy = np.random.rand(LENGTH,75,3)
train_npy.shape

In [None]:
for i in tqdm(range(len(train_df))):
    file_name = train_df.path.iloc[i]
    sequence_id = train_df.sequence_id.iloc[i]
    if cur_file != file_name:
        file_df = pd.read_parquet(f"{dataset_path}/{file_name}")
        cur_file = file_name
        file_df = file_df.reset_index()

    frames_df = file_df[file_df.sequence_id == sequence_id]
    frames_df = frames_df.iloc[:, 2:]
    frames = np.array(frames_df)
    train_npy[start_list[i]:start_list[i]+len_list[i],:,0] = frames[:,LANDMARKS*0+HAND_INDEX:LANDMARKS*0+HAND_INDEX+LANDMARK_LENGTH]
    train_npy[start_list[i]:start_list[i]+len_list[i],:,1] = frames[:,LANDMARKS*1+HAND_INDEX:LANDMARKS*1+HAND_INDEX+LANDMARK_LENGTH]
    train_npy[start_list[i]:start_list[i]+len_list[i],:,2] = frames[:,LANDMARKS*2+HAND_INDEX:LANDMARKS*2+HAND_INDEX+LANDMARK_LENGTH]
    
# train_npy = np.array(train_list)
print(train_npy.shape)

In [None]:
np.save('train.npy', train_npy)