In [1]:
import pandas as pd 
import os
import json
import csv
import gzip
import pickle
csv.field_size_limit(10 * 1024 * 1024)

131072

In [2]:

def load_csv_to_dict(csv_file):
    data = {}
    with open(csv_file, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            key = row['Key']
            try:
                # 尝试直接解析原始数据
                value = json.loads(row['Value'])
                # print(value)
                
            except json.JSONDecodeError:
                # 先转义所有可能破坏JSON结构的内部双引号
                corrected_value = row['Value'].replace("'", '"')

                # 尝试解析修正后的字符串
                try:
                    value = json.loads(corrected_value)
                except json.JSONDecodeError:
                    print(f"无法解析行：{corrected_value}")
                    break
            data[key] = value
    return data



# 保存字典为gzip压缩的pickle文件
def save_dict_to_gz(data, filename):
    with gzip.open(filename, 'wb') as f:
        pickle.dump(data, f)
    
def collect_frame_paths(dtype,data):
    output_base_folder = f"./how2sign/frame/{dtype}/"
    key_base_folder =  f"./how2sign/frame/key/{dtype}/"
    video_frames = {}
    k_frames = {}
    for video_name in os.listdir(output_base_folder):
        video_folder = os.path.join(output_base_folder, video_name)
        key_folder = os.path.join(key_base_folder, video_name)

        frames = sorted([os.path.join(video_folder, frame) for frame in os.listdir(video_folder) if frame.endswith('.jpg')])
        kframes = sorted([os.path.join(key_folder, frame) for frame in os.listdir(key_folder) if frame.endswith('.png')])

        kprefix = os.path.join(f"key/{dtype}")
        prefix = os.path.join(f"{dtype}")

        relative_frames = [os.path.join(prefix, os.path.relpath(frame, output_base_folder)).replace('\\', '/')  for frame in frames]
        krelative_frames = [os.path.join(kprefix, os.path.relpath(frame, key_base_folder)).replace('\\', '/')  for frame in kframes]

        video_frames[video_name] = relative_frames
        k_frames[video_name] = krelative_frames
        # print(video_frames , k_frames)
    output_data = []
    for video_name, frames in video_frames.items():
        matching_rows = data.loc[data["SENTENCE_NAME"] == video_name, "SENTENCE"]
        if len(matching_rows) == 0:
            print(f"Warning: No matching record found for video {video_name}")
            continue
        text_info = matching_rows.values[0]
        video_name_with_prefix = os.path.join(f'{dtype}', video_name).replace('\\', '/') 
        entry = {
                "Key": video_name_with_prefix,
                "Value": {
                    "name": video_name_with_prefix,
                    "gloss": "",  # 如果有其他信息需要添加在这里
                    "text": text_info,
                    "length": len(frames),
                    "imgs_path": video_frames[video_name],
                    "kps_path":k_frames[video_name]
                }
            }
        output_data.append(entry)
        
    output_csv=f'./{dtype}.csv'
    output_dir = os.path.dirname(output_csv)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    with open(output_csv, 'w', encoding='utf-8') as f:
        for entry in output_data:
            key = entry['Key']
            value = json.dumps(entry['Value'], ensure_ascii=False)
            f.write(f'{key},{value}\n')

    data = []

    with open(output_csv, 'r', encoding='utf-8') as file:
        reader = csv.reader(file)
        for row in reader:
            # 检查行中是否包含正确数量的字段（假设应为2个字段）
            if len(row) >= 2:
                # 将多余的部分合并到第二个字段中
                key = row[0]
                value = ','.join(row[1:])
                data.append([key, value])
            else:
                print(f"Skipping malformed row: {row}")
    # 将数据转换为DataFrame并添加表头
    df = pd.DataFrame(data, columns=['Key', 'Value'])

    # 保存带表头的文件
    df.to_csv(output_csv, index=False, encoding='utf-8')

    # pd.DataFrame(output_data).to_csv(f'./{dtype}.csv', index=False, encoding='utf-8')

def escape_string(s):
    return s.replace("'", "\\'")

In [3]:
train_data = pd.read_csv('./filter_train_Data.csv')
val_data = pd.read_csv('./filter_val_Data.csv')
test_data = pd.read_csv('./filter_test_Data.csv')

In [4]:
collect_frame_paths('val',val_data)
data_dict = load_csv_to_dict(f'./val.csv')
save_dict_to_gz(data_dict, f'labels.dev')


In [5]:
collect_frame_paths('train',train_data)
data_dict = load_csv_to_dict(f'./train.csv')
save_dict_to_gz(data_dict, f'labels.train')


In [6]:
collect_frame_paths('test',test_data)
data_dict = load_csv_to_dict(f'./test.csv')
save_dict_to_gz(data_dict, f'labels.test')

In [7]:
pd.read_csv('./train.csv')

Unnamed: 0,Key,Value
0,train/fZK9hW81XCU_10-10-rgb_front,"{""name"": ""train/fZK9hW81XCU_10-10-rgb_front"", ..."
1,train/fZK9hW81XCU_9-10-rgb_front,"{""name"": ""train/fZK9hW81XCU_9-10-rgb_front"", ""..."
2,train/fZM3IcM2Xs4_0-10-rgb_front,"{""name"": ""train/fZM3IcM2Xs4_0-10-rgb_front"", ""..."
3,train/fZM3IcM2Xs4_10_11_12_13-10-rgb_front,"{""name"": ""train/fZM3IcM2Xs4_10_11_12_13-10-rgb..."
4,train/fZM3IcM2Xs4_14_15_16-10-rgb_front,"{""name"": ""train/fZM3IcM2Xs4_14_15_16-10-rgb_fr..."
5,train/fZM3IcM2Xs4_17-10-rgb_front,"{""name"": ""train/fZM3IcM2Xs4_17-10-rgb_front"", ..."
6,train/fZM3IcM2Xs4_2-10-rgb_front,"{""name"": ""train/fZM3IcM2Xs4_2-10-rgb_front"", ""..."
7,train/fZM3IcM2Xs4_3-10-rgb_front,"{""name"": ""train/fZM3IcM2Xs4_3-10-rgb_front"", ""..."
8,train/fZM3IcM2Xs4_4-10-rgb_front,"{""name"": ""train/fZM3IcM2Xs4_4-10-rgb_front"", ""..."
9,train/fZM3IcM2Xs4_5-10-rgb_front,"{""name"": ""train/fZM3IcM2Xs4_5-10-rgb_front"", ""..."
