# Data Cleaning
Unify the data format, remove any sensitive information from data

### Environment Setup

In [5]:
import os
import sys
from PIL import Image
import pillow_heif
from concurrent.futures import ThreadPoolExecutor

data_path = 'C:\\Users\\tfgmo\\OneDrive - Virginia Tech\\Mahjong CV\\mahjong_data'
output_path = 'C:\\Users\\tfgmo\\OneDrive - Virginia Tech\\Mahjong CV\\cleaned_data'

In [6]:
# 注册 HEIF 解码器
pillow_heif.register_heif_opener()

def process_image(input_path, output_path, index):
    try:
        image = Image.open(input_path)
        # 移除元数据
        image_without_metadata = Image.new(image.mode, image.size)
        image_without_metadata.putdata(list(image.getdata()))
        # 保存为 PNG 格式，使用规范化文件名
        output_filename = f"{index:06}.png"
        output_full_path = os.path.join(output_path, output_filename)
        image_without_metadata.save(output_full_path, 'PNG')
        print(f"已处理文件：{input_path} -> {output_full_path}")
    except Exception as e:
        print(f"处理文件时出错：{input_path}，错误信息：{e}")

def normalize_and_convert(input_dir, output_dir):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # 收集所有待处理文件
    files = []
    for root, dirs, filenames in os.walk(input_dir):
        for filename in filenames:
            if filename.lower().endswith(('.jpg', '.jpeg', '.heic', '.png')):
                files.append(os.path.join(root, filename))

    # 使用多线程处理
    with ThreadPoolExecutor() as executor:
        for index, file_path in enumerate(files, start=1):
            executor.submit(process_image, file_path, output_dir, index)


normalize_and_convert(data_path, output_path)

已处理文件：C:\Users\tfgmo\OneDrive - Virginia Tech\Mahjong CV\mahjong_data\16_IMG_6528.jpeg -> C:\Users\tfgmo\OneDrive - Virginia Tech\Mahjong CV\cleaned_data\000012.png
已处理文件：C:\Users\tfgmo\OneDrive - Virginia Tech\Mahjong CV\mahjong_data\12_mmexport1723838944712.jpg -> C:\Users\tfgmo\OneDrive - Virginia Tech\Mahjong CV\cleaned_data\000004.png
已处理文件：C:\Users\tfgmo\OneDrive - Virginia Tech\Mahjong CV\mahjong_data\16_e97f4fa54a6a7786dafa8c5683943c38.jpeg -> C:\Users\tfgmo\OneDrive - Virginia Tech\Mahjong CV\cleaned_data\000011.png
已处理文件：C:\Users\tfgmo\OneDrive - Virginia Tech\Mahjong CV\mahjong_data\10_mmexport1729156073387.jpg -> C:\Users\tfgmo\OneDrive - Virginia Tech\Mahjong CV\cleaned_data\000001.png
已处理文件：C:\Users\tfgmo\OneDrive - Virginia Tech\Mahjong CV\mahjong_data\16_IMG_6577.jpeg -> C:\Users\tfgmo\OneDrive - Virginia Tech\Mahjong CV\cleaned_data\000013.png
已处理文件：C:\Users\tfgmo\OneDrive - Virginia Tech\Mahjong CV\mahjong_data\18_IMG_20210228_140245.jpg -> C:\Users\tfgmo\OneDrive - V