# 网盘中下载数据

下载的代码使用的脚本是copy_image_data_to_local.py

# 数据转换成训练数据

## 首先根据类别情况转换成正规的类别

In [1]:
import csv
import json


def get_all_age_gender_set(file_name):
    age_set = set()
    gender_set = set()
    with open(file_name, 'r') as f:
        animal_action = json.load(f)
    print(f'total image num: {len(animal_action)}')
    for item in animal_action:
        age_set.add(item['年龄'])
        gender_set.add(item['性别'])
    return age_set, gender_set


def get_clean_species_statistic_info(file_name):
    species_map_to_count = {}
    with open(file_name, 'r') as file:
        next(file)  # 跳过第一行
        for line in file:
            line = line.strip()
            if line == "":
                continue
            data = line.split(",")
            if len(data) < 6:
                print(f"error format {line}")
                continue
            _, _, species, gender, age, count = data[0:6]
            if age == "幼体":
                if gender != "无法区分":
                    print(f"{species} age is 幼体 need transfer gender {gender} to 无法区分")
                    gender = "无法区分"
            if "指名亚种" in species:
                species_column = species.split(" ")
                new_species = species_column[0]
                key = new_species + ',' + gender + ',' + age
                print(f"orig species {species},{gender},{age} transfer to {key}")
            else:
                key = species + ',' + gender + ',' + age
            if key not in species_map_to_count:
                species_map_to_count[key] = 0
            species_map_to_count[key] += int(count)
    split_key_separator = ","
    species_map_to_count_sort = dict(sorted(species_map_to_count.items(), key=lambda item: 
                                            (item[0].split(split_key_separator)[0], 
                                             item[0].split(split_key_separator)[1], 
                                             item[0].split(split_key_separator)[2])))
    total_label_num = 0
    with open("species_statistic_info_new.csv", "w", newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["ID",	"标签",	"物种",	"性别",	"年龄", '数量'])
        id = 1
        for key, value in species_map_to_count_sort.items():
            species, gender, age = key.split(',')
            species_detail = species
            if gender != '无法区分':
                species_detail += gender
            if age != '无法区分':
                species_detail += age
            writer.writerow([id, species_detail, species, gender, age, value])
            id += 1
            total_label_num += value
    print(f"total_label_num is {total_label_num}")

In [2]:
# 查看照片中所有的年龄和性别的标注情况
file_name = "/home/yuzhong/nndata/export/20240428/物种图片1.json"
age_set, gender_set = get_all_age_gender_set(file_name)
print("年龄标注情况:", age_set)
print("性别标注情况:", gender_set)

total image num: 643250
年龄标注情况: {'', '成年;亚成体', '亚成体', '成年', '幼体'}
性别标注情况: {'', '难以鉴定', '雄，另外一头不确定', '群体', '雌带幼仔', '难以辨认', '幼', '雄', '雌雄都有', '幼仔', '雌', '雄+', '幼体'}


In [3]:
# 未在这个map中有的都是无法区分
gender_map = {"雄": "雄性", "雄+": "雄性", '雌带幼仔': '雌性', "雌": "雌性", "雄，另外一头不确定": "雄性", '': '无法区分'}
# 未在这个map中有的都是无法区分
age_map = {'': '无法区分' ,'成年': '成体', '亚成体': '亚成体', '幼体': "幼体", '成年;亚成体': '成体'}

In [4]:
def check_label_data(file_name, species_to_another_species, species_map_to_count):
    with open(file_name, 'r') as file:
        next(file)  # 跳过第一行
        for line in file:
            line = line.strip()
            if line == "":
                continue
            data = line.split(",")
            if len(data) < 5:
                print(f"error format {line}")
                continue
            species_map_to_count[data[2] + ',' + data[3] + ',' + data[4]] = 0
            if data[3] == "无法区分" and data[4] == "无法区分":
                if data[1] != data[2]:
                    print(f"ID {data[0]},species {data[1]} to {data[2]}")
                    species_to_another_species[data[1]] = data[2]
            if data[3] != "无法区分":
                if data[3] not in data[1]:
                    print(f"ID {data[0]},species name {data[1]} not macth gender {data[3]}")
            if data[4]!= "无法区分":
                if data[4] not in data[1]:
                    print(f"ID {data[0]},species name {data[2]} not macth age {data[4]}")

In [5]:
species_to_another_species = {}
species_map_to_count = {}
# 根据专家填写的分类码表，检查数据集的分类并把一些分类转换成另一个分类
check_label_data("/home/yuzhong/data2/code/object_detection/process_data/物种分类码表-野生动物.csv", species_to_another_species, species_map_to_count)
check_label_data("/home/yuzhong/data2/code/object_detection/process_data/物种分类码表-家养动物.csv", species_to_another_species, species_map_to_count)

ID 253,species name 熊猴成体 not macth gender 雄性
ID 387,species name 灰头鸫 not macth age 幼体
ID 472,species name 北树鼩亚成体 not macth gender 雄性
ID 1635,species 鬣羚 to 中华鬣羚
ID 1643,species 斑羚 to 中华斑羚
ID D095,species 牦牛 to 家牦牛


In [6]:
species_to_another_species

{'鬣羚': '中华鬣羚', '斑羚': '中华斑羚', '牦牛': '家牦牛'}

In [14]:
def get_species_num(file_name, species_to_another_species, species_map_to_count, gender_map, age_map, not_found_species):
    all_species = set()
    with open(file_name, 'r') as f:
        animal_action = json.load(f)
    print(f'total image num: {len(animal_action)}')
    for item in animal_action:
        age = item['年龄']
        gender = item['性别']
        species = item['物种名称']
        if "指名亚种" in species:
            species_column = species.split(" ")
            old_species = species
            species = species_column[0]
            print(f"orig species {old_species} transfer to {species}")
        if species in species_to_another_species:
            species = species_to_another_species[species]
        all_species.add(species)
        if age not in age_map:
            age = "无法区分"
        else:
            age = age_map[age]
        if age == "幼体":
            if gender != "无法区分":
                # print(f"{species} age is 幼体 need transfer gender {gender} to 无法区分")
                gender = "无法区分"
        if gender not in gender_map:
            gender = "无法区分"
        else:
            gender = gender_map[gender]
        key = species + ',' + gender + ',' + age
        # 在标注数据中存在的数据但是没在专家提供的表格中也需要作为训练数据
        if key not in species_map_to_count:
            not_found_species.add(key)
            species_map_to_count[key] = 0
            # print(f"{key} not found in species_map_to_count")
            # continue  # 跳过该数据项
        species_map_to_count[key] += 1
    return all_species

def species_statistic_info_to_csv(species_map_to_count, filter_num=0):
    split_key_separator = ","
    species_map_to_count_sort = dict(sorted(species_map_to_count.items(), key=lambda item: 
                                            (item[0].split(split_key_separator)[0], 
                                             item[0].split(split_key_separator)[1], 
                                             item[0].split(split_key_separator)[2])))
    
    with open("species_statistic_info_240916.csv", "w", newline='') as csvfile, open("species_name_info_240916.txt", "w") as txtfile:
        writer = csv.writer(csvfile)
        writer.writerow(["ID",	"标签",	"物种",	"性别",	"年龄", '数量'])
        id = 1
        for key, value in species_map_to_count_sort.items():
            if key == "":
                continue
            if value < filter_num:
                continue
            species, gender, age = key.split(',')
            if species == '' or species == "不认识":
                continue
            species_detail = species
            if gender != '无法区分':
                species_detail += gender
            if age != '无法区分':
                species_detail += age
            writer.writerow([id, species_detail, species, gender, age, value])
            txtfile.write(f"{id-1}: {species_detail}\n")
            id += 1

In [8]:
not_found_animal = set()
# 查看照片中所有的年龄和性别的标注情况
file_name = "/home/yuzhong/nndata/export/20240428/物种图片1.json"
all_species = get_species_num(file_name, species_to_another_species, species_map_to_count, gender_map, age_map, not_found_animal)

total image num: 643250
orig species 眼纹噪鹛 眼纹噪鹛指名亚种 transfer to 眼纹噪鹛
orig species 白鹇 白鹇指名亚种 transfer to 白鹇
orig species 白鹇 白鹇指名亚种 transfer to 白鹇
orig species 白鹇 白鹇指名亚种 transfer to 白鹇
orig species 白鹇 白鹇指名亚种 transfer to 白鹇
orig species 白鹇 白鹇指名亚种 transfer to 白鹇
orig species 白鹇 白鹇指名亚种 transfer to 白鹇
orig species 橙翅噪鹛 橙翅噪鹛指名亚种 transfer to 橙翅噪鹛
orig species 橙翅噪鹛 橙翅噪鹛指名亚种 transfer to 橙翅噪鹛
orig species 灰头鸫 灰头鸫指名亚种 transfer to 灰头鸫
orig species 灰头鸫 灰头鸫指名亚种 transfer to 灰头鸫
orig species 灰头鸫 灰头鸫指名亚种 transfer to 灰头鸫
orig species 灰头鸫 灰头鸫指名亚种 transfer to 灰头鸫
orig species 白鹇 白鹇指名亚种 transfer to 白鹇
orig species 黑鹇 黑鹇指名亚种 transfer to 黑鹇
orig species 黑鹇 黑鹇指名亚种 transfer to 黑鹇
orig species 黑鹇 黑鹇指名亚种 transfer to 黑鹇
orig species 白鹇 白鹇指名亚种 transfer to 白鹇
orig species 蓝喉拟啄木鸟 蓝喉拟啄木鸟指名亚种 transfer to 蓝喉拟啄木鸟
orig species 灰眶雀鹛 灰眶雀鹛指名亚种 transfer to 灰眶雀鹛


In [15]:
print(f'len(species_map_to_count):{len(species_map_to_count)},len(not_found_animal):{len(not_found_animal)},all species:{len(all_species)}')

len(species_map_to_count):2494,len(not_found_animal):732,all species:618


In [16]:
species_statistic_info_to_csv(species_map_to_count)

## 制作训练数据

根据上面的统计信息得到对应的yaml文件，可以参考process_data/species_43.yaml

In [None]:
import json
import hashlib
import argparse
import shutil
import os
from PIL import Image
from pathlib import Path
from tqdm import tqdm

copy_file_failed_fd = open("copy_to_local_image_failed.txt", "w")


def get_file_path(content, root_dir):
    # 计算图片id的md5
    pic_id = content["图片id"]
    md5hash = hashlib.md5(pic_id.encode('utf-8'))
    md5 = md5hash.hexdigest()

    # root_dir = "/home/yuzhong/nndata/fs"

    first = md5[0]
    second = md5[1:3]
    third = md5[3:6]
    path = "/".join([root_dir, first, second, third, pic_id])
    # print(path)
    my_file = Path(path)
    if my_file.is_file():
        content["有效路径"] = path
        return path
    else:
        new_path = "/".join([root_dir, first, second, third, md5])
        # print(new_path)
        my_file = Path(new_path)
        if my_file.is_file():
            content["有效路径"] = new_path
            return new_path
        else:
            content["无效路径"] = path + "\t" + new_path
            return None


def copy_image_to_local(content, root_dir):
    try:
        Image.open(content["有效路径"])
        content["本地路径"] = root_dir + "/" + content["有效路径"].split("/")[-1]
        shutil.copyfile(content["有效路径"], content["本地路径"])
        return True
    except Exception:
        json.dump(content, copy_file_failed_fd, ensure_ascii=False)
        copy_file_failed_fd.write("\n")
        return False


def write_json_file(file_name, data):
    with open(file_name, "w", encoding="utf-8") as json_file:
        for item in data:
            json.dump(item, json_file, ensure_ascii=False)
            json_file.write('\n')


def json_contents_to_path(actions, root_dir, dir_name, fs_dir):
    invalid_json = []
    json_file = root_dir + dir_name + ".json"
    json_file_fd = open(json_file, 'a')
    path_not_exist = 0
    image_corrupted = 0
    write_images = 0
    os.makedirs(root_dir + "/" + dir_name, exist_ok=True)
    for content in tqdm(actions):
        path = get_file_path(content, fs_dir)
        if path:
            if copy_image_to_local(content, root_dir=root_dir + "/" + dir_name):
                json.dump(content, json_file_fd, ensure_ascii=False)
                json_file_fd.write("\n")
                write_images += 1
            else:
                invalid_json.append(content)
                image_corrupted += 1
        else:
            invalid_json.append(content)
            path_not_exist += 1

    json_file_fd.close()

    print(f"{dir_name},成功写入的照片数量:{write_images},路径非法总数:{path_not_exist},照片损坏总数:{image_corrupted}.")

    with open(root_dir + dir_name + "_failed.json", "w") as json_file:
        json.dump(invalid_json, json_file, ensure_ascii=False)

def get_image_to_local_dir(image_json_actions, fs_dir, root_dir = "/home/yuzhong/data3/image_data/"):
    os.makedirs(root_dir, exist_ok=True)
    json_contents_to_path(image_json_actions, root_dir, "human_images", fs_dir)