In [1]:
import json
import csv

In [2]:
def get_all_age_gender_set(file_name):
    age_set = set()
    gender_set = set()
    with open(file_name, 'r') as f:
        animal_action = json.load(f)
    print(f'animal size: {len(animal_action)}')
    for item in animal_action:
        age_set.add(item['年龄'])
        gender_set.add(item['性别'])
    return age_set, gender_set

In [3]:
file_name = "/home/yuzhong/nndata/export/20240428/物种图片1.json"
age_set, gender_set = get_all_age_gender_set(file_name)

animal size: 643250


In [4]:
print(age_set)

{'', '成年', '亚成体', '幼体', '成年;亚成体'}


In [5]:
print(gender_set)

{'', '难以辨认', '难以鉴定', '雄', '幼', '雌雄都有', '雌', '雄+', '雌带幼仔', '幼仔', '幼体', '雄，另外一头不确定', '群体'}


In [6]:
# 未在这个map中有的都是无法区分
gender_map = {"雄": "雄性", "雄+": "雄性", '雌带幼仔': '雌性', "雌": "雌性", "雄，另外一头不确定": "雄性"}
# 未在这个map中有的都是无法区分
age_map = {'': '无法区分' ,'成年': '成体', '亚成体': '亚成体', '幼体': "幼体", '成年;亚成体': '成体'}

In [7]:
def check_label_data(file_name, species_to_another_species, species_map_to_count):
    with open(file_name, 'r') as file:
        next(file)  # 跳过第一行
        for line in file:
            line = line.strip()
            if line == "":
                continue
            data = line.split(",")
            if len(data) < 5:
                print(f"error format {line}")
                continue
            species_map_to_count[data[2] + ',' + data[3] + ',' + data[4]] = 0
            if data[3] == "无法区分" and data[4] == "无法区分":
                if data[1] != data[2]:
                    print(f"ID {data[0]},species {data[1]} to {data[2]}")
                    species_to_another_species[data[1]] = data[2]
            if data[3] != "无法区分":
                if data[3] not in data[1]:
                    print(f"ID {data[0]},species name {data[1]} not macth gender {data[3]}")
            if data[4]!= "无法区分":
                if data[4] not in data[1]:
                    print(f"ID {data[0]},species name {data[2]} not macth age {data[4]}")

In [8]:
species_to_another_species = {}
species_map_to_count = {}
check_label_data("/home/yuzhong/data2/code/object_detection/process_data/物种分类码表-野生动物.csv", species_to_another_species, species_map_to_count)
check_label_data("/home/yuzhong/data2/code/object_detection/process_data/物种分类码表-家养动物.csv", species_to_another_species, species_map_to_count)

ID 253,species name 熊猴成体 not macth gender 雄性
ID 387,species name 灰头鸫 not macth age 幼体
ID 472,species name 北树鼩亚成体 not macth gender 雄性
ID 1635,species 鬣羚 to 中华鬣羚
ID 1643,species 斑羚 to 中华斑羚
ID D095,species 牦牛 to 家牦牛


In [9]:
print(species_to_another_species)
print(len(species_map_to_count))

{'鬣羚': '中华鬣羚', '斑羚': '中华斑羚', '牦牛': '家牦牛'}
1762


In [10]:
def get_species_num(file_name, species_to_another_species, species_map_to_count, gender_map, age_map):
    all_species = set()
    not_found_species = set()
    with open(file_name, 'r') as f:
        animal_action = json.load(f)
    print(f'file size: {len(animal_action)}')
    for item in animal_action:
        age = item['年龄']
        gender = item['性别']
        species = item['物种名称']
        all_species.add(species)
        if species in species_to_another_species:
            species = species_to_another_species[species]
        if age not in age_map:
            age = "无法区分"
        else:
            age = age_map[age]
        if gender not in gender_map:
            gender = "无法区分"
        else:
            gender = gender_map[gender]
        key = species + ',' + gender + ',' + age
        if key not in species_map_to_count:
            not_found_species.add(species)
            # species_map_to_count[key] = 0
            print(f"{key} not found in species_map_to_count")
            continue  # 跳过该数据项
        species_map_to_count[key] += 1
    for k in not_found_species:
        print(f"species {k} not found")
    return all_species

def species_statistic_info_to_csv(species_map_to_count):
    split_key_separator = ","
    species_map_to_count_sort = dict(sorted(species_map_to_count.items(), key=lambda item: 
                                            (item[0].split(split_key_separator)[0], 
                                             item[0].split(split_key_separator)[1], 
                                             item[0].split(split_key_separator)[2])))
    with open("species_statistic_info.csv", "w", newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["ID",	"标签",	"物种",	"性别",	"年龄", '数量'])
        id = 1
        for key, value in species_map_to_count_sort.items():
            species, gender, age = key.split(',')
            species_detail = species
            if gender != '无法区分':
                species_detail += gender
            if age != '无法区分':
                species_detail += age
            writer.writerow([id, species_detail, species, gender, age, value])
            id += 1

In [11]:
all_species = get_species_num(file_name, species_to_another_species, species_map_to_count, gender_map, age_map)

file size: 643250
野猪,无法区分,成体 not found in species_map_to_count
野猪,无法区分,成体 not found in species_map_to_count
野猪,无法区分,成体 not found in species_map_to_count
野猪,无法区分,成体 not found in species_map_to_count
野猪,无法区分,成体 not found in species_map_to_count
野猪,无法区分,成体 not found in species_map_to_count
野猪,无法区分,成体 not found in species_map_to_count
野猪,无法区分,成体 not found in species_map_to_count
野猪,无法区分,成体 not found in species_map_to_count
野猪,无法区分,成体 not found in species_map_to_count
野猪,无法区分,成体 not found in species_map_to_count
野猪,无法区分,成体 not found in species_map_to_count
野猪,无法区分,成体 not found in species_map_to_count
野猪,无法区分,成体 not found in species_map_to_count
野猪,无法区分,成体 not found in species_map_to_count
野猪,无法区分,成体 not found in species_map_to_count
野猪,无法区分,成体 not found in species_map_to_count
野猪,无法区分,成体 not found in species_map_to_count
野猪,无法区分,成体 not found in species_map_to_count
野猪,雄性,成体 not found in species_map_to_count
野猪,雄性,成体 not found in species_map_to_count
野猪,雄性,成体 not found in species_map_to_coun

In [12]:
species_statistic_info_to_csv(species_map_to_count)

In [9]:
import csv
def get_clean_species_statistic_info(file_name):
    species_map_to_count = {}
    with open(file_name, 'r') as file:
        next(file)  # 跳过第一行
        for line in file:
            line = line.strip()
            if line == "":
                continue
            data = line.split(",")
            if len(data) < 6:
                print(f"error format {line}")
                continue
            _, _, species, gender, age, count = data[0:6]
            if age == "幼体":
                if gender != "无法区分":
                    print(f"{species} age is 幼体 need transfer gender {gender} to 无法区分")
                    gender = "无法区分"
            if "指名亚种" in species:
                species_column = species.split(" ")
                new_species = species_column[0]
                key = new_species + ',' + gender + ',' + age
                print(f"orig species {species},{gender},{age} transfer to {key}")
            else:
                key = species + ',' + gender + ',' + age
            if key not in species_map_to_count:
                species_map_to_count[key] = 0
            species_map_to_count[key] += int(count)
    split_key_separator = ","
    species_map_to_count_sort = dict(sorted(species_map_to_count.items(), key=lambda item: 
                                            (item[0].split(split_key_separator)[0], 
                                             item[0].split(split_key_separator)[1], 
                                             item[0].split(split_key_separator)[2])))
    total_label_num = 0
    with open("species_statistic_info_new.csv", "w", newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["ID",	"标签",	"物种",	"性别",	"年龄", '数量'])
        id = 1
        for key, value in species_map_to_count_sort.items():
            species, gender, age = key.split(',')
            species_detail = species
            if gender != '无法区分':
                species_detail += gender
            if age != '无法区分':
                species_detail += age
            writer.writerow([id, species_detail, species, gender, age, value])
            id += 1
            total_label_num += value
    print(f"total_label_num is {total_label_num}")

In [10]:
get_clean_species_statistic_info("/home/yuzhong/data2/code/object_detection/process_data/物种分类码表-训练数据包含的动物情况.csv")

中华斑羚 age is 幼体 need transfer gender 雌性 to 无法区分
家牛 age is 幼体 need transfer gender 雄性 to 无法区分
家牛 age is 幼体 need transfer gender 雌性 to 无法区分
小麂 age is 幼体 need transfer gender 雄性 to 无法区分
小麂 age is 幼体 need transfer gender 雌性 to 无法区分
山羊 age is 幼体 need transfer gender 雌性 to 无法区分
岩羊 age is 幼体 need transfer gender 雌性 to 无法区分
梅花鹿 age is 幼体 need transfer gender 雌性 to 无法区分
orig species 橙翅噪鹛 橙翅噪鹛指名亚种,无法区分,成体 transfer to 橙翅噪鹛,无法区分,成体
毛冠鹿 age is 幼体 need transfer gender 雄性 to 无法区分
毛冠鹿 age is 幼体 need transfer gender 雌性 to 无法区分
水鹿 age is 幼体 need transfer gender 雌性 to 无法区分
orig species 灰头鸫 灰头鸫指名亚种,无法区分,成体 transfer to 灰头鸫,无法区分,成体
orig species 灰头鸫 灰头鸫指名亚种,雄性,成体 transfer to 灰头鸫,雄性,成体
orig species 灰眶雀鹛 灰眶雀鹛指名亚种,无法区分,成体 transfer to 灰眶雀鹛,无法区分,成体
狍 age is 幼体 need transfer gender 雌性 to 无法区分
白头鹤 age is 幼体 need transfer gender 雄性 to 无法区分
白尾鹞 age is 幼体 need transfer gender 雌性 to 无法区分
白鹇 age is 幼体 need transfer gender 雄性 to 无法区分
白鹇 age is 幼体 need transfer gender 雌性 to 无法区分
orig species 白鹇 白鹇指名亚种,雄性,亚成体 transfer to 白鹇

In [15]:
def filter_species_less_data(file_name, less_num):
    with open(file_name, 'r') as file, open(f"species_statistic_info_new_filter_less_num_{less_num}.csv", "w", newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["ID",	"标签",	"物种",	"性别",	"年龄", '数量'])
        id = 1
        original_id = 0
        next(file)  # 跳过第一行
        for line in file:
            line = line.strip()
            if line == "":
                continue
            data = line.split(",")
            if len(data) < 6:
                print(f"error format {line}")
                continue
            _, label, species, gender, age, count = data[0:6]
            original_id += 1
            if int(count) <= less_num:
                continue
            writer.writerow([id, label, species, gender, age, count])
            id += 1
    print(f"original id num {original_id}, id num {id - 1}")

In [24]:
filter_species_less_data("species_statistic_info_new.csv", 9)

original id num 2474, id num 986
