# YOLOv5 on SageMaker--数据准备

## 1 说明
本章内容主要是把labelme数据格式转化为YOLOv5格式。
## 2 运行环境
Kernel 选择pytorch_latest_p36。 
## 3 已有YOLOv5格式数据
如果已有YOLOv5格式的数据，可跳过数据准备，把数据放入S3即可。  
### 3.1 S3目录存放格式
```
training
├── cfg
│   └── data.yaml
├── images
│   ├── train
│   │   ├── image001.jpg
│   │   ├── image002.jpg
│   │   └── ...
│   └── val
│       ├── image101.jpg
│       ├── image102.jpg
│       └── ...
└── labels
    ├── train
    │   ├── image001.txt
    │   ├── image002.txt
    │   └── ...
    └── val
        ├── image101.txt
        ├── image102.txt
        └── ...
```
### 3.2 SageMaker输入数据根目录
运行SageMaker时，SageMaker会从S3拷贝数据放到到运行容器的`/opt/ml/input/data/training/`下。即`data.yaml`对应全路径为`/opt/ml/input/data/training/cfg/data.yaml`
### 3.3 文件说明
- cfg/data.yaml YOLOv5 train --data的值，必须使用该名称
- images/*.jpg 需要训练的图片，分训练目录和验证目录
- labels/*.jpg YOLOv5 txt数据

### 3.4 YOLOv5 txt数据示例
```
45 0.736516 0.247188 0.498875 0.476417
50 0.637063 0.732938 0.494125 0.510583
```
第1列为name索引，后4列为标注范围，为xywh格式，即中心点位置(xy)和宽高(wh)  


## 4 没有YOLOv5格式数据
### 4.1 准备labelme格式数据

In [None]:
!aws s3 cp s3://nowfox/data/yumaoqiu-labelme.zip ./

In [None]:
!unzip -qo yumaoqiu-labelme.zip

### 4.2 创建输出目录

In [None]:
import os
import shutil

In [None]:
data_dir="training"
data_types=["images","labels"]
stage_types=["train","val"]
if os.path.isdir(data_dir):
    shutil.rmtree(data_dir)
for data_type in data_types:
    for stage_type in stage_types:
        os.makedirs(os.path.join(data_dir,data_type,stage_type))
cfg_dir=os.path.join(data_dir,"cfg")
os.makedirs(cfg_dir)

### 4.3 设置names
根据自身业务，设置names，其他内容不用修改

In [None]:
import yaml
names=["faqiu","jianqiu","jiqiu","serve_down","serve_up","pickup","play_up","play_down"]

data={}
data["train"]="/opt/ml/input/data/training/images/train/"
data["val"]="/opt/ml/input/data/training/images/val/"
data["names"]=names
data["nc"]=len(names)

yaml_filename="data.yaml"
yaml_file=os.path.join(cfg_dir,yaml_filename)
file = open(yaml_file, 'w', encoding='utf-8')
yaml.dump(data, file)
file.close()

In [None]:
#设置各个name的索引
name_index={}
index=0
for name in names:
    name_index[name]=index
    index+=1
name_index

### 4.4 转化格式

In [None]:
#xyxy转xywh
def xyxy2xywh(xyxy,width,height):
    xywh = []
    xywh.append(((xyxy[0]+xyxy[2])/2)/width)
    xywh.append(((xyxy[1]+xyxy[3])/2)/height)
    xywh.append((xyxy[2]-xyxy[0])/width)
    xywh.append((xyxy[3]-xyxy[1])/height)
    return xywh

In [None]:
import json
def convert_format(source_json,stage_type):
    source_index=source_json.rindex("/")
    source_path=source_json[:source_index]
    source_file_name=source_json[source_index+1:]
    label_output_dir=os.path.join(data_dir,data_types[1],stage_type)
    image_output_dir=os.path.join(data_dir,data_types[0],stage_type)
    with open(source_json,'r',encoding='utf8')as f:
        json_data = json.load(f)
    width=json_data["imageWidth"]
    height=json_data["imageHeight"]
    xywh_result=[]
    for shape in json_data["shapes"]:
        label=shape["label"]
        xyxy=[shape["points"][0][0],shape["points"][0][1],shape["points"][1][0],shape["points"][1][1]]
        xywh=xyxy2xywh(xyxy,width,height)
        xywh_result.append(('%g ' * 5 ) % (name_index[label], *xywh))
    result_txt=os.path.join(label_output_dir,source_file_name.split(".")[0]+".txt")
    with open(result_txt, 'w', encoding='utf-8') as f:
        for xywh_line in xywh_result:
            f.write(xywh_line)
            f.write("\n")
    sourece_img=os.path.join(source_path,json_data["imagePath"])
    result_img=os.path.join(image_output_dir,json_data["imagePath"])
    shutil.copyfile(sourece_img,result_img)

In [None]:
#设置需要转化的目录
input_dir="biaozhu"
json_files=[]
for root, dirs, files in os.walk(input_dir):
    if root.find(".ipynb_checkpoints")==-1:
        for f in files:
            if f.endswith(".json"):
                json_files.append(os.path.join(root, f))

In [None]:
import random
#根据自身情况设置验证集的比例val_rate
val_rate=0.1
files_count=len(json_files)
val_count=int(files_count*val_rate)
random.shuffle(json_files)
val_json_files=json_files[:val_count]
train_json_files=json_files[val_count:]
print("总JSON文件数："+str(len(json_files)))
print("val JSON文件数："+str(len(val_json_files)))
print("train JSON文件数："+str(len(train_json_files)))

In [None]:
def deal_files(files,stage_type):
    for file in files:
        convert_format(file,stage_type)
deal_files(train_json_files,stage_types[0])
deal_files(val_json_files,stage_types[1])

### 4.5 上传数据到S3

In [None]:
# 设置数据存放S3 bucket和前缀
bucket = 'junzhong'
pre_key = 'yolov5'

In [None]:
!aws s3 cp --recursive --quiet training/ s3://{bucket}/{pre_key}/training/

### 4.6 移动结果文件到训练目录

In [None]:
import os.path
yaml_path="../1-training/container/local_test/input/data/training/cfg/"
target_file=os.path.join(yaml_path,yaml_filename)
#if os.path.isfile(target_file):
#    os.remove(target_file)
shutil.move(yaml_file,target_file)

In [None]:
for data_type in data_types:
    current_dir=os.path.join(data_dir,data_type)
    print(current_dir)
    shutil.move(current_dir,"../1-training/container/local_test/input/data/training/")