In [1]:
from toai.imports import *
import xmltodict



In [2]:
DATA_DIR = Path('data/raw/images')
DATA_DEST = Path('data/images')
DATA_DEST.mkdir(parents=True, exist_ok=True)

In [79]:
all_files = glob(str(DATA_DIR / '**/*.xml'), recursive=True)

In [80]:
json_files = []
for xml_file in all_files:
    with open(xml_file, encoding='utf-8') as f:
        json_files.append(json.dumps(xmltodict.parse(f.read())))

Copy files from raw files dir

In [81]:
print(f"Processing {len(json_files)}")
for json_file in json_files:
    annotation = json.loads(json_file)['annotation']
    
    #copy photo
    folder = annotation['folder'].replace(" – ", "-").replace("–", "-")
    filename = annotation['filename']
    photo_src = DATA_DIR / folder / filename
    

    with open(photo_src, "rb") as imageFile:
        if imageFile.read().startswith(b"RIFF"):
            print(f"Invalid file found. Skipping. {photo_src}")
        else:
            dest = DATA_DEST / folder
            dest.mkdir(parents=True, exist_ok=True)
            shutil.copy2(photo_src, dest)

            # copy xml
            xml_src = photo_src.with_suffix(".xml")
            shutil.copy2(xml_src, dest)

print(f"Done! {len(glob(str(DATA_DEST / '**/*.xml'), recursive=True))} files copied.")

Processing 1618
Invalid file found. Skipping. data/raw/images/9K72 SS-1 SCUD/14.1251505-20234-89-pristine.jpg
Invalid file found. Skipping. data/raw/images/9K72 SS-1 SCUD/7.1252313-20234-62-pristine.jpg
Invalid file found. Skipping. data/raw/images/BMP-1/12.HTB1klmLX.z1gK0jSZLeq6z9kVXaJ.jpg
Invalid file found. Skipping. data/raw/images/9K330 TOR/8.1254883-21625-17-720.jpg
Invalid file found. Skipping. data/raw/images/2S23 NONA-SVK/14.tsm9559__40089.1539179135.jpg
Invalid file found. Skipping. data/raw/images/BMP-97/9.1085972-19161-65-720.jpg
Done! 1596 files copied.


In [3]:
all_files = glob(str(DATA_DEST / '**/*.xml'), recursive=True)
json_files = []
for xml_file in all_files:
    with open(xml_file, encoding='utf-8') as f:
        json_files.append(json.dumps(xmltodict.parse(f.read())))

Inspect annotations

In [4]:
print(json.dumps(json.loads(json_files[0]), indent=2))

{
  "annotation": {
    "folder": "PTS-M",
    "filename": "14.pts-m-v24.04.18-mudrunner-4.jpg",
    "path": "/Users/martynas/ai_bootcamp/capstone/images/PTS-M/14.pts-m-v24.04.18-mudrunner-4.jpg",
    "source": {
      "database": "Unknown"
    },
    "size": {
      "width": "1717",
      "height": "1085",
      "depth": "3"
    },
    "segmented": "0",
    "object": {
      "name": "PTS-M",
      "pose": "Unspecified",
      "truncated": "0",
      "difficult": "0",
      "bndbox": {
        "xmin": "107",
        "ymin": "575",
        "xmax": "1056",
        "ymax": "982"
      }
    }
  }
}


In [5]:
def parse_bboxes(obj):
    """Return list of [xmin, ymin, xmax, ymax]"""
    if isinstance(obj, list):
        res = []
        for single in obj:
            bndbox = single['bndbox']
            res.append(list(bndbox.values()))
        return res
    else:
        bndbox = obj['bndbox']
        return [bndbox.values()]

In [6]:
result = []
for json_file in json_files:
    annotation = json.loads(json_file)['annotation']
    folder = annotation['folder'].replace(" – ", "-").replace("–", "-")
    filename = annotation['filename']
    size = annotation['size']
    width, height = size['width'], size['height']
        
    try:
        bndboxs = parse_bboxes(annotation['object'])
        for bbox in bndboxs:
            result.append([folder, f"images/{folder}/{filename}", width, height, *bbox])
    except:
        print(annotation)

df = pd.DataFrame(result, columns=['class', 'path', 'width', 'height', 'xmin', 'ymin', 'xmax', 'ymax'])

{'folder': '9K57 BM-22V', 'filename': '11.Katjuscha_1938_Moscow.jpg', 'path': '/Users/martynas/ai_bootcamp/capstone/images/9K57 BM-22V/11.Katjuscha_1938_Moscow.jpg', 'source': {'database': 'Unknown'}, 'size': {'width': '3008', 'height': '2000', 'depth': '3'}, 'segmented': '0'}


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1668 entries, 0 to 1667
Data columns (total 8 columns):
class     1668 non-null object
path      1668 non-null object
width     1668 non-null object
height    1668 non-null object
xmin      1668 non-null object
ymin      1668 non-null object
xmax      1668 non-null object
ymax      1668 non-null object
dtypes: object(8)
memory usage: 104.4+ KB


In [8]:
df['class'].nunique()

106

In [9]:
df = df.groupby('class').filter(lambda x: len(x) > 20)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 353 entries, 50 to 1379
Data columns (total 8 columns):
class     353 non-null object
path      353 non-null object
width     353 non-null object
height    353 non-null object
xmin      353 non-null object
ymin      353 non-null object
xmax      353 non-null object
ymax      353 non-null object
dtypes: object(8)
memory usage: 24.8+ KB


In [11]:
df['class'].nunique()

14

In [12]:
df['class'].value_counts()

BMP-3M              31
BMP-97              31
9P149 STURM-S       29
2S19 MSTA-S         27
BTR-ZD              26
BMP-2               25
2S23 NONA-SVK       25
BMP-1               24
2S4 TIULPAN         24
2B9 VASILIOK        23
ZRK-SD 2K11 KRUG    23
T-90S               22
T-80UM2             22
BMP-3               21
Name: class, dtype: int64

In [13]:
df.to_csv('data/annotations.csv', index=False)

In [14]:
with open('data/classes.txt', "w") as f:
    for clazz in df['class'].unique():
        f.write(f"{clazz}\n")

In [15]:
df_train, df_val = train_test_split(df, test_size=0.2, stratify=df['class'])

In [16]:
len(df_train), len(df_val)

(282, 71)

In [17]:
with open('data/images/data_train.txt', "w") as f:
    for path in df_train['path'].values:
        f.write(f"{path}\n")

In [18]:
with open('data/images/data_val.txt', "w") as f:
    for path in df_val['path'].values:
        f.write(f"{path}\n")