In [1]:
from toai.imports import *
import xmltodict



In [2]:
DATA_DIR = Path('data/raw/images')
DATA_DEST = Path('data/images')
DATA_DEST.mkdir(parents=True, exist_ok=True)

In [3]:
all_files = glob(str(DATA_DIR / '**/*.xml'), recursive=True)

In [4]:
json_files = []
for xml_file in all_files:
    with open(xml_file, encoding='utf-8') as f:
        json_files.append(json.dumps(xmltodict.parse(f.read())))

Copy files from raw files dir

In [5]:
print(f"Processing {len(json_files)}")
for json_file in json_files:
    annotation = json.loads(json_file)['annotation']
    
    #copy photo
    folder = annotation['folder'].replace(" – ", "-").replace("–", "-")
    filename = annotation['filename']
    photo_src = DATA_DIR / folder / filename
    

    with open(photo_src, "rb") as imageFile:
        if imageFile.read().startswith(b"RIFF"):
            print(f"Invalid file found. Skipping. {photo_src}")
        else:
            dest = DATA_DEST / folder
            dest.mkdir(parents=True, exist_ok=True)
            shutil.copy2(photo_src, dest)

            # copy xml
            xml_src = photo_src.with_suffix(".xml")
            shutil.copy2(xml_src, dest)

print(f"Done! {len(glob(str(DATA_DEST / '**/*.xml'), recursive=True))} files copied.")

Processing 2305
Invalid file found. Skipping. data/raw/images/9K72 SS-1 SCUD/14.1251505-20234-89-pristine.jpg
Invalid file found. Skipping. data/raw/images/9K72 SS-1 SCUD/7.1252313-20234-62-pristine.jpg
Invalid file found. Skipping. data/raw/images/9K330 TOR/8.1254883-21625-17-720.jpg
Done! 2302 files copied.


In [6]:
all_files = glob(str(DATA_DEST / '**/*.xml'), recursive=True)
json_files = []
for xml_file in all_files:
    with open(xml_file, encoding='utf-8') as f:
        json_files.append(json.dumps(xmltodict.parse(f.read())))

Inspect annotations

In [7]:
print(json.dumps(json.loads(json_files[0]), indent=2))

{
  "annotation": {
    "folder": "PTS-M",
    "filename": "14.pts-m-v24.04.18-mudrunner-4.jpg",
    "path": "/Users/martynas/ai_bootcamp/capstone/images/PTS-M/14.pts-m-v24.04.18-mudrunner-4.jpg",
    "source": {
      "database": "Unknown"
    },
    "size": {
      "width": "1717",
      "height": "1085",
      "depth": "3"
    },
    "segmented": "0",
    "object": {
      "name": "PTS-M",
      "pose": "Unspecified",
      "truncated": "0",
      "difficult": "0",
      "bndbox": {
        "xmin": "107",
        "ymin": "575",
        "xmax": "1056",
        "ymax": "982"
      }
    }
  }
}


In [8]:
def parse_bboxes(obj):
    """Return list of [xmin, ymin, xmax, ymax]"""
    if isinstance(obj, list):
        res = []
        for single in obj:
            bndbox = single['bndbox']
            res.append(list(bndbox.values()))
        return res
    else:
        bndbox = obj['bndbox']
        return [bndbox.values()]

In [9]:
result = []
for json_file in json_files:
    annotation = json.loads(json_file)['annotation']
    folder = annotation['folder'].replace(" – ", "-").replace("–", "-")
    filename = annotation['filename']
    size = annotation['size']
    width, height = size['width'], size['height']
        
    try:
        bndboxs = parse_bboxes(annotation['object'])
        for bbox in bndboxs:
            result.append([folder, f"images/{folder}/{filename}", width, height, *bbox])
    except:
        print(annotation)

df = pd.DataFrame(result, columns=['class', 'path', 'width', 'height', 'xmin', 'ymin', 'xmax', 'ymax'])

{'folder': '9K57 BM-22V', 'filename': '11.Katjuscha_1938_Moscow.jpg', 'path': '/Users/martynas/ai_bootcamp/capstone/images/9K57 BM-22V/11.Katjuscha_1938_Moscow.jpg', 'source': {'database': 'Unknown'}, 'size': {'width': '3008', 'height': '2000', 'depth': '3'}, 'segmented': '0'}
{'folder': 'BMP-97', 'filename': 'i (10)_11.jpeg', 'path': '/home/martynas/ml/ai_bootcamp_capstone/data/raw/images/BMP-97/i (10)_11.jpeg', 'source': {'database': 'Unknown'}, 'size': {'width': '480', 'height': '320', 'depth': '3'}, 'segmented': '0'}


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2444 entries, 0 to 2443
Data columns (total 8 columns):
class     2444 non-null object
path      2444 non-null object
width     2444 non-null object
height    2444 non-null object
xmin      2444 non-null object
ymin      2444 non-null object
xmax      2444 non-null object
ymax      2444 non-null object
dtypes: object(8)
memory usage: 152.9+ KB


In [11]:
df['class'].nunique()

104

In [16]:
df = df.groupby('class').filter(lambda x: len(x) > 34)

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1109 entries, 50 to 2155
Data columns (total 8 columns):
class     1109 non-null object
path      1109 non-null object
width     1109 non-null object
height    1109 non-null object
xmin      1109 non-null object
ymin      1109 non-null object
xmax      1109 non-null object
ymax      1109 non-null object
dtypes: object(8)
memory usage: 78.0+ KB


In [18]:
df['class'].nunique()

12

In [19]:
df['class'].value_counts()

TOS-1 BURATINO      199
BMP-2               126
2S19 MSTA-S         115
T-90S                92
BMP-1                86
2S23 NONA-SVK        83
ZRK-SD 2K11 KRUG     79
BMP-3                75
2S4 TIULPAN          75
2B9 VASILIOK         72
BMP-97               55
9P149 STURM-S        52
Name: class, dtype: int64

In [20]:
df.to_csv('data/annotations.csv', index=False)

In [21]:
with open('data/classes.txt', "w") as f:
    for clazz in df['class'].unique():
        f.write(f"{clazz}\n")

In [22]:
df_train, df_val = train_test_split(df, test_size=0.1, stratify=df['class'])

In [23]:
len(df_train), len(df_val)

(998, 111)

In [24]:
with open('data/images/data_train.txt', "w") as f:
    for path in df_train['path'].values:
        f.write(f"{path}\n")

In [25]:
with open('data/images/data_val.txt', "w") as f:
    for path in df_val['path'].values:
        f.write(f"{path}\n")