In [15]:
import os
from random import shuffle
import numpy as np
import pandas as pd
import cv2
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
from tqdm import tqdm

In [16]:
path = 'Dataset/Manga109'

with open('{}/{}'.format(path, 'books.txt')) as file:
    book_list = file.readlines()
book_list = [bookname.strip() for bookname in book_list]
shuffle(book_list)

In [17]:
records = []
# For each book
for count, bookname in enumerate(tqdm(book_list)):
    # Process annotiation
    annotation_path = '{}/annotations/{}.xml'.format(path, bookname)
    with open(annotation_path, encoding='utf8') as a_file:
        content = ''.join(a_file.readlines())
        
    soup = BeautifulSoup(content, 'xml')
    
    # For each page in book
    for page in soup.find_all('page'):
        # Get classes from page
        texts = page.findChildren('text')
        bodies = page.findChildren('body')
        faces = page.findChildren('face')

        # Check if classes exist in page, if not then continue to next page
        if not texts and not bodies and not faces:
            continue

        filepath = '{}/images/{}/{}.jpg'.format(path, bookname, page.get('index').zfill(3))
        width, height = int(page.get('width')), int(page.get('height'))

        for text in texts:
            (xmin,ymin,xmax,ymax) = text.get('xmin'), text.get('ymin'), text.get('xmax'), text.get('ymax')
            data = (filepath,width,height,'text',xmin,ymin,xmax,ymax)
            records.append(data)

        for body in bodies:
            (xmin,ymin,xmax,ymax) = body.get('xmin'), body.get('ymin'), body.get('xmax'), body.get('ymax')
            data = (filepath,width,height,'body',xmin,ymin,xmax,ymax)
            records.append(data)

        for face in faces:
            (xmin,ymin,xmax,ymax) = face.get('xmin'), face.get('ymin'), face.get('xmax'), face.get('ymax')
            data = (filepath,width,height,'face',xmin,ymin,xmax,ymax)
            records.append(data)

        # process 7 pages per book
        if int(page.get('index')) >= 1:
            break
            
    if count >= 5:
        break

  5%|███▊                                                                              | 5/109 [00:00<00:12,  8.32it/s]


In [18]:
train_records, test_records = train_test_split(records, test_size=0.2, shuffle=False, random_state=42)

In [19]:
len(train_records), len(test_records)

(48, 13)

In [20]:
df = pd.DataFrame(train_records, columns=['filename', 'width', 'height', 'class', 'xmin', 'ymin', 'xmax', 'ymax'])
df.to_csv('Object Detection/data/train_labels.csv', index=False)

In [21]:
df = pd.DataFrame(test_records, columns=['filename', 'width', 'height', 'class', 'xmin', 'ymin', 'xmax', 'ymax'])
df.to_csv('Object Detection/data/test_labels.csv', index=False)