# Data Preprocessing

In [2]:
import pandas as pd
import PIL
from glob import glob
import os
import re
import random
import chitra
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from chitra.image import Chitra

In [3]:

def load_data(typ):

    """Load image data and join with annotation information."""

    df_annotation = pd.read_csv(f"data/{typ}/_annotations.csv")
    df_img = pd.DataFrame()

    base_dir = rf'data/{typ}/*.jpg'
    files = glob(base_dir)

    imgs = []
    for i in files:
        imgs.append(plt.imread(i))

    files = [re.sub(f'data/{typ}/', '', i) for i in files]

    df_img['filename'] = files
    df_img['image'] = imgs
    df_train = pd.merge(df_annotation, df_img, on='filename')
    return df_train


In [None]:
df_train = load_data('train')
#df_valid = load_data('valid')
#df_test = load_data('test')

print(df_train.info())
#print(df_valid.info())
#print(df_test.info())

In [None]:
# write data to pickle:
#df_train.to_pickle('data/chess_new.pickle')

In [None]:
# check ranges of bounding boxes
#df_train[['xmin', 'ymin', 'xmax', 'ymax']].agg(['max', 'min'])

In [None]:
# Display sample image with annotation

sample_img = df_train.iloc[1111,:] #4801


figsize = sample_img['width'] / float(100), sample_img['height']/ float(100)
fig, ax = plt.subplots(figsize=figsize)
#new_width = sample_img['width'] / 6.4
#new_height = sample_img['height'] / 4.8

#ax.axis('off')
ax.imshow(sample_img['image'])
rect = patches.Rectangle((sample_img['xmin'], sample_img['ymin']), 
                         sample_img['xmax']-sample_img['xmin'], 
                         sample_img['ymax']-sample_img['ymin'],
                         edgecolor='red',
                         facecolor='none',
                         linewidth=3,
                         )

ax.add_patch(rect)
annot_height = sample_img['ymin'] + (sample_img['ymax']-sample_img['ymin']) + 20
ax.annotate(sample_img['class'], (sample_img['xmin'], annot_height), color='red',
            size=20)

Some annotations are incorrect (e.g. axes transposed). Manually check if annotations are correct in `check_annotations.ipynb`.

## Load cheked data

In [None]:
# Add togehther annotations
annot_1 = pd.read_csv('data/checked_annot/top_184_anot.csv')
annot_2 = pd.read_csv('data/checked_annot/bottom_annot.csv')
annot_3 = pd.read_csv('data/checked_annot/bottom_annot_1602_downwards.csv')

In [None]:
# Concatenate sheets
annot_total = pd.concat([annot_1, annot_2, annot_3])
annot_total.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1025 entries, 0 to 371
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Unnamed: 0    1025 non-null   int64 
 1   filename      1025 non-null   object
 2   correct_anot  1015 non-null   object
dtypes: int64(1), object(2)
memory usage: 32.0+ KB


In [None]:
# Filter for correctly annotated
annot_correct = annot_total[annot_total['correct_anot'] == 'y']

# get length of correctly annotated:
annot_img_list = list(annot_correct['filename'].unique())
print(len(annot_img_list))

In [None]:
# get new dataset
chess_df = df_train[df_train['filename'].isin(annot_img_list)]
#print(chess_df.info())

# Check unique pictures
chess_df['filename'].nunique()

In [None]:
# Write data to csv
#chess_df.to_pickle('data/chess_new.p')

## Resizing images

Given that `image` is a `Chitra` object:

* Get ndarray of image with `image.numpy()`
* Get resized bounding box xmin, ymin with `image.bboxes[0][0]`
* Get resized bounding box xmax, ymax with `image.bboxes[0][1]`
* Get resized bounding box label with `image.bboxes[0].label`

In [None]:
# laod data:
chess_df = pd.read_pickle('data/chess_new.p')

In [None]:
# display sample image with chitra:

import chitra
from chitra.image import Chitra

tryout = chess_df.iloc[7]

ymin = tryout['ymin']
ymax = tryout['ymax']
xmin = tryout['xmin']
xmax = tryout['xmax']

label = tryout['class']

bbox = [xmin, ymin, xmax, ymax]

filename = tryout['filename']

img_path = f'data/train/{filename}'
image = Chitra(img_path, bboxes=bbox, labels=label, box_format= 'BoundingBoxes.CORNER')
image.resize_image_with_bbox((224, 224))

figsize = tryout['width'] / float(100), tryout['height']/ float(100)

#plt.figure(figsize=figsize)
plt.imshow(image.draw_boxes())

In [None]:
def resize_images(samp, resize = (224, 224)):
    
    """Function to convert one row of chess df to dictionary with resized information."""

    bbox = [samp['xmin'], samp['ymin'], samp['xmax'], samp['ymax']]
    
    filename = samp['filename']
    img_path = f'data/train/{filename}'
    
    image = Chitra(img_path, bboxes=bbox, labels=samp['class'], box_format='BoundingBoxes.CORNER')
    image.resize_image_with_bbox(resize)

    return {'filename': filename, 
            'width': image.shape[0], 
            'height': image.shape[1], 
            'class': image.bboxes[0].label, 
            'xmin': image.bboxes[0][0][0], 
            'ymin': image.bboxes[0][0][1], 
            'xmax' : image.bboxes[0][1][0], 
            'ymax': image.bboxes[0][1][1], 
            'image': image.numpy()}

In [None]:
# Iterate through dataset and append resized images:

df_chess_resized= pd.DataFrame()

for i in range(chess_df.shape[0]):
    
    df_add = pd.DataFrame.from_dict(resize_images(chess_df.iloc[i]), orient='index').T
    df_chess_resized = pd.concat([df_chess_resized, df_add], axis=0, ignore_index=True)

In [None]:
#df_chess_resized.to_pickle('data/chess_resized.p')