In [47]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from PIL import Image
import cv2
import json
import sys
import easyocr
import os
from shutil import move, copy
from ultralytics import YOLO
sys.path.append('../src')

%load_ext autoreload
%autoreload 2
from extract_data.barplots import detect_graph_space

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
df = pd.read_csv('../data/interim/annots_imgs_merged.csv')
bar = df[df['chart-type'] == 'vertical_bar']
bar = bar[['chart-type','data-series','visual-elements.bars','file_name', 'image_path']]
bar = bar.reset_index(drop=True)
bar.columns = ['type','data_series','data','name','path']
bar.head()


Unnamed: 0,type,data_series,data,name,path
0,vertical_bar,"[{'x': 'Jordan', 'y': 88.35159235668789}, {'x'...","[{'height': 103, 'width': 19, 'x0': 77, 'y0': ...",75c0449f6917,../data/raw/train/images/75c0449f6917.jpg
1,vertical_bar,"[{'x': 'Cuba', 'y': 75695.38632268383}, {'x': ...","[{'height': 124, 'width': 12, 'x0': 104, 'y0':...",58595c30beab,../data/raw/train/images/58595c30beab.jpg
2,vertical_bar,"[{'x': 'Jamaica', 'y': 36.43411033235605}, {'x...","[{'height': 75, 'width': 21, 'x0': 62, 'y0': 9...",5022600d52d7,../data/raw/train/images/5022600d52d7.jpg
3,vertical_bar,"[{'x': '1', 'y': 62.45402663284719}, {'x': '2'...","[{'height': 187, 'width': 11, 'x0': 60, 'y0': ...",32176a89b822,../data/raw/train/images/32176a89b822.jpg
4,vertical_bar,"[{'x': 'Harney', 'y': 75.91000641635438}, {'x'...","[{'height': 167, 'width': 14, 'x0': 71, 'y0': ...",3d895e511690,../data/raw/train/images/3d895e511690.jpg


In [18]:
bar_samples = bar.head(500)

dataframes = []
for i in range(len(bar_samples)):
    # extract xy coordinates from data_coords column
    coords = bar_samples['data'][i]
    coords = coords.replace("'", "\"")
    coords_list = json.loads(coords)

    # extract important image info
    path = bar_samples['path'][i]
    name = bar_samples['name'][i]
    img = cv2.imread(path)
    height = img.shape[0]
    width = img.shape[1]

    # Save the xy coords into a list
    bbox = []
    for data in coords_list:

        x1 = data['x0']
        y1= data['y0']
        w = data['width']
        h = data['height']
        x2 = x1 + w
        y2 = y1 + h

        x_center = (x1+x2)/(2*width)
        y_center = (y1+y2)/(2*height)
        yolo_width = w/width
        yolo_height = h/height
        bbox.append((x_center, y_center, yolo_width, yolo_height))
  
   

    # Save into list of dataframe
    yolo = pd.DataFrame(bbox, columns=['x','y','w','h'])
    yolo['class'] = 0
    yolo['path'] = path
    yolo['name'] = name
    yolo = yolo[['path','name','class','x','y','w','h']]
    dataframes.append(yolo)

# Concat dataframes and check that the number of unique images is correct
yolo_df = pd.concat(dataframes)
yolo_df['path'].nunique()

500

In [19]:
yolo_df

Unnamed: 0,path,name,class,x,y,w,h
0,../data/raw/train/images/75c0449f6917.jpg,75c0449f6917,0,0.184829,0.534173,0.040598,0.370504
1,../data/raw/train/images/75c0449f6917.jpg,75c0449f6917,0,0.268162,0.485612,0.040598,0.467626
2,../data/raw/train/images/75c0449f6917.jpg,75c0449f6917,0,0.351496,0.579137,0.040598,0.280576
3,../data/raw/train/images/75c0449f6917.jpg,75c0449f6917,0,0.434829,0.669065,0.040598,0.100719
4,../data/raw/train/images/75c0449f6917.jpg,75c0449f6917,0,0.518162,0.505396,0.040598,0.428058
...,...,...,...,...,...,...,...
8,../data/raw/train/images/b38361166664.jpg,b38361166664,0,0.682900,0.723183,0.015152,0.221453
9,../data/raw/train/images/b38361166664.jpg,b38361166664,0,0.748918,0.536332,0.017316,0.595156
10,../data/raw/train/images/b38361166664.jpg,b38361166664,0,0.816017,0.527682,0.017316,0.612457
11,../data/raw/train/images/b38361166664.jpg,b38361166664,0,0.882035,0.607266,0.015152,0.453287


In [20]:
unique_images = yolo_df['path'].unique()
img_df = pd.DataFrame(unique_images,columns=['path'])
img_train = tuple(img_df.sample(frac=0.8)['path'])
img_test = tuple(img_df.query(f'path not in {img_train}')['path'])
len(img_train), len(img_test)

(400, 100)

In [21]:
train_df = yolo_df.query(f'path in {img_train}')
test_df = yolo_df.query(f'path in {img_test}')

In [22]:
src_folder = '../data/raw/train/images/'
train_folder = '../data/processed/YOLO/Barplots/train/'
test_folder = '../data/processed/YOLO/Barplots/test/'

In [23]:
cols = ['name','class','x','y','w','h']
groupby_obj_train = train_df[cols].groupby('name')
groupby_obj_test = test_df[cols].groupby('name')

In [24]:
def save_data(filename, folder_path, group_obj):
    #copy image
    src = os.path.join(src_folder,filename)+'.jpg'
    dst = os.path.join(folder_path, filename)+'.jpg'
    copy(src, dst)

    #save labels
    text_filename = os.path.join(folder_path,filename+'.txt')
    group_obj.get_group(filename).set_index('name').to_csv(text_filename, index=False, header=False, sep=' ')

In [32]:
filename_series = pd.Series(groupby_obj_train.groups.keys())
filename_series.apply(save_data,args=(train_folder, groupby_obj_train))

0      None
1      None
2      None
3      None
4      None
       ... 
395    None
396    None
397    None
398    None
399    None
Length: 400, dtype: object

In [33]:
filename_series_test = pd.Series(groupby_obj_test.groups.keys())
filename_series_test.apply(save_data,args=(test_folder, groupby_obj_test))

0     None
1     None
2     None
3     None
4     None
      ... 
95    None
96    None
97    None
98    None
99    None
Length: 100, dtype: object

In [2]:
model = YOLO('yolov8n.yaml')


                   from  n    params  module                                       arguments                     
  0                  -1  1       464  ultralytics.nn.modules.conv.Conv             [3, 16, 3, 2]                 
  1                  -1  1      4672  ultralytics.nn.modules.conv.Conv             [16, 32, 3, 2]                
  2                  -1  1      7360  ultralytics.nn.modules.block.C2f             [32, 32, 1, True]             
  3                  -1  1     18560  ultralytics.nn.modules.conv.Conv             [32, 64, 3, 2]                
  4                  -1  2     49664  ultralytics.nn.modules.block.C2f             [64, 64, 2, True]             
  5                  -1  1     73984  ultralytics.nn.modules.conv.Conv             [64, 128, 3, 2]               
  6                  -1  2    197632  ultralytics.nn.modules.block.C2f             [128, 128, 2, True]           
  7                  -1  1    295424  ultralytics.nn.modules.conv.Conv             [128

In [3]:
model.train(data = '../data/processed/YOLO/Barplots/data.yaml', epochs=50)

New https://pypi.org/project/ultralytics/8.0.132 available 😃 Update with 'pip install -U ultralytics'
Ultralytics YOLOv8.0.124 🚀 Python-3.11.3 torch-2.0.1 CPU
[34m[1myolo/engine/trainer: [0mtask=detect, mode=train, model=yolov8n.yaml, data=../data/processed/YOLO/Barplots/data.yaml, epochs=50, patience=50, batch=16, imgsz=640, save=True, save_period=-1, cache=False, device=None, workers=8, project=None, name=None, exist_ok=False, pretrained=True, optimizer=auto, verbose=True, seed=0, deterministic=True, single_cls=False, rect=False, cos_lr=False, close_mosaic=0, resume=False, amp=True, fraction=1.0, profile=False, overlap_mask=True, mask_ratio=4, dropout=0.0, val=True, split=val, save_json=False, save_hybrid=False, conf=None, iou=0.7, max_det=300, half=False, dnn=False, plots=True, source=None, show=False, save_txt=False, save_conf=False, save_crop=False, show_labels=True, show_conf=True, vid_stride=1, line_width=None, visualize=False, augment=False, agnostic_nms=False, classes=None,