In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import cv2

import json
import os
from shutil import move, copy

from ultralytics import YOLO

# Setting up the Data

In [2]:
data = pd.read_csv('../data/interim/annots_imgs_merged.csv')
scatterplots = data[data['chart-type'] == 'scatter']
scatterplots = scatterplots.head(4000)
scatterplots = scatterplots[['data-series','visual-elements.scatter points','file_name','image_path']]
scatterplots.columns = ['data_series','data_coords','name','path']
scatterplots.reset_index(drop=True, inplace=True)
scatterplots.head()

Unnamed: 0,data_series,data_coords,name,path
0,"[{'x': 1949.4200576738224, 'y': 66.68303958393...","[[{'x': 89.33333333333334, 'y': 75.59999999999...",e91e28111e86,../data/interim/Scatterplots/e91e28111e86.jpg
1,"[{'x': 0.9445463278316278, 'y': 52.13870797394...","[[{'x': 68.76190476190474, 'y': 188.2023809523...",66dd2a250237,../data/interim/Scatterplots/66dd2a250237.jpg
2,"[{'x': 0.8933632249058832, 'y': 5.440194292653...","[[{'x': 93.33333333333333, 'y': 210.9666666666...",497a547454d7,../data/interim/Scatterplots/497a547454d7.jpg
3,"[{'x': 0.9873584566532869, 'y': 30.64333251170...","[[{'x': 67.83333333333334, 'y': 167.15}, {'x':...",07fb50377c3c,../data/interim/Scatterplots/07fb50377c3c.jpg
4,"[{'x': 1989.854169237731, 'y': 31.103360811667...","[[{'x': 105.83333333333334, 'y': 212.799999999...",daa43320159b,../data/interim/Scatterplots/daa43320159b.jpg


In [3]:
dataframes = []
for i in range(len(scatterplots)):
    # extract xy coordinates from data_coords column
    coords = scatterplots['data_coords'][i]
    coords = coords.replace("'", "\"")
    coords_list = json.loads(coords)

    # extract important image info
    path = scatterplots['path'][i]
    name = scatterplots['name'][i]
    img = cv2.imread(path)
    height = img.shape[0]
    width = img.shape[1]

    # Save the xy coords into a list
    xy_coords = []
    for data_point in coords_list[0]:
        xy_coords.append((data_point['x'],data_point['y']))

    # Transform the xy coords into YOLO bboxes 
    bboxes = []
    for xy in xy_coords:
        x, y = xy
        bbox = (int(x)/width, int(y)/height, 3/width, 3/height)
        bboxes.append(bbox)

    # Save into list of dataframe
    yolo = pd.DataFrame(bboxes, columns=['x','y','w','h'])
    yolo['class'] = 0
    yolo['path'] = path
    yolo['name'] = name
    yolo = yolo[['path','name','class','x','y','w','h']]
    dataframes.append(yolo)

# Concat dataframes and check that the number of unique images is correct
yolo_df = pd.concat(dataframes)
yolo_df['path'].nunique()
    


4000

In [4]:
yolo_df.head()

Unnamed: 0,path,name,class,x,y,w,h
0,../data/interim/Scatterplots/e91e28111e86.jpg,e91e28111e86,0,0.176938,0.218659,0.005964,0.008746
1,../data/interim/Scatterplots/e91e28111e86.jpg,e91e28111e86,0,0.212724,0.221574,0.005964,0.008746
2,../data/interim/Scatterplots/e91e28111e86.jpg,e91e28111e86,0,0.250497,0.227405,0.005964,0.008746
3,../data/interim/Scatterplots/e91e28111e86.jpg,e91e28111e86,0,0.282306,0.244898,0.005964,0.008746
4,../data/interim/Scatterplots/e91e28111e86.jpg,e91e28111e86,0,0.314115,0.268222,0.005964,0.008746


# Train/Test Split

In [5]:
unique_images = yolo_df['path'].unique()
img_df = pd.DataFrame(unique_images,columns=['path'])
img_train = tuple(img_df.sample(frac=0.8)['path'])
img_test = tuple(img_df.query(f'path not in {img_train}')['path'])
len(img_train), len(img_test)

(3200, 800)

In [6]:
train_df = yolo_df.query(f'path in {img_train}')
test_df = yolo_df.query(f'path in {img_test}')

In [7]:
train_df

Unnamed: 0,path,name,class,x,y,w,h
0,../data/interim/Scatterplots/66dd2a250237.jpg,66dd2a250237,0,0.143763,0.696296,0.006342,0.011111
1,../data/interim/Scatterplots/66dd2a250237.jpg,66dd2a250237,0,0.179704,0.588889,0.006342,0.011111
2,../data/interim/Scatterplots/66dd2a250237.jpg,66dd2a250237,0,0.211416,0.548148,0.006342,0.011111
3,../data/interim/Scatterplots/66dd2a250237.jpg,66dd2a250237,0,0.245243,0.737037,0.006342,0.011111
4,../data/interim/Scatterplots/66dd2a250237.jpg,66dd2a250237,0,0.276956,0.222222,0.006342,0.011111
...,...,...,...,...,...,...,...
27,../data/interim/Scatterplots/08181d5dbf44.jpg,08181d5dbf44,0,0.863454,0.795222,0.006024,0.010239
28,../data/interim/Scatterplots/08181d5dbf44.jpg,08181d5dbf44,0,0.893574,0.679181,0.006024,0.010239
29,../data/interim/Scatterplots/08181d5dbf44.jpg,08181d5dbf44,0,0.921687,0.252560,0.006024,0.010239
30,../data/interim/Scatterplots/08181d5dbf44.jpg,08181d5dbf44,0,0.951807,0.563140,0.006024,0.010239


In [8]:
src_folder = '../data/interim/Scatterplots/'
train_folder = '../data/YOLO/Scatterplots/train/'
test_folder = '../data/YOLO/Scatterplots/test/'

In [9]:
cols = ['name','class','x','y','w','h']
groupby_obj_train = train_df[cols].groupby('name')
groupby_obj_test = test_df[cols].groupby('name')

In [10]:
def save_data(filename, folder_path, group_obj):
    #copy image
    src = os.path.join(src_folder,filename)+'.jpg'
    dst = os.path.join(folder_path, filename)+'.jpg'
    copy(src, dst)

    #save labels
    text_filename = os.path.join(folder_path,filename+'.txt')
    group_obj.get_group(filename).set_index('name').to_csv(text_filename, index=False, header=False, sep=' ')

In [11]:
filename_series = pd.Series(groupby_obj_train.groups.keys())
filename_series.apply(save_data,args=(train_folder, groupby_obj_train))

0       None
1       None
2       None
3       None
4       None
        ... 
3195    None
3196    None
3197    None
3198    None
3199    None
Length: 3200, dtype: object

In [12]:
filename_series_test = pd.Series(groupby_obj_test.groups.keys())
filename_series_test.apply(save_data,args=(test_folder, groupby_obj_test))


0      None
1      None
2      None
3      None
4      None
       ... 
795    None
796    None
797    None
798    None
799    None
Length: 800, dtype: object

# Training the Model

In [17]:
model = YOLO('yolov8s-p2.yaml').load('yolov8s.pt')


                   from  n    params  module                                       arguments                     
  0                  -1  1       928  ultralytics.nn.modules.conv.Conv             [3, 32, 3, 2]                 
  1                  -1  1     18560  ultralytics.nn.modules.conv.Conv             [32, 64, 3, 2]                
  2                  -1  1     29056  ultralytics.nn.modules.block.C2f             [64, 64, 1, True]             
  3                  -1  1     73984  ultralytics.nn.modules.conv.Conv             [64, 128, 3, 2]               
  4                  -1  2    197632  ultralytics.nn.modules.block.C2f             [128, 128, 2, True]           
  5                  -1  1    295424  ultralytics.nn.modules.conv.Conv             [128, 256, 3, 2]              
  6                  -1  2    788480  ultralytics.nn.modules.block.C2f             [256, 256, 2, True]           
  7                  -1  1   1180672  ultralytics.nn.modules.conv.Conv             [256

 11             [-1, 6]  1         0  ultralytics.nn.modules.conv.Concat           [1]                           
 12                  -1  1    591360  ultralytics.nn.modules.block.C2f             [768, 256, 1]                 
 13                  -1  1         0  torch.nn.modules.upsampling.Upsample         [None, 2, 'nearest']          
 14             [-1, 4]  1         0  ultralytics.nn.modules.conv.Concat           [1]                           
 15                  -1  1    148224  ultralytics.nn.modules.block.C2f             [384, 128, 1]                 
 16                  -1  1         0  torch.nn.modules.upsampling.Upsample         [None, 2, 'nearest']          
 17             [-1, 2]  1         0  ultralytics.nn.modules.conv.Concat           [1]                           
 18                  -1  1     37248  ultralytics.nn.modules.block.C2f             [192, 64, 1]                  
 19                  -1  1     36992  ultralytics.nn.modules.conv.Conv             [64, 

2023-07-17 12:20:25,276 - clearml.model - INFO - Selected model id: a66c01fc59c8440199f8fbb7ae883e3e


Transferred 219/437 items from pretrained weights


In [18]:
results = model.train(data='../data/YOLO/Scatterplots/data.yaml', epochs=30)

New https://pypi.org/project/ultralytics/8.0.136 available 😃 Update with 'pip install -U ultralytics'
Ultralytics YOLOv8.0.124 🚀 Python-3.11.3 torch-2.0.1 CPU
[34m[1myolo/engine/trainer: [0mtask=detect, mode=train, model=yolov8s-p2.yaml, data=/Users/matt/Desktop/graphs-capstone/data/YOLO/Scatterplots/data.yaml, epochs=50, patience=50, batch=16, imgsz=640, save=True, save_period=-1, cache=False, device=None, workers=8, project=None, name=None, exist_ok=False, pretrained=True, optimizer=auto, verbose=True, seed=0, deterministic=True, single_cls=False, rect=False, cos_lr=False, close_mosaic=0, resume=False, amp=True, fraction=1.0, profile=False, overlap_mask=True, mask_ratio=4, dropout=0.0, val=True, split=val, save_json=False, save_hybrid=False, conf=None, iou=0.7, max_det=300, half=False, dnn=False, plots=True, source=None, show=False, save_txt=False, save_conf=False, save_crop=False, show_labels=True, show_conf=True, vid_stride=1, line_width=None, visualize=False, augment=False, agn