In [1]:
import os
import pandas as pd
from tqdm.notebook import tqdm
import random
import numpy as np
import pickle

### Reading json files

In [2]:
from SATM.preprocess import merge_aves_df

In [3]:
# merging train and val dataset to perform a train val test split
#df = pd.concat([json2df("train_2017_bboxes.json"), json2df("val_2017_bboxes.json")])
df = merge_aves_df("train_2017_bboxes.json", "val_2017_bboxes.json")
df.head()

Unnamed: 0,license,file_name,rights_holder,height,width,id,area,iscrowd,image_id,bbox,category_id,id_y,identifier,category_name,super_category_name
139706,3,train_val_images/Aves/Bubulcus ibis/26a9157b48...,greglasley,545,800,213252,37238.0,0,213252,"[230, 199, 433, 172]",2912,153173,26a9157b48f66f71032f75ac70a11db7.jpg,Bubulcus ibis,Aves
214110,3,train_val_images/Insecta/Feltia herilis/8e7ecc...,leplady0209,600,800,318689,53133.5,0,318689,"[194, 138, 323, 329]",4169,235593,8e7ecc6f1bf06ad53acb8326b8696740.jpg,Feltia herilis,Insecta
429742,3,train_val_images/Aves/Tringa solitaria/a766afd...,J. Maughn,591,800,598727,6950.0,0,598727,"[113, 274, 139, 100]",3890,468382,a766afd4faa3b87cece8760ebaa6d9d1.jpg,Tringa solitaria,Aves
191185,3,train_val_images/Insecta/Manduca sexta/4f86fdb...,hobiecat,732,800,284974,194892.0,0,284974,"[146, 56, 654, 596]",3771,210301,4f86fdb62ca07b7257a1ab9a72816d98.jpg,Manduca sexta,Insecta
458018,3,train_val_images/Aves/Haemorhous mexicanus/e28...,cuskelly,600,800,631689,22000.0,0,631689,"[269, 129, 125, 352]",4506,498071,e28ff6dc89c379783c08bba85f66371c.jpg,Haemorhous mexicanus,Aves


### Removing unncessary images

In [4]:
from SATM.preprocess import clean_aves
category_list = os.listdir("Aves")
clean_aves(cat_list = category_list, path = "Aves", element = ".ipynb_checkpoints")

In [5]:
from SATM.preprocess import encode_df
img_list = [i for cat in category_list for i in os.listdir("Aves/"+cat)]
df = df[df.identifier.isin(img_list)]
df = encode_df(df)

In [6]:
# there are images we downloaded that do not feature a bounding box. We need to delete them
# to identify them, we look for pictures not appearing in the dataframe
img_list = [i for cat in category_list for i in os.listdir("Aves/"+cat)]
identifiers = list(df.identifier) # list of pictures not appearing in the dataframe
to_remove = [i for i in tqdm(img_list) if i not in identifiers]
print(f'There are {len(to_remove)} pictures to be removed')

  0%|          | 0/22257 [00:00<?, ?it/s]

There are 0 pictures to be removed


In [7]:
path = "Aves"
rem_count = 0
for i in category_list:
    temp_path = path+"/"+i
    list_dir = os.listdir(temp_path)
    for k in to_remove:
        if k in list_dir:
            os.remove(temp_path+"/"+k)
            rem_count += 1
print(f'{rem_count} pictures removed!')

0 pictures removed!


In [8]:
new_img_list = [i for cat in category_list for i in os.listdir(path+"/"+cat)]
print(f'From {len(img_list)} to {len(new_img_list)} images')
if len(new_img_list) == len(df.drop_duplicates("image_id")):
    print("The number of rows in the dataframe is the same as the number of available images")

From 22257 to 22257 images
The number of rows in the dataframe is the same as the number of available images


### Train Val Test split

In [9]:
seed = 810
random.seed(seed)
np.random.seed(seed)
images_df = df.drop_duplicates("image_id")
val_identifiers = []
test_identifiers = []
for cat in category_list:
    temp_list = list(images_df[images_df.category_name == cat].sample(frac = 0.2, random_state = 0).identifier)
    l = len(temp_list)//2
    val_identifiers += temp_list[:l]
    test_identifiers += temp_list[l:]
    #print(len(val_identifiers), len(test_identifiers))
random.shuffle(val_identifiers)
random.shuffle(test_identifiers)

In [10]:
val_test_identifiers = val_identifiers+test_identifiers
val_df = df[df.identifier.isin(val_identifiers)]
test_df = df[df.identifier.isin(test_identifiers)]
train_df = df[~df.identifier.isin(val_test_identifiers)]

In [11]:
train_identifiers = list(train_df.drop_duplicates("image_id").identifier)

In [12]:
if len(train_df) + len(val_df) + len(test_df) == len(df):
    print("Split done correctly!")

Split done correctly!


In [13]:
#print("original", "train","val","test", "sum")
recap_split_box = pd.DataFrame(columns = ["Original", "Train", "Val", "Test", "Sum"])
for cat in category_list:
    values = [len(df[df.category_name == cat]),
          len(train_df[train_df.category_name == cat]),
          len(val_df[val_df.category_name == cat]),
            len(test_df[test_df.category_name == cat]),
          sum([len(train_df[train_df.category_name == cat]),
          len(val_df[val_df.category_name == cat]),
          len(test_df[test_df.category_name == cat])])]
    
    recap_split_box.loc[cat,:] = values
    
recap_split_box

Unnamed: 0,Original,Train,Val,Test,Sum
Melospiza melodia,2098,1676,213,209,2098
Ardea alba,3640,2917,356,367,3640
Pandion haliaetus,1999,1588,201,210,1999
Cardinalis cardinalis,2207,1758,220,229,2207
Zenaida macroura,2502,1997,250,255,2502
Agelaius phoeniceus,2348,1856,237,255,2348
Junco hyemalis,1385,1113,134,138,1385
Ardea herodias,4299,3445,422,432,4299
Buteo jamaicensis,3612,2874,372,366,3612
Picoides pubescens,1546,1240,154,152,1546


In [14]:
#print("original", "train","val","test", "sum")
recap_split_image = pd.DataFrame(columns = ["Original", "Train", "Val", "Test", "Sum"])
for cat in category_list:
    values = [len(df.drop_duplicates("image_id")[df.drop_duplicates("image_id").category_name == cat]),
          len(train_df.drop_duplicates("image_id")[train_df.drop_duplicates("image_id").category_name == cat]),
          len(val_df.drop_duplicates("image_id")[val_df.drop_duplicates("image_id").category_name == cat]),
          len(test_df.drop_duplicates("image_id")[test_df.drop_duplicates("image_id").category_name == cat]),
         sum([len(train_df.drop_duplicates("image_id")[train_df.drop_duplicates("image_id").category_name == cat]),
          len(val_df.drop_duplicates("image_id")[val_df.drop_duplicates("image_id").category_name == cat]),
          len(test_df.drop_duplicates("image_id")[test_df.drop_duplicates("image_id").category_name == cat])])]
    recap_split_image.loc[cat,:] = values
    
recap_split_image 

Unnamed: 0,Original,Train,Val,Test,Sum
Melospiza melodia,2050,1640,205,205,2050
Ardea alba,2848,2278,285,285,2848
Pandion haliaetus,1794,1435,179,180,1794
Cardinalis cardinalis,2006,1605,200,201,2006
Zenaida macroura,1932,1546,193,193,1932
Agelaius phoeniceus,1888,1510,189,189,1888
Junco hyemalis,1281,1025,128,128,1281
Ardea herodias,3627,2902,362,363,3627
Buteo jamaicensis,3328,2662,333,333,3328
Picoides pubescens,1503,1202,150,151,1503


In [15]:
# saving dataframes
with open('pickles_df/train.pickle', 'wb') as handle:
    pickle.dump(train_df, handle)
with open('pickles_df/val.pickle', 'wb') as handle:
    pickle.dump(val_df, handle)
with open('pickles_df/test.pickle', 'wb') as handle:
    pickle.dump(test_df, handle)

In [None]:
# copying the images into correct Train, Val, Test
# accordingly with the train_df, val_df and test_df
count_train = 0
count_val = 0
count_test = 0

errors = []

#train_path = "images_prova/Train"
#val_path = "images_prova/Val"
#test_path = "images_prova/Test

for cat in tqdm(category_list):
    temp_path = "Aves/"+cat
    #print(temp_path)
    temp_img_list = os.listdir(temp_path)
    for i in tqdm(temp_img_list):
        if i in train_identifiers:
            try:
                shutil.copyfile(temp_path+"/"+i, "data/images/Train/"+i)
                count_train += 1
            except:
                errors.append(i)
                print(id, "train error")

        elif i in val_identifiers:
            try:
                shutil.copyfile(temp_path+"/"+i, "data/images/Val/"+i)
                count_val += 1
            except:
                errors.append(i)
                print(id, "val error")
                
                
        elif i in test_identifiers:
            try:
                shutil.copyfile(temp_path+"/"+i, "data/images/Test/"+i)
                count_test += 1
            except:
                errors.append(i)
                print(id, "test error")
                
        else:
            errors.append(i)
            print(id, "non c'è la foto nei dataframes")

### Generating labels for YOLO

In [16]:
from SATM.preprocess import convert_to_yolov5
convert_to_yolov5(which = "Train", df = train_df)
convert_to_yolov5(which = "Val", df = val_df)
convert_to_yolov5(which = "Test", df = test_df)

In [16]:
from SATM.preprocess import clean_data
clean_data()

### Generating the greyscale dataset (3 channels)

In [95]:
from torchvision.utils import save_image
transform = transforms.Compose([transforms.ToTensor()])
for folder in tqdm(["Train", "Val", "Test"]):
    source_p = os.listdir("data/images/"+folder)
    for i in tqdm(source_p):
        image_file = Image.open(f"data/images/{folder}/{i}") # open colour image
        tensor = transform(image_file)
        avg_tensor = tensor.mean(axis = 0).numpy()
        new_image = torch.tensor(np.array([avg_tensor,avg_tensor,avg_tensor]))
        print(new_image.shape)
        #save_image(new_image, f'data_bw/images/{folder}/{i}')

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/17806 [00:00<?, ?it/s]

torch.Size([3, 532, 800])


  0%|          | 0/2224 [00:00<?, ?it/s]

torch.Size([3, 800, 800])


  0%|          | 0/2228 [00:00<?, ?it/s]

torch.Size([3, 800, 713])


### Generating the greyscale dataset (1 channels)

In [39]:
from torchvision.utils import save_image
transform = transforms.Compose([transforms.ToTensor()])
for folder in tqdm(["Train", "Val", "Test"]):
    source_p = os.listdir("data/images/"+folder)
    for i in tqdm(source_p):
        image_file = Image.open(f"data/images/{folder}/{i}") # open colour image
        tensor = transform(image_file)
        avg_tensor = tensor.mean(axis = 0)
        #new_image = torch.tensor(np.array([avg_tensor,avg_tensor,avg_tensor]))
        #print(new_image.shape)
        save_image(avg_tensor, f'data_1channel/images/{folder}/{i}')



  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/17805 [00:00<?, ?it/s]

  0%|          | 0/2224 [00:00<?, ?it/s]

  0%|          | 0/2228 [00:00<?, ?it/s]

### Colorize 1 channel gray scale images

In [9]:
#!git clone https://github.com/jantic/DeOldify.git DeOldify 

In [6]:
cd DeOldify

/home/labuser/Project/DeOldify


In [7]:
#NOTE:  This must be the first call in order to work properly!
from deoldify import device
from deoldify.device_id import DeviceId
#choices:  CPU, GPU0...GPU7
device.set(device=DeviceId.GPU0)

import torch

if not torch.cuda.is_available():
    print('GPU not available.')

In [13]:
# !pip install -r requirements-colab.txt

In [8]:
import fastai
from deoldify.visualize import *
import warnings
warnings.filterwarnings("ignore", category=UserWarning, message=".*?Your .*? set is empty.*?")

In [15]:
!mkdir 'models'
!wget https://data.deepai.org/deoldify/ColorizeArtistic_gen.pth -O ./models/ColorizeArtistic_gen.pth

--2022-12-02 18:19:11--  https://data.deepai.org/deoldify/ColorizeArtistic_gen.pth
Resolving data.deepai.org... 5.9.140.253
Connecting to data.deepai.org|5.9.140.253|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 255144681 (243M) [application/octet-stream]
Saving to: ‘./models/ColorizeArtistic_gen.pth’


2022-12-02 18:19:13 (108 MB/s) - ‘./models/ColorizeArtistic_gen.pth’ saved [255144681/255144681]



In [9]:
colorizer = get_image_colorizer(artistic=True)

  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)


In [11]:
for img in tqdm(os.listdir("../data/images/Test/")):
    img_pil = colorizer.get_transformed_image("../data/images/Test/"+img,
                                render_factor = 35)
    img_pil.save("../colorized/images/Test/"+img)

  0%|          | 0/2228 [00:00<?, ?it/s]

In [12]:
cd ..

/home/labuser/Project


### Undersampling

In [63]:
min_threshold = train_df.drop_duplicates("image_id").groupby("category_name").size().sort_values().head(1)[0]

In [None]:
for cat in category_list:

In [78]:
np.random.seed(810)
for cat in tqdm(category_list):
    temp_df = train_df.drop_duplicates("image_id")
    identifiers = temp_df[temp_df.category_name == cat].identifier.values
    chosen = np.random.choice(identifiers, min_threshold)
    for pic in tqdm(chosen):
        shutil.copyfile("data/images/Train/"+pic, "data_under/images/Train/"+pic)
        shutil.copyfile("data/labels/Train/"+pic[:-3]+"txt", "data_under/labels/Train/"+pic[:-3]+"txt")
    
    

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/1025 [00:00<?, ?it/s]

  0%|          | 0/1025 [00:00<?, ?it/s]

  0%|          | 0/1025 [00:00<?, ?it/s]

  0%|          | 0/1025 [00:00<?, ?it/s]

  0%|          | 0/1025 [00:00<?, ?it/s]

  0%|          | 0/1025 [00:00<?, ?it/s]

  0%|          | 0/1025 [00:00<?, ?it/s]

  0%|          | 0/1025 [00:00<?, ?it/s]

  0%|          | 0/1025 [00:00<?, ?it/s]

  0%|          | 0/1025 [00:00<?, ?it/s]

### Oversampling

In [31]:
max_threshold = train_df.drop_duplicates("image_id").groupby("category_name").size().sort_values().tail(1)[0]

In [32]:
max_threshold

2902

In [33]:
to_add = max_threshold - train_df.drop_duplicates("image_id").groupby("category_name").size()

In [34]:
to_add

category_name
Agelaius phoeniceus      1392
Ardea alba                624
Ardea herodias              0
Buteo jamaicensis         240
Cardinalis cardinalis    1297
Junco hyemalis           1877
Melospiza melodia        1262
Pandion haliaetus        1467
Picoides pubescens       1700
Zenaida macroura         1356
dtype: int64

In [18]:
from SATM.preprocess import generate_txt

In [153]:
np.random.seed(810)
bbox_transform_flip = albumentations.Compose([albumentations.HorizontalFlip(p=1)],
                                                    bbox_params = albumentations.BboxParams(format='pascal_voc',
                                                                                            label_fields=['labels']))

bbox_transform_rotate = albumentations.Compose([albumentations.Rotate(p=1)],
                                                    bbox_params = albumentations.BboxParams(format='pascal_voc',
                                                                                           label_fields=['labels']))
wrong = []
trans_dic = {}

for cat in tqdm(category_list):
    #print(to_add[cat])
    temp_df = train_df.drop_duplicates("image_id")
    identifiers = temp_df[temp_df.category_name == cat].identifier.values
    pota  = to_add[cat]
    trans_dic[cat] = []
    #print(len(identifiers))
    if len(identifiers) > pota:
        to_transform = np.random.choice(identifiers, pota)
        for img in tqdm(to_transform):
            image = cv2.imread("data/images/Train/"+img)
            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
            targets = generate_target(train_df[train_df.identifier == img])
            try:
                transformed = bbox_transform_flip(image = image,
                                             bboxes = targets['boxes'],
                                             labels = targets['labels'])

                trans_dic[cat].append(transformed)
                generate_txt(img, transformed, "_v1")
                save_image(torch.tensor(transformed["image"]).cpu().permute(2,0,1)/255, "data_over/images/Train/"+img[:-4]+"_v1"+".jpg")
            except:
                wrong.append(img)

            
            
    else:
        difference = pota - len(identifiers)
        for img in tqdm(identifiers):
            image = cv2.imread("data/images/Train/"+img)
            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
            targets = generate_target(train_df[train_df.identifier == img])
            
            try:
                transformed = bbox_transform_flip(image = image,
                                             bboxes = targets['boxes'],
                                             labels = targets['labels'])

                trans_dic[cat].append(transformed)
                generate_txt(img, transformed, "_v1")
                save_image(torch.tensor(transformed["image"]).cpu().permute(2,0,1)/255, "data_over/images/Train/"+img[:-4]+"_v1"+".jpg")
            except:
                
                wrong.append(img)
                
        to_transform = np.random.choice(identifiers, difference)
        for img in tqdm(to_transform):
            image = cv2.imread("data/images/Train/"+img)
            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
            targets = generate_target(train_df[train_df.identifier == img])
            try:
                
                transformed = bbox_transform_rotate(image = image,
                                         bboxes = targets['boxes'],
                                         labels = targets['labels'])
            
            
                trans_dic[cat].append(transformed)
                generate_txt(img, transformed, "_v2")
                save_image(torch.tensor(transformed["image"]).cpu().permute(2,0,1)/255, "data_over/images/Train/"+img[:-4]+"_v2"+".jpg")
            except:
                wrong.append(img)                

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/1262 [00:00<?, ?it/s]

  0%|          | 0/624 [00:00<?, ?it/s]

  0%|          | 0/1435 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/1297 [00:00<?, ?it/s]

  0%|          | 0/1356 [00:00<?, ?it/s]

  0%|          | 0/1392 [00:00<?, ?it/s]

  0%|          | 0/1025 [00:00<?, ?it/s]

  0%|          | 0/852 [00:00<?, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/240 [00:00<?, ?it/s]

  0%|          | 0/1202 [00:00<?, ?it/s]

  0%|          | 0/498 [00:00<?, ?it/s]