# Generalized code to prepare and combine (both synthetic and real) Image Fruad Datasets for use with YOLOv5
## Example DB: 1. COVERAGE, 2. IEEE IFS-TC

### imports

In [2]:
import cv2
import re
import yaml
import numpy as np
import matplotlib.pyplot as plt
import zipfile
import glob
from random import randint
import os
import torch
import tensorflow as tf
from torchvision.io import read_image
import torchvision.transforms.functional as F
import torchvision.transforms as T
from torchvision.ops import masks_to_boxes

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Unzip/extract images and masks files (uploaded as zip files)

### COVERAGE

### Direct zip upload:

In [None]:
with zipfile.ZipFile(root +'masks.zip' , 'r') as zip_ref:
  zip_ref.extractall(root)

In [None]:
with zipfile.ZipFile(root + 'images.zip' , 'r') as zip_ref:
  zip_ref.extractall(root)

### Read DB files from Drive directly (faster)

In [3]:
# path where images and masks folder(s) are located 
db_root = '/content/drive/MyDrive/IF_DBs/COVERAGE/'

### set name of DB, test/train/val sizes, and folder where masks located

In [4]:
# name of DB
DB = 'coverage'
# COVERAGE is a very small DB
val_size = test_size = 15
# name 
files = 'masks'

### IEEE IFS-TC ImageForensics Challenge

### Direct zip upload:

In [None]:
with zipfile.ZipFile(root +'fake 2.zip' , 'r') as zip_ref:
  zip_ref.extractall(root)

In [None]:
with zipfile.ZipFile(root + 'pristine.zip' , 'r') as zip_ref:
  zip_ref.extractall(root)

### Read DB files from Drive directly (faster)

In [4]:
# path where images and masks folder(s) are located 
db_root = '/content/drive/MyDrive/IF_DBs/IFS-TC/'

In [5]:
# name of DB
DB = 'ifs-tc'
# COVERAGE is a very small DB
val_size = test_size = 150
# name 
files = 'fake'

### other constants

In [6]:
# YOLO requires images to be squares 
# choose appropriate side length
im_size = 416

In [None]:
%rm -rf /content/current
%mkdir /content/current

In [23]:
root = '/content/current/'

In [24]:
# Create Labeled Train/Val/Test Sets in Proper File Structure
# train 
os.mkdir(root + 'train')
os.mkdir(root + 'train/' + 'labels')
os.mkdir(root + 'train/' + 'images')
# test
os.mkdir(root + 'test')
os.mkdir(root + 'test/' + 'labels')
os.mkdir(root + 'test/' + 'images')
# valid
os.mkdir(root + 'valid')
os.mkdir(root + 'valid/' + 'labels')
os.mkdir(root + 'valid/' + 'images')

## Create and label synthetic Tampered Dataset (and divide into train/valid)
- either save the weights (trained with synthetic data) to retrain or fine-tune with non-synthetic dataset or run add them to the same training set




## TODO:

Datasets:
- try other image fraud db
- compare synthetic Pascal Voc images (download masks)
- compare synthetic COCO images (download masks)

Testing:
- add in untampered images
- add more augmented data from actual training set since there is a class imbalance

Preprossing:
- try resizing via padding only to avoid adding additional "tampering" or artifacts
- move around masks pasted



##image fraud DBs only include masks (no BBs) so this `createLabeledSets` function extracts BBs from masks to create the labels while placing properly formatted images and labels in proper YOLO file structures (for each train/val/test set)


**root:** where directories with test/train/val will be created 
and where the original tampered/authentic images and masks will be unzipped

**DB:** name of image fraud database

**files:** name of folder holding all masks and/or corresponding tampered images

**test_size:** number of test images

**val_size:** number of validation images

**im_size:** side length of square images

**synthetic:** if True then a synthetically crop object from another image and paste onto untampered image

**num_real** = number of real (authentic) images to add to the training test and validation sets
**augmentation** = 'None'
**n_rotations** = 1
**bb_pad** = 0

In [25]:
def createLabeledSets(root, DB, files, test_size, val_size, im_size, synthetic, num_real = 0, augmentation = 'None', n_rotations = 1, bb_pad = 0):

  # get all mask names in a list (sorted so first images will always be in test set)
  all_files = list(sorted(os.listdir(os.path.join(db_root, files))))

  subset = 'test'
  count = num_real
  group = ['test', 'train', 'valid']

  # def crop_resize(im, im_size, bb = None):
  #   if bb != None:
  #     x0, x0, w, h = bb
  #     # amount to subtract from each coordinate x and y of the topleft corner of the BB to get new image corner
  #     d_x, d_y = randint( 0, min(x0, im_size-w)), randint(0, min(y0, im_size-h)) 
    
  #     new_im = im[x0 - d_x: ]

  # iterate through each mask 
  for i in range(len(all_files)):
    
    if test_size <= i < (test_size + val_size):
      subset = 'valid' 
    elif i >= (test_size + val_size):
      subset = 'train'

    if i < test_size:
      num = randint(0,test_size)
    else:
      num = randint(test_size,len(all_files)-1)

    if DB == 'coverage':
      ext = '.jpeg'
      rev = False
      mode = cv2.THRESH_BINARY
      # remove forgery type info from image number so it matches with the corresponding image
      # for COVERAGE dataset matching numbers in names mean corresponding image/mask pair
      auth_name = re.sub("[^0-9]", "", all_files[i])
      
      if all_files[i] != auth_name + 'paste.jpeg':
        continue

      tamp_name = auth_name + 't' 
      # make sure to not paste on tampered image for synthetic generation
      auth2_name = re.sub("[^0-9]", "", all_files[num]) 

      # get image and mask using above
      tamp_im_i = cv2.imread(os.path.join(db_root, "images", tamp_name + ext))
      mask_im_i = cv2.imread(os.path.join(db_root, files, all_files[i]))
      # COVERAGE contains a corresponding 'real' image for each tampered image
      auth_im = cv2.imread(os.path.join(db_root, "images", auth_name + ext))
      auth2_im = cv2.imread(os.path.join(db_root, "images", auth2_name + ext)) 
    
    if DB == 'ifs-tc':
      ext = '.png'
      rev = True
      mode = cv2.THRESH_BINARY_INV
      if all_files[i] == all_files[i].split('.mask')[0]:
        continue
      tamp_name = all_files[i].split('.mask')[0]
      auth_name = list(sorted(os.listdir(os.path.join(db_root, 'pristine'))))[i].split(ext)[0]#all_files[i].split(ext)[0]
      auth2_name = list(sorted(os.listdir(os.path.join(db_root, 'pristine'))))[num].split(ext)[0]

      tamp_im_i = cv2.imread(os.path.join(db_root, files, tamp_name + ext))
      mask_im_i = cv2.imread(os.path.join(db_root, files, all_files[i]))
      auth_im = cv2.imread(os.path.join(db_root, "pristine", auth_name + ext)) 
      auth2_im = cv2.imread(os.path.join(db_root, "pristine", auth2_name + ext)) 


    if num_real > 0 and count > 0:
        count -= 1
        subset2 = group[i % 3]
        # write empty file meaning 'no object' for untampered images (ones without the 't' are untampered)
        with open(os.path.join(root, subset2, "labels", auth_name + '.txt'), 'w') as f:     
          f.write(' ')
        # then add the real image to the set folder

        ### HERE
        auth_im = cv2.resize(auth_im, (im_size,im_size))
        cv2.imwrite(os.path.join(root, subset2, "images", auth_name + ext), auth_im)
      

    for j in range(n_rotations):
      mask_im = mask_im_i
      tamp_im = tamp_im_i
      # rotate both the mask and the tamped image its extracted from 
      if j == 0:
        # no rotation (full 360)
        mask_im = cv2.rotate(mask_im, cv2.ROTATE_180)
        tamp_im = cv2.rotate(tamp_im, cv2.ROTATE_180)
        rotate = cv2.ROTATE_180
      elif j == 1:
        if subset == 'test':
          break
        rotate = cv2.ROTATE_90_COUNTERCLOCKWISE
      elif j == 2:
        rotate = cv2.ROTATE_180
      else:
        rotate = cv2.ROTATE_90_CLOCKWISE
      
      # get BBs from mask
      # first rotate and resize
   
      mask_im = cv2.rotate(mask_im, rotate)
      ### HERE
      mask_im = cv2.resize(mask_im, (im_size, im_size))

      # make sure image is binary and threshold
      gray = cv2.cvtColor(mask_im, cv2.COLOR_BGR2GRAY)
      thresh = cv2.threshold(gray,250,255, mode)[1]
      cnts = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
      cnts = cnts[0] if len(cnts) == 2 else cnts[1]
      for c in cnts:
        x, y, w1, h1 = cv2.boundingRect(c)
      if x + w1 >= im_size:
        continue
      if y + h1 >= im_size:
        continue        
      # YOLO requires coordinates in x,y center, x,y height/width
      c_x = x + (w1 // 2) 
      c_y = y + (h1 // 2)
      # YOLO requires normalization of coordinates      
      c_x /= im_size
      c_y /= im_size 
      w = (w1 + bb_pad) / im_size 
      h = (h1 + bb_pad)/ im_size

      # rotate and resize image (same ask mask)
      im_r = cv2.rotate(tamp_im, rotate)

      ### HERE
      out_im = cv2.resize(im_r, (im_size,im_size))

      if synthetic == True:  
        # read in random image to paste extracted masked region onto it
        # and keep BG and FG images in same sets:

        ### HERE
        auth2_im = cv2.resize(auth2_im, (im_size, im_size))
      
        # mask of im is 0/black or white
        # (depending on DB) everywhere where im2 will go   
        if rev == True:
          out_im = np.where(mask_im == 0, out_im, auth2_im)
        else:
          out_im = np.where(mask_im == 0, auth2_im, out_im)
        # im = np.where(mask >= 250, im, im2)

      if augmentation != 'None':
        # imagaug and albumentations (augmentations for BB included)
        if subset == 'test':
          break
        if augmentation == 'brightness':
          b = np.ones(out_im.shape, dtype='uint8')*70
          out_im = cv2.add(out_im,b)

        elif augmentation == 'dullness':
          d = np.ones(out_im.shape, dtype='uint8')*70
          out_im = cv2.subtract(out_im,d)

        elif augmentation == 'sharpness':
          s = np.array([[-1,-1,-1],[-1,10,-1],[-1,-1,-1]])
          out_im = cv2.filter2D(out_im, -1,s)

      # debug:
      #out_im = cv2.rectangle(out_im, (x- (bb_pad//2),y- (bb_pad//2)), (x+w1 + (bb_pad//2),y+h1+ (bb_pad//2)), (255, 255, 255), 2)
      
      cv2.imwrite(os.path.join(root, subset, "images", tamp_name + '_' + augmentation + str(j) + ext), out_im)

      with open(os.path.join(root, subset, "labels", tamp_name + '_' + augmentation + str(j) + '.txt'), 'w') as f:
        f.write(str(0) + ' ' + str(c_x) + ' ' + str(c_y) + ' ' + str(w) + ' ' + str(h))


## Label actual Tampered Dataset (and divide into test/train/valid)

### synthetic db all 4 rotations and 2 augmentation types

In [26]:
createLabeledSets(root, DB, files, test_size, val_size, im_size, True, 0, 'None', 4, 4)

In [12]:
createLabeledSets(root, DB, files, test_size, val_size, im_size, True, 0, 'dullness', 1, 10)

In [13]:
createLabeledSets(root, DB, files, test_size, val_size, im_size, True, 0, 'brightness', 1, 10)

### Create sets (non-synthetic)

In [None]:
createLabeledSets(root, DB, files, test_size, val_size, im_size, False, 0, 'None', 4, 12)

In [None]:
createLabeledSets(root, DB, files, test_size, val_size, im_size, False, 0, 'brightness', 1, 12)

In [None]:
createLabeledSets(root, DB, files, test_size, val_size, im_size, False, 0, 'dullness', 1, 12)

In [14]:
%%writefile /content/data.yaml
train: /content/current/train/images
val: /content/current/valid/images
test: /content/current/test/images


nc: 1
names: ['tampered']

Writing /content/data.yaml


# Setup

Clone repo, install dependencies and check PyTorch and GPU.

In [20]:
!git clone https://github.com/ultralytics/yolov5  # clone
%cd yolov5
%pip install -qr requirements.txt  # install

#import torch
import utils
display = utils.notebook_init()  # checks

YOLOv5 🚀 v6.2-23-g4a8ab3b Python-3.7.13 torch-1.12.1+cu113 CUDA:0 (Tesla P100-PCIE-16GB, 16281MiB)


Setup complete ✅ (4 CPUs, 25.5 GB RAM, 40.6/166.8 GB disk)


In [None]:
%%time
%cd /content/yolov5/
!python train.py --img 416 --batch 16 --epochs 240 --data '/content/data.yaml' --cfg /content/yolov5/models/yolov5m.yaml --weights '' --name yolov5s_results  --cache

In [33]:
%%time
%cd /content/yolov5/
!python val.py --weights /content/yolov5/runs/train/yolov5s_results2/weights/best.pt --data /content/data.yaml --img 416 --conf 0.5 --task test --save-txt


/content/yolov5
[34m[1mval: [0mdata=/content/data.yaml, weights=['/content/yolov5/runs/train/yolov5s_results2/weights/best.pt'], batch_size=32, imgsz=416, conf_thres=0.5, iou_thres=0.6, task=test, device=, workers=8, single_cls=False, augment=False, verbose=False, save_txt=True, save_hybrid=False, save_conf=False, save_json=False, project=runs/val, name=exp, exist_ok=False, half=False, dnn=False
YOLOv5 🚀 v6.2-23-g4a8ab3b Python-3.7.13 torch-1.12.1+cu113 CUDA:0 (Tesla P100-PCIE-16GB, 16281MiB)

Fusing layers... 
YOLOv5m summary: 290 layers, 20852934 parameters, 0 gradients, 47.9 GFLOPs
[34m[1mtest: [0mScanning '/content/current/test/labels.cache' images and labels... 59 found, 0 missing, 0 empty, 0 corrupt: 100% 59/59 [00:00<?, ?it/s]
               Class     Images     Labels          P          R     mAP@.5 mAP@.5:.95: 100% 2/2 [00:01<00:00,  1.52it/s]
                 all         59         59      0.395      0.288      0.334      0.254
Speed: 0.1ms pre-process, 3.8ms inference

In [None]:
%%time
%cd /content/yolov5/classify/
!python predict.py --weights /content/yolov5/runs/train/yolov5s_results2/weights/best.pt 


In [None]:
%cd /content/yolov5/
!python detect.py --weights runs/train/yolov5s_results2/weights/best.pt --img 416 --conf 0.2 --source ../current/test/images

## Freeze backbone

In [None]:
%%time
%cd /content/yolov5/
!python train.py --freeze 10 --img 416 --batch 16 --epochs 240 --data '/content/data.yaml' --cfg /content/yolov5/models/yolov5m.yaml --weights '/content/syn240.pt' --name yolov5s_results  --cache