# Extract content and composite features
- code for color name https://osf.io/ts4an/?view_only=c2bc70802e534f14b1e73efc567462b3
- face detection https://github.com/timesler/facenet-pytorch
- object detection YOLO v8

In [9]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image, ImageDraw
import cv2
# os.environ['PATH'] += ':/work/tvision/bin'
# os.environ['PATH'] += ':/work/tvision/bin/bin'


scene_frame_dir = '../eyetracking-frames-new/'
ds_store_file = os.path.join(scene_frame_dir, '.DS_Store')
if os.path.exists(ds_store_file):
    os.remove(ds_store_file)
    
import warnings
warnings.filterwarnings("ignore")

import features
from ultralytics import YOLO
yolomodel = YOLO(model='yolov8x.pt')
from segment_anything import SamAutomaticMaskGenerator, sam_model_registry, SamPredictor
sam_checkpoint = "sam_vit_b_01ec64.pth"
model_type = "vit_b"
sam = sam_model_registry[model_type](checkpoint=sam_checkpoint)
mask_generator = SamAutomaticMaskGenerator(sam, pred_iou_thresh=0.9, min_mask_region_area=200)

import subprocess
import tqdm
from math import sqrt

import torch
import torchvision
print("PyTorch version:", torch.__version__)
print("Torchvision version:", torchvision.__version__)
print("CUDA is available:", torch.cuda.is_available())
if torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"
    
sam.to(device=device)

from scripts import content
from scripts import composite
from scripts import utils
from skimage.color import rgb2hsv
import re

PyTorch version: 2.0.1+cu117
Torchvision version: 0.15.2+cu117
CUDA is available: True


In [10]:
print(f"Number of available CUDA devices: {torch.cuda.device_count()}")
print(f"Current CUDA device index: {torch.cuda.current_device()}")
print(f"Name of current CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")

Number of available CUDA devices: 1
Current CUDA device index: 0
Name of current CUDA device: Tesla V100-SXM2-32GB


# Read video frames
- for each shot, get the begining, middle, and last feame
- obtain each feature in the three frames

In [11]:
os.listdir(os.path.join(scene_frame_dir,os.listdir(scene_frame_dir)[0]))

['000_2i.jpg',
 '001_2m.jpg',
 '002_2f.jpg',
 '003_3i.jpg',
 '004_3m.jpg',
 '005_3f.jpg',
 '006_4i.jpg',
 '007_4m.jpg',
 '008_4f.jpg',
 '009_5i.jpg',
 '010_5m.jpg',
 '011_5f.jpg',
 '012_6i.jpg',
 '013_6m.jpg',
 '016_7m.jpg',
 '017_7f.jpg',
 '018_8i.jpg']

In [12]:
scene_frames = []
for vdir in os.listdir(scene_frame_dir):
    for impath in os.listdir(os.path.join(scene_frame_dir, vdir)):
        scene_frames.append(os.path.join(scene_frame_dir,vdir, impath))

In [13]:
len(scene_frames)

4181

In [None]:
i = 40

In [None]:
frame = np.array(Image.open(scene_frames[i]))

In [None]:
plt.imshow(frame)
plt.axis('off')  # Turn off axis
image_height, image_width, _ = frame.shape
third_width = image_width / 3
third_height = image_height / 3

# Draw the vertical grid lines
plt.axvline(x=third_width, color='g', linestyle='--')
plt.axvline(x=2 * third_width, color='g', linestyle='--')

# Draw the horizontal grid lines
plt.axhline(y=third_height, color='g', linestyle='--')
plt.axhline(y=2 * third_height, color='g', linestyle='--')

plt.show()

In [None]:
content.face_count_mtcnn(frame)

In [None]:
composite.inner_rectangle_features(frame)

In [None]:
results = yolomodel.predict(scene_frames[i], imgsz=640, conf=0.5, iou=0.25, save=True)  

In [None]:
objects_count_yolo = results[0].boxes.cls.shape[0]
face_count_yolo = len([i for i in results[0].boxes.cls if i==0])

In [None]:
objects_count_yolo, face_count_yolo 

In [None]:
masks = mask_generator.generate(frame)

In [None]:
len(masks)

In [None]:
res = content.get_sam_stats(masks, frame.shape[0]*frame.shape[1])

In [None]:
res

In [None]:
composite.analyze_rule_of_thirds(frame, masks[3]['bbox'], form='xywh')

In [None]:
features.visualize_boxes(scene_frames[i], [masks[3]['bbox']], form='xywh')

In [None]:
plt.figure(figsize=(10,5))
plt.imshow(frame)
features.show_anns(masks,frame.shape[0]*frame.shape[1]*10)
plt.axis('off')
plt.show() 

# Extract and save features

In [17]:
def objects_yolo(im_path):
    results = yolomodel.predict(im_path, imgsz=640, conf=0.5, iou=0.25, save=False, verbose=False)  
    objects_count_yolo = results[0].boxes.cls.shape[0]
    face_count_yolo = len([i for i in results[0].boxes.cls if i==0])
    
    return {'objects_count_yolo':objects_count_yolo,'face_count_yolo':face_count_yolo}

In [18]:
def sam_details(frame):
    image_size = frame.shape[0]*frame.shape[1]

    masks = mask_generator.generate(frame)
    re =  content.get_sam_stats(masks, frame.shape[0]*frame.shape[1])
    rot = []
    for mask in masks:
        if mask['area'] > image_size/100:
            rot.append(composite.analyze_rule_of_thirds(frame, mask['bbox'], form='xywh')['rot_at_intersections'])
    if len(rot)==0:
        rule_of_thirds = 0
    else:
        rule_of_thirds = len([i for i in rot if i])/len(rot)
    re.update({'rule_of_thirds': round(rule_of_thirds,4)})
    return re

In [25]:
if not os.path.exists('../features'):
    os.makedirs('../features')
    
outpath = '../features/content-composite-complexity-eyetracking.json'
for im_path in tqdm.tqdm(scene_frames[0:]):
    ad_id = im_path.split('/')[2]
    #if ad_id not in contentf['ad_id'].unique():
    shot_num = re.search(r'\d', os.path.basename(im_path).split('_')[1]).group()
    position_in_shot = re.search(r'\d([a-zA-Z])\.jpg', os.path.basename(im_path).split('_')[1]).group(1)
    im_num = os.path.basename(im_path).split('_')[0]

    results = {'ad_id':ad_id, 'shot':shot_num, 'position_in_shot':position_in_shot, 'im_num':im_num}

    frame = np.array(Image.open(im_path))

    re_face = content.face_count_mtcnn(frame)
    re_inner = composite.inner_rectangle_features(frame)
    re_yolo = objects_yolo(im_path)
    re_sam = sam_details(frame)

    results.update(re_face)
    results.update(re_inner)
    results.update(re_yolo)
    results.update(re_sam)

    utils.write_json_row(outpath,results)
    #break

100%|██████████| 2000/2000 [1:12:33<00:00,  2.18s/it]
