#### Reload images and videos datasets

In [1]:
import pandas as pd
import json

df_images = pd.read_csv('data/datasets/images_clean.csv')
df_videos = pd.read_csv('data/datasets/videos.csv')

df_videos['tags'] = df_videos['tags'].apply(json.loads)

print(df_videos.columns)
print()
display(df_videos.index[:10])

Index(['_id', 'blob_name', 'blob_size', 'bucket_name', 'file_name', 'code',
       'n_folders', 'timestamp', 'folder_structure', 'folder', 'tags', 'url',
       'api_url', 'bucket', 'seen'],
      dtype='object')



RangeIndex(start=0, stop=10, step=1)

#### Check existence of images in folder

In [7]:
import os
def get_nested_files(folder_path):
    file_paths = []
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            file_path = os.path.join(root, file)
            file_paths.append(file_path.replace('\\', '/'))
    return file_paths


base_path = 'data/images'
images_paths_found = get_nested_files(base_path)

images_paths_sample = df_images['file_path'].apply(lambda file_path: f'{base_path}/{file_path}'.replace('\\', '/'))
files_exist_prct = images_paths_sample.isin(images_paths_found).mean()

print('Files found:', round(files_exist_prct * 100, 2) + ' %')

'100.0 %'

---

## Run predictions with YOLO

#### Load model with Yolo

In [2]:
from ultralytics import YOLO

# Path to the folder you want to zip
# model_path = f'models/sgkf-8-1-1/weights/best.pt'
# model_path = f'models/sgkf-50-25-25-size-2024-rs-2/weights/best.pt'
model_path = f'models/full-imbalanced-train/weights/best.pt'

# Load a model
model = YOLO(model_path)  # load a partially trained model

#### Compare time to predict using batches

In [None]:
import time

path_field = 'full_file_path'

df_images[path_field] = f'{base_path}/' + df_images['file_path']

n = 15
batch = 16
batch_paths = df_images[path_field].iloc[:batch].tolist()

s = time.time()
for i in range(n):
    for img_path in batch_paths:
        model.predict(img_path, verbose=False)
    diff = time.time() - s
    avg = round(diff / n, 3)

print('Avg-Time Op. 1: ', avg, 's')

s = time.time()
for i in range(n):
    model.predict(batch_paths, batch=1, verbose=False)
    diff = time.time() - s
    avg = round(diff / n, 3)

print('Avg-Time Op. 2: ', avg, 's')

s = time.time()
for i in range(n):
    model.predict(batch_paths, batch=batch, verbose=False)
    diff = time.time() - s
    avg = round(diff / n, 3)

print('Avg-Time Op. 3: ', avg, 's')

#### Run predictions with yolo in batch progressively

In [3]:
import os
import json
import time
import pandas as pd
import cv2
from IPython.display import clear_output as co
import traceback

def yolo_classify_dataset_batches(
    model, df, path_field,
    batch=8, save_each=10, save_path=None, save_error=True,
    predict_args={'imgsz': 640, 'device': 'cpu'},
):
        
    index_start = 0
    preds = []
    errors = []

    if save_path is not None:
        pred_path = f'{save_path}/images_pred.csv'
        error_path = f'{save_path}/errors.json'

        if not os.path.exists(save_path):
            os.makedirs(save_path)

        if os.path.exists(pred_path):
            preds_df = pd.read_csv(pred_path)
            index_start = len(preds_df)
            preds = preds_df.values.tolist()

        if save_error and os.path.exists(error_path):
            errors = json.load(open(error_path, 'r'))

    sources = df[path_field]  # .iloc[index_start:]
    n_imgs = len(sources)
    s_time = time.time()
    
    for i in range(index_start, n_imgs, batch):

        e_time = time.time() - s_time
        e_time_round = round(e_time / 60, 2)
        avg_time = e_time / max(1, i - index_start)
        expected_finish_time = round((n_imgs - i) * avg_time / 60, 2)
        expected_total_time = round(n_imgs * avg_time  / 60, 2)
    
        co(True)
        print(f'image-results-saved: {i} / {n_imgs} | time-running: {e_time_round} min / {expected_total_time} min | time-left: {expected_finish_time} min')    
    
        batch_index = int(i / batch)
        sources_batch = sources[i: i + batch]
        
        try:
            if batch_index % 3 == 0:
                a = 1 / 0
            pred = model.predict(sources_batch.tolist(), **predict_args)
            pred = [[idx, pred_i.probs.data[1].item(), pred_i.probs.top1] for idx, pred_i in zip(sources_batch.index, pred)]
            preds.extend(pred)

        except KeyboardInterrupt:
            co(True)
            print(f'Process Interrupted | image-results-saved: {i} / {n_imgs} | time-running: {e_time_round} min / {expected_total_time} min | time-left: {expected_finish_time} min')    
            return preds
        
        except Exception as e:            
            traceback_str = traceback.format_exc()
            errors.append({'index_start': i, 'index_end': i + batch - 1, 'batch_index': batch_index, 'error': str(e), 'traceback': traceback_str})
            
            if save_path is not None and save_error:
                with open(error_path, 'w') as fw:
                    fw.write(json.dumps(errors))
                
            pred = [[idx, None, None] for idx in sources_batch.index]
            preds.extend(pred)
        
        if save_path is not None:
            if batch_index % save_each == 0:
                pd.DataFrame(preds, columns=['index', 'prob', 'pred']).to_csv(pred_path, index=False)
                print(f'Dataset with results saved | IMAGES: {i + batch} / {n_imgs} | BATCH: {batch_index} | PATH: {save_path}')
    
    co(True)
    print(f'image-results-saved: {i + batch} / {n_imgs} | time-running: {e_time_round} min / {expected_total_time} min | time-left: {expected_finish_time} min')
    if save_path is not None:
        print(f'Dataset with results saved to: {save_path}')
    

    return preds


base_path = 'data/images'
path_field = 'full_file_path'

df_images[path_field] = f'{base_path}/' + df_images['file_path']

df = df_images.sort_values(['code', 'initial_timestamp', 'id_video', 'frame_index'])
batch = 12
save_each = 10
save_path = 'data/datasets'
save_error = True
predict_args = {'imgsz': 640, 'device': 'cpu', 'verbose': False}

preds = yolo_classify_dataset_batches(model, df, path_field, batch, save_each, save_path, save_error, predict_args)

preds[:10], f'Size: {len(preds)}'

Process Interrupted | image-results-saved: 2796 / 247784 | time-running: 3.55 min / 314.58 min | time-left: 311.03 min


([[60851, None, None],
  [60852, None, None],
  [60853, None, None],
  [60854, None, None],
  [60855, None, None],
  [60856, None, None],
  [60857, None, None],
  [60858, None, None],
  [60859, None, None],
  [60860, None, None]],
 'Size: 2796')

#### Convert image labels to video labels

In [10]:
preds_path = 'data/datasets/images_pred.csv'

import pandas as pd

preds = pd.read_csv(preds_path)

df_images_predicted = df_images.loc[preds['index']]

df_images_predicted[['prob', 'pred']] = preds[['prob', 'pred']].values

df_images_predicted

Unnamed: 0,id_video,code,folder,file_name,file_path,frame_index,timestamp,initial_timestamp,seen,tags,tag,flood,full_file_path,prob,pred
60851,650691d1d8b8affa4bef4643,1.0,comando/bolsao/99525/1/CODE1 2023-07-19 14-55-03,CODE1 2023-07-19 14-55-03-0.jpg,comando/bolsao/99525/1/CODE1 2023-07-19 14-55-...,0,2023-07-19 14:55:03.0,2023-07-19 14:55:03,True,[],normal,0,data/images/comando/bolsao/99525/1/CODE1 2023-...,,
60852,650691d1d8b8affa4bef4643,1.0,comando/bolsao/99525/1/CODE1 2023-07-19 14-55-03,CODE1 2023-07-19 14-55-03-3.jpg,comando/bolsao/99525/1/CODE1 2023-07-19 14-55-...,1,2023-07-19 14:55:03.3,2023-07-19 14:55:03,True,[],normal,0,data/images/comando/bolsao/99525/1/CODE1 2023-...,,
60853,650691d1d8b8affa4bef4643,1.0,comando/bolsao/99525/1/CODE1 2023-07-19 14-55-03,CODE1 2023-07-19 14-55-03-6.jpg,comando/bolsao/99525/1/CODE1 2023-07-19 14-55-...,2,2023-07-19 14:55:03.6,2023-07-19 14:55:03,True,[],normal,0,data/images/comando/bolsao/99525/1/CODE1 2023-...,,
60854,650691d1d8b8affa4bef4643,1.0,comando/bolsao/99525/1/CODE1 2023-07-19 14-55-03,CODE1 2023-07-19 14-55-03-9.jpg,comando/bolsao/99525/1/CODE1 2023-07-19 14-55-...,3,2023-07-19 14:55:03.9,2023-07-19 14:55:03,True,[],normal,0,data/images/comando/bolsao/99525/1/CODE1 2023-...,,
60855,650691d1d8b8affa4bef4643,1.0,comando/bolsao/99525/1/CODE1 2023-07-19 14-55-03,CODE1 2023-07-19 14-55-04-3.jpg,comando/bolsao/99525/1/CODE1 2023-07-19 14-55-...,4,2023-07-19 14:55:04.3,2023-07-19 14:55:03,True,[],normal,0,data/images/comando/bolsao/99525/1/CODE1 2023-...,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37545,650691d1d8b8affa4bef4300,1.0,comando/bolsao/101543/1/CODE1 2023-08-28 11-05-03,CODE1 2023-08-28 11-05-11-3.jpg,comando/bolsao/101543/1/CODE1 2023-08-28 11-05...,25,2023-08-28 11:05:11.3,2023-08-28 11:05:03,True,[],normal,0,data/images/comando/bolsao/101543/1/CODE1 2023...,,
37546,650691d1d8b8affa4bef4300,1.0,comando/bolsao/101543/1/CODE1 2023-08-28 11-05-03,CODE1 2023-08-28 11-05-11-6.jpg,comando/bolsao/101543/1/CODE1 2023-08-28 11-05...,26,2023-08-28 11:05:11.6,2023-08-28 11:05:03,True,[],normal,0,data/images/comando/bolsao/101543/1/CODE1 2023...,,
37547,650691d1d8b8affa4bef4300,1.0,comando/bolsao/101543/1/CODE1 2023-08-28 11-05-03,CODE1 2023-08-28 11-05-11-9.jpg,comando/bolsao/101543/1/CODE1 2023-08-28 11-05...,27,2023-08-28 11:05:11.9,2023-08-28 11:05:03,True,[],normal,0,data/images/comando/bolsao/101543/1/CODE1 2023...,,
37548,650691d1d8b8affa4bef4300,1.0,comando/bolsao/101543/1/CODE1 2023-08-28 11-05-03,CODE1 2023-08-28 11-05-12-3.jpg,comando/bolsao/101543/1/CODE1 2023-08-28 11-05...,28,2023-08-28 11:05:12.3,2023-08-28 11:05:03,True,[],normal,0,data/images/comando/bolsao/101543/1/CODE1 2023...,,


#### Post results to mongo

In [None]:
import requests

def add_tags(tags, blob_name, bucket)
    baseUrl = ''
    url_tags = f'{baseUrl}/tags';
    data = {
        'tags': tags,
        'b': bucket,
        'blob_name': blob_name,
    };
    res = requests.post(url_tags, json=data)
    if not res.ok:
        return None
    return res.json()
    
# tags = ['acúmulo-ia']
# bucket = ''
# blob_name = ''
