#### Reload images and videos datasets

In [1]:
import pandas as pd
import json

df_images = pd.read_csv('data/datasets/images_clean.csv')
df_videos = pd.read_csv('data/datasets/videos.csv')

# df_images['tags'] = df_images['tags'].apply(json.loads)
df_videos['tags'] = df_videos['tags'].apply(json.loads)

print(df_videos.columns)
print()
display(df_videos.index[:10])

Index(['_id', 'blob_name', 'blob_size', 'bucket_name', 'file_name', 'code',
       'n_folders', 'timestamp', 'folder_structure', 'folder', 'tags', 'url',
       'api_url', 'bucket', 'seen'],
      dtype='object')



RangeIndex(start=0, stop=10, step=1)

In [2]:
(df_images.index != df_images.sort_values(['id_video', 'frame_index']).index).sum()# .head(45)


212640

#### Check existence of images in folder

In [2]:
import os
def get_nested_files(folder_path):
    file_paths = []
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            file_path = os.path.join(root, file)
            file_paths.append(file_path.replace('\\', '/'))
    return file_paths


base_path = 'data/images'

images_paths_found = get_nested_files(base_path)

images_paths_sample = df_images['file_path'].apply(lambda file_path: f'{base_path}/{file_path}'.replace('\\', '/'))
files_exist_prct = images_paths_sample.isin(images_paths_found).mean()

str(round(files_exist_prct * 100, 2)) + ' %'

'100.0 %'

In [None]:
base_path = 'data/images'

image_paths = base_path + '/' + df_images['file_path']

files_exist_prct = image_paths.map(lambda x: os.path.exists(x)).mean()
 
str(round(files_exist_prct * 100, 2)) + ' %'

---

## Run predictions with YOLO

#### Load model with Yolo

In [3]:
from ultralytics import YOLO

# Path to the folder you want to zip
# model_path = f'models/sgkf-8-1-1/weights/best.pt'
# model_path = f'models/sgkf-50-25-25-size-2024-rs-2/weights/best.pt'
model_path = f'models/full-imbalanced-train/weights/best.pt'

# Load a model
model = YOLO(model_path)  # load a partially trained model

#### Run predictions with yolo in batch progressively

#### Compare time to predict using batches

In [None]:
import time

path_field = 'full_file_path'

df_images[path_field] = f'{base_path}/' + df_images['file_path']

n = 15
batch = 16
batch_paths = df_images[path_field].iloc[:batch].tolist()

s = time.time()
for i in range(n):
    for img_path in batch_paths:
        model.predict(img_path, verbose=False)
    diff = time.time() - s
    avg = round(diff / n, 3)

print('Avg-Time Op. 1: ', avg, 's')

s = time.time()
for i in range(n):
    model.predict(batch_paths, batch=1, verbose=False)
    diff = time.time() - s
    avg = round(diff / n, 3)

print('Avg-Time Op. 2: ', avg, 's')

s = time.time()
for i in range(n):
    model.predict(batch_paths, batch=batch, verbose=False)
    diff = time.time() - s
    avg = round(diff / n, 3)

print('Avg-Time Op. 3: ', avg, 's')

In [14]:
import os
import json
import time
import pandas as pd
import cv2
from IPython.display import clear_output as co

def yolo_classify_dataset_batches(model, df, path_field, batch=8, save_each=10, save_path=None):

    if save_path is not None:
        pred_path = f'{save_path}/pred.csv'
        stats_path = f'{save_path}/stats.json'
    
    if not os.path.exists(save_path):
        os.makedirs(save_path)
        
    index_start = 0
    preds = []
    if save_path is not None:
        if os.path.exists(pred_path):
            preds_df = pd.read_csv(pred_path)
            index_start = len(preds_df)
            preds = preds_df.values.tolist()

    sources = df[path_field].tolist()  # .iloc[index_start:]
    n_imgs = len(sources)
    s_time = time.time()
    
    for i in range(index_start, n_imgs, batch):

        e_time = time.time() - s_time
        e_time_round = round(e_time / 60, 2)
        avg_time = e_time / max(1, i - index_start)
        expected_finish_time = round((n_imgs - i) * avg_time / 60, 2)
        expected_total_time = round(n_imgs * avg_time  / 60, 2)
    
        co(True)
        print(f'image-results-saved: {i}/{n_imgs} | time-running: {e_time_round} min / {expected_total_time} min | time-left: {expected_finish_time} min')    
    
        sources_batch = sources[i: i + batch]
        
        try:
            pred = model.predict(sources_batch, imgsz=640)
            pred = [[pred_i.probs.data[1].item(), pred_i.probs.top1] for pred_i in pred]
            preds.extend(pred)

        except KeyboardInterrupt:
            co(True)
            print(f'Process Interrupted: image-results-saved: {i}/{n_imgs} | time-running: {e_time_round} min / {expected_total_time} min | time-left: {expected_finish_time} min')    
            return preds
            
        batch_index = int(i / batch)
        if batch_index % save_each == 0:
            pd.DataFrame(preds, columns=['prob', 'pred']).to_csv(pred_path, index=False)
            print(f'Dataset with results saved | IMAGES: {i + batch} | BATCH: {batch_index} | PATH: {save_path}')
    
    co(True)
    print(f'image-results-saved: {i + batch}/{n_imgs} | time-running: {e_time_round} min / {expected_total_time} min | time-left: {expected_finish_time} min')    

    return preds


base_path = 'data/images'
path_field = 'full_file_path'

df_images[path_field] = f'{base_path}/' + df_images['file_path']

df = df_images
batch = 12
save_each = 1
save_path = 'data/images-pred'

preds = yolo_classify_dataset_batches(model, df, path_field, batch, save_each, save_path)

preds[:10], f'Size: {len(preds)}'

Process Interrupted: 1896/247784 | 1.82 min / 516.14 min | time-left: 512.19 min


([[5.495740581368258e-12, 0.0],
  [1.3343072306770676e-11, 0.0],
  [2.3521630823641893e-11, 0.0],
  [2.2271998481593336e-11, 0.0],
  [3.1580519449514455e-12, 0.0],
  [8.035421828790634e-13, 0.0],
  [1.1554737201768804e-11, 0.0],
  [1.863289939008084e-10, 0.0],
  [3.139693366405183e-11, 0.0],
  [2.5321860927518536e-11, 0.0]],
 'Size: 1896')

#### Convert image labels to video labels

In [18]:
preds_path = 'data/images-pred/pred.csv'

preds = pd.read_csv(preds_path)

df_images_pred = df_images.iloc[:len(preds)]

df_images_pred[['flood-prob', 'flood-pred']] = preds.values.tolist()

#### Post results to mongo

In [None]:
import requests

def add_tags(tags, blob_name, bucket)
    baseUrl = ''
    url_tags = f'{baseUrl}/tags';
    data = {
        'tags': tags,
        'b': bucket,
        'blob_name': blob_name,
    };
    res = requests.post(url_tags, json=data)
    if not res.ok:
        return None
    return res.json()
    
# tags = ['acúmulo-ia']
# bucket = ''
# blob_name = ''
