#### Reload images and videos datasets

In [1]:
import pandas as pd
import json

df_images = pd.read_csv('data/datasets-full/images_clean.csv')
df_videos = pd.read_csv('data/datasets-full/videos.csv')

df_videos['tags'] = df_videos['tags'].apply(json.loads)

print(df_videos.columns)
print()
display(df_videos.index[:10])

Index(['_id', 'blob_name', 'blob_size', 'bucket_name', 'file_name', 'code',
       'n_folders', 'timestamp', 'folder_structure', 'folder', 'tags', 'url',
       'api_url', 'bucket', 'seen'],
      dtype='object')



RangeIndex(start=0, stop=10, step=1)

#### Check existence of images in folder

In [7]:
import os
def get_nested_files(folder_path):
    file_paths = []
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            file_path = os.path.join(root, file)
            file_paths.append(file_path.replace('\\', '/'))
    return file_paths


base_path = 'data/images'
images_paths_found = get_nested_files(base_path)

images_paths_sample = df_images['file_path'].apply(lambda file_path: f'{base_path}/{file_path}'.replace('\\', '/'))
files_exist_prct = images_paths_sample.isin(images_paths_found).mean()

print('Files found:', round(files_exist_prct * 100, 2) + ' %')

'100.0 %'

---

## Run predictions with YOLO

#### Load model with Yolo

In [2]:
from ultralytics import YOLO

# Path to the folder you want to zip
# model_path = f'models/sgkf-8-1-1/weights/best.pt'
# model_path = f'models/sgkf-50-25-25-size-2024-rs-2/weights/best.pt'
model_path = f'models/full-imbalanced-train/weights/best.pt'

# Load a model
model = YOLO(model_path)  # load a partially trained model

#### Compare time to predict using batches

In [None]:
import time

path_field = 'full_file_path'

df_images[path_field] = f'{base_path}/' + df_images['file_path']

n = 15
batch = 16
batch_paths = df_images[path_field].iloc[:batch].tolist()

s = time.time()
for i in range(n):
    for img_path in batch_paths:
        model.predict(img_path, verbose=False)
    diff = time.time() - s
    avg = round(diff / n, 3)

print('Avg-Time Op. 1: ', avg, 's')

s = time.time()
for i in range(n):
    model.predict(batch_paths, batch=1, verbose=False)
    diff = time.time() - s
    avg = round(diff / n, 3)

print('Avg-Time Op. 2: ', avg, 's')

s = time.time()
for i in range(n):
    model.predict(batch_paths, batch=batch, verbose=False)
    diff = time.time() - s
    avg = round(diff / n, 3)

print('Avg-Time Op. 3: ', avg, 's')

#### Run predictions with yolo in batch progressively

In [4]:
import os
import json
import time
import pandas as pd
import cv2
from IPython.display import clear_output as co
import traceback

def yolo_classify_dataset_batches(
    model, df, path_field,
    batch=8, save_each=10, save_path=None, save_error=True,
    predict_args={'imgsz': 640, 'device': 'cpu'},
):
    index_start = 0
    preds = []
    errors = []

    if save_path is not None:
        pred_path = f'{save_path}/images_pred.csv'
        error_path = f'{save_path}/errors.json'

        if not os.path.exists(save_path):
            os.makedirs(save_path)

        if os.path.exists(pred_path):
            preds_df = pd.read_csv(pred_path)
            index_start = len(preds_df)
            preds = preds_df.values.tolist()

        if save_error and os.path.exists(error_path):
            errors = json.load(open(error_path, 'r'))

    sources = df[path_field]  # .iloc[index_start:]
    n_imgs = len(sources)
    s_time = time.time()
    
    for i in range(index_start, n_imgs, batch):

        e_time = time.time() - s_time
        e_time_round = round(e_time / 60, 2)
        avg_time = e_time / max(1, i - index_start)
        expected_finish_time = round((n_imgs - i) * avg_time / 60, 2)
        expected_total_time = round((n_imgs - index_start) * avg_time  / 60, 2)
    
        co(True)
        print(f'image-results-saved: {i} / {n_imgs} | time-running: {e_time_round} min / {expected_total_time} min | time-left: {expected_finish_time} min')    
    
        batch_index = int(i / batch)
        sources_batch = sources[i: i + batch]
        
        try:
            pred = model.predict(sources_batch.tolist(), **predict_args)
            pred = [[idx, pred_i.probs.data[1].item(), pred_i.probs.top1] for idx, pred_i in zip(sources_batch.index, pred)]
            preds.extend(pred)

        except KeyboardInterrupt:
            co(True)
            print(f'Process Interrupted | image-results-saved: {i} / {n_imgs} | time-running: {e_time_round} min / {expected_total_time} min | time-left: {expected_finish_time} min')    
            return preds
        
        except Exception as e:
            traceback_str = traceback.format_exc()
            errors.append({'index_start': i, 'index_end': i + batch - 1, 'batch_index': batch_index, 'error': str(e), 'traceback': traceback_str})
                            
            pred = [[idx, None, None] for idx in sources_batch.index]
            preds.extend(pred)
        
        if save_path is not None:
            if batch_index % save_each == 0: # NOTE: MISSING SAVING THE LAST BATCH
                pd.DataFrame(preds, columns=['index', 'prob', 'pred']).to_csv(pred_path, index=False)
                
                if save_path is not None and save_error:
                    with open(error_path, 'w') as fw:
                        fw.write(json.dumps(errors))
                # print(f'Results saved | IMAGES: {i + batch} / {n_imgs} | BATCH: {batch_index} | PATH: {save_path}')
    
    co(True)
    print(f'image-results-saved: {i + batch} / {n_imgs} | time-running: {e_time_round} min / {expected_total_time} min | time-left: {expected_finish_time} min')
    
    if save_path is not None:
        print(f'\nDataset with results saved to: {save_path}')
    
    return preds


# ---

base_path = 'data/images'

df_images['full_file_path'] = f'{base_path}/' + df_images['file_path']

df = df_images.sort_values(['code', 'initial_timestamp', 'id_video', 'frame_index'])
path_field = 'full_file_path'
batch = 12
save_each = 100
save_path = 'data/datasets/images_pred'
save_error = True
predict_args = {'imgsz': 640, 'device': 'cpu', 'verbose': False}

preds = yolo_classify_dataset_batches(model, df, path_field, batch, save_each, save_path, save_error, predict_args)

preds[:10], f'Size: {len(preds)}'

Process Interrupted | image-results-saved: 240 / 2330639 | time-running: 0.37 min / 7960.41 min | time-left: 7959.59 min


([[541984.0, nan, nan],
  [541985.0, nan, nan],
  [541986.0, nan, nan],
  [541987.0, nan, nan],
  [541988.0, nan, nan],
  [542660.0, nan, nan],
  [542661.0, nan, nan],
  [542662.0, nan, nan],
  [542663.0, nan, nan],
  [542664.0, nan, nan]],
 'Size: 240')

#### Convert image labels to video labels

In [15]:
import pandas as pd

preds_path = 'data/datasets-full/images_pred/images_pred.csv'

preds = pd.read_csv(preds_path)

preds_new = preds.copy()

df_images_predicted = df_images.loc[preds['index']]

df_images_predicted[['prob', 'pred']] = preds[['prob', 'pred']].values

video_prob_stats = df_images_predicted.groupby(['id_video'])['prob'].agg(['mean', 'std'])
video_prob_stats.columns = ['prob_mean', 'prob_std']
video_pred = df_images_predicted.groupby(['id_video'])['pred'].max()

videos_predicted = pd.concat([video_prob_stats, video_pred], axis=1)

# Merge additional video data
videos_predicted = df_videos[['_id', 'blob_name']].join(videos_predicted, on='_id', how='right')

display(videos_predicted.head())

print('Shape:', videos_predicted.shape)
print()
print(preds['pred'].value_counts().rename('Images'))
print()
print(videos_predicted['pred'].value_counts().rename('Videos'))

Unnamed: 0,_id,blob_name,prob_mean,prob_std,pred
84,65040934f049f672e58adc3b,comando/alagamento/101579/1461/CODE1461 2023-0...,9.272117e-11,2.388217e-10,0.0
85,65040934f049f672e58adc3c,comando/alagamento/101579/1461/CODE1461 2023-0...,1.495736e-11,2.246259e-11,0.0
149,65040934f049f672e58adc3d,comando/alagamento/101579/1461/CODE1461 2023-0...,2.834461e-11,6.127894e-11,0.0
263,65040934f049f672e58adc3e,comando/alagamento/101579/1461/CODE1461 2023-0...,8.243967e-13,1.995352e-12,0.0
264,65040934f049f672e58adc3f,comando/alagamento/101579/1461/CODE1461 2023-0...,2.571478e-12,3.292951e-12,0.0


Shape: (43113, 5)

pred
0.0    1539798
1.0      59216
Name: Images, dtype: int64

pred
0.0    40806
1.0     2119
Name: Videos, dtype: int64


In [3]:
import pandas as pd

preds_path = 'data/datasets-full/images_pred/images_pred_old.csv'

preds = pd.read_csv(preds_path)

df_images_predicted = df_images.loc[preds['index']]

df_images_predicted[['prob', 'pred']] = preds[['prob', 'pred']].values

video_prob_stats = df_images_predicted.groupby(['id_video'])['prob'].agg(['mean', 'std'])
video_prob_stats.columns = ['prob_mean', 'prob_std']
video_pred = df_images_predicted.groupby(['id_video'])['pred'].max()

videos_predicted = pd.concat([video_prob_stats, video_pred], axis=1)

# Merge additional video data
videos_predicted = df_videos[['_id', 'blob_name']].join(videos_predicted, on='_id', how='right')

display(videos_predicted.head())

print('Shape:', videos_predicted.shape)
print()
print(videos_predicted['pred'].value_counts())

Unnamed: 0,_id,blob_name,prob_mean,prob_std,pred
89,65040934f049f672e58adc46,comando/alagamento/93901/1026/CODE1026 2023-03...,0.0009371731,0.0001059198,0.0
2,65040934f049f672e58adc47,comando/alagamento/93901/1026/CODE1026 2023-03...,0.0009260222,0.0001356357,0.0
90,65040934f049f672e58adc48,comando/alagamento/93901/1026/CODE1026 2023-03...,2.225615e-09,2.815121e-09,0.0
200,65040934f049f672e58adc49,comando/alagamento/93901/1026/CODE1026 2023-03...,1.630687e-07,8.887154e-07,0.0
91,65040934f049f672e58adc4a,comando/alagamento/93901/1026/CODE1026 2023-03...,8.380167e-09,1.136922e-08,0.0


Shape: (20751, 5)

pred
0.0    19149
1.0     1489
Name: count, dtype: int64


#### Analyze errors

In [37]:
errors_path = 'data/datasets-full/images_pred/errors.json'

errors = pd.DataFrame(json.load(open(errors_path, 'r')))

display(errors.head())
print('Shape:', errors.shape, '\n')

Unnamed: 0,index_start,index_end,batch_index,error,traceback
0,56672,56687,3542,cannot identify image file 'data/images/polygo...,"Traceback (most recent call last):\n File ""C:..."
1,56688,56703,3543,cannot identify image file 'data/images/polygo...,"Traceback (most recent call last):\n File ""C:..."
2,56704,56719,3544,cannot identify image file 'data/images/polygo...,"Traceback (most recent call last):\n File ""C:..."
3,57584,57599,3599,cannot identify image file 'data/images/polygo...,"Traceback (most recent call last):\n File ""C:..."
4,57600,57615,3600,cannot identify image file 'data/images/polygo...,"Traceback (most recent call last):\n File ""C:..."


Shape: (593, 5) 



#### Post video classification results

In [20]:
import os
import json
import time
from IPython.display import clear_output as co

bucket = 'flood-videos-stamped'
progress_path = 'data/datasets-full/images_pred_post/progress.json'
error_path = 'data/datasets-full/images_pred_post/errors.json'
save_each = 25
id_field = '_id'
df = videos_predicted[videos_predicted['pred'] == 1].copy()

base_url = 'https://watch-bucket-veuei2iu4q-uc.a.run.app'
# base_url = 'http://localhost:8080'
video_tags = VideoTags(base_url)

progress = {idx: False for idx in df['_id']}
if os.path.exists(progress_path):
    progress_saved = json.load(open(progress_path, 'r'))
    progress = {**progress, **progress_saved}

done = sum(list(progress.values()))
done_start = done

errors = []
if os.path.exists(error_path):
    errors = json.load(open(error_path, 'r'))

n_videos = len(df)
s_time = time.time()

for i, (idx, row) in enumerate(df.iterrows()):
    
    e_time = time.time() - s_time
    e_time_round = round(e_time / 60, 2)
    avg_time = e_time / max(1, done - done_start)
    expected_finish_time = round((n_videos - done) * avg_time / 60, 2)
    expected_total_time = round((n_videos - done_start) * avg_time  / 60, 2)

    co(True)
    print(f'PROCESSED: {i} / {n_videos} | POSTED: {done} / {n_videos} | TIME-RUNNING: {e_time_round} min / {expected_total_time} min | TIME-LEFT: {expected_finish_time} min | ERRORS: {len(errors)}')    

    id_video = row['_id']
    blob_name = row['blob_name']
    prob_mean = row['prob_mean']
    prob_std = row['prob_std']
    pred = row['pred']

    tags = ['acúmulo-ia' if pred is not None and pred else 'normal-ia']

    if not progress[id_video]:
        data = video_tags.post(tags, blob_name, bucket)
        
        success = data['status'] and data['data'] is not None and 'message' in data['data'] and 'successfully' in data['data']['message']
        
        if success:
            progress[id_video] = True
            done += 1
        
        else:
            errors.append({'id_video': id_video, **data})
            
    if (i + 1) % save_each == 0 or (i + 1) == n_videos:
        with open(progress_path, 'w') as fw:
            fw.write(json.dumps(progress))
            
        with open(error_path, 'w') as fw:
            fw.write(json.dumps(errors))

    co(True)
    print(f'PROCESSED: {i + 1} / {n_videos} | POSTED: {done} / {n_videos} | TIME-RUNNING: {e_time_round} min / {expected_total_time} min | TIME-LEFT: {expected_finish_time} min | ERRORS: {len(errors)}')    

PROCESSED: 2119 / 2119 | POSTED: 2119 / 2119 | TIME-RUNNING: 1.38 min / 34.12 min | TIME-LEFT: 0.0 min | ERRORS: 0
