# Post image classification results to MongoDB database collection

#### Reload images and videos datasets

In [1]:
import pandas as pd
import json

df_images = pd.read_csv('data/datasets-full/images_clean.csv')
df_videos = pd.read_csv('data/datasets-full/videos.csv')

df_videos['tags'] = df_videos['tags'].apply(json.loads)

#### Convert image labels to video labels

In [14]:
import pandas as pd

preds = pd.read_csv('data/datasets-full/images_pred/images_pred.csv')

df_images_predicted = df_images.loc[preds['index']]
df_images_predicted[['prob', 'pred']] = preds[['prob', 'pred']].values

video_prob_stats = df_images_predicted.groupby(['id_video'])['prob'].agg(['mean', 'std'])
video_prob_stats.columns = ['prob_mean', 'prob_std']
video_pred = df_images_predicted.groupby(['id_video'])['pred'].max()

videos_predicted = pd.concat([video_prob_stats, video_pred], axis=1)

# Merge additional video data
videos_predicted = df_videos[['_id', 'blob_name']].join(videos_predicted, on='_id', how='right')

display(videos_predicted.head())
print('Shape:', videos_predicted.shape)
print()
print(preds['pred'].value_counts().rename('Images'))
print()
print(videos_predicted['pred'].value_counts().rename('Videos'))

Unnamed: 0,_id,blob_name,prob_mean,prob_std,pred
84,65040934f049f672e58adc3b,comando/alagamento/101579/1461/CODE1461 2023-0...,9.272117e-11,2.388217e-10,0.0
85,65040934f049f672e58adc3c,comando/alagamento/101579/1461/CODE1461 2023-0...,1.495736e-11,2.246259e-11,0.0
149,65040934f049f672e58adc3d,comando/alagamento/101579/1461/CODE1461 2023-0...,2.834461e-11,6.127894e-11,0.0
263,65040934f049f672e58adc3e,comando/alagamento/101579/1461/CODE1461 2023-0...,8.243967e-13,1.995352e-12,0.0
264,65040934f049f672e58adc3f,comando/alagamento/101579/1461/CODE1461 2023-0...,2.571478e-12,3.292951e-12,0.0


Shape: (61950, 5)

pred
0.0    2253074
1.0      64708
Name: Images, dtype: int64

pred
0.0    59195
1.0     2537
Name: Videos, dtype: int64


#### Analyze errors

In [12]:
errors_path = 'data/datasets-full/images_pred/errors.json'

errors = pd.DataFrame(json.load(open(errors_path, 'r')))

display(errors.head())
print('Shape:', errors.shape, '\n')

Unnamed: 0,index_start,index_end,batch_index,error,traceback
0,56672,56687,3542,cannot identify image file 'data/images/polygo...,"Traceback (most recent call last):\n File ""C:..."
1,56688,56703,3543,cannot identify image file 'data/images/polygo...,"Traceback (most recent call last):\n File ""C:..."
2,56704,56719,3544,cannot identify image file 'data/images/polygo...,"Traceback (most recent call last):\n File ""C:..."
3,57584,57599,3599,cannot identify image file 'data/images/polygo...,"Traceback (most recent call last):\n File ""C:..."
4,57600,57615,3600,cannot identify image file 'data/images/polygo...,"Traceback (most recent call last):\n File ""C:..."


Shape: (1192, 5) 



#### Post video classification results

In [13]:
bucket = 'flood-videos-stamped'
progress_path = 'data/datasets-full/images_pred_post/progress.json'
error_path = 'data/datasets-full/images_pred_post/errors.json'
save_each = 1
id_field = '_id'
df = videos_predicted[videos_predicted['pred'] == 1].copy()
base_url = 'https://watch-bucket-veuei2iu4q-uc.a.run.app'

import os
import json
import time
from IPython.display import clear_output as co
from modules.mongo_util import VideoTags

# base_url = 'http://localhost:8080'
video_tags = VideoTags(base_url)

progress = {idx: False for idx in df['_id']}
if os.path.exists(progress_path):
    progress_saved = json.load(open(progress_path, 'r'))
    progress = {**progress, **progress_saved}

done = sum(list(progress.values()))
done_start = done

errors = []
if os.path.exists(error_path):
    errors = json.load(open(error_path, 'r'))

n_videos = len(df)
s_time = time.time()

for i, (idx, row) in enumerate(df.iterrows()):
    
    e_time = time.time() - s_time
    e_time_round = round(e_time / 60, 2)
    avg_time = e_time / max(1, done - done_start)
    expected_finish_time = round((n_videos - done) * avg_time / 60, 2)
    expected_total_time = round((n_videos - done_start) * avg_time  / 60, 2)

    co(True)
    print(f'PROCESSED: {i} / {n_videos} | POSTED: {done} / {n_videos} | TIME-RUNNING: {e_time_round} min / {expected_total_time} min | TIME-LEFT: {expected_finish_time} min | ERRORS: {len(errors)}')    

    id_video = row['_id']
    blob_name = row['blob_name']
    prob_mean = row['prob_mean']
    prob_std = row['prob_std']
    pred = row['pred']

    tags = ['acúmulo-ia' if pred is not None and pred else 'normal-ia']

    if not progress[id_video]:
        data = video_tags.post(tags, blob_name, bucket)
        
        success = data['status'] and data['data'] is not None and 'message' in data['data'] and 'successfully' in data['data']['message']
        
        if success:
            progress[id_video] = True
            done += 1
        
        else:
            errors.append({'id_video': id_video, **data})
            
    if (i + 1) % save_each == 0 or (i + 1) == n_videos:
        with open(progress_path, 'w') as fw:
            fw.write(json.dumps(progress))
            
        with open(error_path, 'w') as fw:
            fw.write(json.dumps(errors))

    co(True)
    print(f'PROCESSED: {i + 1} / {n_videos} | POSTED: {done} / {n_videos} | TIME-RUNNING: {e_time_round} min / {expected_total_time} min | TIME-LEFT: {expected_finish_time} min | ERRORS: {len(errors)}')    

PROCESSED: 2537 / 2537 | POSTED: 2537 / 2537 | TIME-RUNNING: 1.03 min / 1.03 min | TIME-LEFT: 0.0 min | ERRORS: 0
