# Flood videos from city cameras - Dataset Exploratory Data Analysis

---

In [1]:
import pandas as pd, matplotlib.pyplot as plt, seaborn as sns; sns.set()

  from pandas.core.computation.check import NUMEXPR_INSTALLED


## Reload flood videos dataset

In [2]:
data_path = '../Dados'
control_path = f'{data_path}/Controle de vídeos/videos_control_19-04.csv'

control = pd.read_csv(control_path)
control['timestamp'] = pd.to_datetime(control['timestamp'])

control.columns

Index(['blob_name', 'blob_size', 'bucket_name', 'file_name', 'code',
       'n_folders', 'timestamp', 'folder_structure'],
      dtype='object')

## Exploratory data analsysis

In [None]:
def event_info(blob_name):
    "returns event type and event id given `blob_name` matching the folder structure `{source}/{type}/{event}/{code}`."
    info = blob_name.split('/')
    return ['/'.join(info[:2]), info[2]]

#### Cameras already recorded

In [None]:
print('Cameras recorded:', control['code'].nunique())

#### Folder structure count

In [None]:
control[['n_folders', 'folder_structure']].value_counts().sort_index()

#### Total events per type count

In [None]:
folder_structure_msk = control['folder_structure'] =='{source}/{type}/{event}/{code}'
event_cnt = control[folder_structure_msk]['blob_name'].apply(event_info).tolist()
event_cnt = pd.DataFrame(event_cnt, columns=['event_type', 'event_id'])

event_type_cnt = event_cnt.drop_duplicates()['event_type'].value_counts()

print(f'\n{event_type_cnt}\n')

### Videos per camera

In [None]:
print(f'Videos per camera:\n\n{control["code"].value_counts()}')

#### Mega Bytes (MB) per câmera

In [None]:
print(f'Total MB per câmera:\n\n{control.groupby("code")["blob_size"].sum().sort_values(ascending=False) / 1e6}')

#### Total videos and bytes per day and cumulative

In [None]:
time_cnt = control.groupby('timestamp').count()['blob_name']
time_size_cnt = control.groupby('timestamp')['blob_size'].sum()

fig, axs = plt.subplots(1, 2, figsize=(12, 3))

time_cnt.resample('1D').sum().plot(ax=axs[0])
time_cnt.resample('1D').sum().cumsum().plot(ax=axs[0])
axs[0].set(title='Video files per day', ylabel='Videos', xlabel='time')
axs[0].legend(['per day', 'cumulative'])

(time_size_cnt.resample('1D').sum() / 1e9).plot(ax=axs[1])
(time_size_cnt.resample('1D').sum().cumsum() / 1e9).plot(ax=axs[1])
axs[1].set(title='Video bytes per day', ylabel='Giga Bytes (GB)', xlabel='time')
axs[1].legend(['per day', 'cumulative'])
plt.show()

### Recently recorded ·  Past 30 days

In [None]:
days = 30

ax = time_cnt.resample('1D').sum().tail(days).plot(marker='o', ms=7, title='Videos Recorded Per Day Past 30 Days')

### Build videos datetime index

In [None]:
ts = control.set_index('timestamp') # datetime index

### Events per day

#### Extract event videos dataset

In [None]:
folder_structure_msk = ts['folder_structure'] =='{source}/{type}/{event}/{code}'

ts_events = ts[folder_structure_msk]

ts_events = pd.DataFrame(
    ts_events['blob_name'].map(event_info).tolist(),
    index=ts_events.index, columns=['event_type', 'event_id']
)

ts_events.head()

#### Events per type per day count

In [None]:
freq = '1D'

event_type_cnt = []
for event_type in ts_events['event_type'].unique():
    event_type_cnt.append(ts_events[ts_events['event_type']==event_type].resample(freq).nunique()['event_id'].rename(event_type))

event_type_cnt = pd.concat(event_type_cnt, axis=1).fillna(0.0).drop('rivers/manual', axis=1)

event_type_cnt.head()

#### Events' videos per type per day count

In [None]:
freq = '1D'

event_videos_type_cnt = []
for event_type in ts_events['event_type'].unique():
    event_videos_type_cnt.append(ts_events[ts_events['event_type']==event_type].resample(freq).count()['event_id'].rename(event_type))

event_videos_type_cnt = pd.concat(event_videos_type_cnt, axis=1).fillna(0.0).drop('rivers/manual', axis=1)

event_videos_type_cnt.head()

#### Events and videos per type per day · Line Chart

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(16, 4))
event_type_cnt[['waze/flood']].plot(
    ax=axs[0], marker='o', ms=5, xlabel='Day',
    title='Waze Flood Events Recorded per Day`'
)
event_type_cnt[['comando/bolsão', 'comando/alagamento']].plot(
    ax=axs[1], marker='o', ms=5, xlabel='Day',
    title='Events from `Sistema Comando` Recorded per Day`'
)
plt.show()

fig, axs = plt.subplots(1, 2, figsize=(16, 4))
event_type_cnt[['waze/flood']].cumsum().plot(
    ax=axs[0], marker='o', ms=5, xlabel='Day',
    title='Waze Flood Events Recorded per Day`'
)
event_type_cnt[['comando/bolsão', 'comando/alagamento']].cumsum().plot(
    ax=axs[1], marker='o', ms=5, xlabel='Day',
    title='Events from `Sistema Comando` Recorded per Day`'
)
plt.show()

fig, axs = plt.subplots(1, 2, figsize=(16, 4))
event_videos_type_cnt[['waze/flood', 'comando/bolsão', 'comando/alagamento']].plot(
    ax=axs[0], marker='o', ms=5, xlabel='Day',
    title='Event Videos from Waze and `Sistema Comando` Recorded per Day`'
)
event_videos_type_cnt[['waze/flood', 'comando/bolsão', 'comando/alagamento']].cumsum().plot(
    ax=axs[1], marker='o', ms=5, xlabel='Day',
    title='Event Videos from Waze and `Sistema Comando` Recorded per Day`'
)
plt.show()

Obs: An event count can be duplicated if the same event was recorded both before and after the midnight mark. Therefore, the chart values may vary slightly from the actual values.

---
## Events and videos per type per period count

In [None]:
def event_info(blob_name):
    "returns event type and event id given `blob_name` matching the folder structure `{source}/{type}/{event}/{code}`."
    info = blob_name.split('/')
    return ['/'.join(info[:2]), info[2]]

#### 2023-04-07 and 2023-04-08

#### Events count

In [None]:
ts_cut = ts[(ts.index >= '2023-04-07') & (ts.index < '2023-04-09')]

folder_structure_msk = ts_cut['folder_structure'] =='{source}/{type}/{event}/{code}'

event_cnt = ts_cut.loc[folder_structure_msk]['blob_name'].apply(event_info).tolist()
event_cnt = pd.DataFrame(event_cnt, columns=['event_type', 'event_id'])

video_cnt = event_cnt['event_type'].value_counts().rename('video_count')
event_cnt = event_cnt.drop_duplicates()['event_type'].value_counts().rename('event_count')

event_cnts = pd.concat([event_cnt, video_cnt], axis=1)

print(f'\n{event_cnts}\n')

#### 2023-03-30 and 2023-03-31

#### Events count

In [None]:
ts_cut = ts[(ts.index >= '2023-03-30') & (ts.index <= '2023-04-01')]
folder_structure_msk = ts_cut['folder_structure'] =='{source}/{type}/{event}/{code}'

event_cnt = ts_cut[folder_structure_msk]['blob_name'].apply(event_info).tolist()
event_cnt = pd.DataFrame(event_cnt, columns=['event_type', 'event_id'])

video_cnt = event_cnt['event_type'].value_counts().rename('video_count')
event_cnt = event_cnt.drop_duplicates()['event_type'].value_counts().rename('event_count')

event_cnts = pd.concat([event_cnt, video_cnt], axis=1)

print(f'\n{event_cnts}\n')

#### 2023-04-18 and 2023-04-19

#### Events count

In [None]:
ts_cut = ts[(ts.index >= '2023-04-19') & (ts.index <= '2023-04-20')]
folder_structure_msk = ts_cut['folder_structure'] =='{source}/{type}/{event}/{code}'

event_cnt = ts_cut[folder_structure_msk]['blob_name'].apply(event_info).tolist()
event_cnt = pd.DataFrame(event_cnt, columns=['event_type', 'event_id'])

video_cnt = event_cnt['event_type'].value_counts().rename('video_count')
event_cnt = event_cnt.drop_duplicates()['event_type'].value_counts().rename('event_count')

event_cnts = pd.concat([event_cnt, video_cnt], axis=1)

print(f'\n{event_cnts}\n')

---
## Cameras with highest number of flood events

In [None]:
import pandas as pd, matplotlib.pyplot as plt, seaborn as sns; sns.set()

#### Get event types

In [None]:
def get_event_info(blob_name):
    "returns event type and event id given `blob_name` matching the folder structure `{source}/{type}/{event}/{code}`."
    info = blob_name.split('/')
    if len(info) != 5 or info[0] == 'polygons': return ['', '']
    return ['/'.join(info[:2]), info[2]]

#### Events recorded per câmera

In [None]:
# folder_structure_msk = control['folder_structure'] =='{source}/{type}/{event}/{code}'

event_info = control['blob_name'].apply(get_event_info).tolist()
control[['event_type', 'event_id']] = event_info

control_comando = control[control['event_type'].isin(['comando/bolsão', 'comando/alagamento'])]
events_per_code = control_comando.groupby('code')['event_id'].nunique()

cams_recurrence = events_per_code[events_per_code > 1]
display(cams_recurrence.sort_values(ascending=False).to_frame('Sistema Comando Events recorded').T)

print('Number of cameras with more than one recorded event from Sistema Comando database:', len(cams_recurrence))