# Timestamp base video labeling from single camera

In [1]:
cd ../

C:\Users\luisr\Desktop\Repositories\Data Science Projects\Hackaton COR IV - Centro de Operações do RJ\INCUBAÇÃO\Cameras


### Load video control dataset

In [55]:
import numpy as np, pandas as pd

# load data

video_control_path = 'Dados/Controle de vídeos/videos_control_19-04.csv'
control = pd.read_csv(video_control_path)

# data preprocessing
control['timestamp'] = pd.to_datetime(control['timestamp'])
control = control.set_index('timestamp', drop=True).sort_index()

## Set video timestamp labeling parameters

In [65]:
# set parameters

start = '2023-02-07'
end = '2023-03-02'

query = {
    'code': [1475],
    'folder_structure': [
        'polygons/{type}/{polygon}/{code}',
        '{source}/{type}/{event}/{code}',
    ]
}

time_label = {
    '2023-02-07 19:25:00': 'alagamento', # start 07/02
    '2023-02-08 00:35:00': 'bolsão', # start 08/02
    '2023-02-08 01:15:00': 'lâmina',
    '2023-02-08 01:52:30': 'poça',
    '2023-02-08 02:32:30': 'normalidade',
    '2023-02-11 20:50:00': 'alagamento', # start 11/02
    '2023-02-11 20:51:00': 'normalidade', # end 11/02
    '2023-03-02 00:00:00': 'normalidade', # after end of 2023-03-01 ?
}

##  Timestamp base video labeling from single camera

In [66]:
# take video dataset

df = control.copy()

# query video dataset

for key in query:
    df = df[df[key].isin(query[key])]

# cut video dataset

df = df[(df.index >= start) & (df.index <= end)]

# label videos by category timestamps

time_label = pd.Series(time_label).sort_index()

n_labels = len(time_label)
stamps = time_label.index

labels = pd.Series(np.nan, df.index)

for i in range(n_labels - 1):
    t1, t2 = stamps[i], stamps[i + 1]
    msk_t = (df.index >= t1) & (df.index <= t2)
    labels[msk_t] = time_label.loc[t1]

df['tag'] = labels

# display result label count

display(df.head()[['blob_name', 'code', 'tag']])
display(df['tag'].value_counts().to_frame('Video tag count'))

Unnamed: 0_level_0,blob_name,code,tag
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2023-02-07 19:25:00,polygons/flood-unlabeled/1/1475/CODE1475 2023-...,1475,alagamento
2023-02-07 19:30:00,polygons/flood-unlabeled/1/1475/CODE1475 2023-...,1475,alagamento
2023-02-07 19:35:00,polygons/flood-unlabeled/1/1475/CODE1475 2023-...,1475,alagamento
2023-02-07 19:40:00,polygons/flood-unlabeled/1/1475/CODE1475 2023-...,1475,alagamento
2023-02-07 19:50:00,polygons/flood-unlabeled/1/1475/CODE1475 2023-...,1475,alagamento


Unnamed: 0_level_0,Video tag count
tag,Unnamed: 1_level_1
normalidade,120
alagamento,28
poça,8
lâmina,6
bolsão,3


### Check result's unique dates

In [67]:
print(np.unique(df.index.date))

[datetime.date(2023, 2, 7) datetime.date(2023, 2, 8)
 datetime.date(2023, 2, 11) datetime.date(2023, 2, 28)
 datetime.date(2023, 3, 1)]


### Save labeled dataset as csv

In [68]:
df.reset_index().to_csv('Dados/Rotulos/1475_2023-02-07.csv', index=False)

## Load video frames as labeled images

In [242]:
import os, cv2, numpy as np
from IPython.display import clear_output as co


class VideoLoader:
    
    def __init__(self, dim=3):
        """  """
        self.dim = dim
        
    def frames_from_labeled_videos(self, paths, labels, print_each=None):
        x, y, i, n = [], [], 0, len(labels)
        for path, label in zip(paths, labels):
            frames = self.frames_from_video(path)
            x.extend(frames); y.extend([label] * len(frames))
            i += 1
            if print_each is not None and i % print_each == 0:
                co(True); print(f'CAPTURE LABELED VIDEOS · OPEN: {i}/{n}')
        return np.array(x), np.array(y)
        
    def frames_from_video(self, path):
        cap = cv2.VideoCapture(path)
        if not cap.isOpened():
            print(f"CANNOT OPEN VIDEO CAPTURE · PATH: {path}")
            return []
        frames = []
        while True:
            ret, frame = cap.read()
            if not ret:
                break # stream finished
            if self.dim == 1: # 1D flat frame
                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
                frame = np.reshape(frame, -1)
            if self.dim == 2: # 2D gray scale frame
                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
            frames.append(frame)
        cap.release(); cv2.destroyAllWindows()
        return frames

#### Reload labeled videos dataset

In [256]:
import pandas as pd

videos = pd.read_csv('Dados/Rotulos/1475_2023-02-07.csv')

videos['timestamp'] = pd.to_datetime(videos['timestamp'])
videos = videos.set_index('timestamp', drop=True).sort_index()

videos[['blob_name', 'tag']].head()

Unnamed: 0_level_0,blob_name,tag
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2023-02-07 19:25:00,polygons/flood-unlabeled/1/1475/CODE1475 2023-...,alagamento
2023-02-07 19:30:00,polygons/flood-unlabeled/1/1475/CODE1475 2023-...,alagamento
2023-02-07 19:35:00,polygons/flood-unlabeled/1/1475/CODE1475 2023-...,alagamento
2023-02-07 19:40:00,polygons/flood-unlabeled/1/1475/CODE1475 2023-...,alagamento
2023-02-07 19:50:00,polygons/flood-unlabeled/1/1475/CODE1475 2023-...,alagamento


## Split videos into train and test sets

#### Binarize categories of target variable

In [257]:
replace_tags = {
    'acúmulo': ['lâmina', 'bolsão', 'alagamento'],
    'normalidade': ['poça', 'normalidade'],
}

y = []
for tag in videos['tag']:
    for key, values in replace_tags.items():
        if tag in values: y.append(key)

y = pd.Series(y, index=videos.index)

display(y.value_counts().to_frame('Video Samples'))

Unnamed: 0,Video Samples
normalidade,128
acúmulo,37


#### Under sampling video samples · Drop exceeding majority class members

In [258]:
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(sampling_strategy=1.0)
x_res, y_res = rus.fit_resample(videos, y)

display(y_res.value_counts().to_frame('Videos Under-Sampled'))

Unnamed: 0,Videos Under-Sampled
acúmulo,37
normalidade,37


#### Train test split video samples

In [266]:
t_size = 0.25
e_size = 0.25

xt, xe, yt, ye = train_test_split(
    x_res, y_res, train_size=t_size, test_size=e_size,
    shuffle=True, stratify=y_res, random_state=0
)

display(pd.concat([
    yt.value_counts().to_frame('Train set'),
    ye.value_counts().to_frame('Test set')
], axis=1))

yy = pd.concat([yt, ye], axis=0)
xx = pd.concat([xt, xe], axis=0)

Unnamed: 0,Train set,Test set
acúmulo,9,9
normalidade,9,10


#### Load images and class labels from selected videos

In [None]:
folder = 'Dados/flood-video-collection'
# video_filename = 'polygons/flood-unlabeled/1/1475/CODE1475 2023-02-07 19:25:00.mp4'

# n = 3

videos_paths = [f'{folder}/{blob_name}'.replace(':', '-') for blob_name in xx['blob_name']]
videos_labels = yy.tolist()

video_loader = VideoLoader(dim=1)

# frames = load_video(video_path, gray=False, flat=True)
frames, labels = video_loader.frames_from_labeled_videos(videos_paths, videos_labels, print_each=1)

CAPTURE LABELED VIDEOS · OPEN: 32/37


In [262]:
print(f'\nVideos: {len(xx)}')
print(f'Frames: {len(frames)}')
print(f'Items in first dimension: {len(frames[0])}')
print()
display(pd.Series(labels).value_counts().to_frame('Image Samples'))


Videos: 45
Frames: 4291
Items in first dimension: 409920



Unnamed: 0,Image Samples
acúmulo,3485
normalidade,806


#### Under sampling image samples · Drop exceeding majority class members

In [264]:
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(sampling_strategy=1.0)
X_res, Y_res = rus.fit_resample(frames, labels)

Y_res = pd.Series(Y_res)

display(Y_res.value_counts().to_frame('Images Under-Sampled'))

Unnamed: 0,Images Under-Sampled
acúmulo,806
normalidade,806


#### Train test split image samples

In [251]:
t_size = 0.3
e_size = 0.3

X_train, X_test, y_train, y_test = train_test_split(
    X_res, Y_res, train_size=t_size, test_size=e_size,
    random_state=0, shuffle=True,
    stratify=Y_res
)

display(pd.concat([
    pd.Series(y_train).value_counts().to_frame('Train set'),
    pd.Series(y_test).value_counts().to_frame('Test set')
], axis=1))

Unnamed: 0,Train set,Test set
normalidade,301,300
acúmulo,300,301


#### Fit, predict and evaluate base model

In [214]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()

y_pred = gnb.fit(X_train, y_train).predict(X_test)

print("Number of mislabeled points out of a total %d points : %d"
      % (X_test.shape[0], (y_test != y_pred).sum()))

Number of mislabeled points out of a total 675 points : 24


---
## Exploratory data analysis 

#### Reload labeled videos dataset

In [30]:
videos = pd.read_csv('Dados/Rotulos/1475_2023-02-07.csv')

videos['timestamp'] = pd.to_datetime(videos['timestamp'])
videos = videos.set_index('timestamp', drop=True).sort_index()

videos.head()

Unnamed: 0_level_0,blob_name,blob_size,bucket_name,file_name,code,n_folders,folder_structure,tag
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2023-02-07 19:25:00,polygons/flood-unlabeled/1/1475/CODE1475 2023-...,206926,flood-video-collection,CODE1475 2023-02-07 19:25:00.mp4,1475,5,polygons/{type}/{polygon}/{code},alagamento
2023-02-07 19:30:00,polygons/flood-unlabeled/1/1475/CODE1475 2023-...,261337,flood-video-collection,CODE1475 2023-02-07 19:30:00.mp4,1475,5,polygons/{type}/{polygon}/{code},alagamento
2023-02-07 19:35:00,polygons/flood-unlabeled/1/1475/CODE1475 2023-...,234652,flood-video-collection,CODE1475 2023-02-07 19:35:00.mp4,1475,5,polygons/{type}/{polygon}/{code},alagamento
2023-02-07 19:40:00,polygons/flood-unlabeled/1/1475/CODE1475 2023-...,287397,flood-video-collection,CODE1475 2023-02-07 19:40:00.mp4,1475,5,polygons/{type}/{polygon}/{code},alagamento
2023-02-07 19:50:00,polygons/flood-unlabeled/1/1475/CODE1475 2023-...,211736,flood-video-collection,CODE1475 2023-02-07 19:50:00.mp4,1475,5,polygons/{type}/{polygon}/{code},alagamento
