In [None]:
import pickle
import cv2
import os
import pandas as pd
import numpy as np
import torch
from PIL import Image
from tqdm import tqdm
from collections import defaultdict

with open('../annotations_public.pkl', 'rb') as f:
    anns = pickle.load(f)


In [None]:
def select_most_common(array, k):
    unique, counts = np.unique(array, return_counts=True)
    first_indices = np.array([np.where(array == u)[0][0] for u in unique])
    sorted_indices = np.lexsort((-counts, first_indices))  # Negative counts for descending order
    top_k = unique[sorted_indices[:k]]
    return top_k.tolist()


def get_area(bbox):
    """
    Calculate the area of a bounding box.
    """
    x1, y1, x2, y2 = bbox
    
    # Ensure valid bounding box
    if x2 <= x1 or y2 <= y1:
        raise ValueError("Invalid bounding box dimensions.")
    width = x2 - x1
    height = y2 - y1
    return width * height



class Hazard:
    def __init__(self, video, track, frames):
        self.video = video
        self.track = track
        self.frames = frames
        self.caption_list = []
        self.caption_list_words = []

    def __repr__(self):
        return f"Hazard {self.video} {self.track}"

    @property
    def caption(self):
        if len(self.caption_list) == 0:
            return ' '
        else:
            return ' '.join(self.caption_list)
    
    @property
    def dangerous(self):
        #return hazard.caption_list[0] not in ['suv', 'vehicle', 'car', 'truck', 'bus', 'motorcycle', 'van', 'traffic']
        return self.get_cifar_classes()[0] not in ['pickup_truck', 'bus', 'tank', 'motorcycle', 'cloud']

    def visualize(self, frame_idx=None, folder_path="/home/cermavo3/projects/kaggle/coool/data"):
        if frame_idx is not None:
            frame_id = list(self.frames.keys())[frame_idx]
        else:
            frame_id = pd.DataFrame(self.frames).T['area'].idxmax() # Largest by area
    
        cap = cv2.VideoCapture(f"{folder_path}/{self.video}.mp4")
        frame_count = 0
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break
    
            # Process the target frame
            if frame_count == frame_id:
                image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    
                # Draw bounding box
                x1, y1, x2, y2 = np.array(self.frames[frame_id]['bbox']).round().astype(int)
                image = cv2.rectangle(image, (x1, y1), (x2, y2), color=(0, 255, 0), thickness=2)
                
                # Add track_id text in the upper-left corner
                font = cv2.FONT_HERSHEY_SIMPLEX
                font_scale = 1.2
                font_thickness = 2
                text_size = cv2.getTextSize(self.caption, font, font_scale, font_thickness)[0]
                
                # Calculate text position (upper-left corner)
                text_x = 10  # Fixed x-coordinate
                text_y = text_size[1] + 10  # Add a small offset from the top
    
                # Draw text background for better visibility (optional)
                text_bg_x2 = text_x + text_size[0]
                text_bg_y2 = text_y + 5  # Add a small padding below the text
                image = cv2.rectangle(image, (text_x - 5, text_y - text_size[1] - 5), 
                                       (text_bg_x2 + 5, text_bg_y2), color=(0, 255, 0), thickness=-1)
                
                # Draw the text on the image
                image = cv2.putText(image, self.caption, (text_x, text_y), font, font_scale, (0, 0, 0), thickness=font_thickness)
                break
            frame_count += 1    
        cap.release()
        return image

    def get_cifar_classes(self):
        df = pd.DataFrame(self.frames).T
        df_extended = pd.DataFrame({
            'probs': np.concatenate(df['probs10'].values),
            'class': np.concatenate(df['class10'].values),
            'area': np.concatenate([np.repeat(v, 10) for v in df['area']]),
        })
        select = df_extended.groupby('class').apply(lambda g: (g['probs'] * g['area']).mean()).sort_values()
        return select.idxmax(), select.max()



In [None]:
# Convert to Video -> Track -> Frame hierarchy.
data = defaultdict(lambda: defaultdict(dict))
for video, video_data in anns.items():
    for frame, frame_data in video_data.items():
        for obj in frame_data['challenge_object']:
            data[video][obj['track_id']][frame] = {
                'bbox': obj['bbox'],
                'area': get_area(obj['bbox']),
                'frame': frame,
                'video': video,
            }
    data[video] = dict(data[video])
data = dict(data)


# Create hazard objects
hazards = defaultdict(dict)
for video, video_data in data.items():
    for track, track_data in video_data.items():
        hazards[video, track] = (Hazard(video, track, track_data))


# Add captions to hazards
cap_largest = torch.load('../results/molmo-obj-cap-largest.pkl', weights_only=False)
for (video, track), hazard in hazards.items():
    caps = [i.split() for i in cap_largest[video, track]]
    caps = [item for column in zip(*caps) for item in column] # Columnwise flatten
    caps = np.array([i.lower() for i in caps])
    caps_most_common = select_most_common(caps, k=5)
    hazard.caption_list = caps_most_common

hazards_remap = defaultdict(dict)
for (video, track), hazard in hazards.items():
    hazards_remap[video][track] = hazard
hazards_remap = dict(hazards_remap)

# Parse CIFAR Data
obj_cls = torch.load('../results/cifar-obj-class/all-dense.pkl', weights_only=False)
cls_data = defaultdict(dict)
for video, video_data in obj_cls.items():
    for frame, frame_data in video_data.items():
        for i in frame_data:
            cls_data[video, i['track_id']][frame] = {
                'top10_probs': i['top10_probs'],
                'top10_class': i['top10_class'],
            }
cls_data = dict(cls_data)


# Add CIFAR class data to hazards
for (video, track), hazard in hazards.items():
    assert cls_data[video, track].keys() == hazards[video, track].frames.keys()
    for frame, frame_data in hazards[video, track].frames.items():
        frame_data['probs10'] = cls_data[video, track][frame]['top10_class']
        frame_data['class10'] = cls_data[video, track][frame]['top10_probs']

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

def extract_nouns(sentence, nlp):
    doc = nlp(sentence)
    nouns = [token.text for token in doc if token.pos_ == "NOUN"]
    return nouns

blacklist = set([
    'road', 'collision', 'risk', 'middle', 'brake', 'light', 'traffic', 'street', 'crossing', 'lane',
    'danger', 'caution', 'issue', 'lights', 'door', 'accident', 'crosswalk', 'end', 'branch', 'hazard',
    'failure', 'sidewalk', 'line','fire', 'crash', 'edge', 'headlights', 'load', 'intersection', 'blocks',
    'work', 'safety', 'visibility', 'parking', 'roadway', 'view', 'side', 'birds', 'brakes', 'path',
    'distance', 'swerving', 'police', 'front', 'ditch', 'conditions', 'driving', 'lot', 'kneeling', 'surface',
    'dent', 'separates', 'wild', 'wildlife',
    'driveway', 'highway', 'entrance', 'control', 'scene', 'opening', 'quality', 'crack', 'drivers', 'driver',
    'glare', 'curves', 'obcuring', 'closure', 'explosion', 'riding', 'ground', 'flow', 'leg', 'roof',
    'imminent', 'officer', 'headlight', 'lines', 'speed', 'plane', 'clothing', 'costume', 'canada',
    'car', 'vehicle', 'truck', 'bus', 'cars',
])


caps_words_data = torch.load('../results/molmo-obj-caption-short-square/all.pkl', weights_only=False)
caps_word = defaultdict(dict)
for video, video_data in tqdm(caps_words_data.items()):
    for frame, frame_data in video_data.items():
        for i in frame_data:
            nouns = extract_nouns(i['caption'], nlp)
            nouns = [i.lower() for i in nouns]
            nouns = [i for i in nouns if i not in blacklist]
            caps_word[video, i['track_id']][frame] = nouns
caps_word = dict(caps_word)


# Add new captions to hazards
for (video, track), hazard in hazards.items():
    frames = pd.DataFrame(hazard.frames).T.sort_values(by='area').tail(50)['frame'].values
    words = []
    if (video, track) in caps_word:
        for frame in frames:
            if frame in caps_word[video, track]:
                words.extend(caps_word[video, track][frame])
        hazard.caption_list_words = pd.Series(words).value_counts().head(10).index.to_list()
    else:
        hazard.caption_list_words = []

100%|██████████| 200/200 [02:41<00:00,  1.24it/s]


### Submission

In [None]:
# Get
def get_new_captions(hazard):
    caption_nouns = [i for i in hazard.caption_list if i not in blacklist]
    
    new_list = caption_nouns[:3]
    new_list = new_list + [i for i in hazard.caption_list_words[:10] if i not in new_list]

    if new_list == []:
        return ' '
    else:
        return ' '.join(new_list[:10])

In [None]:
data_sub = defaultdict(list)
for video, video_hazard in tqdm(hazards_remap.items()):
    for track, hazard in video_hazard.items():
        frame_ids = list(hazard.frames.keys())
        if hazard.dangerous:
            for frame in frame_ids:
                data_sub[f"{video}_{frame}"].append({'track': int(track), 'name': hazard.caption})
                #data_sub[f"{video}_{frame}"].append({'track': int(track), 'name': get_new_captions(hazard)})
data_sub = dict(data_sub)


#Create submission
df_template = pd.read_csv('/home/cermavo3/projects/kaggle/coool/submissions/results_17122024_driver_of-bboxsize-ensemble-bkpt4.csv')
df_sub = []
for i, row in tqdm(df_template.iterrows()):
    row_dict = {
        'ID': row['ID'],
        'Driver_State_Changed': row['Driver_State_Changed']
   }
    if row['ID'] in data_sub:
        for i, data in enumerate(data_sub[row['ID']]):
            row_dict[ f'Hazard_Track_{i}'] = str(data['track'])
            row_dict[ f'Hazard_Name_{i}'] = data['name']

    df_sub.append(row_dict)
df_sub = pd.DataFrame(df_sub).fillna(' ')
df_sub.to_csv('../submissions/results_18122024_driver_best_alltracks_cifar-filter-molmo-newcaps3.csv', index=0)


100%|██████████| 200/200 [00:16<00:00, 11.93it/s]
55770it [00:02, 27592.61it/s]
