In [None]:
import os
import cv2
import numpy as np
import pandas as pd
from tqdm import tqdm
from zipfile import ZipFile
from sklearn.model_selection import train_test_split

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Define paths
zip_dir = "/content/drive/MyDrive/CarCrash/videos"
output_dir = "/content/drive/MyDrive/CarCrashYOLO/images"

In [None]:
# Function to generate training and testing data split
def generate_train_test_split(test_size=0.3):
    # Generate file names with the respective positive or negative prefixes
    positive_files = [f"{i:06d}.mp4" for i in range(1, 1501)]
    negative_files = [f"{i:06d}.mp4" for i in range(1501, 3001)]

    # Combine positive and negative files
    all_files = positive_files + negative_files
    labels = [1] * len(positive_files) + [0] * len(negative_files)  # 1 for positive, 0 for negative

    # Split the data into training and testing sets
    train_files, test_files, y_train, y_test = train_test_split(all_files, labels, test_size=test_size, stratify=labels)

    # Return the filenames for training and testing sets
    return train_files, test_files, y_train, y_test

    # Function to extract videos from a ZIP file
def extract_videos(zip_path, folder_name, start_index=None, end_index=None):
    with ZipFile(zip_path, 'r') as zip_ref:
        # Extract videos in the specified range
        for i in range(1, 3001):
            video_name = f"{i:06d}.mp4"
            if (start_index is None or i >= start_index) and (end_index is None or i <= end_index):
                zip_ref.extract(video_name, folder_name)

# Function to save frames as jpg
def save_frames(files, output_folder, split_type):
    for file in tqdm(files):
        video_path = os.path.join("videos", file)
        video_name = os.path.splitext(file)[0]
        save_folder = os.path.join(output_folder, split_type)

        # Open the video file
        cap = cv2.VideoCapture(video_path)

        # Get the number of frames
        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

        # Read and save each frame
        for frame_num in range(1, 51):
            ret, frame = cap.read()
            if not ret:
                break

            frame_filename = f"{video_name}_{frame_num}.jpg"
            frame_path = os.path.join(save_folder, frame_filename)
            cv2.imwrite(frame_path, frame)

        cap.release()

# Function to create DataFrame with information
def create_dataframe(image_folder):
    data = []

    for split_type in ["train", "test"]:
        folder_path = os.path.join(image_folder, split_type)

        for video_name in tqdm(os.listdir(folder_path)):
            video_id, _ = os.path.splitext(video_name)
            video_id, frame_num = video_id.split('_')
            frame_num = int(frame_num)

            data.append([video_id, split_type, frame_num])

    columns = ["Video_ID", "Split_Type", "Frame_Num"]
    df = pd.DataFrame(data, columns=columns)

    return df

In [None]:
# Function to generate training and testing data split for video frames
def generate_video_frame_split(zip_path_pos, zip_path_neg, test_size=0.3):
    # Extract videos from ZIP files
    extract_videos(zip_path_pos, "videos", end_index=1500)
    extract_videos(zip_path_neg, "videos", start_index=1501, end_index=3000)

    # Generate train-test split for video frames
    train_files, test_files, y_train, y_test = generate_train_test_split(test_size)

    # Process and save frames for train and test videos
    save_frames(train_files, output_dir, "train")
    save_frames(test_files, output_dir, "test")

    # Create DataFrame with information
    df = create_dataframe(output_dir)

    return df

In [None]:
zip_path_pos = "/content/drive/MyDrive/CarCrash/videos/Crash-1500.zip"
zip_path_neg = "/content/drive/MyDrive/CarCrash/videos/Normal.zip"

df = generate_video_frame_split(zip_path_pos, zip_path_neg, test_size=0.3)
df.head()

100%|██████████| 2100/2100 [1:04:06<00:00,  1.83s/it]
100%|██████████| 900/900 [27:49<00:00,  1.85s/it]
100%|██████████| 105000/105000 [00:01<00:00, 86107.54it/s]
100%|██████████| 44999/44999 [00:00<00:00, 140043.13it/s]


Unnamed: 0,Video_ID,Split_Type,Frame_Num
0,16,train,1
1,16,train,2
2,16,train,3
3,16,train,4
4,16,train,5


In [None]:
df

Unnamed: 0,Video_ID,Split_Type,Frame_Num
0,000016,train,1
1,000016,train,2
2,000016,train,3
3,000016,train,4
4,000016,train,5
...,...,...,...
149994,002780,test,46
149995,002780,test,47
149996,002780,test,48
149997,002780,test,49


In [None]:
df['Split_Type'].value_counts()

train    105000
test      44999
Name: Split_Type, dtype: int64

In [None]:
df.to_csv("/content/drive/MyDrive/CarCrashYOLO/df.csv", index=False)

In [None]:
data = np.load('/content/drive/MyDrive/CarCrash/vgg16_features/positive/000001.npz')
data['labels']

In [None]:
df = df.sort_values(by=['Video_ID', 'Frame_Num'])
df['Video_ID_int'] = df['Video_ID'].astype(int)
df

Unnamed: 0,Video_ID,Split_Type,Frame_Num,Video_ID_int
30600,000001,train,1,1
30601,000001,train,2,1
30602,000001,train,3,1
30603,000001,train,4,1
30604,000001,train,5,1
...,...,...,...,...
22495,003000,train,46,3000
22496,003000,train,47,3000
22497,003000,train,48,3000
22498,003000,train,49,3000


In [None]:
with open('/content/drive/MyDrive/CarCrash/videos/Crash-1500.txt', 'r') as file:
    lines = file.readlines()
len(lines)

1500

In [None]:
import re
match = re.search(r'\[.*?\]', lines[0])
extracted_list = eval(match.group())
extracted_list

0

In [None]:
# assign frame label for each frame
for index, row in df.iterrows():
    if row['Video_ID_int'] <= 1500:
        video_index = row['Video_ID_int'] - 1

        match = re.search(r'\[.*?\]', lines[video_index])
        extracted_list = eval(match.group())

        frame_index = int(row['Frame_Num']) - 1
        df.at[index, 'label'] = extracted_list[frame_index]

    else:
        df.at[index, 'label'] = 0

In [None]:
df['label'] = df['label'].astype(int)
df['label'].value_counts()

0    130783
1     19216
Name: label, dtype: int64

In [None]:
data = np.load('/content/drive/MyDrive/CarCrash/vgg16_features/positive/000001.npz')
data['det'][0]

array([[5.670e+02, 3.090e+02, 6.400e+02, 3.690e+02, 0.000e+00, 0.000e+00],
       [1.158e+03, 4.170e+02, 1.278e+03, 5.810e+02, 0.000e+00, 0.000e+00],
       [9.740e+02, 8.000e+00, 1.105e+03, 1.290e+02, 0.000e+00, 0.000e+00],
       [3.610e+02, 3.160e+02, 4.240e+02, 3.590e+02, 0.000e+00, 0.000e+00],
       [7.000e+00, 4.410e+02, 1.630e+02, 5.950e+02, 0.000e+00, 0.000e+00],
       [1.081e+03, 2.620e+02, 1.275e+03, 4.090e+02, 0.000e+00, 0.000e+00],
       [2.100e+01, 2.650e+02, 2.480e+02, 4.260e+02, 0.000e+00, 0.000e+00],
       [7.000e+00, 7.700e+01, 1.660e+02, 2.450e+02, 0.000e+00, 0.000e+00],
       [3.610e+02, 3.160e+02, 4.240e+02, 3.580e+02, 0.000e+00, 1.000e+00],
       [2.700e+01, 1.670e+02, 3.380e+02, 7.180e+02, 0.000e+00, 1.000e+00],
       [8.420e+02, 2.630e+02, 1.231e+03, 5.860e+02, 0.000e+00, 1.000e+00],
       [5.670e+02, 3.090e+02, 6.400e+02, 3.680e+02, 0.000e+00, 1.000e+00],
       [3.610e+02, 3.160e+02, 4.240e+02, 3.580e+02, 0.000e+00, 2.000e+00],
       [8.420e+02, 2.630e

In [None]:
def calculate_bbox_properties(x1, y1, x2, y2):
    c1 = (x1 + x2) / 2
    c2 = (y1 + y2) / 2
    h = abs(y2 - y1)
    w = abs(x2 - x1)