# Video Data Preprocessing 

## 0. Package Import

In [None]:
import pandas as pd
from trim_videos import trim_videos
from generate_neg_samples import generate_bad_samples

## 1. Convert txt annotations to csv
The original annotations are saved as txt files, which is not easy to work with. Thus, we choose to convert this file into csv file.

In [21]:
# Initialize a list to store the parsed data
annotations = []

# Read the file line by line
with open('../data_samples/dataset info/Annotations.txt', 'r') as file:
    # Skip the header line
    next(file)
    for line in file:
        # Split the line into components
        parts = line.strip().split('&')
        if len(parts) == 5:  # Ensure the line has all 5 fields
            category, video_id, quality, start_time, end_time = parts
            annotations.append({
                'Category': category,
                'VideoID': video_id,
                'Quality': quality,
                'StartTime': float(start_time),
                'EndTime': float(end_time)
            })

# Convert the list of dictionaries into a DataFrame
df = pd.DataFrame(annotations)

print(df.head())

      Category      VideoID Quality  StartTime  EndTime
0  Church bell  RUhOCu3LNXM    good        0.0     10.0
1  Church bell  MH3m4AwEcRY    good        6.0      8.0
2  Church bell  2An2JMGMbbs    good        0.0     10.0
3  Church bell  WM5R44UMLq0    good        0.0     10.0
4  Church bell  5wHd0VafUAA    good        0.0     10.0


In [22]:
unique_qualities = df['Quality'].unique()
print("Unique Quality Values:", unique_qualities)

Unique Quality Values: ['good']


In [23]:
def preprocess_annotations(input_file: str, output_csv: str) -> None:
    """
    Parse the AVE dataset annotations file and save as a CSV.
    
    Args:
        input_file (str): Path to the input `annotations.txt` file.
        output_csv (str): Path to save the output CSV file.
    """
    # Initialize a list to store parsed data
    annotations = []
    
    # Read and parse the file
    with open(input_file, 'r') as file:
        # Skip the header line
        next(file)
        
        for line in file:
            # Split each line into components
            parts = line.strip().split('&')
            
            # Ensure the line has all 5 fields
            if len(parts) == 5:
                category, video_id, quality, start_time, end_time = parts
                
                # Calculate duration (optional)
                duration = float(end_time) - float(start_time)
                
                # Store parsed data
                annotations.append({
                    'Category': category,
                    'VideoID': video_id,
                    'Quality': quality,
                    'StartTime': float(start_time),
                    'EndTime': float(end_time),
                    'Duration': duration  # Optional field
                })
    
    # Convert to DataFrame
    df = pd.DataFrame(annotations)
    
    # Save to CSV
    df.to_csv(output_csv, index=False)
    print(f"Preprocessed data saved to: {output_csv}")

In [24]:
# Save annotated data to csv file for easier use
input_file = "../data_samples/dataset info/Annotations.txt"  
output_csv = "ave_annotations_preprocessed.csv"  # Output CSV path
preprocess_annotations(input_file, output_csv)


Preprocessed data saved to: ave_annotations_preprocessed.csv


## 2. Trim videos according to annotations
The annotations contain the `StartTime` and `EndTime` of the youtube videos used for AVE training, however, the videos in the given dataset is not trimmed. For training and testing purposes, we would need to trim the good section according to the annotations.

In [25]:
df = pd.read_csv("ave_annotations_preprocessed.csv") 
video_dir = "../original_videos"  # Folder with downloaded YouTube videos
output_dir = "../data_samples/trimmed_clips"   # Folder to save trimmed segments

trim_videos(df, video_dir, output_dir)

Successfully trimmed: ../data_samples/trimmed_clips/RUhOCu3LNXM_0.0_10.0.mp4
Successfully trimmed: ../data_samples/trimmed_clips/MH3m4AwEcRY_6.0_8.0.mp4
Successfully trimmed: ../data_samples/trimmed_clips/2An2JMGMbbs_0.0_10.0.mp4
Successfully trimmed: ../data_samples/trimmed_clips/WM5R44UMLq0_0.0_10.0.mp4
Successfully trimmed: ../data_samples/trimmed_clips/5wHd0VafUAA_0.0_10.0.mp4
Successfully trimmed: ../data_samples/trimmed_clips/AiJTCFtN0BY_1.0_4.0.mp4
Successfully trimmed: ../data_samples/trimmed_clips/Zlwu4AROYzg_0.0_10.0.mp4
Successfully trimmed: ../data_samples/trimmed_clips/gjCew0Kp0iM_0.0_10.0.mp4
Successfully trimmed: ../data_samples/trimmed_clips/eRF4KAXdn0w_6.0_10.0.mp4
Successfully trimmed: ../data_samples/trimmed_clips/3IQHzT5O89g_0.0_10.0.mp4
Successfully trimmed: ../data_samples/trimmed_clips/Hhqvvc4qu2Y_0.0_6.0.mp4
Successfully trimmed: ../data_samples/trimmed_clips/Gavpgq1WWh4_0.0_2.0.mp4
Successfully trimmed: ../data_samples/trimmed_clips/bzWIuMC8Vl8_0.0_10.0.mp4
Suc

# 3. Generate synthetic bad samples using existing videos 

In [None]:
df = pd.read_csv("ave_annotations_preprocessed.csv")
video_dir = "../data_samples/trimmed_clips"
output_dir = "../data_samples/generated_samples"
output_csv = "generated_samples_metadata.csv"
generate_bad_samples(df, video_dir, output_dir, output_csv)