# Video Data Preprocessing 

## 0. Package Import

In [None]:
import pandas as pd
from trim_videos import trim_videos
from generate_neg_samples import generate_samples

## 1. Convert txt annotations to csv
   - Input: Original AVE dataset annotations in `../data_samples/dataset info/Annotations.txt` 
   - Output: `ave_annotations_preprocessed.csv` with columns: Category, VideoID, Quality, StartTime, EndTime, Duration
   - Purpose: Makes the annotations more accessible and easier to work with in Python

In [None]:
# Initialize a list to store the parsed data
annotations = []

# Read the file line by line
with open('../data_samples/dataset info/Annotations.txt', 'r') as file:
    # Skip the header line
    next(file)
    for line in file:
        # Split the line into components
        parts = line.strip().split('&')
        if len(parts) == 5:  # Ensure the line has all 5 fields
            category, video_id, quality, start_time, end_time = parts
            annotations.append({
                'Category': category,
                'VideoID': video_id,
                'Quality': quality,
                'StartTime': float(start_time),
                'EndTime': float(end_time)
            })

# Convert the list of dictionaries into a DataFrame
df = pd.DataFrame(annotations)

print(df.head())

In [None]:
unique_qualities = df['Quality'].unique()
print("Unique Quality Values:", unique_qualities)

In [None]:
def preprocess_annotations(input_file: str, output_csv: str) -> None:
    """
    Parse the AVE dataset annotations file and save as a CSV.
    
    Args:
        input_file (str): Path to the input `annotations.txt` file.
        output_csv (str): Path to save the output CSV file.
    """
    # Initialize a list to store parsed data
    annotations = []
    
    # Read and parse the file
    with open(input_file, 'r') as file:
        # Skip the header line
        next(file)
        
        for line in file:
            # Split each line into components
            parts = line.strip().split('&')
            
            # Ensure the line has all 5 fields
            if len(parts) == 5:
                category, video_id, quality, start_time, end_time = parts
                
                # Calculate duration (optional)
                duration = float(end_time) - float(start_time)
                
                # Store parsed data
                annotations.append({
                    'Category': category,
                    'VideoID': video_id,
                    'Quality': quality,
                    'StartTime': float(start_time),
                    'EndTime': float(end_time),
                    'Duration': duration  # Optional field
                })
    
    # Convert to DataFrame
    df = pd.DataFrame(annotations)
    
    # Save to CSV
    df.to_csv(output_csv, index=False)
    print(f"Preprocessed data saved to: {output_csv}")

In [None]:
# Save annotated data to csv file for easier use
input_file = "../data_samples/dataset info/Annotations.txt"  
output_csv = "ave_annotations_preprocessed.csv"  # Output CSV path
preprocess_annotations(input_file, output_csv)


## 2. Trim videos according to annotations
The annotations contain the `StartTime` and `EndTime` of the youtube videos used for AVE training, however, the videos in the given dataset is not trimmed.

We saved the trimmed clips in `../data_samples/dataset info/trimmed_clips.zip`.

In [None]:
# df = pd.read_csv("ave_annotations_preprocessed.csv") 
# video_dir = "../original_videos"  # Folder with downloaded YouTube videos
# output_dir = "../data_samples/trimmed_clips"   # Folder to save trimmed segments

# trim_videos(df, video_dir, output_dir)

# 3. Generate synthetic bad samples using existing videos 
In order to create a balanced dataset with both positive (aligned) and negative (misaligned) samples, we utilized original videos and preprocessed annotations to
    - Generated video samples in `generated_samples` directory
    - Saved metadata in `generated_samples_metadata.csv`
  
Types of misalignments we used:
   - Time shift: Audio delay relative to video
   - Noise: Added white noise to audio
   - Mute: Removed audio track
   - Distort: Audio waveform distortion


In [None]:
df = pd.read_csv("ave_annotations_preprocessed.csv")
generate_samples(
    df,
    video_dir="../original_videos",  # Full videos (not pre-trimmed)
    output_dir="../data_samples/generated_samples",
    output_csv="generated_samples_metadata.csv"
)