In [1]:
import pandas as pd

In [2]:
# Initialize a list to store the parsed data
annotations = []

# Read the file line by line
with open('data_samples/dataset info/Annotations.txt', 'r') as file:
    # Skip the header line
    next(file)
    for line in file:
        # Split the line into components
        parts = line.strip().split('&')
        if len(parts) == 5:  # Ensure the line has all 5 fields
            category, video_id, quality, start_time, end_time = parts
            annotations.append({
                'Category': category,
                'VideoID': video_id,
                'Quality': quality,
                'StartTime': float(start_time),
                'EndTime': float(end_time)
            })

# Convert the list of dictionaries into a DataFrame
df = pd.DataFrame(annotations)

print(df.head())

      Category      VideoID Quality  StartTime  EndTime
0  Church bell  RUhOCu3LNXM    good        0.0     10.0
1  Church bell  MH3m4AwEcRY    good        6.0      8.0
2  Church bell  2An2JMGMbbs    good        0.0     10.0
3  Church bell  WM5R44UMLq0    good        0.0     10.0
4  Church bell  5wHd0VafUAA    good        0.0     10.0


In [3]:
unique_qualities = df['Quality'].unique()
print("Unique Quality Values:", unique_qualities)

Unique Quality Values: ['good']


In [5]:
def preprocess_annotations(input_file: str, output_csv: str) -> None:
    """
    Parse the AVE dataset annotations file and save as a CSV.
    
    Args:
        input_file (str): Path to the input `annotations.txt` file.
        output_csv (str): Path to save the output CSV file.
    """
    # Initialize a list to store parsed data
    annotations = []
    
    # Read and parse the file
    with open(input_file, 'r') as file:
        # Skip the header line
        next(file)
        
        for line in file:
            # Split each line into components
            parts = line.strip().split('&')
            
            # Ensure the line has all 5 fields
            if len(parts) == 5:
                category, video_id, quality, start_time, end_time = parts
                
                # Calculate duration (optional)
                duration = float(end_time) - float(start_time)
                
                # Store parsed data
                annotations.append({
                    'Category': category,
                    'VideoID': video_id,
                    'Quality': quality,
                    'StartTime': float(start_time),
                    'EndTime': float(end_time),
                    'Duration': duration  # Optional field
                })
    
    # Convert to DataFrame
    df = pd.DataFrame(annotations)
    
    # Save to CSV
    df.to_csv(output_csv, index=False)
    print(f"Preprocessed data saved to: {output_csv}")

In [6]:
input_file = "data_samples/dataset info/Annotations.txt"  
output_csv = "ave_annotations_preprocessed.csv"  # Output CSV path
preprocess_annotations(input_file, output_csv)


Preprocessed data saved to: ave_annotations_preprocessed.csv
