In [3]:
import json
import ffmpeg
import os
import pandas as pd

In [4]:
# Function to cut and save video segments
def cut_video_segment(video_path, start_time, end_time, output_path):
    try:
        print(f"Attempting to load video: {video_path}")
        (
            ffmpeg
            .input(video_path, ss=start_time, to=end_time)
            .output(output_path, codec='libx264')
            .run(overwrite_output=True)
        )
        print(f"Video segment saved: {output_path}")
    except Exception as e:
        print(f"Error loading or processing video {video_path}: {e}")

# Function to generate the "name" column with a custom unique identifier
def generate_name_column(df):
    last_video_id = None
    last_label = None
    unique_id = 1
    names = []
    
    for index, row in df.iterrows():
        if row['video_id'] == last_video_id and row['label'] == last_label:
            unique_id += 1
        else:
            unique_id = 1
        name = f"{row['video_id']}_{row['label'].replace(' ', '_')}_{unique_id}"
        names.append(name)
        last_video_id = row['video_id']
        last_label = row['label']
    
    return names


# Load JSON file
json_file = '/home/nele_pauline_suffo/projects/mmaction2/data/quantex_share/anno_train.json'
with open(json_file, 'r') as f:
    data = json.load(f)

# Process each segment and save to new video files
original_video_folder = '/home/nele_pauline_suffo/ProcessedData/videos_superannotate/'  # Update this path
output_folder = '/home/nele_pauline_suffo/ProcessedData/videos_superannotate/feature_split'  # Update this path

In [8]:
# Convert JSON data to DataFrame
records = []
for video_id, video_data in data.items():
    for annotation in video_data['annotations']:
        record = {
            'video_id': video_id,
            'segment_start': annotation['segment'][0],
            'segment_end': annotation['segment'][1],
            'label': annotation['label'],
        }
        records.append(record)

df = pd.DataFrame(records)

# Remove duplicate rows
df = df.drop_duplicates()

# Sort the DataFrame by video_id and label to ensure correct ordering
df = df.sort_values(by=['video_id', 'label']).reset_index(drop=True)

# Generate the "name" column based on video_id, label, and a unique identifier
df['name'] = generate_name_column(df)

# Display the DataFrame
print(df)

  video_id  segment_start  segment_end                label  \
0   147984       0.000000   606.639367  Playing with Object   
1   183537      71.043709   632.066262  Playing with Object   
2   183537     665.281219   811.122892  Playing with Object   
3   183537    1257.072451  1259.662183  Playing with Object   
4   183537    1332.110401  1346.907723  Playing with Object   
5   189224      26.551535   699.767240              Drawing   

                           name  
0  147984_Playing_with_Object_1  
1  183537_Playing_with_Object_1  
2  183537_Playing_with_Object_2  
3  183537_Playing_with_Object_3  
4  183537_Playing_with_Object_4  
5              189224_Drawing_1  


In [None]:
for index, row in df.iterrows():
    video_id = row['video_id']
    label = row['label'].replace(" ", "_")  # Replace spaces with underscores
    start_time = row['segment_start']
    end_time = row['segment_end']
    original_video_path = f"{original_video_folder}/{video_id}.MP4"  # Assuming original videos are in .mp4 format
    output_video_path = f"{output_folder}/{row['name']}.mp4"
    
    print(f"Processing segment {start_time}-{end_time} of video {video_id}, label: {label}, output: {output_video_path}")

    cut_video_segment(original_video_path, start_time, end_time, output_video_path)
    print(f"Saved segment {start_time}-{end_time} of video {video_id} as {output_video_path}")

    # Display the output video in Jupyter Notebook
    display(Video(output_video_path))

print("All segments have been processed and saved.")