## Open dataset

In [1]:
import pandas as pd
import os
# Get names of all files in directory
files = os.listdir("core_dataset/annotation/")

# Create an empty list to store rows
rows = []

for file_name in files:
    with open("core_dataset/annotation/" + file_name) as f:
        lines = f.readlines()
    data = [line.strip().split(',') for line in lines]
    rows.extend(data)
    
df = pd.DataFrame(rows, columns=["source_video", "target_video", "source_start", "source_end", "target_start", "target_end"])
df.shape

(9236, 6)

In [2]:
for i in range(len(df)):
    if df["source_video"][i] == df["target_video"][i]:
        # drop 
        df = df.drop(i)
df.reset_index(drop=True, inplace=True)
df.shape

(8708, 6)

In [3]:
df.head()

Unnamed: 0,source_video,target_video,source_start,source_end,target_start,target_end
0,3504e360accbaccb1580befbb441f1019664c2bb.mp4,37b31d607d31a47d347b15dae2b8aa63e57861eb.flv,00:00:03,00:00:05,00:00:33,00:00:35
1,3504e360accbaccb1580befbb441f1019664c2bb.mp4,37b31d607d31a47d347b15dae2b8aa63e57861eb.flv,00:00:03,00:00:05,00:00:41,00:00:45
2,3504e360accbaccb1580befbb441f1019664c2bb.mp4,458db5aa227ae49ceb8bc1bed5f9cf5b4bed63f2.flv,00:00:00,00:00:07,00:00:05,00:00:12
3,3504e360accbaccb1580befbb441f1019664c2bb.mp4,5c5714c0a56fd2a96f99db2f59b0d03659d77cdf.flv,00:00:00,00:00:04,00:00:06,00:00:10
4,3504e360accbaccb1580befbb441f1019664c2bb.mp4,67824b87c0698c499acad123b7498ae17f97bf6d.flv,00:00:00,00:00:06,00:00:00,00:00:06


## Parse time

In [7]:
import time
init_time = time.mktime(time.strptime("00:00:00", "%H:%M:%S"))
df["source_start"] = df["source_start"].apply(lambda x: time.mktime(time.strptime(x, "%H:%M:%S")) - init_time)
df["source_end"] = df["source_end"].apply(lambda x: time.mktime(time.strptime(x, "%H:%M:%S")) - init_time)
df["target_start"] = df["target_start"].apply(lambda x: time.mktime(time.strptime(x, "%H:%M:%S")) - init_time)
df["target_end"] = df["target_end"].apply(lambda x: time.mktime(time.strptime(x, "%H:%M:%S")) - init_time)
df.head()

Unnamed: 0,source_video,target_video,source_start,source_end,target_start,target_end
0,3504e360accbaccb1580befbb441f1019664c2bb.mp4,37b31d607d31a47d347b15dae2b8aa63e57861eb.flv,3.0,5.0,33.0,35.0
1,3504e360accbaccb1580befbb441f1019664c2bb.mp4,37b31d607d31a47d347b15dae2b8aa63e57861eb.flv,3.0,5.0,41.0,45.0
2,3504e360accbaccb1580befbb441f1019664c2bb.mp4,458db5aa227ae49ceb8bc1bed5f9cf5b4bed63f2.flv,0.0,7.0,5.0,12.0
3,3504e360accbaccb1580befbb441f1019664c2bb.mp4,5c5714c0a56fd2a96f99db2f59b0d03659d77cdf.flv,0.0,4.0,6.0,10.0
4,3504e360accbaccb1580befbb441f1019664c2bb.mp4,67824b87c0698c499acad123b7498ae17f97bf6d.flv,0.0,6.0,0.0,6.0


## Find connected components for duplicate videos

In [8]:
import networkx as nx

# Create a graph
G = nx.Graph()

# Add edges to the graph
for _, row in df.iterrows():
    source = (row['source_video'], row['source_start'], row['source_end'])
    target = (row['target_video'], row['target_start'], row['target_end'])
    G.add_edge(source, target)

# Find connected components
connected_components = list(nx.connected_components(G))

# Assign group IDs
group_mapping = {}
for group_id, component in enumerate(connected_components):
    for node in component:
        group_mapping[node] = group_id

# Add group IDs to the original dataframe
df['group'] = df.apply(
    lambda row: group_mapping[(row['source_video'], row['source_start'], row['source_end'])], axis=1
)

In [9]:
df.head()

Unnamed: 0,source_video,target_video,source_start,source_end,target_start,target_end,group
0,3504e360accbaccb1580befbb441f1019664c2bb.mp4,37b31d607d31a47d347b15dae2b8aa63e57861eb.flv,3.0,5.0,33.0,35.0,0
1,3504e360accbaccb1580befbb441f1019664c2bb.mp4,37b31d607d31a47d347b15dae2b8aa63e57861eb.flv,3.0,5.0,41.0,45.0,0
2,3504e360accbaccb1580befbb441f1019664c2bb.mp4,458db5aa227ae49ceb8bc1bed5f9cf5b4bed63f2.flv,0.0,7.0,5.0,12.0,1
3,3504e360accbaccb1580befbb441f1019664c2bb.mp4,5c5714c0a56fd2a96f99db2f59b0d03659d77cdf.flv,0.0,4.0,6.0,10.0,2
4,3504e360accbaccb1580befbb441f1019664c2bb.mp4,67824b87c0698c499acad123b7498ae17f97bf6d.flv,0.0,6.0,0.0,6.0,3


In [10]:
df.group.value_counts()

group
262     598
267     217
209     175
211     164
765     157
       ... 
1274      1
1275      1
1276      1
1277      1
1279      1
Name: count, Length: 1332, dtype: int64

## Add category and group (component) information into dataframe

In [11]:
dirs = os.listdir("../core_dataset/core_dataset/")
video2dir = {}
for dir in dirs:
    videos = os.listdir("../core_dataset/core_dataset/" + dir)
    for video in videos:
        video2dir[video] = dir

In [12]:
from moviepy import *
from tqdm import tqdm

video_lengths = {}

for video_name in tqdm(video2dir.keys()):
    dir = video2dir[video_name]
    video = VideoFileClip("../core_dataset/core_dataset/" + dir + "/" + video_name)
    duration = video.duration
    video_lengths[video_name] = duration
    # close video
    video.close()

100%|██████████| 528/528 [00:27<00:00, 18.90it/s]


In [13]:
df['category'] = df['source_video'].apply(lambda x: video2dir[x] if x in video2dir else None)
df['is_duplicate'] = True
# remove nan values
df = df.dropna()
df.reset_index(drop=True, inplace=True)
df.head()

Unnamed: 0,source_video,target_video,source_start,source_end,target_start,target_end,group,category,is_duplicate
0,3504e360accbaccb1580befbb441f1019664c2bb.mp4,37b31d607d31a47d347b15dae2b8aa63e57861eb.flv,3.0,5.0,33.0,35.0,0,baggio_penalty_1994,True
1,3504e360accbaccb1580befbb441f1019664c2bb.mp4,37b31d607d31a47d347b15dae2b8aa63e57861eb.flv,3.0,5.0,41.0,45.0,0,baggio_penalty_1994,True
2,3504e360accbaccb1580befbb441f1019664c2bb.mp4,458db5aa227ae49ceb8bc1bed5f9cf5b4bed63f2.flv,0.0,7.0,5.0,12.0,1,baggio_penalty_1994,True
3,3504e360accbaccb1580befbb441f1019664c2bb.mp4,5c5714c0a56fd2a96f99db2f59b0d03659d77cdf.flv,0.0,4.0,6.0,10.0,2,baggio_penalty_1994,True
4,3504e360accbaccb1580befbb441f1019664c2bb.mp4,67824b87c0698c499acad123b7498ae17f97bf6d.flv,0.0,6.0,0.0,6.0,3,baggio_penalty_1994,True


## Find non-intersecting parts and insert into dataframe
### Skip non-intersecting parts shorter than 5 seconds
### Divide long non-intersecting parts into subparts of 15 seconds

In [14]:
# Helper function to merge intervals
def merge_intervals(intervals):
    """Merge overlapping or adjacent intervals."""
    sorted_intervals = sorted(intervals, key=lambda x: x[0])  # Sort by start time
    merged = []
    for start, end in sorted_intervals:
        if not merged or merged[-1][1] < start:  # No overlap
            merged.append([start, end])
        else:  # Overlap, merge intervals
            merged[-1][1] = max(merged[-1][1], end)
    return merged

# Step 1: Collect all intervals for each video
intervals = {}
for _, row in df.iterrows():
    for video, start, end in [
        (row['source_video'], row['source_start'], row['source_end']),
        (row['target_video'], row['target_start'], row['target_end'])
    ]:
        if video not in intervals:
            intervals[video] = []
        intervals[video].append([start, end])

# Step 2: Merge intervals for each video
merged_intervals = {video: merge_intervals(ranges) for video, ranges in intervals.items()}

# Step 3: Find gaps in each video
non_overlapping_segments = {}
for video, length in video_lengths.items():
    video_intervals = merged_intervals.get(video, [])
    gaps = []
    current_start = 0
    for start, end in video_intervals:
        if current_start < start:  # Gap before this interval
            gaps.append([current_start, start])
        current_start = max(current_start, end)
    if current_start < length:  # Gap after the last interval
        gaps.append([current_start, length])

    # Step 4: Remove gaps where length < 5 seconds
    filtered_gaps = [gap for gap in gaps if gap[1] - gap[0] >= 5]

    # Only add to dictionary if there are valid gaps
    if filtered_gaps:
        non_overlapping_segments[video] = filtered_gaps



# Output non-overlapping segments
print(non_overlapping_segments)

{'37b31d607d31a47d347b15dae2b8aa63e57861eb.flv': [[0, 33.0], [35.0, 41.0], [45.0, 72.4]], '458db5aa227ae49ceb8bc1bed5f9cf5b4bed63f2.flv': [[0, 5.0], [12.0, 31.08]], '5c5714c0a56fd2a96f99db2f59b0d03659d77cdf.flv': [[0, 6.0], [10.0, 25.39]], '6d1a89c83d554fc6a5e39fcadb172a79baf140fd.mp4': [[0, 7.0], [12.0, 27.56]], '8084216caff6082b4e71ae4bbfe556f28a68485f.flv': [[0, 147.0], [150.0, 237.0], [241.0, 262.76]], 'bb604f57a18455867544e79c2e32bf5583c358d4.flv': [[0, 322.0], [329.0, 423.22]], '46f2e964ae16f5c27fad70d6849c76616fad7502.flv': [[0, 28.0], [161.0, 170.68]], '6171d3d87ae377e497199554033bca96a263277b.mp4': [[0, 131.73]], '0056649006c7a250cb4174f1e924768ba2c9d599.flv': [[0, 21.0], [49.0, 54.0], [58.0, 72.03]], '410ba1c253fb8d4a8ba6e59aa9e39d39cb3b245b.flv': [[0, 18.0]], '682329105c2fa291ceb13eebfe0544fb9a115c0b.flv': [[0, 7.0]], 'ab8bd55b3fefca37c7dc57e973a7cea1ae3555ad.flv': [[0, 7.0]], 'c1e67ad94259a2015304ee0b42323dd48f820309.flv': [[0, 7.0]], 'e9e65688a633e99bd3cc6e4078a2ef0157153c

In [15]:
# Helper function to split a gap into subparts of max length `max_length`
def split_gap(gap, max_length=15):
    start, end = gap
    duration = end - start
    subparts = []

    # If the gap is larger than the max length, split it
    while duration > max_length:
        subparts.append([start, start + max_length])
        start += max_length
        duration -= max_length

    # Add any remaining part that is smaller than max_length
    if duration > 0:
        subparts.append([start, end])

    return subparts

# Step 4: Update gaps splitting larger gaps into subparts of 15 seconds
final_non_overlapping_segments = {}
for video, gaps in non_overlapping_segments.items():
    subparts = []
    for gap in gaps:
        subparts.extend(split_gap(gap, max_length=15))
    final_non_overlapping_segments[video] = subparts

# Output final non-overlapping segments split into subparts of 15 seconds
print(final_non_overlapping_segments)


{'37b31d607d31a47d347b15dae2b8aa63e57861eb.flv': [[0, 15], [15, 30], [30, 33.0], [35.0, 41.0], [45.0, 60.0], [60.0, 72.4]], '458db5aa227ae49ceb8bc1bed5f9cf5b4bed63f2.flv': [[0, 5.0], [12.0, 27.0], [27.0, 31.08]], '5c5714c0a56fd2a96f99db2f59b0d03659d77cdf.flv': [[0, 6.0], [10.0, 25.0], [25.0, 25.39]], '6d1a89c83d554fc6a5e39fcadb172a79baf140fd.mp4': [[0, 7.0], [12.0, 27.0], [27.0, 27.56]], '8084216caff6082b4e71ae4bbfe556f28a68485f.flv': [[0, 15], [15, 30], [30, 45], [45, 60], [60, 75], [75, 90], [90, 105], [105, 120], [120, 135], [135, 147.0], [150.0, 165.0], [165.0, 180.0], [180.0, 195.0], [195.0, 210.0], [210.0, 225.0], [225.0, 237.0], [241.0, 256.0], [256.0, 262.76]], 'bb604f57a18455867544e79c2e32bf5583c358d4.flv': [[0, 15], [15, 30], [30, 45], [45, 60], [60, 75], [75, 90], [90, 105], [105, 120], [120, 135], [135, 150], [150, 165], [165, 180], [180, 195], [195, 210], [210, 225], [225, 240], [240, 255], [255, 270], [270, 285], [285, 300], [300, 315], [315, 322.0], [329.0, 344.0], [344.

In [16]:
new_rows = []
for video in final_non_overlapping_segments:
    # Use the length from the video_lengths dictionary for each video
    length = video_lengths.get(video, 0)

    # Add all intervals for this video (even if it's a gap)
    for gap in final_non_overlapping_segments.get(video, [[0, length]]):  # Use the whole video as one interval if no gaps
        new_rows.append({
            'source_video': video,
            'target_video': None,
            'source_start': gap[0],
            'source_end': gap[1],
            'target_start': None,
            'target_end': None,
            'group': None,  # You can set appropriate group here
            'category': video2dir[video],  # You can set appropriate category here
            'is_duplicate': False  # Mark as not a duplicate
        })

In [17]:
len(new_rows)

1690

In [18]:
df = pd.concat([df, pd.DataFrame(new_rows)], ignore_index=True)
df = df.sample(frac=1, random_state=42)
df.reset_index(inplace=True, drop=True)

  df = pd.concat([df, pd.DataFrame(new_rows)], ignore_index=True)


In [19]:
df.head()

Unnamed: 0,source_video,target_video,source_start,source_end,target_start,target_end,group,category,is_duplicate
0,2e246b45e7dfba0a7cfb2beb557c40d81dc02c99.flv,3ed4f5c0eb04c94353594e8be1a72bcc657e27c7.flv,24.0,25.0,15.0,22.0,210.0,maradona_hand_of_god,True
1,1e2598afd4d6a8728d6c0076354477db59702a5a.flv,,186.0,198.0,,,,the_last_samurai_last_battle,False
2,c6d6d37c73f364e3902407e1da07c8e354f66c13.flv,,677.0,692.0,,,,president_obama_takes_oath,False
3,458db5aa227ae49ceb8bc1bed5f9cf5b4bed63f2.flv,6d1466ebc4de7e5ddb229bde090b5c5acac15c0c.flv,8.0,10.0,4.0,6.0,0.0,baggio_penalty_1994,True
4,09b682c899b0727e9990d8e347cdce3df7c5550e.flv,d2015b438b70f022967713d6f977ebc67a16839e.flv,15.0,25.0,193.0,203.0,71.0,david_beckham_lights_the_olympic_torch,True


## Save the resulting dataframe

In [20]:
df.to_csv("data.csv", index=False)