In [1]:
# Analysis: find overlapping video URLs between test and train clean files
import pandas as pd
from urllib.parse import urlparse, parse_qsl, urlunparse


def normalize_url(u: str) -> str:
    if pd.isna(u):
        return ''
    s = str(u).strip()
    # remove common tracking/query params by parsing
    try:
        p = urlparse(s)
        # keep path and netloc, drop query but keep fragment if small
        cleaned = urlunparse((p.scheme, p.netloc, p.path, '', '', ''))
        return cleaned.lower().rstrip('/')
    except Exception:
        return s.lower().rstrip('/')

# paths
train_path = 'data/datatrain_clean.csv'
test_path = 'data/datatest.csv'
out_overlap = 'data/overlap_test_vs_train.csv'

# read
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

# normalize
train['video_norm'] = train['video'].apply(normalize_url)
test['video_norm'] = test['video'].apply(normalize_url)

# find overlap
merged = test.merge(train[['video_norm', 'id', 'emotion_clean']], on='video_norm', how='left', suffixes=('_test', '_train'))
overlap = merged[~merged['emotion_clean'].isna()].copy()

# save
overlap.to_csv(out_overlap, index=False)

# summary
print('Test rows:', len(test))
print('Train rows:', len(train))
print('Overlapping rows found:', len(overlap))

if len(overlap) > 0:
    print('\nSample overlaps:')
    print(overlap[['id_test','video','id_train','emotion_clean']].head().to_string(index=False))
else:
    print('No overlap found')


Test rows: 200
Train rows: 769
Overlapping rows found: 12

Sample overlaps:
 id_test                                                                                         video  id_train emotion_clean
      59            https://drive.google.com/file/d/1UI8IJ19DQkZuTaUtLVtAI19jGMIDJuO_/view?usp=sharing     240.0      Surprise
      61         https://drive.google.com/file/d/1c_1boySTaI5v-wbdEknkClIcsC7emkyC/view?usp=share_link     260.0      Surprise
     108            https://drive.google.com/file/d/1PMDBY0C5oekMhFDm2KcR2jLjdgonZ2Bi/view?usp=sharing     438.0         Proud
     109            https://drive.google.com/file/d/1ElvqnnKfMLU8pZyKKs_CMxwGts2O770v/view?usp=sharing     456.0      Surprise
     116 https://www.instagram.com/reel/DMh3m5rvW3x/?utm_source=ig_web_copy_link&igsh=MzRlODBiNWFlZA==     496.0      Surprise
