In [None]:
import requests
import json

token_url = "http://spiderverse-va.byteintl.net/openapi/token"
with open("spiderverse_config.json", "r") as f:
    data = json.load(f)
resp = requests.post(token_url, data)
token = resp.json()["token"]

print(token)

In [None]:
def parse_url(url):
    if not url.startswith("https://www.youtube.com/"):
        return None
    if "&" in url:
        pruned_url = url.split("&")[0]
    else:
        pruned_url = url
    return pruned_url, pruned_url.split("watch?v=")[1]

In [None]:
# download to tos

from mmagent.utils.tos import list_all_objects

spider_url = "http://spiderverse-va.byteintl.net/openapi/submit/stream/seed"
headers = {"token": f"{token}", "Content-Type": "application/json"}

annotations_paths = [
    # "data/annotations/raw/CZ_4_refined.json",
    # "data/annotations/raw/CZ_5_refined.json",
    # "data/annotations/raw/ZZ_6_refined.json",
    # "data/annotations/raw/CZ_6_refined.json",
    # "data/annotations/raw/CZ_7_refined.json",
    # "data/annotations/raw/ZZ_7_refined.json",
    # "data/annotations/raw/ZZ_8_refined.json",
    "data/annotations/raw/ZZ_9_refined.json"
]

for annotations_path in annotations_paths:
    marker = annotations_path.split("/")[-1].split(".")[0].strip("_refined")
    target_dir = marker + "/"
    with open(annotations_path, "r") as f:
        videos = json.load(f)

    all_data = []
    # Get all existing files
    downloaded_files = list_all_objects(target_dir)

    for video in videos:
        url = video["video_url"]
        if not url.startswith("https://www.youtube.com/watch?v="):
            continue
        pruned_url, video_id = parse_url(url)
        if video_id in downloaded_files:
            continue
        seed_data = {}
        seed_data["source_data"] = video
        seed_data["__custom_args"] = ["-f", "bestvideo[height=720]+bestaudio"]
        seed_data["video_id"] = video_id
        seed_data["store_key"] = video_id + ".mp4"
        seed_data["dir_name"] = marker
        seed_data["url"] = pruned_url
        runtime_vars = {"key": "value"}
        data = {
            "seedSetId": 1614,
            "seedId": 12563,
            "data": json.dumps(seed_data),
            "runtimeVars": json.dumps(runtime_vars),
        }
        all_data.append(data)

    if len(all_data) == 0:
        print("All data downloaded.")
        break

    print(f"{marker}: downloading {len(all_data)} videos.")

    qps = 5
    import time

    # resp = requests.post(spider_url, json=all_data[0], headers=headers)
    # print(resp.json())

    for i in range(0, len(all_data), qps):
        batch = all_data[i : i + qps]
        for data in batch:
            resp = requests.post(spider_url, json=data, headers=headers)
        if i + qps < len(all_data):  # Don't sleep after the last batch
            time.sleep(1)  # Sleep 1 second between batches to maintain QPS

In [None]:
# download from tos

from mmagent.utils.tos import list_all_objects, download_one_sample
import os
import json
from tqdm import tqdm

annotations_paths = [
    # "data/annotations/raw/CZ_4_refined.json",
    # "data/annotations/raw/CZ_5_refined.json",
    # "data/annotations/raw/ZZ_6_refined.json",
    # "data/annotations/raw/CZ_6_refined.json",
    # "data/annotations/raw/CZ_7_refined.json",
    # "data/annotations/raw/ZZ_7_refined.json",
    # "data/annotations/raw/ZZ_8_refined.json",
    # "data/annotations/raw/CZ_3_refined.json",
    "data/annotations/raw/ZZ_9_refined.json"
]
base_save_dir = "/mnt/hdfs/foundation/longlin.kylin/mmagent/data/raw_videos"

for annotations_path in annotations_paths:
    marker = annotations_path.split("/")[-1].split(".")[0].strip("_refined")
    target_dir = marker + "/"
    save_dir = os.path.join(base_save_dir, marker)
    os.makedirs(save_dir, exist_ok=True)
    with open(annotations_path, "r") as f:
        videos = json.load(f)

    # Get all existing files
    downloaded_files = list_all_objects(target_dir)

    for video in tqdm(videos):
        url = video["video_url"]
        if not url.startswith("https://www.youtube.com/watch?v="):
            print(url)
            continue
        pruned_url, video_id = parse_url(url)
        save_file = os.path.join(save_dir, video_id + ".mp4")
        if not os.path.exists(save_file) and video_id in downloaded_files:
            download_one_sample(save_file, target_dir + video_id + ".mp4")

In [None]:
# check if all videos are downloaded

from mmagent.utils.tos import list_all_objects
import os

annotations_paths = [
    # "data/annotations/raw/CZ_4_refined.json",
    # "data/annotations/raw/CZ_5_refined.json",
    # "data/annotations/raw/ZZ_6_refined.json",
    # "data/annotations/raw/CZ_6_refined.json",
    # "data/annotations/raw/CZ_7_refined.json",
    # "data/annotations/raw/ZZ_7_refined.json",
    # "data/annotations/raw/ZZ_8_refined.json",
    # "data/annotations/raw/CZ_3_refined.json",
    "data/annotations/raw/ZZ_9_refined.json"
]
base_save_dir = "/mnt/hdfs/foundation/longlin.kylin/mmagent/data/raw_videos"

video_to_be_downloaded = {}

for annotations_path in annotations_paths:
    marker = annotations_path.split("/")[-1].split(".")[0].strip("_refined")
    target_dir = marker + "/"
    save_dir = os.path.join(base_save_dir, marker)
    with open(annotations_path, "r") as f:
        videos = json.load(f)

    url_to_be_downloaded = []
    # Get all existing files
    downloaded_files = list_all_objects(target_dir)

    for video in videos:
        url = video["video_url"]
        pruned_url, video_id = parse_url(url)
        # if video_id in downloaded_files:
        #     continue
        save_file = os.path.join(save_dir, video_id + ".mp4")
        if os.path.exists(save_file):
            continue
        url_to_be_downloaded.append(pruned_url)

    if len(url_to_be_downloaded) == 0:
        print("All data downloaded.")
    else:
        print(
            f"{marker} has {len(url_to_be_downloaded)} videos to be downloaded: {url_to_be_downloaded}"
        )

    video_to_be_downloaded[marker] = url_to_be_downloaded

In [None]:
from mmagent.utils.tos import list_all_objects, upload_one_sample
import os
import json
import shutil
from tqdm import tqdm

annotations_paths = [
    # "data/annotations/raw/CZ_4_refined.json",
    # "data/annotations/raw/CZ_5_refined.json",
    # "data/annotations/raw/ZZ_6_refined.json",
    # "data/annotations/raw/CZ_6_refined.json",
    # "data/annotations/raw/CZ_7_refined.json",
    # "data/annotations/raw/ZZ_7_refined.json",
    # "data/annotations/raw/ZZ_8_refined.json",
    # "data/annotations/raw/CZ_3_refined.json",
    "data/annotations/raw/ZZ_9_refined.json"
]
base_save_dir = "/mnt/hdfs/foundation/longlin.kylin/mmagent/data/raw_videos"
local_videos = os.listdir(
    "/mnt/hdfs/foundation/longlin.kylin/mmagent/data/raw_videos/supp_videos"
)

for annotations_path in annotations_paths:
    marker = annotations_path.split("/")[-1].split(".")[0].strip("_refined")
    target_dir = marker + "/"
    save_dir = os.path.join(base_save_dir, marker)
    os.makedirs(save_dir, exist_ok=True)
    with open(annotations_path, "r") as f:
        videos = json.load(f)

    # Get all existing files
    downloaded_files = list_all_objects(target_dir)

    for video in tqdm(videos):
        url = video["video_url"]
        if not url.startswith("https://www.youtube.com/watch?v="):
            continue
        pruned_url, video_id = parse_url(url)
        save_file = os.path.join(save_dir, video_id + ".mp4")
        video["path"] = save_file
        if not os.path.exists(save_file):
            # Check if video exists locally first
            local_match = [f for f in local_videos if f.startswith(video_id)]
            if local_match:
                # Copy from local file to save location
                local_file = os.path.join(
                    "/mnt/hdfs/foundation/longlin.kylin/mmagent/data/raw_videos/supp_videos",
                    local_match[0],
                )
                file_name = os.path.basename(local_file)
                shutil.copy(local_file, "data/temp")
                upload_one_sample(
                    os.path.join("data/temp", file_name),
                    obj_key=target_dir + video_id + ".mp4",
                )
                os.remove(os.path.join("data/temp", file_name))

In [None]:
url = "http://spiderverse-va.byteintl.net/openapi/submit/stream/seed"
headers = {"token": f"{token}", "Content-Type": "application/json"}

seed_data = {
    # 这个字段我放了你们的数据
    "source_data": {
        "video_id": "CZ_1",
        "video_url": "https://www.youtube.com/watch?v=PnvZZwlN2yk",
        "video_duration": "29:04",
        "video_type": "综艺 - 游戏",
        "qa_list": [
            {
                "question": "Is Stewart Thompson a person who pursues a high-quality life?",
                "answer": "Yes.",
                "question_type": "多线索推理,人物属性建模",
                "knowledge": "",
                "reasoning": "从Stewart讲述自己旅行方式倾向于商务舱可以判断。",
            }
        ],
    },
    # 这里必须有，
    "__custom_args": [
        # 必须
        "-f",
        # format的筛选，你们自己确定
        "bestvideo[height=720]+bestaudio",
    ],
    # ytb的视频id
    "video_id": "PnvZZwlN2yk",
    # 存储的文件名
    "store_key": "PnvZZwlN2yk.mp4",
    # 存储的文件夹
    "dir_name": "test_file",
    # ytb视频的链接，格式固定
    "url": "https://www.youtube.com/watch?v=PnvZZwlN2yk",
}

runtime_vars = {"key": "value"}
data = {
    "seedSetId": 1614,
    "seedId": 124,
    "data": json.dumps(seed_data),
    "runtimeVars": json.dumps(runtime_vars),
}

resp = requests.post(url, json=data, headers=headers)
print(resp.json())

In [None]:
from utils.tos import download_one_sample

download_one_sample("data/videos/raw/test/test1.mp4", "test_file/Ahrvn7IVgHk.mp4")

In [None]:
# download from youtube
import yt_dlp
import os
from tqdm import tqdm
import json
from utils.tos import list_all_objects
import os
import json
from tqdm import tqdm

annotations_paths = [
    "data/annotations/CZ_2_refined.json",
    "data/annotations/CZ_3_refined.json",
    "data/annotations/ZZ_4_refined.json",
]
base_save_dir = "/mnt/hdfs/foundation/longlin.kylin/mmagent/data/raw_videos"

for annotations_path in annotations_paths:
    marker = annotations_path.split("/")[-1].split(".")[0].strip("_refined")
    target_dir = marker + "/"
    save_dir = os.path.join(base_save_dir, marker)
    os.makedirs(save_dir, exist_ok=True)
    with open(annotations_path, "r") as f:
        videos = json.load(f)

    # Get all existing files
    downloaded_files = list_all_objects(target_dir)

    for video in tqdm(videos):
        url = video["video_url"]
        if not url.startswith("https://www.youtube.com/watch?v="):
            continue
        pruned_url, video_id = parse_url(url)
        save_file = os.path.join(save_dir, video_id + ".mp4")
        if not os.path.exists(save_file):
            options = {
                "format": f"bestvideo[height=720]+bestaudio",
                "outtmpl": save_file,
                "quiet": True,
                "cookiefile": "cookies/www.youtube.com_cookies.txt",  # Path to your cookies.txt
            }
            with yt_dlp.YoutubeDL(options) as ydl:
                try:
                    # Update output template to use the specified path
                    ydl.download([pruned_url])
                    print(f"Successfully downloaded video to {save_file}")
                except Exception as e:
                    print(f"Error downloading {pruned_url}: {str(e)}")
            video["path"] = save_file

    with open(annotations_path, "w") as f:
        json.dump(videos, f)


# with open("data/annotations/video_list_CZ.json", "r") as f:
#     data = json.load(f)

# for video in data:
#     url = video["video_url"]
#     pruned_url, video_id = parse_url(url)
#     if pruned_url is None:
#         video["path"] = None
#     else:
#         video["path"] = f"{destination_folder}{video_id}.mp4"
#         video["video_url"] = pruned_url

# with open("data/annotations/video_list_CZ_modified.json", "w") as f:
#     json.dump(data, f, indent=4)

# # 视频链接
# urls = [
#     "https://www.youtube.com/watch?v=PnvZZwlN2yk&ab_channel=KevinLangue",
# ]

# # 配置下载选项
# # options = {
# #     # "format": f"bestvideo[height={resolution}]+bestaudio/best",
# #     "format": f"bestvideo[ext=mp4][vcodec=h264][height={resolution}]+bestaudio[ext=m4a]/best[ext=mp4]",
# #     "merge_output_format": "mp4",  # 确保最终输出为 MP4
# #     # "outtmpl": destination_folder + "%(title)s.%(ext)s",
# #     "postprocessors": [{"key": "FFmpegVideoConvertor", "preferedformat": "mp4"}],
# #     # "quiet": True,
# # }

# # 创建下载器对象并下载
# count = 0

# for video in tqdm(data):
#     if video["path"] is not None:
#         options = {
#             "format": f"bestvideo[height={resolution}]+bestaudio",
#             "outtmpl": video["path"],
#             "quiet": True,
#         }
#         with yt_dlp.YoutubeDL(options) as ydl:
#             try:
#                 # Update output template to use the specified path
#                 ydl.download([video["video_url"]])
#                 print(f"Successfully downloaded video to {video['path']}")
#                 count += 1
#             except Exception as e:
#                 print(f"Error downloading {video['video_url']}: {str(e)}")

# print(f"Total downloaded videos: {count}")

In [None]:
import os
from tqdm import tqdm
import json
from tqdm import tqdm

annotations_paths = [
    "data/annotations/CZ_1_refined.json",
    "data/annotations/ZZ_1_refined.json",
    "data/annotations/ZZ_2_refined.json",
    "data/annotations/ZZ_3_refined.json",
]
base_save_dir = "/mnt/hdfs/foundation/longlin.kylin/mmagent/data/raw_videos"

videos_to_be_downloaded = {}

for annotations_path in annotations_paths:
    marker = annotations_path.split("/")[-1].split(".")[0].strip("_refined")
    target_dir = marker + "/"
    save_dir = os.path.join(base_save_dir, marker)
    os.makedirs(save_dir, exist_ok=True)
    with open(annotations_path, "r") as f:
        videos = json.load(f)

    for video in tqdm(videos):
        url = video["video_url"]
        if not url.startswith("https://www.youtube.com/watch?v="):
            continue
        pruned_url, video_id = parse_url(url)
        save_file = os.path.join(save_dir, video_id + ".mp4")
        if not os.path.exists(save_file):
            if marker not in videos_to_be_downloaded:
                videos_to_be_downloaded[marker] = []
            videos_to_be_downloaded[marker].append(pruned_url)

In [None]:
print(videos_to_be_downloaded["ZZ_3"])

In [None]:
import os
from tqdm import tqdm
import json
from tqdm import tqdm
from utils.video_processing import get_video_info

annotations_paths = [
    "data/annotations/CZ_1_refined.json",
    "data/annotations/ZZ_1_refined.json",
    "data/annotations/ZZ_2_refined.json",
    "data/annotations/ZZ_3_refined.json",
]
base_save_dir = "/mnt/hdfs/foundation/longlin.kylin/mmagent/data/raw_videos"
processing_config = json.load(open("configs/processing_config.json", "r"))
log_dir = processing_config["log_dir"]

videos_to_be_downloaded = {}

for annotations_path in annotations_paths:
    marker = annotations_path.split("/")[-1].split(".")[0].strip("_refined")
    target_dir = marker + "/"
    save_dir = os.path.join(base_save_dir, marker)
    os.makedirs(save_dir, exist_ok=True)
    with open(annotations_path, "r") as f:
        videos = json.load(f)

    for video in tqdm(videos):
        url = video["video_url"]
        if not url.startswith("https://www.youtube.com/watch?v="):
            continue
        pruned_url, video_id = parse_url(url)
        save_file = os.path.join(save_dir, video_id + ".mp4")
        if os.path.exists(save_file):
            video_info = get_video_info(save_file)
            if video_info["height"] != 720 and video_info["width"] != 1280:
                with open(os.path.join(log_dir, "resolution_error.log"), "a") as f:
                    f.write(
                        f"Resolution error detected in {save_file} with resolution of {video_info['width']}x{video_info['height']}\n"
                    )

In [None]:
import gdown

url = (
    "https://drive.google.com/file/d/1l7oU4-YjZfABmLVTwpK7YKzaxNMMYN5V/view?usp=sharing"
)
url = f"https://drive.google.com/uc?id=1l7oU4-YjZfABmLVTwpK7YKzaxNMMYN5V"
output = "data/videos/supp_0531.zip"
gdown.download(url, output, quiet=False)

In [None]:
import os
import shutil
from tqdm import tqdm

# 源文件夹路径
source_folder = '/mnt/bn/videonasi18n/longlin.kylin/mmagent/data/videos/supp'

# 目标文件夹路径
destination_folder = '/mnt/hdfs/foundation/longlin.kylin/mmagent/data/raw_videos/supp_videos'


# 遍历源文件夹中的所有文件
for filename in tqdm(os.listdir(source_folder)):
    source_path = os.path.join(source_folder, filename)
    
    # 仅移动文件，不移动子文件夹
    if os.path.isfile(source_path):
        destination_path = os.path.join(destination_folder, filename)
        shutil.move(source_path, destination_path)