# Imports

In [1]:
import os
import glob
import pandas as pd
from pathlib import Path

from config.constants import PATH
from utils.file_handler import get_file_path
from utils.duration_converter import convert_duration

In [2]:
output_path = get_file_path(os.path.join(Path.cwd().parent, PATH.data), "videos.json")

# Load Data

In [3]:
files = glob.glob(os.path.join(PATH.raw_data, "*.json"))
df = pd.concat([pd.read_json(f)[1:] for f in files], ignore_index=True)
print(f'Total videos: {len(df)}')

Total videos: 344802


# Format and Save to JSON

In [4]:
videos = []

for _, row in df.iterrows():
    # Safely extract nested dictionaries
    snippet = row.get("snippet") if isinstance(row.get("snippet"), dict) else {}
    contentDetails = row.get("contentDetails") if isinstance(row.get("contentDetails"), dict) else {}
    status = row.get("status") if isinstance(row.get("status"), dict) else {}
    statistics = row.get("statistics") if isinstance(row.get("statistics"), dict) else {}
    topicDetails = row.get("topicDetails") if isinstance(row.get("topicDetails"), dict) else {}
    fileDetails = row.get("fileDetails") if isinstance(row.get("fileDetails"), dict) else {}
    paidDetails = row.get("paidProductPlacementDetails") if isinstance(row.get("paidProductPlacementDetails"), dict) else {}

    video = {
        "id": row.get("id"),
        "fetchedAt": row.get("fetched_at"),

        "title": snippet.get("title"),
        "description": snippet.get("description"),
        "publishedAt": snippet.get("publishedAt"),
        "channelTitle": snippet.get("channelTitle"),
        "tags": snippet.get("tags", []),
        "categoryId": snippet.get("categoryId"),
        "defaultLanguage": snippet.get("defaultLanguage"),

        "duration": convert_duration(contentDetails.get("duration")) if contentDetails.get("duration") else None,
        "dimension": contentDetails.get("dimension"),
        "definition": contentDetails.get("definition"),
        "caption": contentDetails.get("caption"),
        "licensedContent": contentDetails.get("licensedContent"),

        "madeForKids": status.get("madeForKids"),
        "privacyStatus": status.get("privacyStatus"),

        # Statistics (safe default = -1)
        "viewCount": statistics.get("viewCount", -1),
        "likeCount": statistics.get("likeCount", -1),
        "dislikeCount": statistics.get("dislikeCount", -1),
        "favoriteCount": statistics.get("favoriteCount", -1),
        "commentCount": statistics.get("commentCount", -1),

        "hasPaidProductPlacement": paidDetails.get("hasPaidProductPlacement", False),

        "topicIds": topicDetails.get("topicIds", []),
        "relevantTopicIds": topicDetails.get("relevantTopicIds", []),
        "topicCategories": topicDetails.get("topicCategories", []),

        "fileName": fileDetails.get("fileName"),
        "fileSize": fileDetails.get("fileSize"),
        "fileType": fileDetails.get("fileType"),
    }

    videos.append(video)

df_videos = pd.DataFrame(videos)

df_videos.to_json(
    output_path,
    orient="records",
    indent=2
)
print(f"Saved {len(df_videos)} videos to {output_path}")

Saved 344802 videos to /home/cs/grad/mazumdmm/Masud/YouTube Project/YouTube-4SE/./data/videos.json
