In [None]:
# This script is adapted from Corpus Approaches to Language in Social Media Online Companion (Di Cristofaro 2023), https://catlism.github.io and licensed under GPLv3.
# Import modules for: working on local folders and files; regular expressions; finding files in a folder; reading JSON files;
# using BeautifulSoup; working with XML files
import os
import re
from glob import glob
import json
from bs4 import BeautifulSoup
from lxml import etree
from datetime import datetime

In [None]:
# Create a regular expression to capture the title of the video preceding yt-dlphttps://www.youtube.com/watch?v=8LHSY0zmrrU default naming conventions, where:
# [FILENAME].info.json = the JSON file containing the metadata details
# [FILENAME].[LL].srv3 = the XML file containing the subtitles in SRV3 format, where [LL] is the 2-letter ISO 3166-1 language code
# The regex reads 
filename_filter = re.compile(r"(.*?)\.(info.json|[A-Za-z]{1,3}\.srv3)")
# Create an empty list to store all the video titles
unique_filenames_list = []
# List all filenames present in the folder where the script resides
files = glob("*.*")

# For every single filename found in the folder, do:
for single_file in files:
    # Search for the regular expression for capturing metadata and subtitle files in the filename, and store the result
    # in the 'found_filename' variable
    found_filename = re.search(filename_filter, single_file)
    print(found_filename)
    # If the filename matches the regular expression, extract the filename without the extensions; then check if the cleaned
    # filename is present in the unique_filenames_list, and if not add it
    if found_filename is not None and found_filename[1] not in unique_filenames_list:
        unique_filenames_list.append(found_filename[1])

In [None]:
# For each unique filename found do:
for filename in unique_filenames_list:
    try:
        # Recreate the full filenames with extensions, and store each one of them into a single variable
        json_file = filename + ".info.json"
        srv3_file = filename + "." + "en.srv3"
        # Create the output filename using the input filename
        output_xml = srv3_file + ".xml"
        output_txt = srv3_file + ".txt"
        # Create the XML element <text>, root element of the final output
        text_tag = etree.Element("text")
    
        print(f"Processing {json_file}")
        # Open the metadata JSON file:
        metadata_file = json.loads(open(json_file, encoding="utf8").read())
        # Read the `upload_date` datapoint in the format YYYYMMDD, and split the three values
        # for the day, the month, and the year; then assign them to three separate attributes
        date = datetime.strptime(metadata_file["upload_date"], "%Y%m%d")
        text_tag.attrib["date_d"] = str(date.day)
        text_tag.attrib["date_m"] = str(date.month)
        text_tag.attrib["date_y"] = str(date.year)
        # Check if the 'like_count' metadata point is present, if not assign the value "na" to the 'like_count' attribute

    
        # Assign the attribute 'format' with a value of 'srv' to the <text> element tag
        text_tag.attrib["format"] = "srv3"

        # Add additional metadata as selected by the class:
        text_tag.attrib["age_limit"] = str(metadata_file["age_limit"])
        text_tag.attrib["comment_count"] = str(metadata_file["comment_count"] if "comment_count" in metadata_file else "na")
        text_tag.attrib["description"] = str(metadata_file["description"])
        text_tag.attrib["fulltitle"] = str(metadata_file["fulltitle"])
        text_tag.attrib["like_count"] = str(metadata_file["like_count"] if "like_count" in metadata_file else "na")
        text_tag.attrib["location"] = str(metadata_file["location"] if "location" in metadata_file else "na")
        text_tag.attrib["playlist_title"] = str(metadata_file["playlist_title"] if "playlist_title" in metadata_file else "na")
        text_tag.attrib["release_date"] = str(metadata_file["release_date"] if "release_date" in metadata_file else "na")
        text_tag.attrib["upload_date"] = str(metadata_file["upload_date"])
        text_tag.attrib["repost_count"] = str(metadata_file["repost_count"] if "repost_count" in metadata_file else "na")
        tags = ""
        for t in metadata_file["tags"]:
            tags += f"{t}; "
        text_tag.attrib["tags"] = tags
        text_tag.attrib["title"] = str(metadata_file["title"])
        text_tag.attrib["uploader"] = str(metadata_file["uploader"])
        text_tag.attrib["view_count"] = str(metadata_file["view_count"])
        text_tag.attrib["webpage_url"] = str(metadata_file["webpage_url"])
        
        
        
                    
        print(f"Processing {srv3_file}")
        # Open the SRV3 file
        f = open(srv3_file, "r", encoding="utf8")
        # Parse its XML contents using BeautifulSoup
        soup = BeautifulSoup(f, features="xml")
        # If the attribute 'ac' (= autocaption) with value '255' is found in the <s> element tag then the subtitles are the result of autocaptioning;
        # hence assign the value 'true' to the variable 'is_ac'. Otherwise assign the value 'false' to 'is_ac'
        if soup.body.find("s", attrs={"ac": True}):
            is_ac = "true"
        else:
            is_ac = "false"
    
        # Assign the value of 'is_ac' to the <text> element tag attribute 'autocaption'
        text_tag.attrib["autocaption"] = is_ac
    
        # Create an empty list to store all the subtitles sentences to be later saved to a "plain-text" file
        plain_text = []
    
        # For each paragraph (i.e. each line of the subtitles) in the file do:
        for sent in soup.body.find_all("p"):
            # Check if the textual content of the paragraph is longer than 1 character; this avoids adding empty paragraphs to the final output
            if len(sent.get_text()) > 1:
                # Create a <p> element tag inside the XML output
                p_tag = etree.SubElement(text_tag, "p")
                # Add the attribute 'time' (indicating the starting time of the paragraph) and assign it the value appearing in 't'
                p_tag.attrib["time"] = str(sent["t"])
                # Add the attribute 'is_ac' and assign it the value of the previously created variable 'is_ac'
                p_tag.attrib["is_ac"] = is_ac
                p_tag.text = sent.get_text()
                plain_text.append(sent.get_text())
            # If the paragraph does not contain any text (i.e. its length is < 1), skip it
            else:
                continue
    
        # Write the extracted data formatted in XML to the final XML structure
        tree = etree.ElementTree(text_tag)
        # Write the XML to the output file
        tree.write(
            output_xml, pretty_print=True, xml_declaration=True, encoding="utf-8"
        )
        
        # Write the subtitles sentences only to a plain text file, one sentence per line
        with open(output_txt, "w", encoding="utf8") as o:
        # Merge together all the sentences collected in the 'plain_text' list using a new line ("\n")
            txt_contents = "\n".join(plain_text)
        # Write the merged sentences to the final .txt file
            o.write(txt_contents)
    except FileNotFoundError:
        print(f"FILE NOT FOUND for {json_file} and {srv3_file}")