## YT Video to TXT file conversion

In [None]:
#  Install Required Dependencies

# Libraries for working with YouTube videos and subtitles:
# pytube: to download the video
# moviepy: to extract audio from the video
# youtube-transcript-api: to get YouTube's automatic transcription (auto-generated subtitles)

# Install necessary packages using pip:
!pip install pytube moviepy youtube-transcript-api yt-dlp


------

------

In [None]:
"""
Before running this script, please check if the video has subtitles.
Run the following command in your conda environment:

    yt-dlp --list-subs "https://www.youtube.com/watch?v=dQw4w9WgXcQ"

This ensures that the video you intend to process actually has subtitles.
"""

# Step 2: Import Required Libraries
import os  # For working with the file system (directories, paths)
import re  # For regular expressions (to extract timestamps from subtitle files)
import yt_dlp  # For downloading video and subtitles

In [None]:
def download_youtube_subtitles(
    url_video,
    output_directory,
    ffmpeg_location,
    video_title,
    artist_name
):
    """
    Downloads subtitles from a YouTube video and saves them to a text file 
    following the standards for a valid text file for All Karaoke Party.
    
    The output file begins with a header containing the song title and artist.
    
    Parameters:
    - url_video (str): URL of the YouTube video.
    - output_directory (str): Path where the output file will be saved.
    - ffmpeg_location (str): Full path to the FFmpeg executable.
      FFmpeg is required to merge separate audio and video streams if necessary.
    - video_title (str): Title of the video (used for header and output filename).
    - artist_name (str): Artist name (used for header and output filename).
    
    Returns:
    - None
    """
    
    # Set up yt-dlp options for downloading subtitles.
    ydl_opts = {
        'writeautomaticsub': True,    # Download automatic subtitles.
        'writesubtitles': True,       # Download manual subtitles if available.
        'subtitleslangs': ['en', 'en-orig'],  # Prefer English subtitles.
        'outtmpl': 'song_temp.%(ext)s',  # Temporary filename.
        'quiet': False,
        'ffmpeg_location': ffmpeg_location  # FFmpeg location (required for merging).
    }
    
    # Ensure the output directory exists.
    os.makedirs(output_directory, exist_ok=True)

    # Use yt_dlp to download the video and subtitles.
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        _ = ydl.extract_info(url_video, download=False)
        # Create the output file path using the provided title and artist.
        output_filename = f"song_txt_{video_title}_{artist_name}.txt"
        output_filepath = os.path.join(output_directory, output_filename)
        
        # Download the subtitles.
        ydl.download([url_video])
        
        # Look for the temporary subtitle file.
        subtitle_path = None
        for ext in ['en.vtt', 'en-orig.vtt']:
            temp_path = f"song_temp.{ext}"
            if os.path.exists(temp_path):
                subtitle_path = temp_path
                break

        if subtitle_path:
            # Read the subtitle file.
            with open(subtitle_path, 'r', encoding='utf-8') as file:
                subtitle_content = file.read()

            # Regular expression to match timestamps (format: hh:mm:ss.mmm --> hh:mm:ss.mmm).
            timestamp_pattern = re.compile(
                r"(\d{2}):(\d{2}):(\d{2})\.(\d{3}) --> "
                r"(\d{2}):(\d{2}):(\d{2})\.(\d{3})"
            )
            lines = subtitle_content.splitlines()

            # Write the output file with header metadata and subtitles with timestamps.
            try:
                with open(output_filepath, 'w', encoding='utf-8') as f:
                    # Write header required by All Karaoke Party.
                    f.write(f"Title: {video_title}\n")
                    f.write(f"Artist: {artist_name}\n")
                    f.write("\n")  # Blank line after header.
                    
                    content_written = False
                    for i, line in enumerate(lines):
                        line = line.strip()
                        match = timestamp_pattern.match(line)
                        if match:
                            start_time = f"{match.group(1)}:{match.group(2)}:{match.group(3)}.{match.group(4)}"
                            end_time = f"{match.group(5)}:{match.group(6)}:{match.group(7)}.{match.group(8)}"
                            # The next line typically contains the subtitle text.
                            if i + 1 < len(lines):
                                subtitle_text = lines[i + 1].strip()
                                f.write(f"{start_time} - {end_time} {subtitle_text}\n")
                                content_written = True
                    if not content_written:
                        print("Warning: No subtitle content was written to the file.")
                    else:
                        print(f"Subtitles saved to: {output_filepath}")
            except Exception as e:
                print(f"Error writing to file: {e}")
            # Remove the temporary subtitle file.
            os.remove(subtitle_path)
        else:
            print("No subtitles found for this video.")



In [None]:
# Example usage:
if __name__ == "__main__":
    # Provide the full path to your FFmpeg executable (required for merging streams).
    ffmpeg_path = r'C:\Users\usuario\Anaconda3\envs\songtotxt\Library\bin\ffmpeg.exe'
    
    # Set your desired output directory.
    output_dir = r"C:\Users\usuario\Desktop\MÉS\Karaoke"
    
    # Provide the YouTube video URL.
    video_url = "https://www.youtube.com/watch?v=F3aXpa1rQEY"
    
    # Provide the song title and artist name.
    video_title = "the_man"
    artist_name = "taylor_swift"
    
    # Download and save the subtitles.
    download_youtube_subtitles(
        url_video=video_url,
        output_directory=output_dir,
        ffmpeg_location=ffmpeg_path,
        video_title=video_title,
        artist_name=artist_name
    )