In [1]:
import ffmpeg
from pathlib import Path
import re
import logging

# This makes jupyter notebook output everything instead of just the last output
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

################################################################################

# Considerations:
# Not sure if we should make the directories for the users if the supplied directory doesn't
# exist yet

In [2]:
# Print error if this directory does not have a folder named 'data'
if not Path('data').exists():
    print("Error! This program requires a data folder!")
# Otherwise, confirm that the data folder exists
else:
    print("'data' folder exists!")

'data' folder exists!


In [3]:
# Function
logging.basicConfig(filename = 'extract_audio_from_video.log',
                    encoding = 'utf-8', level = logging.DEBUG,
                    filemode = 'w')

def extract_audio_from_video(video_path, to_folder = None,
                             video_extensions = ['.mp4', '.mov'],
                             log_flag = False):
    """
    Function that goes into the directory and extracts audio from all mp4 files and outputs it
    into a folder called 'extracted_audio'
    
    Arguments:
    video_path | pathlib.PosixPath
    Path to the directory that contains all of the video files.
    
    Optional Arguments:
    to_folder | pathlib.PosixPath
    Path to the directory where you want to store the extracted audio. By default,
    it will be extracted into a folder in video_path's parent directory as 'extracted_audio'
    
    video_extensions | list
    A list of strings that contains the extensions you want ffmpeg to extract audio from. The
    default includes .mp4 files and .mov files.
    
    log_flag | bool
    A boolean value that sets the logging level. If true, it will 
    
    Returns:
    It returns a string that tells you how many audio files were extracted.
    """
    # Set up the logging for this function
    # filemode = 'w' makes it overwrite the previous log
    
    # Using dictionary as switch case for using whether or not to log the logging info
    log_switch_case = {
        # The only logging functions that are used are logging.info()
        True: logging.DEBUG, # This writes logging debug output into the log file
        False: logging.WARNING # This stops writing logging debug output into the log file
    }
    # See https://docs.python.org/3/howto/logging.html for descriptions of the various levels
    
    # Making a default directory for to_folder
    if to_folder == None:
        print("Created a default to_folder as none was given.")
        video_path.parent.joinpath('extracted_audio')
    
    # Create a counter for the number of audio files processed
    audio_file_counter = 0
    
    # Go through all files in `video_path`
    for child in video_path.iterdir():
        # Debugging: show the file name
        logging.debug('Detected file: {}'.format(child.name))
        # Store the extension of the file name
        file_extension = Path(child.name).suffix
        # If the file's extension is in `video_extensions`
        if file_extension in video_extensions:
            # Do an ffmpeg probe on the currently selected file
            current_probe = ffmpeg.probe(child)
            # If the number of streams is more than 1
            # (as a proxy for if the file could have audio)
            if len(current_probe['streams']) > 1:
                # Check if `codec_type` is audio
                for stream in current_probe['streams']:
                    # Debugging: print the codec type
                    print(stream['codec_type'])
                    # If the codec type is audio
                    if stream['codec_type'] == 'audio':
                        # Debugging: print the video file's path
                        print(child)
                        # Store the video file path
                        new_child = str(child)
                        # Replace the extension with .mp3
                        new_child = re.sub(pattern = '{}$'.format(file_extension),
                                          repl = '.mp3', string = new_child)
                        
                        # Check that the audio path exists
                        if not a_path_1.exists():
                            print("to_folder directory does not exist therefore one will be made.")
                            a_path_1.mkdir(parents = True, exist_ok = False)
                        
                        # Create a variable to store the audio file's path
                        audio_file_path = a_path_1.joinpath(Path(new_child).name)
                        # Debugging: print the audio file's path
                        print(audio_file_path)
                        
                        # Information about the ffmpeg command:
                        ## ffmpeg -i input.mp4 -vn -q:a 0 -map a audio.mp3
                        ## '-i' is the input, '-vn' excludes video,
                        ## '-q:a' is the mp3 encoding,
                        ## '0' is the best quality mp3 encoding,
                        ## '-map a' only grabs audio
                        
                        # Using ffmpeg to transform the video file into a .mp3 file
                        try:
                            out, error_message = ffmpeg.input(str(child)) \
                            .output(filename = str(audio_file_path),
                                   **{'qscale:a': 0,
                                      'map': 'a'}) \
                            .overwrite_output() \
                            .run()
                        # Print the error message if the try doesn't work
                        except ffmpeg.Error as error_message:
                            print('ffmpeg stderr:', error_message.stderr())
                            raise error_message
                        
                        # Incremenet the counter for the number of video files processed
                        audio_file_counter += 1
    # Return a print message that states how many audio files were extracted
    if (audio_file_counter == 1):
        file_word = 'file'
    else:
        file_word = 'files'
    # Print how many audio files were extracted
    return (str(audio_file_counter) + ' audio ' + file_word + ' extracted!')

In [5]:
# Testing function
# Make the path to the data folder that contains the video files
v_path_1 = Path.cwd().joinpath('data')
# Make a directory that you want the extracted audio to go into
a_path_1 = Path.cwd().joinpath('extracted_audio')
# Show where the video files are coming from
print("Video path: ", v_path_1)
# Test the function
extract_audio_from_video(video_path = v_path_1, to_folder = a_path_1, log_flag = True)

Video path:  /Users/Jackie/Documents/Coding/jupyter nbs (python)/audio_extraction/data
20
log flag:  True
video
audio
/Users/Jackie/Documents/Coding/jupyter nbs (python)/audio_extraction/data/u2jzoo.mp4
/Users/Jackie/Documents/Coding/jupyter nbs (python)/audio_extraction/extracted_audio/u2jzoo.mp3
video
audio
/Users/Jackie/Documents/Coding/jupyter nbs (python)/audio_extraction/data/mrrrzic2.mov
/Users/Jackie/Documents/Coding/jupyter nbs (python)/audio_extraction/extracted_audio/mrrrzic2.mp3


'2 audio files extracted!'

In [None]:
# Note about logging:
# When the code is run from a python notebook, it will just keep logging onto the same file
# without clearing the previous logging file, regardless of the settings. However when I tried
# running the function on a python script file, it cleared the logging file properly
# before it started writing the log.