***
# 2. Audio Preprocessing
***

In [1]:
import argparse
import glob
import os
import sys

from pydub import AudioSegment

## Preparing audio tracks for upload to Google Cloud Platform (GCP)

### Rename tracks

In [2]:
folder = "../audio/tracks"

In [3]:
# rename audio tracks in folder
def main():
    """function to rename multiple files in directory"""
    for count, filename in enumerate(os.listdir(folder)):
        dst = f"{str(count)}.m4a"
        src = f"{folder}/{filename}"  # foldername/filename, if .py file is outside folder
        dst = f"{folder}/{dst}"
         
        # rename() function will rename all the files
        os.rename(src, dst)
 
# Driver Code
if __name__ == '__main__':
     
    # Calling main() function
    main()

### Convert tracks to wav format

In [4]:
formats_to_convert = ['.m4a']

for (dirpath, dirnames, filenames) in os.walk(folder):
    for filename in filenames:
        if filename.endswith(tuple(formats_to_convert)):

            filepath = dirpath + '/' + filename
            (path, file_extension) = os.path.splitext(filepath)
            file_extension_final = file_extension.replace('.', '')
            try:
                track = AudioSegment.from_file(filepath,
                        file_extension_final)
                wav_filename = filename.replace(file_extension_final, 'wav')
                wav_path = dirpath + '/' + wav_filename
                print('CONVERTING: ' + str(filepath))
                file_handle = track.export(wav_path, format='wav')
                os.remove(filepath)
            except:
                print("ERROR CONVERTING " + str(filepath))

CONVERTING: /Users/lukasiwei/Desktop/dsi25-workspace/Projects/capstone_project/audio/tracks/99.m4a
CONVERTING: /Users/lukasiwei/Desktop/dsi25-workspace/Projects/capstone_project/audio/tracks/72.m4a
CONVERTING: /Users/lukasiwei/Desktop/dsi25-workspace/Projects/capstone_project/audio/tracks/66.m4a
CONVERTING: /Users/lukasiwei/Desktop/dsi25-workspace/Projects/capstone_project/audio/tracks/8.m4a
CONVERTING: /Users/lukasiwei/Desktop/dsi25-workspace/Projects/capstone_project/audio/tracks/9.m4a
CONVERTING: /Users/lukasiwei/Desktop/dsi25-workspace/Projects/capstone_project/audio/tracks/67.m4a
CONVERTING: /Users/lukasiwei/Desktop/dsi25-workspace/Projects/capstone_project/audio/tracks/73.m4a
CONVERTING: /Users/lukasiwei/Desktop/dsi25-workspace/Projects/capstone_project/audio/tracks/98.m4a
CONVERTING: /Users/lukasiwei/Desktop/dsi25-workspace/Projects/capstone_project/audio/tracks/65.m4a
CONVERTING: /Users/lukasiwei/Desktop/dsi25-workspace/Projects/capstone_project/audio/tracks/71.m4a
CONVERTING: 

CONVERTING: /Users/lukasiwei/Desktop/dsi25-workspace/Projects/capstone_project/audio/tracks/84.m4a
CONVERTING: /Users/lukasiwei/Desktop/dsi25-workspace/Projects/capstone_project/audio/tracks/53.m4a
CONVERTING: /Users/lukasiwei/Desktop/dsi25-workspace/Projects/capstone_project/audio/tracks/1.m4a
CONVERTING: /Users/lukasiwei/Desktop/dsi25-workspace/Projects/capstone_project/audio/tracks/47.m4a
CONVERTING: /Users/lukasiwei/Desktop/dsi25-workspace/Projects/capstone_project/audio/tracks/46.m4a
CONVERTING: /Users/lukasiwei/Desktop/dsi25-workspace/Projects/capstone_project/audio/tracks/52.m4a
CONVERTING: /Users/lukasiwei/Desktop/dsi25-workspace/Projects/capstone_project/audio/tracks/0.m4a
CONVERTING: /Users/lukasiwei/Desktop/dsi25-workspace/Projects/capstone_project/audio/tracks/85.m4a
CONVERTING: /Users/lukasiwei/Desktop/dsi25-workspace/Projects/capstone_project/audio/tracks/91.m4a
CONVERTING: /Users/lukasiwei/Desktop/dsi25-workspace/Projects/capstone_project/audio/tracks/87.m4a
CONVERTING: 

### Concatenate tracks

In [2]:
# there seems to be a limit to how large/long a concatenated track can be
# a dead kernel results once the limit is reached
# additionally, the maximum duration limit for google speech-to-text API is also 480 mins
# hence, the tracks are being concatenated into 7 tracks
tracks_1 = "/Users/lukasiwei/Desktop/dsi25-workspace/Projects/capstone_project/audio/tracks1/"
tracks_2 = "/Users/lukasiwei/Desktop/dsi25-workspace/Projects/capstone_project/audio/tracks2/"
tracks_3 = "/Users/lukasiwei/Desktop/dsi25-workspace/Projects/capstone_project/audio/tracks3/"
tracks_4 = "/Users/lukasiwei/Desktop/dsi25-workspace/Projects/capstone_project/audio/tracks4/"
tracks_5 = "/Users/lukasiwei/Desktop/dsi25-workspace/Projects/capstone_project/audio/tracks5/"
tracks_6 = "/Users/lukasiwei/Desktop/dsi25-workspace/Projects/capstone_project/audio/tracks6/"
tracks_7 = "/Users/lukasiwei/Desktop/dsi25-workspace/Projects/capstone_project/audio/tracks7/"

In [3]:
# tracks_1
filenames = glob.glob(tracks_1+'*.wav')
combined_1 = AudioSegment.empty()
for filename in filenames:
    audiofilename = AudioSegment.from_wav(filename)
    
    combined_1 += audiofilename

In [4]:
# tracks_2
filenames = glob.glob(tracks_2+'*.wav')
combined_2 = AudioSegment.empty()
for filename in filenames:
    audiofilename = AudioSegment.from_wav(filename)
    
    combined_2 += audiofilename

In [5]:
# tracks_3
filenames = glob.glob(tracks_3+'*.wav')
combined_3 = AudioSegment.empty()
for filename in filenames:
    audiofilename = AudioSegment.from_wav(filename)
    
    combined_3 += audiofilename

In [6]:
# tracks_4
filenames = glob.glob(tracks_4+'*.wav')
combined_4 = AudioSegment.empty()
for filename in filenames:
    audiofilename = AudioSegment.from_wav(filename)
    
    combined_4 += audiofilename

In [7]:
# tracks_5
filenames = glob.glob(tracks_5+'*.wav')
combined_5 = AudioSegment.empty()
for filename in filenames:
    audiofilename = AudioSegment.from_wav(filename)
    
    combined_5 += audiofilename

In [8]:
# tracks_6
filenames = glob.glob(tracks_6+'*.wav')
combined_6 = AudioSegment.empty()
for filename in filenames:
    audiofilename = AudioSegment.from_wav(filename)
    
    combined_6 += audiofilename

In [9]:
# tracks_7
filenames = glob.glob(tracks_7+'*.wav')
combined_7 = AudioSegment.empty()
for filename in filenames:
    audiofilename = AudioSegment.from_wav(filename)
    
    combined_7 += audiofilename

### Split stereo into mono channels

In [10]:
audio_1 = combined_1.set_channels(1)

In [11]:
audio_2 = combined_2.set_channels(1)

In [12]:
audio_3 = combined_3.set_channels(1)

In [13]:
audio_4 = combined_4.set_channels(1)

In [14]:
audio_5 = combined_5.set_channels(1)

In [15]:
audio_6 = combined_6.set_channels(1)

In [16]:
audio_7 = combined_7.set_channels(1)

In [17]:
# check number of channels 
print("Channels:", audio_7.channels)

Channels: 1


In [18]:
# check specs and duration (<28800s)
print('audio_1')
print('\nChannels:', audio_1.channels)
print("Bits per sample:", audio_1.sample_width * 8)
print("Sampling frequency:", audio_1.frame_rate)
print("Length:", audio_1.duration_seconds, "seconds")

print('\naudio_2')
print('\nChannels:', audio_2.channels)
print("Bits per sample:", audio_2.sample_width * 8)
print("Sampling frequency:", audio_2.frame_rate)
print("Length:", audio_2.duration_seconds, "seconds")

print('\naudio_3')
print('\nChannels:', audio_3.channels)
print("Bits per sample:", audio_3.sample_width * 8)
print("Sampling frequency:", audio_3.frame_rate)
print("Length:", audio_3.duration_seconds, "seconds")

print('\naudio_4')
print('\nChannels:', audio_4.channels)
print("Bits per sample:", audio_4.sample_width * 8)
print("Sampling frequency:", audio_4.frame_rate)
print("Length:", audio_4.duration_seconds, "seconds")

print('\naudio_5')
print('\nChannels:', audio_5.channels)
print("Bits per sample:", audio_5.sample_width * 8)
print("Sampling frequency:", audio_5.frame_rate)
print("Length:", audio_5.duration_seconds, "seconds")

print('\naudio_6')
print('\nChannels:', audio_6.channels)
print("Bits per sample:", audio_6.sample_width * 8)
print("Sampling frequency:", audio_6.frame_rate)
print("Length:", audio_6.duration_seconds, "seconds")

print('\naudio_7')
print('\nChannels:', audio_7.channels)
print("Bits per sample:", audio_7.sample_width * 8)
print("Sampling frequency:", audio_7.frame_rate)
print("Length:", audio_7.duration_seconds, "seconds")

audio_1

Channels: 1
Bits per sample: 16
Sampling frequency: 44100
Length: 26060.544580498867 seconds

audio_2

Channels: 1
Bits per sample: 16
Sampling frequency: 44100
Length: 26039.437641723354 seconds

audio_3

Channels: 1
Bits per sample: 16
Sampling frequency: 44100
Length: 20768.716916099773 seconds

audio_4

Channels: 1
Bits per sample: 16
Sampling frequency: 44100
Length: 26636.817414965986 seconds

audio_5

Channels: 1
Bits per sample: 16
Sampling frequency: 44100
Length: 16475.486621315194 seconds

audio_6

Channels: 1
Bits per sample: 16
Sampling frequency: 44100
Length: 22676.07365079365 seconds

audio_7

Channels: 1
Bits per sample: 16
Sampling frequency: 44100
Length: 22072.23873015873 seconds


### Export

In [22]:
audio_1.export(
    "/Volumes/CCC Gold X/audio/CONCAT/audio_1.wav",
    format="wav")

<_io.BufferedRandom name='/Volumes/CCC Gold X/audio/CONCAT/audio_1.wav'>

In [23]:
audio_2.export(
    "/Volumes/CCC Gold X/audio/CONCAT/audio_2.wav",
    format="wav")

<_io.BufferedRandom name='/Volumes/CCC Gold X/audio/CONCAT/audio_2.wav'>

In [24]:
audio_3.export(
    "/Volumes/CCC Gold X/audio/CONCAT/audio_3.wav",
    format="wav")

<_io.BufferedRandom name='/Volumes/CCC Gold X/audio/CONCAT/audio_3.wav'>

In [25]:
audio_4.export(
    "/Volumes/CCC Gold X/audio/CONCAT/audio_4.wav",
    format="wav")

<_io.BufferedRandom name='/Volumes/CCC Gold X/audio/CONCAT/audio_4.wav'>

In [26]:
audio_5.export(
    "/Volumes/CCC Gold X/audio/CONCAT/audio_5.wav",
    format="wav")

<_io.BufferedRandom name='/Volumes/CCC Gold X/audio/CONCAT/audio_5.wav'>

In [27]:
audio_6.export(
    "/Volumes/CCC Gold X/audio/CONCAT/audio_6.wav",
    format="wav")

<_io.BufferedRandom name='/Volumes/CCC Gold X/audio/CONCAT/audio_6.wav'>

In [28]:
audio_7.export(
    "/Volumes/CCC Gold X/audio/CONCAT/audio_7.wav",
    format="wav")

<_io.BufferedRandom name='/Volumes/CCC Gold X/audio/CONCAT/audio_7.wav'>

## Upload to GCP Speech-To-Text API

The 7 audio tracks were uploaded to a [Google Cloud Storage](https://cloud.google.com/storage) bucket and routed to the [Speech-to-Text API](https://cloud.google.com/speech-to-text) with the default settings through the GCP Console. The transcripts were output to a new bucket in Google Cloud Storage overnight and retrieved manually to form the text corpus for the project.

![gcp_speechtotext](../media/gcp_speechtotext.png)

Fig. 1: List of completed transcription on GCP Speech-to_text API

![gcp_transcript](../media/gcp_transcripts.png)

Fig. 2: Example of transcripted audio track