# Libraries

In [5]:
import csv, sys, os
import re
from pydub import AudioSegment
import requests
import tarfile
import shutil



# Functions

In [40]:
def createDirectory(path): # used
    """ Create new directory
    Input: path of the directory to be created.
    Output: Creates the directory if it does not exist."""
    try: 
        os.mkdir(path)  # create directory at location of 'path'
        return True
    except FileExistsError:
        return False  # if file already exists
    
def from_urbansound8k(dataset_path, labels, outputPath): # used
    """ Elongates UrbanSound8k gunshot signals
    Input: path of the UrbanSound8k dataset, file names, and output path for elongated signals.
    Output: Saves the elongated signals to the output path."""

    # Create output directory
    cleanOutputPath = os.path.join(outputPath, 'gunshots')  #
    createDirectory(cleanOutputPath)  

    #Cleanup
    keepFiles = list()
    with open(os.path.join(dataset_path, "metadata", "UrbanSound8K.csv")) as csvFile:
        reader = csv.reader(csvFile)
        for row in reader:
            if row[7] in labels:
                keepFiles.append(row)

    # Find keepFiles
    for item in keepFiles:
        file_path = dataset_path + "/audio/fold" + item[-3] + "/"
        shutil.copy(file_path + item[0], cleanOutputPath + "/" + item[0])

def splitAudioFile(input_filepath, output_filepath, duration): # used
    """ Splits audio files in segments of duration given.
    Input: path to .wav audio files, output path, and duration.
    Output: total duration/duration .wav audio files in the output path.
    """

    #Open the file
    audioFile = AudioSegment.from_wav(input_filepath)  
    
    #Split up the file
    windowSize = duration * 1000
    audioSegmentCount = int((len(audioFile) - (len(audioFile) % windowSize)) / windowSize)  
    
    #Write back the files
    for i, chunk in enumerate(audioFile[::windowSize]):
        filepath = os.path.join(output_filepath, str(i) + ".wav")
        with open(filepath, "wb") as f:
            chunk.export(f, format="wav")

    return audioSegmentCount

def downloadYoutubeFiles(output_path, videos, split_size): # used
    """ Split Youtube video files.
    Input: path to the output directory, list of video IDs, and split size.
    Output: total duration/duration .wav audio files in the output path..
    """

    for video in videos:
        video_id = video[0]
        start_time = video[1]
        name = video[2]
        
        # Specify download path
        download_path = output_path + "YouTube"
        # Create output directory
        split_path = os.path.join(output_path + "SplitFiles")
        createDirectory(split_path)

        # Split .wav files in bits of split_size seconds
        splitFilesDirectory = os.path.join(split_path, name).replace(os.sep, '/')
        createDirectory(splitFilesDirectory)
        splitAudioFile(os.path.join(download_path, name + ".wav").replace(os.sep, '/'), splitFilesDirectory, split_size)

def addSilent(inputFile, outputFile, duration):
    """ Adds silent padding to one input file.
    Input: path to the input file, output path, and duration.
    Output: the output file with silent padding added.
    """
     
    #Open the file
    audioFile = AudioSegment.from_wav(inputFile)
    
    #Calculate padding size
    paddingSize = (duration * 1000 - len(audioFile))/2
    silent_segment = AudioSegment.silent(duration = paddingSize)
    newAudioFile = silent_segment + audioFile + silent_segment
    newAudioFile.export(outputFile, format="wav")

def uniformAudioFiles(inputDir, outputDir, duration):
    """ Uniformly splits audio files.
    Input: path to the gunshot .wav files, output path, and duration.
    Output: same amount of .wav audio files in the output path.
    """

    files = os.listdir(inputDir) 
    print("files: ", files)
    createDirectory(outputDir)
    i = 0
    for file in files:
        i += 1
        outputPath = os.path.join(outputDir, str(files.index(file)) + ".wav") # ./single_shots/1.wav
        if i < 1:
            print("file: ", file)
            print("outputPath: ", outputPath)
        addSilent(os.path.join(inputDir,file), outputPath, duration)

def main_UrbanSound8K(path):
    """Creates a directory 'gunshots' with .wav files of single gunshot signals"""

    dataset_path = path + "UrbanSound8K"
    urbansound_labels = ['gun_shot']
    from_urbansound8k(dataset_path, urbansound_labels, path)

# Diver code
Comment and uncomment the three mean fuctions in the next code block to run different parts of the code.

In [39]:
# Add path to the directory where you want to story the data. 
your_path = "C:/Users/maris/Documents/DataScience/Thesis/PinPoach_Thesis/Data/"  # "C:/Users/ ... /Data/"
# Define Youtube videos for download
yt_videos = [
    ("OcVtCTBTJ-4", 0, "african_savanna_day"),
    ("Bm_Gc4MXqfQ", 0, "african_savanna_night"),
    ("Mr9T-943BnE", 0, "rain"),
    ("T9IJKwEspI8", 0, "thunder")
]
# Split size of recordings in seconds
split_size = 10

# Run this function if you want to extract gunshot recordigns from UrbanSound8K
#main_UrbanSound8K(your_path)

# Run this function if you want to split the manually downloaded YouTube audio files
#downloadYoutubeFiles(your_path, yt_videos, split_size)   

# Run this function if you want to pad the gunshot signals to a lenght of split_size seconds
#uniformAudioFiles(your_path + 'gunshots/', your_path + 'SplitFiles/single_shots', split_size)


 uniformAudioFiles(): 
inputDir:  C:/Users/maris/Documents/DataScience/Thesis/PinPoach_Thesis/Data/gunshots/
ouputDir:  C:/Users/maris/Documents/DataScience/Thesis/PinPoach_Thesis/Data/SplitFiles/single_shots
files:  ['102305-6-0-0.wav', '106955-6-0-0.wav', '110622-6-0-0.wav', '111048-6-0-0.wav', '122690-6-0-0.wav', '131571-6-0-0.wav', '133797-6-0-0.wav', '133797-6-1-0.wav', '133797-6-2-0.wav', '135526-6-0-0.wav', '135526-6-1-0.wav', '135526-6-10-0.wav', '135526-6-11-0.wav', '135526-6-12-0.wav', '135526-6-13-0.wav', '135526-6-2-0.wav', '135526-6-3-0.wav', '135526-6-4-0.wav', '135526-6-5-0.wav', '135526-6-6-0.wav', '135526-6-7-0.wav', '135526-6-8-0.wav', '135526-6-9-0.wav', '135527-6-0-0.wav', '135527-6-1-0.wav', '135527-6-10-0.wav', '135527-6-11-0.wav', '135527-6-12-0.wav', '135527-6-13-0.wav', '135527-6-14-0.wav', '135527-6-14-1.wav', '135527-6-14-10.wav', '135527-6-14-2.wav', '135527-6-14-3.wav', '135527-6-14-4.wav', '135527-6-14-5.wav', '135527-6-14-6.wav', '135527-6-14-7.wav', '13

# Dependencies
## Downloading UrbanSound8K
1. Install sounddata package

In [17]:
!pip install soundata



In [15]:
!pip install --upgrade patsy

Collecting patsy
  Downloading patsy-0.5.3-py2.py3-none-any.whl (233 kB)
     -------------------------------------- 233.8/233.8 kB 4.9 MB/s eta 0:00:00
Installing collected packages: patsy
  Attempting uninstall: patsy
    Found existing installation: patsy 0.5.1
    Uninstalling patsy-0.5.1:
      Successfully uninstalled patsy-0.5.1
Successfully installed patsy-0.5.3


Keyring is skipped due to an exception: 'keyring.backends'

[notice] A new release of pip available: 22.2.2 -> 23.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [16]:
!python -m pip install --upgrade pip

Collecting pip
  Downloading pip-23.0.1-py3-none-any.whl (2.1 MB)
     ---------------------------------------- 2.1/2.1 MB 4.1 MB/s eta 0:00:00
Installing collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 22.2.2
    Uninstalling pip-22.2.2:
      Successfully uninstalled pip-22.2.2
Successfully installed pip-23.0.1


Keyring is skipped due to an exception: 'keyring.backends'


2. Download data

In [19]:
# This is the example code
import soundata

dataset = soundata.initialize('urbansound8k', data_home=r'C:\Users\maris\Documents\Data Science\Thesis\Previous research\Chengeta_model-master')
dataset.download()  # download the dataset
dataset.validate()  # validate that all the expected files are there

example_clip = dataset.choice_clip()  # choose a random example clip
print(example_clip)  # see the available data

INFO: NumExpr defaulting to 8 threads.
INFO: Downloading ['all'] to C:\Users\maris\Documents\Data Science\Thesis\Previous research\Chengeta_model-master
INFO: [all] downloading UrbanSound8K.tar.gz
5.61GB [34:08, 2.94MB/s]                                
100%|██████████| 1/1 [00:00<?, ?it/s]
100%|██████████| 8732/8732 [01:37<00:00, 89.66it/s] 
INFO: Success: the dataset is complete and all files are valid.
INFO: --------------------


Clip(
  audio_path="...\Documents\Data Science\Thesis\Previous research\Chengeta_model-master\audio/fold8/157868-8-0-17.wav",
  clip_id="157868-8-0-17",
  audio: The clip's audio
            * np.ndarray - audio signal
            * float - sample rate,
  class_id: The clip's class id.
            * int - integer representation of the class label (0-9). See Dataset Info in the documentation for mapping,
  class_label: The clip's class label.
            * str - string class name: air_conditioner, car_horn, children_playing, dog_bark, drilling, engine_idling, gun_shot, jackhammer, siren, street_music,
  fold: The clip's fold.
            * int - fold number (1-10) to which this clip is allocated. Use these folds for cross validation,
  freesound_end_time: The clip's end time in Freesound.
            * float - end time in seconds of the clip in the original freesound recording,
  freesound_id: The clip's Freesound ID.
            * str - ID of the freesound.org recording from which this

__Citation__:
@misc{fuentes_salamon2021soundata,
      title={Soundata: A Python library for reproducible use of audio datasets}, 
      author={Magdalena Fuentes and Justin Salamon and Pablo Zinemanas and Martín Rocamora and 
      Genís Plaja and Irán R. Román and Marius Miron and Xavier Serra and Juan Pablo Bello},
      year={2021},
      eprint={2109.12690},
      archivePrefix={arXiv},
      primaryClass={cs.SD}
}

## Dependent libraries
### youtube_dl

In [20]:
!pip install youtube_dl

Collecting youtube_dl
  Downloading youtube_dl-2021.12.17-py2.py3-none-any.whl (1.9 MB)
     ---------------------------------------- 1.9/1.9 MB 10.1 MB/s eta 0:00:00
Installing collected packages: youtube_dl
Successfully installed youtube_dl-2021.12.17


### ffmpeg
Sources used:
1. Install ffmpeg on your local machine. You can download it from the official website: https://ffmpeg.org/download.html.

2. After installing ffmpeg, you can add the path of the ffmpeg executable to your system environment variables. To do this on Windows: Open the Start menu and search for "Environment Variables". Click "Edit the system environment variables". Click the "Environment Variables" button. Under "System variables", scroll down and find the "Path" variable. Click "Edit". Click "New" and add the path to the folder containing the ffmpeg executable. For example, C:\Program Files\ffmpeg\bin. Click "OK" to close all windows.

3. After adding the ffmpeg path to your system environment variables, you can import it in Jupyter Notebooks using the subprocess module:

(Chat GPT)

And: https://answers.microsoft.com/en-us/windows/forum/all/cant-edit-environment-variables-in-windows-10/29ca1561-5c4d-4c0d-ab93-808411a062a3 for accessing the system controls 

In [22]:
import subprocess
