In [7]:
from pyspark.sql import SparkSession

spark = (
    SparkSession
    .builder
    .master("local[*]")
    .getOrCreate()
)

Load the .wav files using binaryFile format and glob filter

In [8]:
import os
df = spark.read.format("binaryFile").option("pathGlobFilter", "*.wav").load(os.path.join("/opt/workspace", "media/audio"))
df.show(5)

+--------------------+-------------------+------+--------------------+
|                path|   modificationTime|length|             content|
+--------------------+-------------------+------+--------------------+
|file:/opt/workspa...|2019-09-21 03:56:22|503062|[52 49 46 46 0E A...|
|file:/opt/workspa...|2019-09-21 03:56:22|384044|[52 49 46 46 24 D...|
|file:/opt/workspa...|2019-09-21 03:56:22|380162|[52 49 46 46 FA C...|
|file:/opt/workspa...|2019-09-21 03:56:22|368044|[52 49 46 46 A4 9...|
|file:/opt/workspa...|2019-09-21 03:56:22|361644|[52 49 46 46 A4 8...|
+--------------------+-------------------+------+--------------------+


Extract audio features using pyAudioAnalysis

In [12]:
import pyspark.sql.functions as F
from pyspark.sql.pandas.functions import pandas_udf
import pyspark.sql.types as T
import pandas as pd

@pandas_udf(returnType=T.MapType(T.StringType(), T.ArrayType(T.FloatType())))
def extract_audio_features(file_paths: pd.Series) -> pd.Series:
    from pyAudioAnalysis import ShortTermFeatures, audioBasicIO
    def fn(path):
        [sampling_rate, signal] = audioBasicIO.read_audio_file(path.replace("file:", ""))
        window = 0.5 * sampling_rate
        step = 0.5 * sampling_rate
        features, f_names = ShortTermFeatures.feature_extraction(signal, sampling_rate, window, step)
        return dict(zip(f_names, features))
    
    return pd.Series([fn(path) for path in file_paths.tolist()])


features_df = df.withColumn("audio_features", extract_audio_features(F.col("path")))
features_df.show(5)

+--------------------+-------------------+------+--------------------+--------------------+
|                path|   modificationTime|length|             content|      audio_features|
+--------------------+-------------------+------+--------------------+--------------------+
|file:/opt/workspa...|2019-09-21 03:56:22|503062|[52 49 46 46 0E A...|{zcr -> [0.083885...|
|file:/opt/workspa...|2019-09-21 03:56:22|384044|[52 49 46 46 24 D...|{zcr -> [0.077509...|
|file:/opt/workspa...|2019-09-21 03:56:22|380162|[52 49 46 46 FA C...|{zcr -> [0.021502...|
|file:/opt/workspa...|2019-09-21 03:56:22|368044|[52 49 46 46 A4 9...|{zcr -> [0.079884...|
|file:/opt/workspa...|2019-09-21 03:56:22|361644|[52 49 46 46 A4 8...|{zcr -> [0.140892...|
+--------------------+-------------------+------+--------------------+--------------------+
