In [1]:
import os
import librosa
import numpy as np
import sagemaker_pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField,StringType, FloatType

In [2]:
from pyspark import SparkContext, SparkConf
from sagemaker_pyspark import classpath_jars
from pyspark.sql.functions import create_map, struct
from pydub import AudioSegment

In [3]:
LANDED = '/home/rlfo/Documents/pessoal/bigdatamusic/landed/'
RAW = '/home/rlfo/Documents/pessoal/bigdatamusic/raw/'

In [4]:
classpath = ":".join(sagemaker_pyspark.classpath_jars())

builder = SparkSession.builder.appName("MUSIC SPARK")
builder.config(
    "spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version", "2")
builder.config("spark.speculation", "false")
builder.config("spark.sql.parquet.compression.codec", "gzip")
builder.config("spark.debug.maxToStringFields", "100")
builder.config("spark.driver.extraClassPath", classpath)
builder.config("spark.driver.memory", "1g")
builder.config("spark.driver.cores", "1")
builder.config("spark.executor-memory", "20g")
builder.config("spark.executor.cores", "4")


builder.master("local[*]")


<pyspark.sql.session.SparkSession.Builder at 0x7f28bd361860>

In [5]:
def audio_to_wav(file):
    dst = file.replace('.mp3',"")+".wav"
    sound = AudioSegment.from_mp3(file)
    sound.export(dst, format="wav")
    return dst
    

In [6]:
def extract_important_feature_music(file):
    
    songname = file.split('/')[::-1][0]
    y, sr = librosa.load(file, mono=True, duration=30)
    chroma_stft = np.mean(librosa.feature.chroma_stft(y=y, sr=sr))
    spec_cent = np.mean(librosa.feature.spectral_centroid(y=y, sr=sr))
    spec_bw = np.mean(librosa.feature.spectral_bandwidth(y=y, sr=sr))
    rolloff = np.mean(librosa.feature.spectral_rolloff(y=y, sr=sr))
    zcr = np.mean(librosa.feature.zero_crossing_rate(y))
    
    chroma_stft = np.array2string(chroma_stft, precision=4, separator=',',suppress_small=True)
    spec_cent = np.array2string(spec_cent, precision=4, separator=',',suppress_small=True)
    spec_bw = np.array2string(spec_bw, precision=4, separator=',',suppress_small=True)
    rolloff = np.array2string(rolloff, precision=4, separator=',',suppress_small=True)
    zcr = np.array2string(zcr, precision=4, separator=',',suppress_small=True)
    
    
    to_append = f'{songname};{chroma_stft};{spec_cent};{spec_bw};{rolloff};{zcr}'    
    return to_append
        

In [7]:
spark = builder.getOrCreate()

In [8]:
extract_important_feature_music('/home/rlfo/Documents/pessoal/bigdatamusic/landed/Stryv - ONEDUO - Illusion (Stryv Remix).wav')

'Stryv - ONEDUO - Illusion (Stryv Remix).wav;0.3036;2104.9923;2081.9164;4229.7994;0.0939'

In [9]:
all_music = [f'{LANDED}{file}' for file in os.listdir(LANDED) if '.mp3' in file]

In [10]:
all_music

['/home/rlfo/Documents/pessoal/bigdatamusic/landed/Aux Fox - Ellie Goulding - Flux (Aux Fox Remix).mp3',
 '/home/rlfo/Documents/pessoal/bigdatamusic/landed/Young Nero - Beyond (Prod. Scott Storch).mp3',
 '/home/rlfo/Documents/pessoal/bigdatamusic/landed/Stryv - ONEDUO - Illusion (Stryv Remix).mp3',
 '/home/rlfo/Documents/pessoal/bigdatamusic/landed/wūsh - late nights with you.mp3',
 '/home/rlfo/Documents/pessoal/bigdatamusic/landed/PRDSEOHNO - OHNO - Lil Mama (prod. Fallen Roses and B Dom).mp3',
 '/home/rlfo/Documents/pessoal/bigdatamusic/landed/nymano - jazz and rain.mp3',
 '/home/rlfo/Documents/pessoal/bigdatamusic/landed/HIGH ON MUSIC - Danrell x Småland - Hostage.mp3',
 '/home/rlfo/Documents/pessoal/bigdatamusic/landed/Flipp Dinero - Leave Me Alone (Prod. by Young Forever x Cast Beats).mp3',
 '/home/rlfo/Documents/pessoal/bigdatamusic/landed/Cardi B - Money.mp3',
 '/home/rlfo/Documents/pessoal/bigdatamusic/landed/ZZ - ICY (feat. Thorii).mp3',
 '/home/rlfo/Documents/pessoal/bigdatam

In [11]:
pipe_rdd_csv = spark.sparkContext.parallelize(all_music).map(audio_to_wav).map(extract_important_feature_music)

In [12]:
pipe_rdd_csv

PythonRDD[1] at RDD at PythonRDD.scala:52

In [13]:
schema = StructType([StructField('file_name', StringType(), True),
                     StructField('chroma', StringType(), True),
                     StructField('spec_cent', StringType(), True),
                     StructField('spec_bw', StringType(), True),
                     StructField('rolloff', StringType(), True),
                     StructField('zcr', StringType(), True)])

In [14]:
pipe_rdd_csv = pipe_rdd_csv.map(lambda x : x.split(";"))
rdd = spark.createDataFrame(pipe_rdd_csv,schema)
rdd.show(2)

+--------------------+------+---------+---------+---------+------+
|           file_name|chroma|spec_cent|  spec_bw|  rolloff|   zcr|
+--------------------+------+---------+---------+---------+------+
|Aux Fox - Ellie G...|0.3115|1243.9466|1475.7774|2472.4184|0.0507|
|Young Nero - Beyo...|0.3283|1608.4343|1925.5995|3293.8967|0.0667|
+--------------------+------+---------+---------+---------+------+
only showing top 2 rows



In [15]:
RAW

'/home/rlfo/Documents/pessoal/bigdatamusic/raw/'

In [16]:
rdd.write.csv(RAW+'features',sep=';',mode='overwrite')
