# ELECTRONIC MUSIC GENRE CLASSIFICATION

### Create PySpark instance

In [1]:
import findspark
findspark.init()

import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Multiclass Classification").getOrCreate()

cores = spark._jsc.sc().getExecutorMemoryStatus().keySet().size()
print("You are working with", cores, "core(s)")
spark

You are working with 1 core(s)


### Import the dataset and others librairies

In [2]:
# Dataset
df = spark.read.csv('beatsdataset.csv', inferSchema = True, header = True)

# Librairies
import pandas as pd

from pyspark.ml.feature import *

from pyspark.sql.types import * 
from pyspark.sql.functions import *

from pyspark.ml.classification import *

from pyspark.ml.stat import *

from pyspark.ml.evaluation import *

from pyspark.ml import Pipeline

from pyspark.ml.tuning import *

### Data Preparation

In [3]:
pd.DataFrame(df.take(5), columns=df.columns)

Unnamed: 0,_c0,1-ZCRm,2-Energym,3-EnergyEntropym,4-SpectralCentroidm,5-SpectralSpreadm,6-SpectralEntropym,7-SpectralFluxm,8-SpectralRolloffm,9-MFCCs1m,...,63-ChromaVector8std,64-ChromaVector9std,65-ChromaVector10std,66-ChromaVector11std,67-ChromaVector12std,68-ChromaDeviationstd,69-BPM,70-BPMconf,71-BPMessentia,class
0,0,0.13644,0.088861,3.201201,0.262825,0.249212,1.114423,0.007003,0.256682,-22.723259,...,0.003431,0.004981,0.010818,0.024001,0.005201,0.015056,133.333333,0.132792,128.0,BigRoom
1,1,0.117039,0.108389,3.194001,0.247657,0.250288,1.065668,0.005387,0.199821,-21.775871,...,0.004461,0.006441,0.007469,0.015499,0.005589,0.019339,120.0,0.112767,126.0,BigRoom
2,2,0.085308,0.128525,3.123837,0.217205,0.228652,0.789647,0.008247,0.156822,-22.472722,...,0.001529,0.004556,0.007723,0.017482,0.002901,0.022201,133.333333,0.123373,129.0,BigRoom
3,3,0.10305,0.167042,3.15083,0.233593,0.245032,0.967082,0.006571,0.168083,-21.470751,...,0.001591,0.003514,0.009477,0.023162,0.004165,0.015379,133.333333,0.158876,129.0,BigRoom
4,4,0.15173,0.148405,3.194498,0.29373,0.267231,1.353005,0.003872,0.292055,-21.371157,...,0.003945,0.004131,0.01133,0.028188,0.002639,0.019079,133.333333,0.190708,129.0,BigRoom


In [4]:
df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- 1-ZCRm: double (nullable = true)
 |-- 2-Energym: double (nullable = true)
 |-- 3-EnergyEntropym: double (nullable = true)
 |-- 4-SpectralCentroidm: double (nullable = true)
 |-- 5-SpectralSpreadm: double (nullable = true)
 |-- 6-SpectralEntropym: double (nullable = true)
 |-- 7-SpectralFluxm: double (nullable = true)
 |-- 8-SpectralRolloffm: double (nullable = true)
 |-- 9-MFCCs1m: double (nullable = true)
 |-- 10-MFCCs2m: double (nullable = true)
 |-- 11-MFCCs3m: double (nullable = true)
 |-- 12-MFCCs4m: double (nullable = true)
 |-- 13-MFCCs5m: double (nullable = true)
 |-- 14-MFCCs6m: double (nullable = true)
 |-- 15-MFCCs7m: double (nullable = true)
 |-- 16-MFCCs8m: double (nullable = true)
 |-- 17-MFCCs9m: double (nullable = true)
 |-- 18-MFCCs10m: double (nullable = true)
 |-- 19-MFCCs11m: double (nullable = true)
 |-- 20-MFCCs12m: double (nullable = true)
 |-- 21-MFCCs13m: double (nullable = true)
 |-- 22-ChromaVector1m: double (null

In [5]:
# Checking for missing values
df.toPandas()['class'].isnull().sum()

0

In [6]:
df = df.drop('_c0')
cols = df.columns

In [7]:
### Check number of classes
df.groupBy("class").count().orderBy(col("count").desc()).show(100, truncate = False)

+--------------------+-----+
|class               |count|
+--------------------+-----+
|PsyTrance           |100  |
|HardDance           |100  |
|Breaks              |100  |
|HardcoreHardTechno  |100  |
|IndieDanceNuDisco   |100  |
|Trance              |100  |
|DeepHouse           |100  |
|ElectronicaDowntempo|100  |
|ReggaeDub           |100  |
|Minimal             |100  |
|DrumAndBass         |100  |
|Dubstep             |100  |
|BigRoom             |100  |
|Techno              |100  |
|House               |100  |
|FutureHouse         |100  |
|ElectroHouse        |100  |
|GlitchHop           |100  |
|TechHouse           |100  |
|HipHop              |100  |
|FunkRAndB           |100  |
|Dance               |100  |
|ProgressiveHouse    |100  |
+--------------------+-----+



### Summary statistics for numeric variables

In [8]:
numeric_features = [t[0] for t in df.dtypes if t[1] != 'string']
df.select(numeric_features).describe().toPandas()

Unnamed: 0,summary,1-ZCRm,2-Energym,3-EnergyEntropym,4-SpectralCentroidm,5-SpectralSpreadm,6-SpectralEntropym,7-SpectralFluxm,8-SpectralRolloffm,9-MFCCs1m,...,62-ChromaVector7std,63-ChromaVector8std,64-ChromaVector9std,65-ChromaVector10std,66-ChromaVector11std,67-ChromaVector12std,68-ChromaDeviationstd,69-BPM,70-BPMconf,71-BPMessentia
0,count,2300.0,2300.0,2300.0,2300.0,2300.0,2300.0,2300.0,2300.0,2300.0,...,2300.0,2300.0,2300.0,2300.0,2300.0,2300.0,2300.0,2300.0,2300.0,2300.0
1,mean,0.1077356797961763,0.1042517446633393,3.110511156666804,0.2355830553133991,0.2414067472617003,0.9061372135071644,0.0107047924493302,0.190545112261208,-23.166806366416047,...,0.0675402742480544,0.0026319920322406,0.0049189587369445,0.0099167646074471,0.0306164108031303,0.0041410536191705,0.0198524652157839,148.22553723922272,0.1693102908973245,120.94869565217392
2,stddev,0.0371050389930113,0.0409655391882542,0.0709247404620073,0.0435852748119094,0.019559085955778,0.341511873072279,0.005386503420646,0.0801685800452624,1.1774170450040735,...,0.0204573543367274,0.0018063503004847,0.0026641147895572,0.0045582124506912,0.0115720378103814,0.0024919863674215,0.0044025429666662,46.6411599307751,0.056806153016806,17.10192490974369
3,min,0.017122317477,0.00494799624461,2.74610482676,0.0826846383739,0.151737604308,0.0341590993706,0.0026894786818,0.0118423440634,-30.3789543716,...,0.017596577864,0.000161595871092,0.000499560419907,0.00216876494351,0.00417520159112,0.000277091481056,0.00553176321265,63.1578947368,0.0733369683751,61.0
4,max,0.246658826058,0.281533479209,3.25415266086,0.373731260787,0.300685364935,1.95884041443,0.0527354211261,0.483744905594,-20.1291999839,...,0.143269579101,0.0271224146296,0.027724271782,0.0478188970252,0.0961281431734,0.0287388910107,0.0407359233789,600.0,0.4229915668,188.0


### Preparing Data for Machine Learning

In [9]:
categoricalColumns = []
stages = []

for categoricalCol in categoricalColumns:
    stringIndexer = StringIndexer(inputCol = categoricalCol, outputCol = categoricalCol + 'Index')
    encoder = OneHotEncoderEstimator(inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec"])
    stages += [stringIndexer, encoder]
    
label_stringIdx = StringIndexer(inputCol = 'class', outputCol = 'label')
stages += [label_stringIdx]

numericCols = numeric_features

assemblerInputs = [c + "classVec" for c in categoricalColumns] + numericCols

assembler = VectorAssembler(inputCols = assemblerInputs, outputCol = "features")
stages += [assembler]

### Pipeline

In [10]:
pipeline = Pipeline(stages = stages)

pipelineModel = pipeline.fit(df)
df = pipelineModel.transform(df)

selectedCols = ['label', 'features'] + cols
df = df.select(selectedCols)

df.printSchema()

root
 |-- label: double (nullable = false)
 |-- features: vector (nullable = true)
 |-- 1-ZCRm: double (nullable = true)
 |-- 2-Energym: double (nullable = true)
 |-- 3-EnergyEntropym: double (nullable = true)
 |-- 4-SpectralCentroidm: double (nullable = true)
 |-- 5-SpectralSpreadm: double (nullable = true)
 |-- 6-SpectralEntropym: double (nullable = true)
 |-- 7-SpectralFluxm: double (nullable = true)
 |-- 8-SpectralRolloffm: double (nullable = true)
 |-- 9-MFCCs1m: double (nullable = true)
 |-- 10-MFCCs2m: double (nullable = true)
 |-- 11-MFCCs3m: double (nullable = true)
 |-- 12-MFCCs4m: double (nullable = true)
 |-- 13-MFCCs5m: double (nullable = true)
 |-- 14-MFCCs6m: double (nullable = true)
 |-- 15-MFCCs7m: double (nullable = true)
 |-- 16-MFCCs8m: double (nullable = true)
 |-- 17-MFCCs9m: double (nullable = true)
 |-- 18-MFCCs10m: double (nullable = true)
 |-- 19-MFCCs11m: double (nullable = true)
 |-- 20-MFCCs12m: double (nullable = true)
 |-- 21-MFCCs13m: double (nullable = 

In [11]:
fin_output = df.select("label", "features", "class")
fin_output.show(10)

+-----+--------------------+-------+
|label|            features|  class|
+-----+--------------------+-------+
|  0.0|[0.136439587512,0...|BigRoom|
|  0.0|[0.117038518483,0...|BigRoom|
|  0.0|[0.0853077737447,...|BigRoom|
|  0.0|[0.103049917216,0...|BigRoom|
|  0.0|[0.151729948738,0...|BigRoom|
|  0.0|[0.127046737192,0...|BigRoom|
|  0.0|[0.123395302003,0...|BigRoom|
|  0.0|[0.140027382431,0...|BigRoom|
|  0.0|[0.117635200751,0...|BigRoom|
|  0.0|[0.137400181488,0...|BigRoom|
+-----+--------------------+-------+
only showing top 10 rows



### Randomly split data into train and test sets

In [12]:
train, test = df.randomSplit([0.7, 0.3], seed = 20)

print("Training Dataset Count: " + str(train.count()))
print("Test Dataset Count: " + str(test.count()))

Training Dataset Count: 1608
Test Dataset Count: 692


### Logistic Regression

In [13]:
lr = LogisticRegression(featuresCol = 'features', labelCol = 'label', maxIter = 10, )
lrModel = lr.fit(train)

#### Make predictions on the test set

In [14]:
predictions = lrModel.transform(test)
predictions.select('label', 'features', 'prediction').show(10)

+-----+--------------------+----------+
|label|            features|prediction|
+-----+--------------------+----------+
|  0.0|[0.0786813648231,...|       9.0|
|  0.0|[0.0992425653517,...|       6.0|
|  0.0|[0.103385431592,0...|       0.0|
|  0.0|[0.103571258,0.16...|       5.0|
|  0.0|[0.113504612825,0...|       9.0|
|  0.0|[0.113933557806,0...|      22.0|
|  0.0|[0.11784864839,0....|       6.0|
|  0.0|[0.122947412201,0...|       6.0|
|  0.0|[0.123181356863,0...|       6.0|
|  0.0|[0.123851231413,0...|       6.0|
+-----+--------------------+----------+
only showing top 10 rows



#### Evaluate our Logistic Regression model

In [15]:
evaluator = MulticlassClassificationEvaluator(labelCol = 'label', predictionCol = 'prediction', metricName = 'accuracy')

accuracy = evaluator.evaluate(predictions)
print("Accuracy of Logistic Regression: %g"%(accuracy))
print("Test Error of Logistic Regression: %g "%(1.0 - accuracy))

Accuracy of Logistic Regression: 0.452312
Test Error of Logistic Regression: 0.547688 


### Decison Tree Classifier

In [16]:
dt = DecisionTreeClassifier(featuresCol = 'features', labelCol = 'label', maxDepth = 3)
dtModel = dt.fit(train)

#### Make predictions on the test set

In [17]:
predictions = dtModel.transform(test)
predictions.select('label', 'features', 'prediction').show(10)

+-----+--------------------+----------+
|label|            features|prediction|
+-----+--------------------+----------+
|  0.0|[0.0786813648231,...|      14.0|
|  0.0|[0.0992425653517,...|       0.0|
|  0.0|[0.103385431592,0...|       0.0|
|  0.0|[0.103571258,0.16...|       0.0|
|  0.0|[0.113504612825,0...|       0.0|
|  0.0|[0.113933557806,0...|       0.0|
|  0.0|[0.11784864839,0....|       0.0|
|  0.0|[0.122947412201,0...|       0.0|
|  0.0|[0.123181356863,0...|      22.0|
|  0.0|[0.123851231413,0...|       0.0|
+-----+--------------------+----------+
only showing top 10 rows



#### Evaluate our Decison Tree Classifier

In [18]:
accuracy = evaluator.evaluate(predictions)
print("Accuracy of Decision Tree: %g"%(accuracy))
print("Test Error of Decision Tree: %g "%(1.0 - accuracy))

Accuracy of Decision Tree: 0.223988
Test Error of Decision Tree: 0.776012 


### Random Forest Model

In [19]:
rf = RandomForestClassifier(featuresCol = 'features', labelCol = 'label', numTrees = 50, maxDepth = 10)
rfModel = rf.fit(train)

#### Make predictions on the test set

In [20]:
predictions = rfModel.transform(test)
predictions.select('label', 'features', 'prediction').show(10)

+-----+--------------------+----------+
|label|            features|prediction|
+-----+--------------------+----------+
|  0.0|[0.0786813648231,...|      17.0|
|  0.0|[0.0992425653517,...|      21.0|
|  0.0|[0.103385431592,0...|       0.0|
|  0.0|[0.103571258,0.16...|       6.0|
|  0.0|[0.113504612825,0...|       0.0|
|  0.0|[0.113933557806,0...|      22.0|
|  0.0|[0.11784864839,0....|       0.0|
|  0.0|[0.122947412201,0...|       0.0|
|  0.0|[0.123181356863,0...|       0.0|
|  0.0|[0.123851231413,0...|       6.0|
+-----+--------------------+----------+
only showing top 10 rows



#### Evaluate our Random Forest Model

In [21]:
accuracy = evaluator.evaluate(predictions)
print("Accuracy of Random Forest: %g"%(accuracy))
print("Test Error of Random Forest: %g "%(1.0 - accuracy))

Accuracy of Random Forest: 0.498555
Test Error of Random Forest: 0.501445 


### One-vs-Rest

In [22]:
ovr = OneVsRest(featuresCol = 'features', labelCol = 'label', classifier = lr)
ovrModel = ovr.fit(train)

#### Make predictions on the test set

In [23]:
predictions = ovrModel.transform(test)
predictions.select('label', 'features', 'prediction', 'class').show(10)

+-----+--------------------+----------+-------+
|label|            features|prediction|  class|
+-----+--------------------+----------+-------+
|  0.0|[0.0786813648231,...|       9.0|BigRoom|
|  0.0|[0.0992425653517,...|       6.0|BigRoom|
|  0.0|[0.103385431592,0...|      12.0|BigRoom|
|  0.0|[0.103571258,0.16...|       6.0|BigRoom|
|  0.0|[0.113504612825,0...|       9.0|BigRoom|
|  0.0|[0.113933557806,0...|       0.0|BigRoom|
|  0.0|[0.11784864839,0....|       6.0|BigRoom|
|  0.0|[0.122947412201,0...|       0.0|BigRoom|
|  0.0|[0.123181356863,0...|       6.0|BigRoom|
|  0.0|[0.123851231413,0...|      14.0|BigRoom|
+-----+--------------------+----------+-------+
only showing top 10 rows



#### Evaluate our OvR Model

In [24]:
accuracy = evaluator.evaluate(predictions)
print("Accuracy of OnevsRest: %g"%(accuracy))
print("Test Error of OnevsRest: %g "%(1.0 - accuracy))

Accuracy of OnevsRest: 0.445087
Test Error of OnevsRest: 0.554913 


### Cross Validation

In [25]:
classifier = RandomForestClassifier()

paramGrid = (ParamGridBuilder()
             .addGrid(classifier.maxDepth, [2, 5, 10, 20, 30])
             .addGrid(classifier.maxBins, [10, 20, 40, 50])
             .addGrid(classifier.numTrees, [10, 25, 50])
             .build())

# Cross Val score set up with all parameters
cv = CrossValidator(estimator = classifier, estimatorParamMaps = paramGrid, evaluator = evaluator, numFolds = 3)

# Then fit the model
cvModel = cv.fit(train)

# Collect the best Model
BestModel = cvModel.bestModel

# Generate predictions
predictions = BestModel.transform(test)

# Evaluate the model
accuracy = evaluator.evaluate(predictions)
print("Accuracy of Cross Validation: %g"%(accuracy))
print("Test Error of Cross Validation: %g "%(1.0 - accuracy))

Accuracy of Cross Validation: 0.49711
Test Error of Cross Validation: 0.50289 


In [26]:
predictions.select('label', 'features', 'prediction').show(10)

+-----+--------------------+----------+
|label|            features|prediction|
+-----+--------------------+----------+
|  0.0|[0.0786813648231,...|       2.0|
|  0.0|[0.0992425653517,...|       0.0|
|  0.0|[0.103385431592,0...|       0.0|
|  0.0|[0.103571258,0.16...|       6.0|
|  0.0|[0.113504612825,0...|       0.0|
|  0.0|[0.113933557806,0...|       0.0|
|  0.0|[0.11784864839,0....|       0.0|
|  0.0|[0.122947412201,0...|       0.0|
|  0.0|[0.123181356863,0...|       0.0|
|  0.0|[0.123851231413,0...|       9.0|
+-----+--------------------+----------+
only showing top 10 rows

