# ELECTRONIC MUSIC GENRE CLASSIFICATION

### Create PySpark instance

In [1]:
import findspark
findspark.init()

import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Multiclass Classification").getOrCreate()

cores = spark._jsc.sc().getExecutorMemoryStatus().keySet().size()
print("You are working with", cores, "core(s)")
spark

You are working with 1 core(s)


### Import the dataset and others librairies

In [12]:
# Dataset
df = spark.read.csv('beatsdataset.csv', inferSchema = True, header = True)

# Librairies
import pandas as pd

from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler, MinMaxScaler, RegexTokenizer, StopWordsRemover, CountVectorizer

from pyspark.sql.types import * 
from pyspark.sql.functions import *

from pyspark.ml.classification import *

from pyspark.ml.evaluation import *

from pyspark.ml import Pipeline

from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

### Data Preparation

In [3]:
pd.DataFrame(df.take(5), columns=df.columns)

Unnamed: 0,_c0,1-ZCRm,2-Energym,3-EnergyEntropym,4-SpectralCentroidm,5-SpectralSpreadm,6-SpectralEntropym,7-SpectralFluxm,8-SpectralRolloffm,9-MFCCs1m,...,63-ChromaVector8std,64-ChromaVector9std,65-ChromaVector10std,66-ChromaVector11std,67-ChromaVector12std,68-ChromaDeviationstd,69-BPM,70-BPMconf,71-BPMessentia,class
0,0,0.13644,0.088861,3.201201,0.262825,0.249212,1.114423,0.007003,0.256682,-22.723259,...,0.003431,0.004981,0.010818,0.024001,0.005201,0.015056,133.333333,0.132792,128.0,BigRoom
1,1,0.117039,0.108389,3.194001,0.247657,0.250288,1.065668,0.005387,0.199821,-21.775871,...,0.004461,0.006441,0.007469,0.015499,0.005589,0.019339,120.0,0.112767,126.0,BigRoom
2,2,0.085308,0.128525,3.123837,0.217205,0.228652,0.789647,0.008247,0.156822,-22.472722,...,0.001529,0.004556,0.007723,0.017482,0.002901,0.022201,133.333333,0.123373,129.0,BigRoom
3,3,0.10305,0.167042,3.15083,0.233593,0.245032,0.967082,0.006571,0.168083,-21.470751,...,0.001591,0.003514,0.009477,0.023162,0.004165,0.015379,133.333333,0.158876,129.0,BigRoom
4,4,0.15173,0.148405,3.194498,0.29373,0.267231,1.353005,0.003872,0.292055,-21.371157,...,0.003945,0.004131,0.01133,0.028188,0.002639,0.019079,133.333333,0.190708,129.0,BigRoom


In [4]:
df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- 1-ZCRm: double (nullable = true)
 |-- 2-Energym: double (nullable = true)
 |-- 3-EnergyEntropym: double (nullable = true)
 |-- 4-SpectralCentroidm: double (nullable = true)
 |-- 5-SpectralSpreadm: double (nullable = true)
 |-- 6-SpectralEntropym: double (nullable = true)
 |-- 7-SpectralFluxm: double (nullable = true)
 |-- 8-SpectralRolloffm: double (nullable = true)
 |-- 9-MFCCs1m: double (nullable = true)
 |-- 10-MFCCs2m: double (nullable = true)
 |-- 11-MFCCs3m: double (nullable = true)
 |-- 12-MFCCs4m: double (nullable = true)
 |-- 13-MFCCs5m: double (nullable = true)
 |-- 14-MFCCs6m: double (nullable = true)
 |-- 15-MFCCs7m: double (nullable = true)
 |-- 16-MFCCs8m: double (nullable = true)
 |-- 17-MFCCs9m: double (nullable = true)
 |-- 18-MFCCs10m: double (nullable = true)
 |-- 19-MFCCs11m: double (nullable = true)
 |-- 20-MFCCs12m: double (nullable = true)
 |-- 21-MFCCs13m: double (nullable = true)
 |-- 22-ChromaVector1m: double (null

In [5]:
# df = df.drop('_c0')
# cols = df.columns

In [6]:
### Check number of classes
df.groupBy("class").count().show(100, truncate = False)

+--------------------+-----+
|class               |count|
+--------------------+-----+
|PsyTrance           |100  |
|HardDance           |100  |
|Breaks              |100  |
|HardcoreHardTechno  |100  |
|IndieDanceNuDisco   |100  |
|Trance              |100  |
|DeepHouse           |100  |
|ElectronicaDowntempo|100  |
|ReggaeDub           |100  |
|Minimal             |100  |
|DrumAndBass         |100  |
|Dubstep             |100  |
|BigRoom             |100  |
|Techno              |100  |
|House               |100  |
|FutureHouse         |100  |
|ElectroHouse        |100  |
|GlitchHop           |100  |
|TechHouse           |100  |
|HipHop              |100  |
|FunkRAndB           |100  |
|Dance               |100  |
|ProgressiveHouse    |100  |
+--------------------+-----+



In [7]:
# Data Prep function
def MLClassifierDFPrep(df, input_columns, dependent_var, treat_outliers = True, treat_neg_values = True):
    
    # change label (class variable) to string type to prep for reindexing
    # Pyspark is expecting a zero indexed integer for the label column. 
    # Just incase our data is not in that format... we will treat it by using the StringIndexer built in method
    renamed = df.withColumn("label_str", df[dependent_var].cast(StringType())) #Rename and change to string type
    indexer = StringIndexer(inputCol = "label_str", outputCol = "label") #Pyspark is expecting the this naming convention 
    indexed = indexer.fit(renamed).transform(renamed)
    print(indexed.groupBy("class", "label").count().show(100))

    # Convert all string type data in the input column list to numeric
    # Otherwise the Algorithm will not be able to process it
    numeric_inputs = []
    string_inputs = []
    for column in input_columns:
        if str(indexed.schema[column].dataType) == 'StringType':
            indexer = StringIndexer(inputCol=column, outputCol=column+"_num") 
            indexed = indexer.fit(indexed).transform(indexed)
            new_col_name = column+"_num"
            string_inputs.append(new_col_name)
        else:
            numeric_inputs.append(column)
            
    if treat_outliers == True:
        print("We are correcting for non normality now!")
        # empty dictionary d
        d = {}
        # Create a dictionary of quantiles
        for col in numeric_inputs: 
            d[col] = indexed.approxQuantile(col,[0.01,0.99],0.25) #if you want to make it go faster increase the last number
        #Now fill in the values
        for col in numeric_inputs:
            skew = indexed.agg(skewness(indexed[col])).collect() #check for skewness
            skew = skew[0][0]
            # This function will floor, cap and then log+1 (just in case there are 0 values)
            if skew > 1:
                indexed = indexed.withColumn(col, \
                log(when(df[col] < d[col][0],d[col][0])\
                .when(indexed[col] > d[col][1], d[col][1])\
                .otherwise(indexed[col] ) +1).alias(col))
                print(col+" has been treated for positive (right) skewness. (skew =)",skew,")")
            elif skew < -1:
                indexed = indexed.withColumn(col, \
                exp(when(df[col] < d[col][0],d[col][0])\
                .when(indexed[col] > d[col][1], d[col][1])\
                .otherwise(indexed[col] )).alias(col))
                print(col+" has been treated for negative (left) skewness. (skew =",skew,")")

            
    # Produce a warning if there are negative values in the dataframe that Naive Bayes cannot be used. 
    # Note: we only need to check the numeric input values since anything that is indexed won't have negative values
    minimums = df.select([min(c).alias(c) for c in df.columns if c in numeric_inputs]) # Calculate the mins for all columns in the df
    min_array = minimums.select(array(numeric_inputs).alias("mins")) # Create an array for all mins and select only the input cols
    df_minimum = min_array.select(array_min(min_array.mins)).collect() # Collect golobal min as Python object
    df_minimum = df_minimum[0][0] # Slice to get the number itself

    features_list = numeric_inputs + string_inputs
    assembler = VectorAssembler(inputCols=features_list,outputCol='features')
    output = assembler.transform(indexed).select('features','label')

#     final_data = output.select('features','label') #drop everything else
    
    # Now check for negative values and ask user if they want to correct that? 
    if df_minimum < 0:
        print(" ")
        print("WARNING: The Naive Bayes Classifier will not be able to process your dataframe as it contains negative values")
        print(" ")
    
    if treat_neg_values == True:
        print("You have opted to correct that by rescaling all your features to a range of 0 to 1")
        print(" ")
        print("We are rescaling you dataframe....")
        scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures")

        # Compute summary statistics and generate MinMaxScalerModel
        scalerModel = scaler.fit(output)

        # rescale each feature to range [min, max].
        scaled_data = scalerModel.transform(output)
        final_data = scaled_data.select('label','scaledFeatures') # added class to the selection
        final_data = final_data.withColumnRenamed('scaledFeatures','features')
        print("Done!")

    else:
        print("You have opted not to correct that therefore you will not be able to use to Naive Bayes classifier")
        print("We will return the dataframe unscaled.")
        final_data = output
    
    return final_data

In [8]:
def ClassTrainEval(classifier,features,classes,folds,train,test):
    
    def FindMtype(classifier):
        # Intstantiate Model
        M = classifier
        # Learn what it is
        Mtype = type(M).__name__
        
        return Mtype
    
    Mtype = FindMtype(classifier)
    

    def IntanceFitModel(Mtype,classifier,classes,features,folds,train):
        
        if Mtype == "OneVsRest":
            # instantiate the base classifier.
            lr = LogisticRegression()
            # instantiate the One Vs Rest Classifier.
            OVRclassifier = OneVsRest(classifier=lr)
#             fitModel = OVRclassifier.fit(train)
            # Add parameters of your choice here:
            paramGrid = ParamGridBuilder() \
                .addGrid(lr.regParam, [0.1, 0.01]) \
                .build()
            #Cross Validator requires the following parameters:
            crossval = CrossValidator(estimator=OVRclassifier,
                                      estimatorParamMaps=paramGrid,
                                      evaluator=MulticlassClassificationEvaluator(),
                                      numFolds=folds) # 3 is best practice
            # Run cross-validation, and choose the best set of parameters.
            fitModel = crossval.fit(train)
            return fitModel
        if Mtype == "MultilayerPerceptronClassifier":
            # specify layers for the neural network:
            # input layer of size features, two intermediate of features+1 and same size as features
            # and output of size number of classes
            # Note: crossvalidator cannot be used here
            features_count = len(features[0][0])
            layers = [features_count, features_count+1, features_count, classes]
            MPC_classifier = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234)
            fitModel = MPC_classifier.fit(train)
            return fitModel
        if Mtype in("LinearSVC","GBTClassifier") and classes != 2: # These classifiers currently only accept binary classification
            print(Mtype," could not be used because PySpark currently only accepts binary classification data for this algorithm")
            return
        if Mtype in("LogisticRegression","NaiveBayes","RandomForestClassifier","GBTClassifier","LinearSVC","DecisionTreeClassifier"):
  
            # Add parameters of your choice here:
            if Mtype in("LogisticRegression"):
                paramGrid = (ParamGridBuilder() \
#                              .addGrid(classifier.regParam, [0.1, 0.01]) \
                             .addGrid(classifier.maxIter, [10, 15,20])
                             .build())
                
            # Add parameters of your choice here:
            if Mtype in("NaiveBayes"):
                paramGrid = (ParamGridBuilder() \
                             .addGrid(classifier.smoothing, [0.0, 0.2, 0.4, 0.6]) \
                             .build())
                
            # Add parameters of your choice here:
            if Mtype in("RandomForestClassifier"):
                paramGrid = (ParamGridBuilder() \
                               .addGrid(classifier.maxDepth, [2, 5, 10])
#                                .addGrid(classifier.maxBins, [5, 10, 20])
#                                .addGrid(classifier.numTrees, [5, 20, 50])
                             .build())
                
            # Add parameters of your choice here:
            if Mtype in("GBTClassifier"):
                paramGrid = (ParamGridBuilder() \
#                              .addGrid(classifier.maxDepth, [2, 5, 10, 20, 30]) \
#                              .addGrid(classifier.maxBins, [10, 20, 40, 80, 100]) \
                             .addGrid(classifier.maxIter, [10, 15,50,100])
                             .build())
                
            # Add parameters of your choice here:
            if Mtype in("LinearSVC"):
                paramGrid = (ParamGridBuilder() \
                             .addGrid(classifier.maxIter, [10, 15]) \
                             .addGrid(classifier.regParam, [0.1, 0.01]) \
                             .build())
            
            # Add parameters of your choice here:
            if Mtype in("DecisionTreeClassifier"):
                paramGrid = (ParamGridBuilder() \
#                              .addGrid(classifier.maxDepth, [2, 5, 10, 20, 30]) \
                             .addGrid(classifier.maxBins, [10, 20, 40, 80, 100]) \
                             .build())
            
            #Cross Validator requires all of the following parameters:
            crossval = CrossValidator(estimator=classifier,
                                      estimatorParamMaps=paramGrid,
                                      evaluator=MulticlassClassificationEvaluator(),
                                      numFolds=folds) # 3 + is best practice
            # Fit Model: Run cross-validation, and choose the best set of parameters.
            fitModel = crossval.fit(train)
            return fitModel
    
    fitModel = IntanceFitModel(Mtype,classifier,classes,features,folds,train)
    
    # Print feature selection metrics
    if fitModel is not None:
        
        if Mtype in("OneVsRest"):
            # Get Best Model
            BestModel = fitModel.bestModel
            global OVR_BestModel
            OVR_BestModel = BestModel
            print(" ")
            print('\033[1m' + Mtype + '\033[0m')
            # Extract list of binary models
            models = BestModel.models
            for model in models:
                print('\033[1m' + 'Intercept: '+ '\033[0m',model.intercept)
                print('\033[1m' + 'Top 20 Coefficients:'+ '\033[0m')
                coeff_array = model.coefficients.toArray()
                coeff_scores = []
                for x in coeff_array:
                    coeff_scores.append(float(x))
                # Then zip with input_columns list and create a df
                result = spark.createDataFrame(zip(input_columns,coeff_scores), schema=['feature','coeff'])
                print(result.orderBy(result["coeff"].desc()).show(truncate=False))


        if Mtype == "MultilayerPerceptronClassifier":
            print("")
            print('\033[1m' + Mtype + '\033[0m')
            print('\033[1m' + "Model Weights: "+ '\033[0m',fitModel.weights.size)
            print("")
            global MLPC_Model
            MLPC_BestModel = fitModel

        if Mtype in("DecisionTreeClassifier", "GBTClassifier","RandomForestClassifier"):
            # FEATURE IMPORTANCES
            # Estimate of the importance of each feature.
            # Each feature’s importance is the average of its importance across all trees 
            # in the ensemble The importance vector is normalized to sum to 1. 
            # Get Best Model
            BestModel = fitModel.bestModel
            print(" ")
            print('\033[1m' + Mtype," Top 20 Feature Importances"+ '\033[0m')
            print("(Scores add up to 1)")
            print("Lowest score is the least important")
            print(" ")
            featureImportances = BestModel.featureImportances.toArray()
            # Convert from numpy array to list
            imp_scores = []
            for x in featureImportances:
                imp_scores.append(float(x))
            # Then zip with input_columns list and create a df
            result = spark.createDataFrame(zip(input_columns,imp_scores), schema=['feature','score'])
            print(result.orderBy(result["score"].desc()).show(truncate=False))
            
            # Save the feature importance values and the models
            if Mtype in("DecisionTreeClassifier"):
                global DT_featureimportances
                DT_featureimportances = BestModel.featureImportances.toArray()
                global DT_BestModel
                DT_BestModel = BestModel
            if Mtype in("GBTClassifier"):
                global GBT_featureimportances
                GBT_featureimportances = BestModel.featureImportances.toArray()
                global GBT_BestModel
                GBT_BestModel = BestModel
            if Mtype in("RandomForestClassifier"):
                global RF_featureimportances
                RF_featureimportances = BestModel.featureImportances.toArray()
                global RF_BestModel
                RF_BestModel = BestModel

        # Print the coefficients
        if Mtype in("LogisticRegression"):
            # Get Best Model
            BestModel = fitModel.bestModel
            print(" ")
            print('\033[1m' + Mtype + '\033[0m')
            print("Intercept: " + str(BestModel.interceptVector))
            print('\033[1m' + " Top 20 Coefficients"+ '\033[0m')
            print("You should compares these relative to eachother")
            # Convert from numpy array to list
            coeff_array = BestModel.coefficientMatrix.toArray()
            coeff_scores = []
            for x in coeff_array[0]:
                coeff_scores.append(float(x))
            # Then zip with input_columns list and create a df
            result = spark.createDataFrame(zip(input_columns,coeff_scores), schema=['feature','coeff'])
            print(result.orderBy(result["coeff"].desc()).show(truncate=False))
            # Save the coefficient values and the models
            global LR_coefficients
            LR_coefficients = BestModel.coefficientMatrix.toArray()
            global LR_BestModel
            LR_BestModel = BestModel

        # Print the Coefficients
        if Mtype in("LinearSVC"):
            # Get Best Model
            BestModel = fitModel.bestModel
            print(" ")
            print('\033[1m' + Mtype + '\033[0m')
            print("Intercept: " + str(BestModel.intercept))
            print('\033[1m' + "Top 20 Coefficients"+ '\033[0m')
            print("You should compares these relative to eachother")
#             print("Coefficients: \n" + str(BestModel.coefficients))
            coeff_array = BestModel.coefficients.toArray()
            coeff_scores = []
            for x in coeff_array:
                coeff_scores.append(float(x))
            # Then zip with input_columns list and create a df
            result = spark.createDataFrame(zip(input_columns,coeff_scores), schema=['feature','coeff'])
            print(result.orderBy(result["coeff"].desc()).show(truncate=False))
            # Save the coefficient values and the models
            global LSVC_coefficients
            LSVC_coefficients = BestModel.coefficients.toArray()
            global LSVC_BestModel
            LSVC_BestModel = BestModel
        
   
    # Set the column names to match the external results dataframe that we will join with later:
    columns = ['Classifier', 'Result']
    
    if Mtype in("LinearSVC","GBTClassifier") and classes != 2:
        Mtype = [Mtype] # make this a list
        score = ["N/A"]
        result = spark.createDataFrame(zip(Mtype,score), schema=columns)
    else:
        predictions = fitModel.transform(test)
        MC_evaluator = MulticlassClassificationEvaluator(metricName="accuracy") # redictionCol="prediction",
        accuracy = (MC_evaluator.evaluate(predictions))*100
        Mtype = [Mtype] # make this a string
        score = [str(accuracy)] #make this a string and convert to a list
        result = spark.createDataFrame(zip(Mtype,score), schema=columns)
        result = result.withColumn('Result',result.Result.substr(0, 5))
        
    return result
    #Also returns the fit model important scores or p values

In [9]:
# Set up independ and dependent vars
input_columns = df.columns
input_columns = input_columns[1:-1] # keep only relevant columns: everything but the first and last cols
dependent_var = 'class'

# Learn how many classes there are in order to specify evaluation type based on binary or multi and turn the df into an object
class_count = df.select(countDistinct("class")).collect()
classes = class_count[0][0]

In [10]:
# Call on data prep, train and evaluate functions
test1_data = MLClassifierDFPrep(df,input_columns,dependent_var,treat_outliers=False,treat_neg_values=False)
test1_data.limit(5).toPandas()

# Comment out Naive Bayes if your data still contains negative values
classifiers = [
                LogisticRegression()
                ,OneVsRest()
               ,LinearSVC()
#                ,NaiveBayes()
               ,RandomForestClassifier()
               ,GBTClassifier()
               ,DecisionTreeClassifier()
               ,MultilayerPerceptronClassifier()
              ] 

train,test = test1_data.randomSplit([0.7,0.3])
features = test1_data.select(['features']).collect()
folds = 2 # because we have limited data

#set up your results table
columns = ['Classifier', 'Result']
vals = [("Place Holder","N/A")]
results = spark.createDataFrame(vals, columns)

for classifier in classifiers:
    new_result = ClassTrainEval(classifier,features,classes,folds,train,test)
    results = results.union(new_result)
results = results.where("Classifier!='Place Holder'")
print("!!!!!Final Results!!!!!!!!")
results.show(100,False)

+--------------------+-----+-----+
|               class|label|count|
+--------------------+-----+-----+
|           ReggaeDub| 19.0|  100|
|           FunkRAndB|  8.0|  100|
|              Trance| 22.0|  100|
|              Techno| 21.0|  100|
|             Dubstep|  5.0|  100|
|   IndieDanceNuDisco| 15.0|  100|
|ElectronicaDowntempo|  7.0|  100|
|               House| 14.0|  100|
|  HardcoreHardTechno| 12.0|  100|
|              HipHop| 13.0|  100|
|           GlitchHop| 10.0|  100|
|         DrumAndBass|  4.0|  100|
|           PsyTrance| 18.0|  100|
|    ProgressiveHouse| 17.0|  100|
|        ElectroHouse|  6.0|  100|
|           HardDance| 11.0|  100|
|             Minimal| 16.0|  100|
|           TechHouse| 20.0|  100|
|           DeepHouse|  3.0|  100|
|             BigRoom|  0.0|  100|
|         FutureHouse|  9.0|  100|
|               Dance|  2.0|  100|
|              Breaks|  1.0|  100|
+--------------------+-----+-----+

None
 
 
You have opted not to correct that therefore 

+---------------------+------------------+
|feature              |coeff             |
+---------------------+------------------+
|65-ChromaVector10std |34.647110560064256|
|68-ChromaDeviationstd|32.82582395325892 |
|34-ChromaDeviationm  |31.55357634624832 |
|33-ChromaVector12m   |26.41764535271755 |
|25-ChromaVector4m    |23.154334695825334|
|29-ChromaVector8m    |21.556562317510615|
|59-ChromaVector4std  |20.420843052984882|
|67-ChromaVector12std |17.646238637733436|
|26-ChromaVector5m    |11.83492616446538 |
|27-ChromaVector6m    |10.241971045956152|
|36-Energystd         |8.95225190216495  |
|24-ChromaVector3m    |7.394787005334344 |
|53-MFCCs11std        |5.684586333861787 |
|56-ChromaVector1std  |4.587989744810132 |
|4-SpectralCentroidm  |4.255437730686536 |
|8-SpectralRolloffm   |3.96928527476933  |
|50-MFCCs8std         |3.852267559473424 |
|47-MFCCs5std         |3.794082805780173 |
|3-EnergyEntropym     |3.337335012223176 |
|31-ChromaVector10m   |3.2677294379355115|
+----------

+--------------------+------------------+
|feature             |coeff             |
+--------------------+------------------+
|27-ChromaVector6m   |85.59855246988776 |
|63-ChromaVector8std |81.37789632680416 |
|36-Energystd        |17.978688369613106|
|55-MFCCs13std       |14.64493066934984 |
|30-ChromaVector9m   |14.399345057311578|
|34-ChromaDeviationm |12.119874068737984|
|54-MFCCs12std       |8.151565653109316 |
|58-ChromaVector3std |6.521671311941728 |
|60-ChromaVector5std |5.408084598480584 |
|48-MFCCs6std        |4.773425256499594 |
|26-ChromaVector5m   |4.002547465379139 |
|3-EnergyEntropym    |3.839929655196581 |
|24-ChromaVector3m   |3.6103412287313468|
|52-MFCCs10std       |3.3794388935511486|
|23-ChromaVector2m   |3.353371748663237 |
|44-MFCCs2std        |3.2146290572832354|
|18-MFCCs10m         |2.9556027440332517|
|53-MFCCs11std       |2.3823466216036664|
|66-ChromaVector11std|2.032035010818985 |
|47-MFCCs5std        |2.007284988962482 |
+--------------------+------------

+----------------------+------------------+
|feature               |coeff             |
+----------------------+------------------+
|22-ChromaVector1m     |53.069491769609336|
|25-ChromaVector4m     |46.89714685568214 |
|30-ChromaVector9m     |44.02012354841866 |
|29-ChromaVector8m     |32.668664149996886|
|65-ChromaVector10std  |31.31797297722685 |
|41-SpectralFluxstd    |28.616673086657524|
|56-ChromaVector1std   |26.809834692832325|
|23-ChromaVector2m     |15.59257767301562 |
|61-ChromaVector6std   |14.772674315511313|
|38-SpectralCentroidstd|11.178568763800895|
|36-Energystd          |11.120295174779798|
|55-MFCCs13std         |10.274461152425618|
|66-ChromaVector11std  |8.392914961859574 |
|33-ChromaVector12m    |6.783452717212812 |
|59-ChromaVector4std   |6.331613491482039 |
|28-ChromaVector7m     |2.943534266878564 |
|19-MFCCs11m           |2.7836685611712917|
|7-SpectralFluxm       |2.532314099240266 |
|44-MFCCs2std          |2.420876952410232 |
|18-MFCCs10m           |2.066370

In [13]:
# Call on data prep, train and evaluate functions
test2_data = MLClassifierDFPrep(df,input_columns,dependent_var,treat_outliers=True,treat_neg_values=True)
test2_data.limit(5).toPandas()

# Comment out Naive Bayes if your data still contains negative values
classifiers = [
                LogisticRegression()
                ,OneVsRest()
               ,LinearSVC()
               ,NaiveBayes()
               ,RandomForestClassifier()
               ,GBTClassifier()
               ,DecisionTreeClassifier()
               ,MultilayerPerceptronClassifier()
              ] 

train,test = test2_data.randomSplit([0.7,0.3])
features = test2_data.select(['features']).collect()
folds = 2

#set up your results table
columns = ['Classifier', 'Result']
vals = [("Place Holder","N/A")]
results = spark.createDataFrame(vals, columns)

for classifier in classifiers:
    new_result = ClassTrainEval(classifier,features,classes,folds,train,test)
    results = results.union(new_result)
results = results.where("Classifier!='Place Holder'")
print("!!!!!Final Results!!!!!!!!")
results.show(100,False)

+--------------------+-----+-----+
|               class|label|count|
+--------------------+-----+-----+
|           ReggaeDub| 19.0|  100|
|           FunkRAndB|  8.0|  100|
|              Trance| 22.0|  100|
|              Techno| 21.0|  100|
|             Dubstep|  5.0|  100|
|   IndieDanceNuDisco| 15.0|  100|
|ElectronicaDowntempo|  7.0|  100|
|               House| 14.0|  100|
|  HardcoreHardTechno| 12.0|  100|
|              HipHop| 13.0|  100|
|           GlitchHop| 10.0|  100|
|         DrumAndBass|  4.0|  100|
|           PsyTrance| 18.0|  100|
|    ProgressiveHouse| 17.0|  100|
|        ElectroHouse|  6.0|  100|
|           HardDance| 11.0|  100|
|             Minimal| 16.0|  100|
|           TechHouse| 20.0|  100|
|           DeepHouse|  3.0|  100|
|             BigRoom|  0.0|  100|
|         FutureHouse|  9.0|  100|
|               Dance|  2.0|  100|
|              Breaks|  1.0|  100|
+--------------------+-----+-----+

None
We are correcting for non normality now!
7-Spectr

+----------------------+------------------+
|feature               |coeff             |
+----------------------+------------------+
|70-BPMconf            |2.6962416853416813|
|33-ChromaVector12m    |2.581656322464383 |
|36-Energystd          |2.322746890314752 |
|29-ChromaVector8m     |2.2108453146337075|
|38-SpectralCentroidstd|2.207562541263131 |
|10-MFCCs2m            |2.0850470724278547|
|23-ChromaVector2m     |1.9212703340266206|
|67-ChromaVector12std  |1.7691421000857326|
|55-MFCCs13std         |1.5593698881345828|
|22-ChromaVector1m     |1.481822488537568 |
|58-ChromaVector3std   |1.4773570922095804|
|66-ChromaVector11std  |1.2823118609107678|
|53-MFCCs11std         |1.2533327871691795|
|26-ChromaVector5m     |1.241705246595399 |
|27-ChromaVector6m     |1.141505623616189 |
|11-MFCCs3m            |1.0901710401993545|
|71-BPMessentia        |1.0027016073850852|
|47-MFCCs5std          |0.8784303055437688|
|64-ChromaVector9std   |0.8008194012211628|
|68-ChromaDeviationstd |0.710750

+---------------------+------------------+
|feature              |coeff             |
+---------------------+------------------+
|70-BPMconf           |6.083011397100815 |
|2-Energym            |2.768538130622402 |
|63-ChromaVector8std  |2.4457979839800625|
|67-ChromaVector12std |2.2929391353156783|
|69-BPM               |2.0661587373836303|
|48-MFCCs6std         |1.568700415911433 |
|13-MFCCs5m           |1.538645315663955 |
|52-MFCCs10std        |1.4687791194761275|
|5-SpectralSpreadm    |1.403153179047201 |
|71-BPMessentia       |1.393204156973851 |
|9-MFCCs1m            |1.3908360148682648|
|3-EnergyEntropym     |1.3754441558742398|
|59-ChromaVector4std  |1.173373147466203 |
|28-ChromaVector7m    |1.1584099921336426|
|14-MFCCs6m           |1.116439958404806 |
|16-MFCCs8m           |1.1071618896830557|
|61-ChromaVector6std  |1.0878428573352226|
|46-MFCCs4std         |0.9490549782252797|
|68-ChromaDeviationstd|0.8731555826590988|
|25-ChromaVector4m    |0.8132625198733868|
+----------

+---------------------+------------------+
|feature              |coeff             |
+---------------------+------------------+
|59-ChromaVector4std  |3.5026991871168875|
|47-MFCCs5std         |3.252097440536721 |
|39-SpectralSpreadstd |2.5885122445018833|
|66-ChromaVector11std |2.4952024088096314|
|54-MFCCs12std        |2.302452482915721 |
|55-MFCCs13std        |2.1345601379830668|
|63-ChromaVector8std  |2.066434515297856 |
|45-MFCCs3std         |1.8287817607940067|
|27-ChromaVector6m    |1.7744295024323518|
|37-EnergyEntropystd  |1.7020116245096966|
|15-MFCCs7m           |1.6573914256853783|
|35-ZCRstd            |1.4637837382640204|
|20-MFCCs12m          |1.4518834429113325|
|51-MFCCs9std         |1.4420917061237626|
|69-BPM               |1.3202778503042012|
|34-ChromaDeviationm  |1.284047177714124 |
|12-MFCCs4m           |1.2807797544205095|
|68-ChromaDeviationstd|1.2663490153828694|
|25-ChromaVector4m    |1.1291111653175359|
|67-ChromaVector12std |1.070151464803878 |
+----------

In [14]:
from pyspark.ml.feature import VectorSlicer
from pyspark.ml.feature import ChiSqSelector
from pyspark.ml.linalg import Vectors

classifiers = [OneVsRest()] 

#Select the top n features and view results
maximum = len(input_columns)
for n in range(10,maximum,10):
    print("Testing top n = ",n," features")
    
    # For Tree classifiers
#     best_n_features = RF_featureimportances.argsort()[-n:][::-1]
#     best_n_features= best_n_features.tolist() # convert to a list
#     vs = VectorSlicer(inputCol="features", outputCol="best_features", indices=best_n_features)
#     bestFeaturesDf = vs.transform(test2_data)

    # For Logistic regression or One vs Rest
    selector = ChiSqSelector(numTopFeatures=n, featuresCol="features",
                         outputCol="selectedFeatures", labelCol="label")
    bestFeaturesDf = selector.fit(test2_data).transform(test2_data)
    bestFeaturesDf = bestFeaturesDf.select("label","selectedFeatures")
    bestFeaturesDf = bestFeaturesDf.withColumnRenamed("selectedFeatures","features")

    # Collect features
    features = bestFeaturesDf.select(['features']).collect()

    # Split
    train,test = bestFeaturesDf.randomSplit([0.7,0.3])
    
    # Specify folds
    folds = 2

    #set up your results table
    columns = ['Classifier', 'Result']
    vals = [("Place Holder","N/A")]
    results = spark.createDataFrame(vals, columns)

    for classifier in classifiers:
        new_result = ClassTrainEval(classifier,features,classes,folds,train,test)
        results = results.union(new_result)
    results = results.where("Classifier!='Place Holder'")
    results.show(100,False)

Testing top n =  10  features
 
[1mOneVsRest[0m
[1mIntercept: [0m 0.2637945579860772
[1mTop 20 Coefficients:[0m
+-------------------+-------------------+
|feature            |coeff              |
+-------------------+-------------------+
|10-MFCCs2m         |1.9821448878262002 |
|8-SpectralRolloffm |0.9059107261293992 |
|3-EnergyEntropym   |0.1836138288127672 |
|2-Energym          |-0.6534049109311282|
|1-ZCRm             |-1.56661149687291  |
|6-SpectralEntropym |-1.9407641613453712|
|7-SpectralFluxm    |-2.3157463015300856|
|4-SpectralCentroidm|-2.5818039215518884|
|9-MFCCs1m          |-2.712070235386066 |
|5-SpectralSpreadm  |-4.962541388156002 |
+-------------------+-------------------+

None
[1mIntercept: [0m -5.007903261581921
[1mTop 20 Coefficients:[0m
+-------------------+--------------------+
|feature            |coeff               |
+-------------------+--------------------+
|9-MFCCs1m          |4.2472607472003725  |
|10-MFCCs2m         |2.5473685276021976  |
|4-Sp

+-------------------+-------------------+
|feature            |coeff              |
+-------------------+-------------------+
|9-MFCCs1m          |3.8858936859284614 |
|3-EnergyEntropym   |1.2989771924441549 |
|8-SpectralRolloffm |0.6584781290860019 |
|5-SpectralSpreadm  |-0.9620786304532241|
|1-ZCRm             |-1.225794521617382 |
|10-MFCCs2m         |-1.966701639800594 |
|6-SpectralEntropym |-1.971053581129235 |
|7-SpectralFluxm    |-2.2198891899828084|
|2-Energym          |-3.5541093700360977|
|4-SpectralCentroidm|-3.838814356354671 |
+-------------------+-------------------+

None
[1mIntercept: [0m -0.1483724914689759
[1mTop 20 Coefficients:[0m
+-------------------+--------------------+
|feature            |coeff               |
+-------------------+--------------------+
|6-SpectralEntropym |1.3419543879808171  |
|10-MFCCs2m         |1.2733989521677187  |
|7-SpectralFluxm    |1.159275987079238   |
|5-SpectralSpreadm  |-0.1988402773501032 |
|2-Energym          |-0.271299654362

+-------------------+-------------------+
|feature            |coeff              |
+-------------------+-------------------+
|13-MFCCs5m         |3.2450845097142795 |
|17-MFCCs9m         |2.2905254993320088 |
|16-MFCCs8m         |1.6403111579542102 |
|1-ZCRm             |1.4822128439637272 |
|15-MFCCs7m         |1.2708996751127644 |
|2-Energym          |1.182158974050764  |
|8-SpectralRolloffm |1.0255874982263111 |
|5-SpectralSpreadm  |0.8899251172041844 |
|9-MFCCs1m          |0.5930567599824436 |
|12-MFCCs4m         |0.43946470919747677|
|6-SpectralEntropym |0.3350136916223924 |
|7-SpectralFluxm    |-0.309226415578919 |
|11-MFCCs3m         |-0.5000519717852095|
|20-MFCCs12m        |-0.6073695909820119|
|3-EnergyEntropym   |-0.6085881507155795|
|14-MFCCs6m         |-0.6098741125025351|
|10-MFCCs2m         |-0.9887678128632279|
|18-MFCCs10m        |-1.4348653080599922|
|4-SpectralCentroidm|-1.7244727535049913|
|19-MFCCs11m        |-2.5277295593577205|
+-------------------+-------------

+-------------------+---------------------+
|feature            |coeff                |
+-------------------+---------------------+
|19-MFCCs11m        |2.860274580743555    |
|14-MFCCs6m         |1.8731745795332322   |
|13-MFCCs5m         |1.8635801526291649   |
|12-MFCCs4m         |1.1961889178448237   |
|11-MFCCs3m         |1.124444540024091    |
|10-MFCCs2m         |0.9634288888822552   |
|18-MFCCs10m        |0.7765628363919344   |
|15-MFCCs7m         |0.7188621365330877   |
|7-SpectralFluxm    |0.6689669386664906   |
|1-ZCRm             |0.2809866854203015   |
|8-SpectralRolloffm |0.186960906486634    |
|6-SpectralEntropym |-0.049952867153028685|
|9-MFCCs1m          |-0.0586649930389763  |
|17-MFCCs9m         |-0.4879221963915295  |
|2-Energym          |-0.7148464051316588  |
|4-SpectralCentroidm|-0.7400616419453351  |
|3-EnergyEntropym   |-1.5890127181813916  |
|16-MFCCs8m         |-2.3609824958109105  |
|5-SpectralSpreadm  |-2.407856804998401   |
|20-MFCCs12m        |-3.66272939

+-------------------+--------------------+
|feature            |coeff               |
+-------------------+--------------------+
|20-MFCCs12m        |8.641222231176446   |
|9-MFCCs1m          |2.9538898443510666  |
|17-MFCCs9m         |2.9211321712092544  |
|18-MFCCs10m        |2.177506813349059   |
|19-MFCCs11m        |2.120843545448033   |
|6-SpectralEntropym |1.3810270325506566  |
|16-MFCCs8m         |0.46249128331083234 |
|2-Energym          |0.393363656886698   |
|1-ZCRm             |-0.41255845760187543|
|7-SpectralFluxm    |-0.42907807002471315|
|12-MFCCs4m         |-0.5240862155719425 |
|10-MFCCs2m         |-0.7784532141244647 |
|3-EnergyEntropym   |-0.8470796087620048 |
|15-MFCCs7m         |-1.8792246178120011 |
|11-MFCCs3m         |-1.8927467408447503 |
|5-SpectralSpreadm  |-1.9728626242945873 |
|4-SpectralCentroidm|-2.265166245359263  |
|14-MFCCs6m         |-3.5198392110104733 |
|8-SpectralRolloffm |-3.52013381294726   |
|13-MFCCs5m         |-6.00572523617097   |
+----------

+-------------------+--------------------+
|feature            |coeff               |
+-------------------+--------------------+
|4-SpectralCentroidm|4.363826096266432   |
|27-ChromaVector6m  |2.807400679217169   |
|15-MFCCs7m         |2.715922115890646   |
|9-MFCCs1m          |2.0761932229117774  |
|14-MFCCs6m         |1.7260087973551261  |
|28-ChromaVector7m  |1.4599776307901733  |
|30-ChromaVector9m  |1.111620535091766   |
|26-ChromaVector5m  |0.7275360063606028  |
|24-ChromaVector3m  |0.7038339219617537  |
|12-MFCCs4m         |0.48608331914708697 |
|20-MFCCs12m        |0.47621348295765686 |
|17-MFCCs9m         |0.3832886580284565  |
|22-ChromaVector1m  |0.28893805253636184 |
|16-MFCCs8m         |0.18419005474380773 |
|19-MFCCs11m        |0.07631324684254653 |
|21-MFCCs13m        |0.05562188342779955 |
|8-SpectralRolloffm |0.011489513667827684|
|10-MFCCs2m         |-0.0568481563871706 |
|13-MFCCs5m         |-0.09081080148390921|
|11-MFCCs3m         |-0.1588232746814728 |
+----------

+-------------------+--------------------+
|feature            |coeff               |
+-------------------+--------------------+
|25-ChromaVector4m  |6.589754909559608   |
|1-ZCRm             |3.312154722762035   |
|5-SpectralSpreadm  |2.4919133936904734  |
|20-MFCCs12m        |2.3135173589391336  |
|10-MFCCs2m         |2.2059559544748066  |
|27-ChromaVector6m  |2.2045584818717243  |
|30-ChromaVector9m  |2.1544589489060635  |
|12-MFCCs4m         |2.0502975046756506  |
|2-Energym          |1.7523742694525049  |
|29-ChromaVector8m  |1.4713552508154821  |
|28-ChromaVector7m  |1.4384242906432816  |
|6-SpectralEntropym |0.6324178556996747  |
|13-MFCCs5m         |0.6047438473878706  |
|4-SpectralCentroidm|0.1532578862910609  |
|26-ChromaVector5m  |-0.03012734584269084|
|22-ChromaVector1m  |-0.08719241480236335|
|23-ChromaVector2m  |-0.2132103993438243 |
|11-MFCCs3m         |-0.2543501679245427 |
|19-MFCCs11m        |-0.5053399989206397 |
|21-MFCCs13m        |-0.6671591023585408 |
+----------

+-------------------+--------------------+
|feature            |coeff               |
+-------------------+--------------------+
|19-MFCCs11m        |4.176565236208282   |
|14-MFCCs6m         |3.8786747935859394  |
|25-ChromaVector4m  |3.3497537569304345  |
|16-MFCCs8m         |3.3253130892266     |
|9-MFCCs1m          |3.30573810820549    |
|22-ChromaVector1m  |3.163032793813233   |
|23-ChromaVector2m  |3.1296452940104453  |
|10-MFCCs2m         |2.6270212256610805  |
|12-MFCCs4m         |2.464441944966569   |
|5-SpectralSpreadm  |1.907172505237948   |
|18-MFCCs10m        |1.8072987051120561  |
|28-ChromaVector7m  |1.5990797847456517  |
|6-SpectralEntropym |1.3120331508826415  |
|11-MFCCs3m         |1.0762228440388226  |
|7-SpectralFluxm    |0.7291302396955163  |
|21-MFCCs13m        |0.5787055868718485  |
|20-MFCCs12m        |0.23315302328952453 |
|29-ChromaVector8m  |0.23179677158011291 |
|26-ChromaVector5m  |-0.3676767168352131 |
|4-SpectralCentroidm|-0.45898958837591314|
+----------

+----------------------+-------------------+
|feature               |coeff              |
+----------------------+-------------------+
|5-SpectralSpreadm     |4.326206153766196  |
|20-MFCCs12m           |2.5432770471651445 |
|23-ChromaVector2m     |2.4703205727939372 |
|38-SpectralCentroidstd|2.4115255955253017 |
|21-MFCCs13m           |2.35064212947691   |
|15-MFCCs7m            |2.303949001499095  |
|13-MFCCs5m            |1.478172845651904  |
|19-MFCCs11m           |1.3862087501633569 |
|37-EnergyEntropystd   |1.354888212559843  |
|12-MFCCs4m            |1.3033933353678469 |
|36-Energystd          |1.300503718872827  |
|25-ChromaVector4m     |1.2560874820855084 |
|40-SpectralEntropystd |1.2456514339222466 |
|30-ChromaVector9m     |0.9424594894508983 |
|18-MFCCs10m           |0.8999793678184755 |
|2-Energym             |0.7633823135396518 |
|27-ChromaVector6m     |0.5794064677012282 |
|33-ChromaVector12m    |0.47832579870892566|
|8-SpectralRolloffm    |0.36653868849425014|
|17-MFCCs9

+----------------------+-------------------+
|feature               |coeff              |
+----------------------+-------------------+
|31-ChromaVector10m    |5.856757707372484  |
|39-SpectralSpreadstd  |2.561324792353527  |
|19-MFCCs11m           |1.9672517608129378 |
|32-ChromaVector11m    |1.8031481507557048 |
|30-ChromaVector9m     |1.704945696402055  |
|28-ChromaVector7m     |1.644624803325998  |
|10-MFCCs2m            |1.481818660073792  |
|13-MFCCs5m            |1.1097183826745887 |
|34-ChromaDeviationm   |1.048731303597172  |
|29-ChromaVector8m     |0.9381872110600012 |
|7-SpectralFluxm       |0.8999938667005126 |
|38-SpectralCentroidstd|0.8705231574945556 |
|18-MFCCs10m           |0.8613238961034468 |
|26-ChromaVector5m     |0.8420468936727158 |
|25-ChromaVector4m     |0.770998871997689  |
|15-MFCCs7m            |0.7194657263642653 |
|16-MFCCs8m            |0.5524820476523445 |
|23-ChromaVector2m     |0.4527876107430093 |
|37-EnergyEntropystd   |0.3212818624439418 |
|24-Chroma

+----------------------+-------------------+
|feature               |coeff              |
+----------------------+-------------------+
|5-SpectralSpreadm     |4.333217369356582  |
|16-MFCCs8m            |2.8184757934739557 |
|23-ChromaVector2m     |2.4562712535993665 |
|13-MFCCs5m            |1.7668036192138556 |
|15-MFCCs7m            |1.7278177880083962 |
|19-MFCCs11m           |1.6978638968019546 |
|24-ChromaVector3m     |1.5837028455642657 |
|6-SpectralEntropym    |1.4527600103309142 |
|35-ZCRstd             |1.3983747091436962 |
|9-MFCCs1m             |1.0291505580738414 |
|40-SpectralEntropystd |0.8042303187354267 |
|37-EnergyEntropystd   |0.7907858876575832 |
|7-SpectralFluxm       |0.7412514312692031 |
|29-ChromaVector8m     |0.6517246203348744 |
|2-Energym             |0.4792275907669793 |
|36-Energystd          |0.4262258229048202 |
|38-SpectralCentroidstd|0.4110031614438671 |
|17-MFCCs9m            |0.39739299153872104|
|12-MFCCs4m            |0.39276096609359085|
|30-Chroma

+---------------------+-------------------+
|feature              |coeff              |
+---------------------+-------------------+
|49-MFCCs7std         |4.210847053951959  |
|50-MFCCs8std         |3.264950538723056  |
|32-ChromaVector11m   |2.796569948792857  |
|35-ZCRstd            |2.5167294552442128 |
|27-ChromaVector6m    |2.3497930776973237 |
|3-EnergyEntropym     |1.9728933230069168 |
|19-MFCCs11m          |1.6809167461595993 |
|23-ChromaVector2m    |1.6026099627173283 |
|2-Energym            |1.5939006624274608 |
|33-ChromaVector12m   |1.4958223596456528 |
|45-MFCCs3std         |1.0416617314945553 |
|29-ChromaVector8m    |1.0371117929301286 |
|48-MFCCs6std         |0.9770694341504348 |
|25-ChromaVector4m    |0.6544465657528717 |
|36-Energystd         |0.6124584277483158 |
|42-SpectralRolloffstd|0.591935860953859  |
|22-ChromaVector1m    |0.5291085274733124 |
|17-MFCCs9m           |0.4973569871091599 |
|47-MFCCs5std         |0.33649497301566456|
|46-MFCCs4std         |0.2225410

+----------------------+-------------------+
|feature               |coeff              |
+----------------------+-------------------+
|5-SpectralSpreadm     |5.013876028592303  |
|2-Energym             |3.9414857844421185 |
|32-ChromaVector11m    |3.3332218064048194 |
|29-ChromaVector8m     |2.3476614586950584 |
|42-SpectralRolloffstd |1.9987371670994978 |
|47-MFCCs5std          |1.9055128229366098 |
|50-MFCCs8std          |1.7853597637524288 |
|41-SpectralFluxstd    |1.777448964218127  |
|1-ZCRm                |1.6302161190189666 |
|13-MFCCs5m            |1.3805813741333632 |
|38-SpectralCentroidstd|1.306824056958733  |
|43-MFCCs1std          |1.2392911062791658 |
|18-MFCCs10m           |1.025518743801659  |
|22-ChromaVector1m     |1.015992080997308  |
|30-ChromaVector9m     |1.0020077859760166 |
|31-ChromaVector10m    |0.8730637839217145 |
|33-ChromaVector12m    |0.7693027740570982 |
|40-SpectralEntropystd |0.36702441802047214|
|36-Energystd          |0.2660594123786412 |
|12-MFCCs4

+-------------------+------------------+
|feature            |coeff             |
+-------------------+------------------+
|6-SpectralEntropym |3.814677953960494 |
|29-ChromaVector8m  |2.873632662359028 |
|22-ChromaVector1m  |2.5436303223022754|
|25-ChromaVector4m  |2.535181592019588 |
|44-MFCCs2std       |2.0884454842333438|
|8-SpectralRolloffm |1.8629142145971784|
|37-EnergyEntropystd|1.608072209081059 |
|19-MFCCs11m        |1.6010868449833475|
|23-ChromaVector2m  |1.5473109322163243|
|31-ChromaVector10m |1.355624199685259 |
|16-MFCCs8m         |1.3330478446428922|
|45-MFCCs3std       |1.2246994811766365|
|12-MFCCs4m         |1.0921307869028956|
|47-MFCCs5std       |1.0772955752455946|
|21-MFCCs13m        |1.0618267333729037|
|14-MFCCs6m         |0.9152391070806255|
|27-ChromaVector6m  |0.8549319796453732|
|13-MFCCs5m         |0.7829033936034435|
|43-MFCCs1std       |0.7598541814107057|
|9-MFCCs1m          |0.7009016685129691|
+-------------------+------------------+
only showing top

+---------------------+------------------+
|feature              |coeff             |
+---------------------+------------------+
|2-Energym            |3.7307798220750668|
|43-MFCCs1std         |2.884959380702597 |
|8-SpectralRolloffm   |2.4048812088856026|
|29-ChromaVector8m    |2.358715573283771 |
|53-MFCCs11std        |1.466679393081336 |
|17-MFCCs9m           |1.3178163728305539|
|13-MFCCs5m           |1.317447474074658 |
|37-EnergyEntropystd  |1.296127779571622 |
|44-MFCCs2std         |1.2895737919385029|
|51-MFCCs9std         |1.212809878763418 |
|32-ChromaVector11m   |1.20721197568742  |
|42-SpectralRolloffstd|1.2062453000541025|
|57-ChromaVector2std  |1.1544409550573167|
|20-MFCCs12m          |1.0895299801028686|
|46-MFCCs4std         |1.0752102118638907|
|45-MFCCs3std         |1.0527218082918066|
|31-ChromaVector10m   |1.040886585226158 |
|6-SpectralEntropym   |1.0295040698918354|
|48-MFCCs6std         |0.7353451294666267|
|23-ChromaVector2m    |0.718230292776369 |
+----------

+---------------------+------------------+
|feature              |coeff             |
+---------------------+------------------+
|47-MFCCs5std         |4.825061702930066 |
|5-SpectralSpreadm    |2.71277142598931  |
|42-SpectralRolloffstd|2.2548656168513412|
|52-MFCCs10std        |1.89859529385033  |
|59-ChromaVector4std  |1.895641012908481 |
|37-EnergyEntropystd  |1.8142221282561375|
|35-ZCRstd            |1.5232506714625853|
|14-MFCCs6m           |1.4012083226687788|
|50-MFCCs8std         |1.3774413806874688|
|49-MFCCs7std         |1.334914883197305 |
|33-ChromaVector12m   |1.253480418310982 |
|7-SpectralFluxm      |1.151879227693784 |
|44-MFCCs2std         |1.1434565868092468|
|56-ChromaVector1std  |1.1167909647119032|
|55-MFCCs13std        |0.9510730787109375|
|16-MFCCs8m           |0.7869491155116016|
|26-ChromaVector5m    |0.7471940776363605|
|22-ChromaVector1m    |0.5660952676586241|
|43-MFCCs1std         |0.5305619835258033|
|40-SpectralEntropystd|0.5269623707951022|
+----------

+-------------------+------------------+
|feature            |coeff             |
+-------------------+------------------+
|60-ChromaVector5std|6.925595445943014 |
|4-SpectralCentroidm|3.036946366316103 |
|59-ChromaVector4std|2.711290273581628 |
|33-ChromaVector12m |2.5618735258987466|
|52-MFCCs10std      |2.540488241002853 |
|57-ChromaVector2std|2.4749586559474945|
|56-ChromaVector1std|2.2855842694958115|
|53-MFCCs11std      |2.0298945397417443|
|22-ChromaVector1m  |1.977971915371976 |
|58-ChromaVector3std|1.95599034384162  |
|5-SpectralSpreadm  |1.9550466357441143|
|25-ChromaVector4m  |1.7681023307727666|
|24-ChromaVector3m  |1.7499098094766603|
|13-MFCCs5m         |1.3793125380062488|
|27-ChromaVector6m  |1.2198321639668328|
|29-ChromaVector8m  |1.0103866861898645|
|48-MFCCs6std       |0.9073896365708002|
|8-SpectralRolloffm |0.7612512135441555|
|19-MFCCs11m        |0.6080576200427105|
|21-MFCCs13m        |0.3768258909647816|
+-------------------+------------------+
only showing top

+----------------------+------------------+
|feature               |coeff             |
+----------------------+------------------+
|29-ChromaVector8m     |3.364183290166335 |
|36-Energystd          |2.818524357803635 |
|38-SpectralCentroidstd|2.655818671959386 |
|23-ChromaVector2m     |2.350353024723878 |
|10-MFCCs2m            |2.198736222531027 |
|58-ChromaVector3std   |2.1133500971341337|
|53-MFCCs11std         |1.879212455395614 |
|67-ChromaVector12std  |1.7476995201306467|
|63-ChromaVector8std   |1.6171981393746269|
|16-MFCCs8m            |1.6029125770666608|
|42-SpectralRolloffstd |1.408264894027488 |
|55-MFCCs13std         |1.3441319721614984|
|70-BPMconf            |1.3124628086999615|
|11-MFCCs3m            |1.212182149428348 |
|40-SpectralEntropystd |1.1558524856721732|
|22-ChromaVector1m     |1.0113025433650424|
|47-MFCCs5std          |0.9314868239270859|
|64-ChromaVector9std   |0.9235564546942533|
|26-ChromaVector5m     |0.9096861408420288|
|12-MFCCs4m            |0.755795

+---------------------+-------------------+
|feature              |coeff              |
+---------------------+-------------------+
|63-ChromaVector8std  |3.452850274822714  |
|2-Energym            |3.2254191538808845 |
|9-MFCCs1m            |2.5507259676098335 |
|67-ChromaVector12std |2.4282825751440376 |
|28-ChromaVector7m    |2.3410469658820947 |
|14-MFCCs6m           |2.2135238457811215 |
|70-BPMconf           |2.061511427137626  |
|68-ChromaDeviationstd|2.026209253793006  |
|52-MFCCs10std        |1.915647013426896  |
|48-MFCCs6std         |1.802138655611243  |
|5-SpectralSpreadm    |1.7275145638966427 |
|46-MFCCs4std         |1.4424585178971054 |
|15-MFCCs7m           |1.4307303510121916 |
|69-BPM               |1.2537949329889384 |
|3-EnergyEntropym     |1.0754506715959942 |
|1-ZCRm               |0.8215908358047985 |
|59-ChromaVector4std  |0.535650400104843  |
|62-ChromaVector7std  |0.5168110259064711 |
|18-MFCCs10m          |0.4280225312741757 |
|49-MFCCs7std         |0.4081491

+---------------------+------------------+
|feature              |coeff             |
+---------------------+------------------+
|47-MFCCs5std         |3.687535923565278 |
|63-ChromaVector8std  |3.0226645733169213|
|59-ChromaVector4std  |2.6022740543856253|
|45-MFCCs3std         |2.272689396139495 |
|67-ChromaVector12std |2.0820602363458938|
|55-MFCCs13std        |2.0744786434923608|
|66-ChromaVector11std |2.0537144423174305|
|54-MFCCs12std        |2.0443037806580215|
|39-SpectralSpreadstd |1.9555287154220302|
|51-MFCCs9std         |1.7526492612229025|
|34-ChromaDeviationm  |1.6868072007688146|
|68-ChromaDeviationstd|1.649321141623386 |
|11-MFCCs3m           |1.605547326901855 |
|27-ChromaVector6m    |1.4934586373461356|
|53-MFCCs11std        |1.3696379407032389|
|16-MFCCs8m           |1.3384018364110621|
|50-MFCCs8std         |1.2590028342060724|
|12-MFCCs4m           |1.1641554543843673|
|20-MFCCs12m          |1.1086544960815061|
|25-ChromaVector4m    |1.0797373499803904|
+----------

In [15]:
from pyspark.ml.feature import VectorSlicer
from pyspark.ml.feature import ChiSqSelector
from pyspark.ml.linalg import Vectors

classifiers = [OneVsRest()] 

#Select the top n features and view results
n = 71

# For Logistic regression or One vs Rest
selector = ChiSqSelector(numTopFeatures=n, featuresCol="features",
                     outputCol="selectedFeatures", labelCol="label")
bestFeaturesDf = selector.fit(test2_data).transform(test2_data)
bestFeaturesDf = bestFeaturesDf.select("label","selectedFeatures")
bestFeaturesDf = bestFeaturesDf.withColumnRenamed("selectedFeatures","features")

# Collect features
features = bestFeaturesDf.select(['features']).collect()

# Split
train,test = bestFeaturesDf.randomSplit([0.7,0.3])

# Specify folds
folds = 2

#set up your results table
columns = ['Classifier', 'Result']
vals = [("Place Holder","N/A")]
results = spark.createDataFrame(vals, columns)

for classifier in classifiers:
    new_result = ClassTrainEval(classifier,features,classes,folds,train,test)
    results = results.union(new_result)
results = results.where("Classifier!='Place Holder'")
results.show(100,False)

 
[1mOneVsRest[0m
[1mIntercept: [0m -3.9047297292103105
[1mTop 20 Coefficients:[0m
+-------------------+------------------+
|feature            |coeff             |
+-------------------+------------------+
|9-MFCCs1m          |4.185849629518928 |
|22-ChromaVector1m  |2.242537028681456 |
|71-BPMessentia     |2.1129683986372925|
|18-MFCCs10m        |1.7193328870988267|
|12-MFCCs4m         |1.71614580982129  |
|3-EnergyEntropym   |1.7070823491204694|
|11-MFCCs3m         |1.469742911214246 |
|5-SpectralSpreadm  |1.4141929180843464|
|54-MFCCs12std      |1.1406409608797254|
|53-MFCCs11std      |1.1228655357226835|
|2-Energym          |1.0732995874249984|
|24-ChromaVector3m  |1.071268549421721 |
|52-MFCCs10std      |0.9524137976362416|
|62-ChromaVector7std|0.7983182941317039|
|63-ChromaVector8std|0.7828832757835076|
|58-ChromaVector3std|0.6773027349964431|
|6-SpectralEntropym |0.6398984952915698|
|47-MFCCs5std       |0.6312692335923455|
|51-MFCCs9std       |0.6047106792098926|
|50-MFCCs

+---------------------+------------------+
|feature              |coeff             |
+---------------------+------------------+
|11-MFCCs3m           |2.926482557270733 |
|64-ChromaVector9std  |2.7469125593870185|
|17-MFCCs9m           |2.2262388096657784|
|48-MFCCs6std         |2.030384325035533 |
|40-SpectralEntropystd|1.9598879101218205|
|62-ChromaVector7std  |1.8531540621978468|
|68-ChromaDeviationstd|1.6992535501378716|
|65-ChromaVector10std |1.6782420928803976|
|55-MFCCs13std        |1.6267153599786395|
|57-ChromaVector2std  |1.5637546163133653|
|60-ChromaVector5std  |1.5206215957600413|
|58-ChromaVector3std  |1.4978201176686132|
|49-MFCCs7std         |1.49509915537331  |
|26-ChromaVector5m    |1.4879646213004565|
|50-MFCCs8std         |1.468847143327968 |
|12-MFCCs4m           |1.4327523031256355|
|67-ChromaVector12std |1.4158241159348042|
|19-MFCCs11m          |1.2410633126581823|
|59-ChromaVector4std  |1.2290157279809197|
|63-ChromaVector8std  |1.057349945132209 |
+----------

+----------------------+------------------+
|feature               |coeff             |
+----------------------+------------------+
|29-ChromaVector8m     |4.288089571058429 |
|43-MFCCs1std          |3.361109623380768 |
|41-SpectralFluxstd    |3.2313804441128813|
|7-SpectralFluxm       |2.8508007522431003|
|21-MFCCs13m           |2.306629993717977 |
|71-BPMessentia        |2.0412447204684865|
|23-ChromaVector2m     |1.92339903411284  |
|38-SpectralCentroidstd|1.8870876215990353|
|39-SpectralSpreadstd  |1.570393694771114 |
|59-ChromaVector4std   |1.5227234013915762|
|27-ChromaVector6m     |1.3097980384805084|
|33-ChromaVector12m    |1.2256167854458642|
|28-ChromaVector7m     |1.142255269095619 |
|45-MFCCs3std          |1.1381254974225288|
|44-MFCCs2std          |1.0689630035733033|
|15-MFCCs7m            |1.046881323539903 |
|32-ChromaVector11m    |1.0007313945042697|
|48-MFCCs6std          |0.993773992325959 |
|25-ChromaVector4m     |0.9341518252841208|
|13-MFCCs5m            |0.883455

In [16]:
predictions = OVR_BestModel.transform(test)

In [17]:
# From the output earlier we saw that the new label for BigRoom is now 21.0
# Let's get a song from there
count = predictions.filter("label!=21.0 AND prediction == 21.0").count()
print(count)
predictions.filter("label!=21.0 AND prediction == 21.0").show()
# predictions.show()

11
+-----+--------------------+--------------------+----------+
|label|            features|       rawPrediction|prediction|
+-----+--------------------+--------------------+----------+
|  1.0|[0.23439685430221...|[-3.9066832886448...|      21.0|
|  1.0|[0.34746761558915...|[-3.8610247292709...|      21.0|
|  3.0|[0.17495648917143...|[-3.1524086070662...|      21.0|
|  6.0|[0.27692463079165...|[-4.6682077266719...|      21.0|
|  9.0|[0.34635321819729...|[-2.3178524858804...|      21.0|
| 16.0|[0.17345429813292...|[-8.3841335950446...|      21.0|
| 16.0|[0.24582293877877...|[-4.2369452225033...|      21.0|
| 16.0|[0.34345894073264...|[-5.1095628232206...|      21.0|
| 17.0|[0.16983627790932...|[-7.4040761608274...|      21.0|
| 18.0|[0.32659039393572...|[-5.5452023538191...|      21.0|
| 20.0|[0.45256871026397...|[-7.3214545084438...|      21.0|
+-----+--------------------+--------------------+----------+



### Summary statistics for numeric variables

In [None]:
# numeric_features = [t[0] for t in df.dtypes if t[1] == 'int' or t[1] == 'double']
# df.select(numeric_features).describe().toPandas().transpose()

### Preparing Data for Machine Learning

In [None]:
# categoricalColumns = []

# stages = []

# for categoricalCol in categoricalColumns:
#     stringIndexer = StringIndexer(inputCol = categoricalCol, outputCol = categoricalCol + 'Index')
#     encoder = OneHotEncoderEstimator(inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec"])
#     stages += [stringIndexer, encoder]
    
# label_stringIdx = StringIndexer(inputCol = 'class', outputCol = 'label')

# stages += [label_stringIdx]

# numericCols = [
#     '1-ZCRm',
#     '2-Energym',
#     '3-EnergyEntropym',
#     '4-SpectralCentroidm',
#     '5-SpectralSpreadm',
#     '6-SpectralEntropym',
#     '7-SpectralFluxm',
#     '8-SpectralRolloffm',
#     '9-MFCCs1m',
#     '22-ChromaVector1m',
#     '34-ChromaDeviationm',
#     '35-ZCRstd',
#     '36-Energystd',
#     '37-EnergyEntropystd',
#     '38-SpectralCentroidstd',
#     '39-SpectralSpreadstd',
#     '40-SpectralEntropystd',
#     '41-SpectralFluxstd',
#     '42-SpectralRolloffstd',
#     '43-MFCCs1std',
#     '56-ChromaVector1std',
#     '68-ChromaDeviationstd',
#     '69-BPM',
#     '70-BPMconf',
#     '71-BPMessentia'
# ]
# assemblerInputs = [c + "classVec" for c in categoricalColumns] + numericCols

# assembler = VectorAssembler(inputCols=assemblerInputs, outputCol = "features")

# stages += [assembler]

### Pipeline

In [None]:
# pipeline = Pipeline(stages = stages)
# pipelineModel = pipeline.fit(df)
# df = pipelineModel.transform(df)
# selectedCols = ['label', 'features'] + cols
# df = df.select(selectedCols)

In [None]:
# pd.DataFrame(df.take(5), columns = df.columns).transpose()

### Randomly split data into train and test sets

In [None]:
# train, test = df.randomSplit([0.7, 0.3], seed = 40)
# print("Training Dataset Count: " + str(train.count()))
# print("Test Dataset Count: " + str(test.count()))

### Define the evaluator

In [None]:
# evaluator = MulticlassClassificationEvaluator(predictionCol = "prediction")

### Logistic Regression Model

In [None]:
# lr = LogisticRegression()
# lrModel = lr.fit(train)

# # Set up parameter Grid
# paramGrid = (ParamGridBuilder()
#              .addGrid(lr.regParam, [0.1, 0.3, 0.5])
#              .addGrid(lr.elasticNetParam, [0.0, 0.1, 0.2])
#              .addGrid(lrModel.maxIter, [10, 20, 50])
#              .build())

# # Cross Val score set up with all parameters
# cv = CrossValidator(estimator = lr, estimatorParamMaps = paramGrid, evaluator = evaluator, numFolds = 5)

# # Then fit the model
# cvModel = cv.fit(train)

# # Collect the best Model
# BestModel = cvModel.bestModel

# # Generate predictions
# predictions = cvModel.transform(test)

# # Print the accuracy rate of the model or AUC for a binary classifier
# print('Accuracy : {}'.format(evaluator.evaluate(predictions)))

In [None]:
# # Load the Summary
# trainingSummary = BestModel.summary

# # Generate confusion matrix and print (includes accuracy)
# accuracy = trainingSummary.accuracy
# falsePositiveRate = trainingSummary.weightedFalsePositiveRate
# truePositiveRate = trainingSummary.weightedTruePositiveRate
# fMeasure = trainingSummary.weightedFMeasure()
# precision = trainingSummary.weightedPrecision
# recall = trainingSummary.weightedRecall

# print("Accuracy: %s\nFPR: %s\nTPR: %s\nF-measure: %s\nPrecision: %s\nRecall: %s"
#       % (accuracy, falsePositiveRate, truePositiveRate, fMeasure, precision, recall))

### Random Forest

In [None]:
# rf = RandomForestClassifier()
# rfModel = rf.fit(train)

# # Set up parameter Grid
# paramGrid = (ParamGridBuilder()
#              .addGrid(rf.maxDepth, [2, 5, 10])
#              .build())

# # Cross Val score set up with all parameters
# cv = CrossValidator(estimator = rf, estimatorParamMaps = paramGrid, evaluator = evaluator, numFolds = 5)

# # Then fit the model
# cvModel = cv.fit(train)

# # Collect the best Model
# BestModel = cvModel.bestModel

# # Generate predictions
# predictions = cvModel.transform(test)

# # Print the accuracy rate of the model or AUC for a binary classifier
# print('Accuracy : {}'.format(evaluator.evaluate(predictions)))

In [None]:
# # Load the Summary
# trainingSummary = BestModel.summary

# # Generate confusion matrix and print (includes accuracy)
# accuracy = trainingSummary.accuracy
# falsePositiveRate = trainingSummary.weightedFalsePositiveRate
# truePositiveRate = trainingSummary.weightedTruePositiveRate
# fMeasure = trainingSummary.weightedFMeasure()
# precision = trainingSummary.weightedPrecision
# recall = trainingSummary.weightedRecall

# print("Accuracy: %s\nFPR: %s\nTPR: %s\nF-measure: %s\nPrecision: %s\nRecall: %s"
#       % (accuracy, falsePositiveRate, truePositiveRate, fMeasure, precision, recall))