In [1]:
#FeatureSelection.ipynb
#authors: Chance Stewart

import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
import CSVInterface

reader = CSVInterface.featRead()

Reading features.pkl
Reading tracks.pkl
Reading echonest.pkl
Reading genres.pkl


In [2]:
features = reader.getSubset(reader.getFrame('features'), sub='cleanLarge')
genres = reader.getSubset(reader.getFrame('track')['genre_top'], sub='cleanLarge')

In [3]:
Xdf = pd.DataFrame(features)
Ydf = pd.DataFrame(genres)

In [4]:
res = 50  #number of results to be calculated and printed
bestfeatures = SelectKBest(score_func=f_classif, k=res)
fit = bestfeatures.fit(Xdf, np.ravel(Ydf))
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(Xdf.columns)

#concat the returned columns and their respective scores
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Feature','Score']
#print the selected number of best features according to univarite selection
print(featureScores.nlargest(res,'Score'))

                             Feature        Score
295                 (mfcc, mean, 04)  1242.145874
315               (mfcc, median, 04)  1212.966733
312               (mfcc, median, 01)  1097.234798
292                 (mfcc, mean, 01)  1087.863106
275                  (mfcc, max, 04)  1040.706971
437  (spectral_contrast, median, 04)  1006.795188
272                  (mfcc, max, 01)  1004.792318
430    (spectral_contrast, mean, 04)   992.799450
436  (spectral_contrast, median, 03)   917.780320
429    (spectral_contrast, mean, 03)   905.337592
314               (mfcc, median, 03)   863.055983
294                 (mfcc, mean, 03)   856.079110
435  (spectral_contrast, median, 02)   822.905585
428    (spectral_contrast, mean, 02)   809.976324
426     (spectral_contrast, max, 07)   760.826713
467     (spectral_rolloff, skew, 01)   736.147639
319               (mfcc, median, 08)   734.508656
317               (mfcc, median, 06)   731.754164
297                 (mfcc, mean, 06)   719.100014


In [5]:
#create dataframe formatted for mRMR program
df = reader.mergeFrames(Ydf, Xdf)
df['genre_top'] = pd.factorize(df['genre_top'])[0] + 1

#output csv file to be read into C++ mRMR program. Needs to have instances of ', ' (comma with space, which
#appears in the column names) replaced with'/' or some other non-comma separating character in order to work

df.to_csv(r'mRMRdata.csv', index=False)

#I couldn't manage to get the mRMR python libary to install on my machine, so after some troubleshooting
#I used an executable on the mRMR project website.
#The program used to calculate mRMR feature selection is found at: http://home.penglab.com/proj/mRMR/
#I used the windows executible (but other OS versions exist), run in the same directory as mRMRdata.csv with the command:  
#mrmr_win32.exe -i mRMR.csv -t 0.01 -n 200 -s 10000
#although this command takes a long time to run, so reducing s (number of samples the program is run on), 
#n (number of features to select), or increasing -t (threshold for converting continuous variables into
#distinct values) should decrease the time it takes to run this.


#mRMR results (in column index form):
#indices = [435, 295, 412, 314, 312, 275, 437, 331,
#          384, 317, 515, 449, 323, 377, 354, 462, 450, 321,
#          509, 387, 272, 315, 294, 517, 461, 292, 430, 277,
#          508, 379, 174, 428, 411, 386, 325, 375, 319, 426,
#          208, 510, 281, 352, 297, 436, 311, 385, 451, 440,
#          330, 399, 431, 380, 507, 394, 229, 303, 318, 467,
#          388, 373, 415, 301, 285, 406, 355, 381, 444, 429,
#          250, 279, 206, 376, 358, 327, 173, 414, 398, 338,
#          460, 299, 383, 416, 198, 468, 506, 438, 287, 196,
#          433, 445, 389, 335, 305, 278, 212, 382, 172, 395,
#          404, 427, 344, 310, 211, 231, 390, 329, 274, 204,
#          372, 458, 397, 214, 505, 166, 336, 293, 169, 378,
#          320, 209, 514, 343, 328, 356, 391, 471, 159, 205,
#          441, 234, 307, 497, 464, 290, 207, 276, 403, 459,
#          176, 283, 333, 238, 288, 175, 194, 452, 324, 442,
#          232, 434, 465, 298, 210, 254, 228, 472, 309, 362,
#          213, 496, 171, 405, 448, 286, 340, 401, 193, 291,
#          280, 302, 233, 454, 243, 308, 423, 253, 164, 170,
#          215, 443, 511, 199, 289, 313, 339, 230, 200, 488,
#          498, 282, 304, 392, 479, 334, 195, 402, 75, 410,
#          177, 192]
#these are not the exact same as the "feature numbers" that the mRMR program outputs, because those
#start at index 1. Each of these is one less than the feature numbers, so that they match a starting
#index of  0.

