# Package Installations and Imports

In [1]:
import numpy as np
import pandas as pd
import csv

from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import MinMaxScaler


from sklearn import svm


# Feature Selection

Feature Headers

In [2]:
headers=np.array(["chromaC_mean", "chromaC#_mean", "chromaD_mean", "chromaD#_mean", "chromaE_mean", "chromaF_mean", "chromaF#_mean", "chromaG_mean", \
         "chromaG#_mean", "chromaA_mean", "chromaA#_mean", "chromaB_mean", "chromaC_std", "chromaC#_std", "chromaD_std", "chromaD#_std", "chromaE_std", \
         "chromaF_std", "chromaF#_std", "chromaG_std", "chromaG#_std", "chromaA_std", "chromaA#_std", "chromaB_std", "chromaC_var", "chromaC#_var", "chromaD_var", \
         "chromaD#_var", "chromaE_var", "chromaF_var", "chromaF#_var", "chromaG_var", "chromaG#_var", "chromaA_var", "chromaA#_var", "chromaB_var", "mfcc1_mean", \
         "mfcc2_mean", "mfcc3_mean", "mfcc4_mean", "mfcc5_mean", "mfcc6_mean", "mfcc7_mean", "mfcc8_mean", "mfcc9_mean", "mfcc10_mean", "mfcc11_mean", "mfcc12_mean", \
         "mfcc13_mean", "mfcc1_std", "mfcc2_std", "mfcc3_std", "mfcc4_std", "mfcc5_std", "mfcc6_std", "mfcc7_std", "mfcc8_std", "mfcc9_std", "mfcc10_std", "mfcc11_std", \
         "mfcc12_std", "mfcc13_std", "mfcc1_var", "mfcc2_var", "mfcc3_var", "mfcc4_var", "mfcc5_var", "mfcc6_var", "mfcc7_var", "mfcc8_var", "mfcc9_var", "mfcc10_var", "mfcc11_var", \
         "mfcc12_var", "mfcc13_var", "rms_mean", "rms_std", "rms_var", "zcr_mean", "zcr_std", "zcr_var", "tempo", "centroid_mean", "centroid_std", "centroid_var", "bandwidth_mean", \
         "bandwidth_std", "bandwidth_var", "flatness_mean", "flatness_std", "flatness_var", "rolloff_mean", "rolloff_std", "rolloff_var", "tonnetz0_mean", "tonnetz1_mean", \
         "tonnetz2_mean", "tonnetz3_mean", "tonnetz4_mean", "tonnetz5_mean", "tonnetz0_std", "tonnetz1_std", "tonnetz2_std", "tonnetz3_std", "tonnetz4_std", "tonnetz5_std", \
         "tonnetz0_var", "tonnetz1_var", "tonnetz2_var", "tonnetz3_var", "tonnetz4_var", "tonnetz5_var", "tgr0_mean", "tgr1_mean", "tgr2_mean", "tgr3_mean", "tgr4_mean", "tgr5_mean",\
         "tgr6_mean", "tgr7_mean", "tgr8_mean", "tgr9_mean", "tgr10_mean", "tgr11_mean", "tgr12_mean", "tgr0_std", "tgr1_std", "tgr2_std", "tgr3_std", "tgr4_std", "tgr5_std", "tgr6_std",\
         "tgr7_std", "tgr8_std", "tgr9_std", "tgr10_std", "tgr11_std", "tgr12_std", "tgr0_var", "tgr1_var", "tgr2_var", "tgr3_var", "tgr4_var", "tgr5_var", "tgr6_var", "tgr7_var", "tgr8_var", \
         "tgr9_var", "tgr10_var", "tgr11_var", "tgr12_var", "logattacktime", "attackstart", "attackstop", "loudness", "danceability", "predominantpitch_mean", "predominantpitch_std",\
         "predominantpitch_var", "pitchsalience_mean", "pitchsalience_std", "pitchsalience_var", "complexity_mean", "complexity_std", "complexity_var", "flux_mean", "flux_std", "flux_var",\
         "inharmonicity_mean", "inharmonicity_std", "inharmonicity_var", "tristimulus1_mean", "tristimulus2_mean", "tristimulus3_mean", "tristimulus1_std","tristimulus2_std","tristimulus3_std",\
         "tristimulus1_var", "tristimulus2_var", "tristimulus3_var", "oddevenratio_mean", "oddevenratio_std", "oddevenratio_var", "AMBMenergyratio", "key", "scale", "strength", "tuningfrequency"])


Feature selection for Half features Q1&Q2/Q3&Q4

In [3]:
featuredata=pd.read_csv('Features_half_train.csv')
features=np.asarray(featuredata.drop(columns=['songID','sentiment']))
sentiment=np.asarray(featuredata['sentiment'])

scaler = MinMaxScaler()
scaler.fit(features)
features=scaler.transform(features)

estimator = svm.SVC(kernel='linear')
cv = StratifiedKFold(5)

rfecv = RFECV(
    estimator=estimator,
    step=3,
    cv=cv,
    scoring="accuracy",
    min_features_to_select=40,
    n_jobs=2,
)
rfecv.fit(features, sentiment)
selectedheaders1=['songID']+list(headers[rfecv.get_support()])+['sentiment']



Feature selection for Top Half features Q1/Q2

In [5]:
featuredata=pd.read_csv('Features_tophalf_train.csv')
features=np.asarray(featuredata.drop(columns=['songID','sentiment']))
sentiment=np.asarray(featuredata['sentiment'])

scaler = MinMaxScaler()
scaler.fit(features)
features=scaler.transform(features)


estimator = svm.SVC(kernel='linear')
cv = StratifiedKFold(5)

rfecv = RFECV(
    estimator=estimator,
    step=2,
    cv=cv,
    scoring="accuracy",
    min_features_to_select=50,
    n_jobs=2,
)
rfecv.fit(features, sentiment)
selectedheaders2=['songID']+list(headers[rfecv.get_support()])+['sentiment']





Feature selection for Bottom Half features Q3/Q4

In [7]:
featuredata=pd.read_csv('Features_bottomhalf_train.csv')
features=np.asarray(featuredata.drop(columns=['songID','sentiment']))
sentiment=np.asarray(featuredata['sentiment'])

scaler = MinMaxScaler()
scaler.fit(features)
features=scaler.transform(features)


estimator = svm.SVC(kernel='linear')
cv = StratifiedKFold(5)

rfecv = RFECV(
    estimator=estimator,
    step=2,
    cv=cv,
    scoring="accuracy",
    min_features_to_select=40,
    n_jobs=2,
)
rfecv.fit(features, sentiment)
selectedheaders3=['songID']+list(headers[rfecv.get_support()])+['sentiment']




In [8]:
import json
featureheaders={'SVM1':selectedheaders1[1:-1],'SVM2':selectedheaders2[1:-1],'SVM3':selectedheaders3[1:-1]}
with open('SelectedHeaders.json', 'w') as f:
    json.dump(featureheaders, f)