# Appliying Machine learning in Carnatic Music Classification with less attributed values

# Step 1

load the libraries

In [63]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [64]:
df=pd.read_excel('music.xlsx')
df_train = df.drop(['UID','MBID of the recording','Name','Artist','Release+Volume','Lead Instrument Code','Raaga','Excerpt Start Time (s)','Excerpt End Time (s)'], axis = 1)
X = df_train .drop('Taala',axis=1).values
y = df_train ['Taala'].values

In [65]:
df_train

Unnamed: 0,Taala,Length of the excerpt (s),Length of the excerpt (min),Number of annotated beats,Number of samas
0,0,149,2.48,193,25
1,0,237,3.95,368,46
2,0,352,5.87,481,61
3,0,510,8.50,825,104
4,0,514,8.57,705,89
...,...,...,...,...,...
171,3,255,4.25,397,133
172,3,415,6.92,703,235
173,3,129,2.15,223,75
174,3,265,4.42,487,163


In [48]:
X

array([[ 149.  ,    2.48,  193.  ,   25.  ],
       [ 237.  ,    3.95,  368.  ,   46.  ],
       [ 352.  ,    5.87,  481.  ,   61.  ],
       [ 510.  ,    8.5 ,  825.  ,  104.  ],
       [ 514.  ,    8.57,  705.  ,   89.  ],
       [ 216.  ,    3.6 ,  305.  ,   39.  ],
       [ 423.  ,    7.05,  681.  ,   86.  ],
       [ 295.  ,    4.92,  520.  ,   65.  ],
       [ 173.  ,    2.88,  241.  ,   31.  ],
       [ 534.  ,    8.9 ,  881.  ,  111.  ],
       [ 507.  ,    8.45,  793.  ,  100.  ],
       [ 402.  ,    6.7 ,  496.  ,   62.  ],
       [ 120.  ,    2.  ,  194.  ,   25.  ],
       [ 183.  ,    3.05,  440.  ,   55.  ],
       [ 212.  ,    3.53,  275.  ,   35.  ],
       [ 309.  ,    5.15,  465.  ,   59.  ],
       [ 234.  ,    3.9 ,  305.  ,   39.  ],
       [ 260.  ,    4.33,  441.  ,   56.  ],
       [ 748.  ,   12.47, 1266.  ,  159.  ],
       [ 268.  ,    4.47,  376.  ,   47.  ],
       [ 215.  ,    3.58,  321.  ,   41.  ],
       [ 161.  ,    2.68,  209.  ,   27.  ],
       [ 3

In [49]:
print(y)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3]


# Step 2: Feature selection

Feature selection is a process where you automatically select those features in your data that contribute most to the prediction variable or output in which you are interested.

1) Univariate Selection

Statistical tests can be used to select those features that have the strongest relationship with the output variable.


In [66]:
from numpy import set_printoptions
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

from pandas import read_csv
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt

In [67]:
# feature selection
def select_features(X_train, y_train, X_test):
	# configure to select all features
	fs = SelectKBest(score_func=f_classif, k='all')
	# learn relationship from training data
	fs.fit(X_train, y_train)
	# transform train input data
	X_train_fs = fs.transform(X_train)
	# transform test input data
	X_test_fs = fs.transform(X_test)
	return X_train_fs, X_test_fs, fs
 

In [68]:
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
# feature selection
X_train_fs, X_test_fs, fs = select_features(X_train, y_train, X_test)
# what are scores for the features
for i in range(len(fs.scores_)):
	print('Feature %d: %f' % (i, fs.scores_[i]))
# plot the scores
#pyplot.bar([i for i in range(len(fs.scores_))], fs.scores_)
#pyplot.show()

Feature 0: 4.857845
Feature 1: 4.856098
Feature 2: 4.891852
Feature 3: 19.320271


In [69]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 0)

# Feature Scaling

In [54]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Machine learing algorithms

# RandomForestClassifier

In [70]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

RandomForestClassifier(criterion='entropy', n_estimators=10, random_state=0)

In [71]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[1 1]
 [1 3]
 [3 1]
 [1 1]
 [2 2]
 [2 2]
 [0 0]
 [2 3]
 [3 2]
 [3 1]
 [2 2]
 [2 2]
 [0 0]
 [3 1]
 [2 3]
 [1 1]
 [0 0]
 [2 0]
 [3 1]
 [1 1]
 [3 1]
 [0 0]
 [0 0]
 [1 2]
 [1 1]
 [2 2]
 [2 2]
 [1 3]
 [2 2]
 [1 3]
 [0 0]
 [3 1]
 [2 2]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [2 2]
 [3 1]
 [1 3]
 [0 0]
 [0 0]
 [1 1]
 [2 2]
 [1 3]
 [0 0]
 [2 2]
 [1 1]
 [2 2]
 [0 0]
 [2 2]
 [2 2]
 [0 0]]


In [72]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[14  0  1  0]
 [ 0  8  0  7]
 [ 0  1 14  1]
 [ 0  5  2  0]]


0.6792452830188679

# Training the Logistic Regression model on the Training set

In [73]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(random_state=0)

In [74]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[1 1]
 [1 3]
 [3 1]
 [1 1]
 [2 2]
 [2 2]
 [0 0]
 [3 3]
 [2 2]
 [1 1]
 [2 2]
 [2 2]
 [0 0]
 [3 1]
 [1 3]
 [1 1]
 [0 0]
 [0 0]
 [3 1]
 [1 1]
 [3 1]
 [0 0]
 [0 0]
 [2 2]
 [1 1]
 [2 2]
 [2 2]
 [3 3]
 [2 2]
 [1 3]
 [0 0]
 [1 1]
 [2 2]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [2 2]
 [1 1]
 [3 3]
 [0 0]
 [0 0]
 [1 1]
 [2 2]
 [3 3]
 [0 0]
 [2 2]
 [1 1]
 [2 2]
 [0 0]
 [2 2]
 [2 2]
 [0 0]]


In [75]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[15  0  0  0]
 [ 0 11  0  4]
 [ 0  0 16  0]
 [ 0  3  0  4]]


0.8679245283018868

# Decision Tree Classification

In [76]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

DecisionTreeClassifier(criterion='entropy', random_state=0)

In [77]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[1 1]
 [1 3]
 [3 1]
 [1 1]
 [2 2]
 [2 2]
 [0 0]
 [1 3]
 [2 2]
 [1 1]
 [2 2]
 [2 2]
 [0 0]
 [3 1]
 [2 3]
 [1 1]
 [0 0]
 [2 0]
 [3 1]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [0 2]
 [1 1]
 [2 2]
 [2 2]
 [0 3]
 [2 2]
 [2 3]
 [0 0]
 [1 1]
 [2 2]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [2 2]
 [3 1]
 [2 3]
 [0 0]
 [0 0]
 [1 1]
 [2 2]
 [3 3]
 [0 0]
 [2 2]
 [3 1]
 [2 2]
 [0 0]
 [1 2]
 [2 2]
 [0 0]]


# Making the Confusion Matrix

In [78]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[14  0  1  0]
 [ 0 10  0  5]
 [ 1  1 14  0]
 [ 1  2  3  1]]


0.7358490566037735

# Conclusion

we used 4 features and one label for classification of carnatic music ,and apply the feature selection for finding highly correlated attribute and then apply 4 machine learning algorithms

1)Random Forest Classifier--- 70.45%

2)Logistic Regression Classifier= 82%

3)Decision Tree Classifier=80%

# here we conclude that the best ml algorithm for carnatic music classification is LR and Decision Tree after reducing the attributes