In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from tensorflow.keras.preprocessing.sequence import pad_sequences

## -------------------------------------------------------------
## Keep only if running locally on Intel CPUs and/or GPUs. Comment for rest
from sklearnex import patch_sklearn
patch_sklearn()
## -------------------------------------------------------------


from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, ShuffleSplit
from sklearn.metrics import classification_report, confusion_matrix

from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


## Preprocessing

In [2]:
dfx = pd.read_csv("out.tsv", sep='\t')

In [3]:
print(dfx)

                                                  notes  Alban Berg  \
0     [67, 72, 67, 72, 78, 71, 61, 67, 67, 61, 78, 7...           1   
1     [67, 72, 67, 78, 61, 67, 71, 72, 79, 71, 78, 6...           1   
2     [67, 72, 67, 78, 61, 71, 67, 72, 79, 78, 71, 6...           1   
3     [70, 66, 58, 48, 49, 48, 53, 58, 70, 54, 49, 7...           0   
4     [52, 62, 58, 80, 62, 52, 58, 80, 81, 81, 82, 6...           0   
...                                                 ...         ...   
1271  [73, 56, 53, 73, 72, 72, 73, 73, 72, 73, 72, 7...           0   
1272  [77, 81, 65, 77, 72, 81, 65, 72, 76, 79, 70, 7...           0   
1273  [72, 72, 70, 69, 70, 67, 69, 65, 67, 69, 65, 6...           0   
1274  [72, 72, 70, 69, 70, 69, 67, 67, 65, 69, 65, 6...           0   
1275  [79, 48, 48, 52, 52, 48, 77, 79, 76, 77, 48, 7...           0   

      Alexander Scriabin  Antonio Soler  Carl Maria von Weber  \
0                      0              0                     0   
1                

In [4]:
df = dfx.copy()

# Extract features (notes) and labels (composers)
features = df['notes'].tolist()
labels = df.drop('notes', axis=1)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)


In [5]:
import ast
features = df['notes'].apply(ast.literal_eval).tolist()

In [6]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

In [7]:
# Padding
max_length = max(len(seq) for seq in X_train)
X_train_padded = pad_sequences(X_train, maxlen=max_length, padding='post')
X_test_padded = pad_sequences(X_test, maxlen=max_length, padding='post')

# Shuffle and Split
cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=42)

## XGBoost

In [9]:
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.01, 0.001],
    'n_estimators': [50, 100, 200]
}

xgb_classifier = XGBClassifier()

grid_search = GridSearchCV(estimator=xgb_classifier, param_grid=param_grid, cv=cv, scoring='accuracy')
grid_search.fit(X_train_padded, y_train)

best_estimator = grid_search.best_estimator_

y_pred = best_estimator.predict(X_test_padded)

report = classification_report(y_test, y_pred, target_names=labels.columns)
report1 = report

## DT, RF, ANN (MLP)

In [None]:
#DECISION TREE

dt_param_grid = {
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10]
}

dt_classifier = DecisionTreeClassifier()

# Grid search for Decision Tree
dt_grid_search = GridSearchCV(estimator=dt_classifier, param_grid=dt_param_grid, cv=cv, scoring='accuracy')
dt_grid_search.fit(X_train_padded, y_train)

best_dt_estimator = dt_grid_search.best_estimator_

# Predict using DT
dt_y_pred = best_dt_estimator.predict(X_test_padded)
dt_report = classification_report(y_test, dt_y_pred, target_names=labels.columns)

# RANDOM FOREST

rf_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10]
}

rf_classifier = RandomForestClassifier()

# Grid search for Random Forest
rf_grid_search = GridSearchCV(estimator=rf_classifier, param_grid=rf_param_grid, cv=cv, scoring='accuracy')
rf_grid_search.fit(X_train_padded, y_train)

best_rf_estimator = rf_grid_search.best_estimator_

# Predictions using the Random Forest estimator
rf_y_pred = best_rf_estimator.predict(X_test_padded)
rf_report = classification_report(y_test, rf_y_pred, target_names=labels.columns)


# ANN MLP - Artificial Neural Network
ann_param_grid = {
    'hidden_layer_sizes': [(64,), (128,), (256,)],
    'activation': ['relu', 'tanh', 'logistic'],
    'alpha': [0.0001, 0.001, 0.01]
}

ann_classifier = MLPClassifier(max_iter=1000)

# Grid search for Artificial Neural Network
ann_grid_search = GridSearchCV(estimator=ann_classifier, param_grid=ann_param_grid, cv=cv, scoring='accuracy')
ann_grid_search.fit(X_train_padded, y_train)

best_ann_estimator = ann_grid_search.best_estimator_

# Predictions using the Artificial Neural Network estimator
ann_y_pred = best_ann_estimator.predict(X_test_padded)
ann_report = classification_report(y_test, ann_y_pred, target_names=labels.columns)

In [16]:
print("\n\nXGBoost Classification Report:")
print(dt_report)
print("\nDecision Tree Classification Report:")
print(dt_report)
print("\nRandom Forest Classification Report:")
print(rf_report)
print("\nANN Classification Report:")
print(ann_report)


XGBoost Classification Report:
              precision    recall  f1-score   support

           0       0.56      0.53      0.54        19
           1       0.64      0.80      0.71        40
           2       0.69      0.59      0.63        41
           3       0.65      0.57      0.61        35
           4       0.76      0.76      0.76        34
           5       0.61      0.71      0.66        31
           6       0.74      0.64      0.69        45
           7       0.69      0.72      0.70        43

    accuracy                           0.67       288
   macro avg       0.67      0.67      0.66       288
weighted avg       0.68      0.67      0.67       288

Accuracy: 0.67

Decision Tree Classification Report:
              precision    recall  f1-score   support

           0       0.12      0.16      0.14        19
           1       0.64      0.53      0.58        40
           2       0.29      0.22      0.25        41
           3       0.17      0.26      0.20    