In [1]:
from xgboost import XGBClassifier
import mlflow
import ast
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MultiLabelBinarizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils import class_weight
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [2]:
generic = lambda x: ast.literal_eval(x)
conv = {'artists': generic}

In [3]:
X_data = pd.read_csv("preprocessed/X_data.csv")
target = pd.read_csv("preprocessed/target.csv", converters=conv)

In [4]:
X_data.shape, target.shape

((159068, 13), (159068, 1))

In [5]:
X_data.head()

Unnamed: 0,acousticness,danceability,duration_ms,instrumentalness,key,liveness,loudness,mode,popularity,speechiness,tempo,valence,year
0,0.976,0.261,203720,0.88,0,0.129,-33.903,1,12,0.0387,140.647,0.0878,1958
1,0.991,0.251,126173,0.929,1,0.299,-24.261,1,2,0.0335,73.85,0.229,1953
2,0.963,0.265,737067,7.3e-05,1,0.119,-17.544,1,0,0.0515,103.444,0.0476,1948
3,0.597,0.714,281400,1.2e-05,2,0.0591,-9.912,1,54,0.046,111.583,0.672,1995
4,0.0757,0.185,222453,0.102,9,0.232,-6.791,1,31,0.0621,180.539,0.644,1981


In [6]:
target

Unnamed: 0,artists
0,[mamie smith]
1,[screamin' jay hawkins]
2,[mamie smith]
3,[oscar velazquez]
4,[mamie smith & her jazz hounds]
...,...
159063,"[dj combo, sander-7, tony t]"
159064,[alessia cara]
159065,[roger fly]
159066,[taylor swift]


The current danuniqueset is highly imbalanced given that most of the samples correspond to unique classes (over 48% percent). Furthermore, there are a high dimensionality having 32540 classes.
Widely speaking there are 2 Multilabel classification methods:

- __Algorithm adaptation methods__

Modify the algorithm to build decision trees or the entropy definition to consider various classes.

- __Problem transformation methods:__ Convert the problem into a combination of binary classification or multiclass classification problems.
    - __Simple:__ Ignore classes by frequency in each sample (keep the class with higher frequency)    
    - __Binary Relevance__: Divide the dataset into different datasets with each one containing the instances of one specific label. Train n_classes classifiers with these datasets and merge the prediction to get the final result. __Problem__: This will produce datasets with 1 sample, insufucient to train any supervised learning model.

    - __Label Powerset__: Treats every combination in the dataset as a single class (resulting with more classes in this case). __Problem__: This is more prior to imbalance problems, the number of classes in this approach increase with respect to multilabel classes (artist)

    - __Pruned set__: Solves imbalanced class distribution by pruning instances that have frequency less that specific threshold.


__Experiments__

- Train as it is (don't remove any label)
- Train using pruned set
- Train using pruned set plus some imbalanced handling tecniques (undersampling high frequency classes and oversampling low frequency)

__With all use:__

- Random Forest
- XGBoost
- Feed Forward Neural Network (MLP)

In [7]:
def fetch_logged_data(run_id):
    client = mlflow.tracking.MlflowClient()
    data = client.get_run(run_id).data
    tags = {k: v for k, v in data.tags.items() if not k.startswith("mlflow.")}
    artifacts = [f.path for f in client.list_artifacts(run_id, "model")]
    return data.params, data.metrics, tags, artifacts

In [8]:
mlflow.sklearn.autolog()

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X_data, target, test_size=0.2, random_state=42)

In [10]:
X_train.shape, X_test.shape

((127254, 13), (31814, 13))

In [11]:
forest = RandomForestClassifier(n_estimators=10)
multi_target_forest = MultiOutputClassifier(forest, n_jobs=3)

### Multilabel binarize the target

In [12]:
mlb = MultiLabelBinarizer()
target_space_train = mlb.fit_transform(y_train["artists"].to_list())
target_space_test = mlb.fit_transform(y_test["artists"].to_list())

In [None]:
with mlflow.start_run() as run:
    multi_target_forest.fit(X_train, target_space_train)

In [93]:
params, metrics, tags, artifacts = fetch_logged_data(run.info.run_id)

In [94]:
scoring = ['accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted', 'roc_auc']

In [95]:
target_names = df_target.columns.tolist()

In [96]:
y_pred = multi_target_forest.predict(X_test)
report = classification_report(y_test, y_pred, target_names=target_names)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [97]:
print(report)

                          precision    recall  f1-score   support

        Francisco Canaro       0.93      0.60      0.73       439
Tadeusz Dolega Mostowicz       1.00      0.95      0.97       290
        Эрнест Хемингуэй       1.00      0.98      0.99       249
       Эрих Мария Ремарк       1.00      0.98      0.99       199
         Frédéric Chopin       0.88      0.42      0.57       198
   Johann Sebastian Bach       0.98      0.28      0.44       182
 Wolfgang Amadeus Mozart       0.85      0.06      0.11       185
    Ludwig van Beethoven       0.76      0.09      0.17       171
           Frank Sinatra       0.54      0.04      0.08       167
   Wiener Philharmoniker       0.96      0.37      0.54       139

               micro avg       0.96      0.54      0.69      2219
               macro avg       0.89      0.48      0.56      2219
            weighted avg       0.90      0.54      0.62      2219
             samples avg       0.03      0.03      0.03      2219



In [4]:
clf = mlflow.sklearn.load_model("mlruns/0/0f10aa9cdc2c476eab4796bbf5a50b62/artifacts/model/")

In [5]:
clf

MultiOutputClassifier(estimator=RandomForestClassifier(n_estimators=10),
                      n_jobs=-1)

In [8]:
data_dict = {"acousticness": 0.909,
 "danceability": 0.212,
 "duration_ms": 70453,
 "energy": 0.0473,
 "explicit": 0,
 "instrumentalness": 0.927,
 "key": 5,
 "liveness": 0.0958,
 "loudness": -27.82800000000001,
 "mode": 1,
 "popularity": 0,
 "speechiness": 0.04,
 "tempo": 110.388,
 "valence": 0.248,
 "year": 1952}

In [16]:
import numpy as np

In [23]:
sample = np.array([i for i in data_dict.values()]).reshape(1, -1)

In [24]:
sample

array([[ 9.09000e-01,  2.12000e-01,  7.04530e+04,  4.73000e-02,
         0.00000e+00,  9.27000e-01,  5.00000e+00,  9.58000e-02,
        -2.78280e+01,  1.00000e+00,  0.00000e+00,  4.00000e-02,
         1.10388e+02,  2.48000e-01,  1.95200e+03]])

In [25]:
preds = clf.predict(sample)

In [194]:
preds[0][0] = 1

In [196]:
preds[0]

array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [199]:
y_test.columns

Index(['Francisco Canaro', 'Tadeusz Dolega Mostowicz', 'Эрнест Хемингуэй',
       'Эрих Мария Ремарк', 'Frédéric Chopin', 'Johann Sebastian Bach',
       'Wolfgang Amadeus Mozart', 'Ludwig van Beethoven', 'Frank Sinatra',
       'Wiener Philharmoniker'],
      dtype='object')

In [None]:
X_test