## Train a Model on Embeddings (Extracted Features)
In this feature-based approach, we are using the embeddings from the previous transformation step to train some models on a multilabel classification task.
These results will be considered the baseline for more advanced modelling techniques.

In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score,classification_report

pd.set_option('display.max_columns', None)

In [17]:
model_embedding_name="ModernBERT-base"#"all-mpnet-base-v2"#"bert-base-uncased"

In [18]:
df=pd.read_csv("out/3_elaborated_dataset_for_multilabel_training_"+model_embedding_name+"_.csv")
df

Unnamed: 0,embeddings,Alert User,Ambient Atmosphere,Ambient Luminance,Ambient Temperature,Control Hub,Energy Saving,Gardening,Other,Other Appliances,Outlet Control,Robot Control,Security
0,[ 1.40166485e+00 -1.04692149e+00 6.63752705e-...,0,0,1,0,0,0,0,0,0,0,0,0
1,[ 9.71651614e-01 -8.89602423e-01 7.05024824e-...,0,0,1,0,0,0,0,0,0,0,0,0
2,[ 1.04591990e+00 -5.18905222e-01 -7.82014132e-...,0,0,1,0,0,0,0,0,0,0,0,0
3,[ 7.71454036e-01 -5.15564382e-01 -9.71297175e-...,0,0,0,0,1,0,0,0,0,0,0,0
4,[ 8.28122139e-01 -5.97265363e-01 -1.32278651e-...,1,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2555,[ 7.11828172e-01 -7.65671611e-01 -3.60019267e-...,0,0,1,0,0,0,0,0,0,0,0,0
2556,[ 1.07509267e+00 -3.91025215e-01 -2.05628097e-...,1,0,0,0,0,0,0,0,0,0,0,0
2557,[ 8.02802205e-01 -5.68631768e-01 -3.58093351e-...,0,0,0,0,0,0,0,1,0,0,0,0
2558,[ 9.06479359e-01 -4.96750921e-01 -1.26053154e-...,0,1,0,0,0,0,0,0,0,0,0,0


In [19]:
labels=[col for col in df.columns if col != "embeddings"]
labels

['Alert User',
 'Ambient Atmosphere',
 'Ambient Luminance',
 'Ambient Temperature',
 'Control Hub',
 'Energy Saving',
 'Gardening',
 'Other',
 'Other Appliances',
 'Outlet Control',
 'Robot Control',
 'Security']

In [20]:
#convert the saved embeddings into manageable arrays
df["embeddings_array"]=df["embeddings"].apply(lambda x : np.fromstring(x.strip('[]'), sep=' '))

In [21]:
df

Unnamed: 0,embeddings,Alert User,Ambient Atmosphere,Ambient Luminance,Ambient Temperature,Control Hub,Energy Saving,Gardening,Other,Other Appliances,Outlet Control,Robot Control,Security,embeddings_array
0,[ 1.40166485e+00 -1.04692149e+00 6.63752705e-...,0,0,1,0,0,0,0,0,0,0,0,0,"[1.40166485, -1.04692149, 0.0663752705, -0.213..."
1,[ 9.71651614e-01 -8.89602423e-01 7.05024824e-...,0,0,1,0,0,0,0,0,0,0,0,0,"[0.971651614, -0.889602423, 0.0705024824, -0.1..."
2,[ 1.04591990e+00 -5.18905222e-01 -7.82014132e-...,0,0,1,0,0,0,0,0,0,0,0,0,"[1.0459199, -0.518905222, -0.0782014132, -0.09..."
3,[ 7.71454036e-01 -5.15564382e-01 -9.71297175e-...,0,0,0,0,1,0,0,0,0,0,0,0,"[0.771454036, -0.515564382, -0.0971297175, -0...."
4,[ 8.28122139e-01 -5.97265363e-01 -1.32278651e-...,1,0,0,0,0,0,0,0,0,0,0,0,"[0.828122139, -0.597265363, -0.132278651, -0.0..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2555,[ 7.11828172e-01 -7.65671611e-01 -3.60019267e-...,0,0,1,0,0,0,0,0,0,0,0,0,"[0.711828172, -0.765671611, -0.360019267, -0.2..."
2556,[ 1.07509267e+00 -3.91025215e-01 -2.05628097e-...,1,0,0,0,0,0,0,0,0,0,0,0,"[1.07509267, -0.391025215, -0.205628097, -0.06..."
2557,[ 8.02802205e-01 -5.68631768e-01 -3.58093351e-...,0,0,0,0,0,0,0,1,0,0,0,0,"[0.802802205, -0.568631768, -0.358093351, -0.5..."
2558,[ 9.06479359e-01 -4.96750921e-01 -1.26053154e-...,0,1,0,0,0,0,0,0,0,0,0,0,"[0.906479359, -0.496750921, -0.126053154, -0.2..."


In [22]:
df["embeddings_array"].iloc[0],type(df["embeddings_array"].iloc[0])

(array([ 1.40166485e+00, -1.04692149e+00,  6.63752705e-02, -2.13575810e-01,
         3.74687523e-01,  2.52173208e-02, -5.59648037e-01, -1.09014057e-01,
         3.66670907e-01, -2.88364559e-01, -9.91745830e-01,  2.33933836e-01,
        -5.15238881e-01, -9.68901634e-01,  3.50301445e-01, -1.82414521e-02,
         3.68957728e-01, -3.78105454e-02, -1.33815035e-01,  5.34784973e-01,
        -1.09357975e-01, -5.42500019e-01,  1.14618137e-01, -7.37599015e-01,
         8.69965777e-02, -2.91745454e-01,  2.89036572e-01, -3.02640777e-02,
        -5.08828163e-01,  1.06010628e+00, -2.83769697e-01, -1.91762757e+00,
         3.89050424e-01, -3.11419636e-01,  9.06866789e-01,  2.65267462e-01,
        -7.68693626e-01, -2.42944092e-01, -6.86814904e-01, -5.93240224e-02,
         2.47758016e-01,  4.49632049e-01, -7.91486651e-02, -1.56700760e-01,
         6.92760050e-02, -7.09432140e-02, -3.44487786e-01, -2.49652177e-01,
        -3.40492696e-01,  1.50368646e-01,  5.83714843e-01, -3.48946393e-01,
         7.2

## Create X, y for training

In [23]:
X = np.array(list(df["embeddings_array"]))
y= df[labels].values

In [24]:
X,y

(array([[ 1.40166485, -1.04692149,  0.06637527, ..., -0.6114791 ,
         -0.06263749, -0.86379087],
        [ 0.97165161, -0.88960242,  0.07050248, ..., -0.4613184 ,
          0.01163797, -0.65408331],
        [ 1.0459199 , -0.51890522, -0.07820141, ..., -0.28311902,
          0.1928068 , -0.63038075],
        ...,
        [ 0.8028022 , -0.56863177, -0.35809335, ..., -0.50351375,
         -0.09572634, -0.38107189],
        [ 0.90647936, -0.49675092, -0.12605315, ..., -0.36970887,
          0.00744445, -0.55369812],
        [ 0.73101306, -0.308505  , -0.12493306, ..., -0.52691317,
          0.18261479, -0.38305348]]),
 array([[0, 0, 1, ..., 0, 0, 0],
        [0, 0, 1, ..., 0, 0, 0],
        [0, 0, 1, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 1, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64))

In [25]:
# Split the data into train and test sets with stratified sampling. Validation set not taken into consideration at the moment.
# A cross validation methods can be also applied.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42,stratify=y)

## Train a Random Forest model


In [26]:
# Initialize RandomForestClassifier
rf = RandomForestClassifier(n_estimators=30, random_state=42)

# Use MultiOutputClassifier to handle multi-label classification
multi_rf = MultiOutputClassifier(rf, n_jobs=-1)

# Train the model
multi_rf.fit(X_train, y_train)

# Make predictions
y_pred_train = multi_rf.predict(X_train)
y_pred = multi_rf.predict(X_test)

# Evaluate the model
train_accuracy = accuracy_score(y_train, y_pred_train)
test_accuracy = accuracy_score(y_test, y_pred)

train_f1_micro=f1_score(y_train, y_pred_train,average='micro')
test_f1_micro=f1_score(y_test, y_pred,average='micro')

train_f1_macro=f1_score(y_train, y_pred_train,average='macro')
test_f1_macro=f1_score(y_test, y_pred,average='macro')

train_f1_weighted=f1_score(y_train, y_pred_train,average='weighted')# weighted take into account class unbalance
test_f1_weighted=f1_score(y_test, y_pred,average='weighted')

print(f"Train Accuracy: {train_accuracy * 100:.2f}%")
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")


print(f"Train F1-micro: {train_f1_micro:.2f}")
print(f"Test F1-micro: {test_f1_micro :.2f}")

print(f"Train F1-macro: {train_f1_macro:.2f}")
print(f"Test F1-macro: {test_f1_macro :.2f}")

print(f"Train F1-weighted: {train_f1_weighted:.2f}")
print(f"Test F1-weighted: {test_f1_weighted :.2f}")

Train Accuracy: 97.32%
Test Accuracy: 11.85%
Train F1-micro: 0.99
Test F1-micro: 0.21
Train F1-macro: 0.97
Test F1-macro: 0.11
Train F1-weighted: 0.99
Test F1-weighted: 0.20


### The results show that the model is in overfittings (big gap between train and test performarces)
The model is too complex

In [27]:
print (classification_report(y_train,y_pred_train))

              precision    recall  f1-score   support

           0       1.00      0.97      0.99       187
           1       0.99      0.98      0.98       161
           2       1.00      0.99      1.00       430
           3       1.00      0.97      0.99       245
           4       1.00      0.95      0.97       139
           5       1.00      0.88      0.93        24
           6       1.00      0.79      0.88        19
           7       1.00      0.99      0.99       247
           8       1.00      0.97      0.99        71
           9       1.00      0.90      0.95        29
          10       1.00      0.95      0.98        65
          11       1.00      0.98      0.99       243

   micro avg       1.00      0.97      0.99      1860
   macro avg       1.00      0.94      0.97      1860
weighted avg       1.00      0.97      0.99      1860
 samples avg       0.98      0.97      0.98      1860



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [28]:
print (classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        80
           1       0.60      0.09      0.15        70
           2       0.75      0.28      0.41       184
           3       1.00      0.14      0.25       106
           4       1.00      0.02      0.03        60
           5       0.00      0.00      0.00        11
           6       0.00      0.00      0.00         8
           7       0.82      0.13      0.23       105
           8       1.00      0.03      0.06        30
           9       0.00      0.00      0.00        13
          10       0.00      0.00      0.00        28
          11       1.00      0.09      0.16       104

   micro avg       0.80      0.12      0.21       799
   macro avg       0.51      0.06      0.11       799
weighted avg       0.71      0.12      0.20       799
 samples avg       0.13      0.12      0.12       799



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Train a Logistic Regression model

In [29]:
from sklearn.linear_model import LogisticRegression
# Initialize LogisticRegression
logreg = LogisticRegression(solver='liblinear')

# Use MultiOutputClassifier to handle multi-label classification
multi_logreg = MultiOutputClassifier(logreg, n_jobs=-1)

# Train the model
multi_logreg.fit(X_train, y_train)

# Make predictions
y_pred_train = multi_logreg.predict(X_train)
y_pred = multi_logreg.predict(X_test)

# Evaluate the model
train_accuracy = accuracy_score(y_train, y_pred_train)
test_accuracy = accuracy_score(y_test, y_pred)

train_f1_micro=f1_score(y_train, y_pred_train,average='micro')
test_f1_micro=f1_score(y_test, y_pred,average='micro')

train_f1_macro=f1_score(y_train, y_pred_train,average='macro')
test_f1_macro=f1_score(y_test, y_pred,average='macro')

train_f1_weighted=f1_score(y_train, y_pred_train,average='weighted')# weighted take into account class unbalance
test_f1_weighted=f1_score(y_test, y_pred,average='weighted')

print(f"Train Accuracy: {train_accuracy * 100:.2f}%")
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")


print(f"Train F1-micro: {train_f1_micro:.2f}")
print(f"Test F1-micro: {test_f1_micro :.2f}")

print(f"Train F1-macro: {train_f1_macro:.2f}")
print(f"Test F1-macro: {test_f1_macro :.2f}")

print(f"Train F1-weighted: {train_f1_weighted:.2f}")
print(f"Test F1-weighted: {test_f1_weighted :.2f}")

Train Accuracy: 79.02%
Test Accuracy: 47.14%
Train F1-micro: 0.88
Test F1-micro: 0.62
Train F1-macro: 0.88
Test F1-macro: 0.56
Train F1-weighted: 0.88
Test F1-weighted: 0.61


In [30]:
print (classification_report(y_train,y_pred_train))

              precision    recall  f1-score   support

           0       0.97      0.94      0.95       187
           1       0.94      0.81      0.87       161
           2       0.93      0.84      0.88       430
           3       0.98      0.87      0.92       245
           4       0.98      0.71      0.82       139
           5       1.00      0.83      0.91        24
           6       1.00      0.89      0.94        19
           7       0.96      0.66      0.78       247
           8       1.00      0.82      0.90        71
           9       1.00      0.69      0.82        29
          10       1.00      0.83      0.91        65
          11       0.97      0.82      0.89       243

   micro avg       0.96      0.81      0.88      1860
   macro avg       0.98      0.81      0.88      1860
weighted avg       0.96      0.81      0.88      1860
 samples avg       0.81      0.81      0.81      1860



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [31]:
print (classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.54      0.65        80
           1       0.66      0.44      0.53        70
           2       0.77      0.64      0.70       184
           3       0.83      0.59      0.69       106
           4       0.67      0.30      0.41        60
           5       0.60      0.27      0.38        11
           6       1.00      0.38      0.55         8
           7       0.59      0.36      0.45       105
           8       0.88      0.47      0.61        30
           9       1.00      0.23      0.38        13
          10       0.94      0.61      0.74        28
          11       0.79      0.62      0.70       104

   micro avg       0.76      0.52      0.62       799
   macro avg       0.80      0.45      0.56       799
weighted avg       0.76      0.52      0.61       799
 samples avg       0.51      0.52      0.51       799



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### The results show less better test performance, less overfitting