## Train a Model on Embeddings (Extracted Features)
In this feature-based approach, we are using the embeddings from the previous transformation step to train some models on a multilabel classification task.
These results will be considered the baseline for more advanced modelling techniques.

In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score,classification_report

pd.set_option('display.max_columns', None)

In [18]:
model_embedding_name="all-mpnet-base-v2-finetuned"#"ModernBERT-base"#"all-mpnet-base-v2"#"bert-base-uncased"

In [19]:
df=pd.read_csv("out/3_elaborated_dataset_for_multilabel_training_"+model_embedding_name+"_.csv")
df

Unnamed: 0,embeddings,Alert User,Ambient Atmosphere,Ambient Luminance,Ambient Temperature,Control Hub,Energy Saving,Gardening,Other,Other Appliances,Outlet Control,Robot Control,Security
0,[-3.81090045e-02 -7.89537840e-03 -1.92272980e-...,0,0,1,0,0,0,0,0,0,0,0,0
1,[-6.28273264e-02 8.64298567e-02 -1.31409541e-...,0,0,1,0,0,0,0,0,0,0,0,0
2,[-4.60534953e-02 3.22994590e-02 -2.23879218e-...,0,0,1,0,0,0,0,0,0,0,0,0
3,[-3.30852084e-02 2.96814479e-02 -2.41294652e-...,0,0,0,0,1,0,0,0,0,0,0,0
4,[-1.68051887e-02 1.30204540e-02 3.50181316e-...,1,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2555,[ 8.11339170e-03 -1.91914421e-02 1.64293814e-...,0,0,1,0,0,0,0,0,0,0,0,0
2556,[-2.73690857e-02 9.20130499e-03 -4.17254120e-...,1,0,0,0,0,0,0,0,0,0,0,0
2557,[ 6.07395768e-02 8.36325623e-03 -2.18872428e-...,0,0,0,0,0,0,0,1,0,0,0,0
2558,[ 2.58382261e-02 7.54028931e-02 2.00409396e-...,0,1,0,0,0,0,0,0,0,0,0,0


In [20]:
labels=[col for col in df.columns if col != "embeddings"]
labels

['Alert User',
 'Ambient Atmosphere',
 'Ambient Luminance',
 'Ambient Temperature',
 'Control Hub',
 'Energy Saving',
 'Gardening',
 'Other',
 'Other Appliances',
 'Outlet Control',
 'Robot Control',
 'Security']

In [21]:
#convert the saved embeddings into manageable arrays
df["embeddings_array"]=df["embeddings"].apply(lambda x : np.fromstring(x.strip('[]'), sep=' '))

In [22]:
df

Unnamed: 0,embeddings,Alert User,Ambient Atmosphere,Ambient Luminance,Ambient Temperature,Control Hub,Energy Saving,Gardening,Other,Other Appliances,Outlet Control,Robot Control,Security,embeddings_array
0,[-3.81090045e-02 -7.89537840e-03 -1.92272980e-...,0,0,1,0,0,0,0,0,0,0,0,0,"[-0.0381090045, -0.0078953784, -0.019227298, -..."
1,[-6.28273264e-02 8.64298567e-02 -1.31409541e-...,0,0,1,0,0,0,0,0,0,0,0,0,"[-0.0628273264, 0.0864298567, -0.0131409541, -..."
2,[-4.60534953e-02 3.22994590e-02 -2.23879218e-...,0,0,1,0,0,0,0,0,0,0,0,0,"[-0.0460534953, 0.032299459, -0.0223879218, -0..."
3,[-3.30852084e-02 2.96814479e-02 -2.41294652e-...,0,0,0,0,1,0,0,0,0,0,0,0,"[-0.0330852084, 0.0296814479, -0.0241294652, -..."
4,[-1.68051887e-02 1.30204540e-02 3.50181316e-...,1,0,0,0,0,0,0,0,0,0,0,0,"[-0.0168051887, 0.013020454, 0.00350181316, -0..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2555,[ 8.11339170e-03 -1.91914421e-02 1.64293814e-...,0,0,1,0,0,0,0,0,0,0,0,0,"[0.0081133917, -0.0191914421, 0.0164293814, 0...."
2556,[-2.73690857e-02 9.20130499e-03 -4.17254120e-...,1,0,0,0,0,0,0,0,0,0,0,0,"[-0.0273690857, 0.00920130499, -0.041725412, 0..."
2557,[ 6.07395768e-02 8.36325623e-03 -2.18872428e-...,0,0,0,0,0,0,0,1,0,0,0,0,"[0.0607395768, 0.00836325623, -0.0218872428, -..."
2558,[ 2.58382261e-02 7.54028931e-02 2.00409396e-...,0,1,0,0,0,0,0,0,0,0,0,0,"[0.0258382261, 0.0754028931, 0.00200409396, -0..."


In [23]:
df["embeddings_array"].iloc[0],type(df["embeddings_array"].iloc[0])

(array([-3.81090045e-02, -7.89537840e-03, -1.92272980e-02, -2.46934853e-02,
        -7.02785924e-02,  2.87226867e-02,  2.14655865e-02,  1.48777486e-04,
        -3.55027243e-02,  1.10283475e-02, -1.29162101e-02,  3.19684483e-02,
        -3.69567536e-02, -1.15828798e-03,  4.30162363e-02,  2.15776190e-02,
        -3.39004509e-02,  1.30145764e-02, -7.95737058e-02,  9.28139593e-03,
         7.26475986e-03, -5.40941693e-02,  2.26404928e-02, -8.83916691e-02,
        -2.59162057e-02,  3.59333530e-02,  1.95130631e-02, -3.36550102e-02,
        -9.88819730e-03,  2.60190405e-02, -3.49622369e-02, -2.13097893e-02,
        -3.18212584e-02, -6.72504529e-02,  5.07572804e-05,  1.33579392e-02,
         4.93039228e-02, -2.13951543e-02,  3.87396961e-02,  6.35862425e-02,
        -3.56026813e-02, -2.33595204e-02,  6.72092289e-02,  1.69952735e-02,
        -7.92835467e-03, -3.59339304e-02, -2.10603457e-02, -5.64836012e-03,
        -1.03149347e-01, -1.56775154e-02, -3.75967026e-02, -1.76340379e-02,
         3.5

## Create X, y for training

In [24]:
X = np.array(list(df["embeddings_array"]))
y= df[labels].values

In [25]:
X,y

(array([[-0.038109  , -0.00789538, -0.0192273 , ..., -0.00541885,
          0.01937505,  0.01477483],
        [-0.06282733,  0.08642986, -0.01314095, ...,  0.00692127,
          0.00085832,  0.00849081],
        [-0.0460535 ,  0.03229946, -0.02238792, ...,  0.0261548 ,
         -0.00467431,  0.00573931],
        ...,
        [ 0.06073958,  0.00836326, -0.02188724, ..., -0.02586745,
          0.00613012, -0.04101495],
        [ 0.02583823,  0.07540289,  0.00200409, ..., -0.01437893,
         -0.03769761,  0.00616436],
        [-0.01689353,  0.02439754,  0.00898682, ..., -0.00699768,
         -0.05302438, -0.02829444]]),
 array([[0, 0, 1, ..., 0, 0, 0],
        [0, 0, 1, ..., 0, 0, 0],
        [0, 0, 1, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 1, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64))

In [26]:
# Split the data into train and test sets with stratified sampling. Validation set not taken into consideration at the moment.
# A cross validation methods can be also applied.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42,stratify=y)

## Train a Random Forest model


In [27]:
# Initialize RandomForestClassifier
rf = RandomForestClassifier(n_estimators=30, random_state=42)

# Use MultiOutputClassifier to handle multi-label classification
multi_rf = MultiOutputClassifier(rf, n_jobs=-1)

# Train the model
multi_rf.fit(X_train, y_train)

# Make predictions
y_pred_train = multi_rf.predict(X_train)
y_pred = multi_rf.predict(X_test)

# Evaluate the model
train_accuracy = accuracy_score(y_train, y_pred_train)
test_accuracy = accuracy_score(y_test, y_pred)

train_f1_micro=f1_score(y_train, y_pred_train,average='micro')
test_f1_micro=f1_score(y_test, y_pred,average='micro')

train_f1_macro=f1_score(y_train, y_pred_train,average='macro')
test_f1_macro=f1_score(y_test, y_pred,average='macro')

train_f1_weighted=f1_score(y_train, y_pred_train,average='weighted')# weighted take into account class unbalance
test_f1_weighted=f1_score(y_test, y_pred,average='weighted')

print(f"Train Accuracy: {train_accuracy * 100:.2f}%")
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")


print(f"Train F1-micro: {train_f1_micro:.2f}")
print(f"Test F1-micro: {test_f1_micro :.2f}")

print(f"Train F1-macro: {train_f1_macro:.2f}")
print(f"Test F1-macro: {test_f1_macro :.2f}")

print(f"Train F1-weighted: {train_f1_weighted:.2f}")
print(f"Test F1-weighted: {test_f1_weighted :.2f}")

Train Accuracy: 99.22%
Test Accuracy: 51.30%
Train F1-micro: 1.00
Test F1-micro: 0.67
Train F1-macro: 0.99
Test F1-macro: 0.55
Train F1-weighted: 1.00
Test F1-weighted: 0.65


### The results show that the model is in overfittings (big gap between train and test performances)
The model is too complex

In [28]:
print (classification_report(y_train,y_pred_train))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99       187
           1       0.99      0.99      0.99       161
           2       1.00      0.99      1.00       430
           3       1.00      1.00      1.00       245
           4       1.00      0.98      0.99       139
           5       1.00      0.96      0.98        24
           6       1.00      0.95      0.97        19
           7       1.00      1.00      1.00       247
           8       1.00      1.00      1.00        71
           9       1.00      1.00      1.00        29
          10       1.00      1.00      1.00        65
          11       1.00      1.00      1.00       243

   micro avg       1.00      0.99      1.00      1860
   macro avg       1.00      0.99      0.99      1860
weighted avg       1.00      0.99      1.00      1860
 samples avg       0.99      0.99      0.99      1860



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [29]:
print (classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.61      0.75        80
           1       0.91      0.46      0.61        70
           2       0.87      0.65      0.75       184
           3       0.96      0.65      0.78       106
           4       0.96      0.38      0.55        60
           5       1.00      0.18      0.31        11
           6       0.00      0.00      0.00         8
           7       1.00      0.21      0.35       105
           8       1.00      0.37      0.54        30
           9       1.00      0.23      0.38        13
          10       1.00      0.64      0.78        28
          11       0.94      0.64      0.77       104

   micro avg       0.93      0.52      0.67       799
   macro avg       0.89      0.42      0.55       799
weighted avg       0.93      0.52      0.65       799
 samples avg       0.53      0.52      0.53       799



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Train a Logistic Regression model

In [30]:
from sklearn.linear_model import LogisticRegression
# Initialize LogisticRegression
logreg = LogisticRegression(solver='liblinear')

# Use MultiOutputClassifier to handle multi-label classification
multi_logreg = MultiOutputClassifier(logreg, n_jobs=-1)

# Train the model
multi_logreg.fit(X_train, y_train)

# Make predictions
y_pred_train = multi_logreg.predict(X_train)
y_pred = multi_logreg.predict(X_test)

# Evaluate the model
train_accuracy = accuracy_score(y_train, y_pred_train)
test_accuracy = accuracy_score(y_test, y_pred)

train_f1_micro=f1_score(y_train, y_pred_train,average='micro')
test_f1_micro=f1_score(y_test, y_pred,average='micro')

train_f1_macro=f1_score(y_train, y_pred_train,average='macro')
test_f1_macro=f1_score(y_test, y_pred,average='macro')

train_f1_weighted=f1_score(y_train, y_pred_train,average='weighted')# weighted take into account class unbalance
test_f1_weighted=f1_score(y_test, y_pred,average='weighted')

print(f"Train Accuracy: {train_accuracy * 100:.2f}%")
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")


print(f"Train F1-micro: {train_f1_micro:.2f}")
print(f"Test F1-micro: {test_f1_micro :.2f}")

print(f"Train F1-macro: {train_f1_macro:.2f}")
print(f"Test F1-macro: {test_f1_macro :.2f}")

print(f"Train F1-weighted: {train_f1_weighted:.2f}")
print(f"Test F1-weighted: {test_f1_weighted :.2f}")

Train Accuracy: 71.76%
Test Accuracy: 65.89%
Train F1-micro: 0.82
Test F1-micro: 0.77
Train F1-macro: 0.67
Test F1-macro: 0.64
Train F1-weighted: 0.81
Test F1-weighted: 0.76


In [31]:
print (classification_report(y_train,y_pred_train))

              precision    recall  f1-score   support

           0       0.94      0.86      0.90       187
           1       0.88      0.65      0.75       161
           2       0.91      0.81      0.86       430
           3       0.95      0.87      0.91       245
           4       0.96      0.64      0.77       139
           5       0.00      0.00      0.00        24
           6       0.00      0.00      0.00        19
           7       0.95      0.55      0.70       247
           8       1.00      0.72      0.84        71
           9       1.00      0.38      0.55        29
          10       1.00      0.86      0.93        65
          11       0.94      0.79      0.86       243

   micro avg       0.94      0.73      0.82      1860
   macro avg       0.79      0.59      0.67      1860
weighted avg       0.92      0.73      0.81      1860
 samples avg       0.74      0.73      0.73      1860



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [32]:
print (classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.94      0.75      0.83        80
           1       0.90      0.61      0.73        70
           2       0.88      0.78      0.82       184
           3       0.93      0.74      0.82       106
           4       0.95      0.70      0.81        60
           5       0.00      0.00      0.00        11
           6       0.00      0.00      0.00         8
           7       0.87      0.43      0.57       105
           8       1.00      0.63      0.78        30
           9       1.00      0.38      0.56        13
          10       1.00      0.82      0.90        28
          11       0.94      0.75      0.83       104

   micro avg       0.92      0.67      0.77       799
   macro avg       0.78      0.55      0.64       799
weighted avg       0.89      0.67      0.76       799
 samples avg       0.68      0.68      0.68       799



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### The results show less better test performance, less overfitting