## Train a Model on Embeddings (Extracted Features)
In this feature-based approach, we are using the embeddings from the previous transformation step to train some models on a multilabel classification task.
These results will be considered the baseline for more advanced modelling techniques.

In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score,classification_report

pd.set_option('display.max_columns', None)

In [10]:
model_embedding_name="llama3.2.1b"#"all-mpnet-base-v2-finetuned"#"llama3.2.1b"#"all-mpnet-base-v2-finetuned"#"ModernBERT-base"#"all-mpnet-base-v2"#"bert-base-uncased"

In [11]:
df=pd.read_csv("out/3_elaborated_dataset_for_multilabel_training_"+model_embedding_name+"_.csv")
df

Unnamed: 0,embeddings,Alert User,Ambient Atmosphere,Ambient Luminance,Ambient Temperature,Control Hub,Energy Saving,Gardening,Other,Other Appliances,Outlet Control,Robot Control,Security
0,[5.8760797e-06 0.018332329 0.032247532 -0.0...,0,0,1,0,0,0,0,0,0,0,0,0
1,[-0.01548752 0.007120902 0.032546464 -0.005...,0,0,1,0,0,0,0,0,0,0,0,0
2,[-0.007748441 0.0021770685 0.036084786 -0.0...,0,0,1,0,0,0,0,0,0,0,0,0
3,[-0.0013106825 0.00499818 0.053116668 -0.00...,0,0,0,0,1,0,0,0,0,0,0,0
4,[-0.013440807 -0.019121878 0.019484065 -0.0...,1,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2555,[0.0040453877 0.028940763 0.03343941 0.0087...,0,0,1,0,0,0,0,0,0,0,0,0
2556,[-0.015449206 0.001333692 0.006457404 -0.02...,1,0,0,0,0,0,0,0,0,0,0,0
2557,[-0.01638672 -0.0019023608 0.05313262 -0.01...,0,0,0,0,0,0,0,1,0,0,0,0
2558,[-0.018889135 0.01379838 0.0349957 0.007016...,0,1,0,0,0,0,0,0,0,0,0,0


In [12]:
labels=[col for col in df.columns if col != "embeddings"]
labels

['Alert User',
 'Ambient Atmosphere',
 'Ambient Luminance',
 'Ambient Temperature',
 'Control Hub',
 'Energy Saving',
 'Gardening',
 'Other',
 'Other Appliances',
 'Outlet Control',
 'Robot Control',
 'Security']

In [13]:
df["embeddings"].iloc[0],type(df["embeddings"].iloc[0])

('[5.8760797e-06  0.018332329  0.032247532  -0.0110866055  0.0056896866  -0.038031887  -0.0052733608  0.010633435  0.004938283  0.0030828542  0.024305897  -0.00034804727  0.010212543  -0.009306585  -0.02243945  0.0034766113  0.0013601297  -0.006926455  -0.007860296  -0.02537369  -0.019756323  -0.018142285  0.020376485  0.000500108  -0.0052967947  -0.006420259  0.0315943  -0.034798063  0.023545695  -0.004330015  -0.012828634  0.015793506  -0.005533178  -0.0027263244  0.0046459553  -0.031276304  -0.011349223  0.008606491  -0.008529677  0.0026761044  -0.00177175  -0.016179532  -0.0016162902  -0.0059591294  -0.012968814  0.027853627  0.010361526  0.013106162  -0.014220956  0.0051352116  0.012267384  0.0026726997  0.014480509  0.013825935  -0.0067624403  -0.008854733  0.010245699  -0.020308683  0.0030219995  0.025489  0.02453635  0.022993717  4.29167e-05  -0.0074858237  0.064596586  -0.020746429  0.009347191  0.00039841706  -0.009177946  -0.0007617805  0.0075129936  -0.01062468  0.025060652

In [14]:
#convert the saved embeddings into manageable arrays
df["embeddings_array"]=df["embeddings"].apply(lambda x : np.fromstring(x.strip('[]'), sep=' '))



In [15]:
df

Unnamed: 0,embeddings,Alert User,Ambient Atmosphere,Ambient Luminance,Ambient Temperature,Control Hub,Energy Saving,Gardening,Other,Other Appliances,Outlet Control,Robot Control,Security,embeddings_array
0,[5.8760797e-06 0.018332329 0.032247532 -0.0...,0,0,1,0,0,0,0,0,0,0,0,0,"[5.8760797e-06, 0.018332329, 0.032247532, -0.0..."
1,[-0.01548752 0.007120902 0.032546464 -0.005...,0,0,1,0,0,0,0,0,0,0,0,0,"[-0.01548752, 0.007120902, 0.032546464, -0.005..."
2,[-0.007748441 0.0021770685 0.036084786 -0.0...,0,0,1,0,0,0,0,0,0,0,0,0,"[-0.007748441, 0.0021770685, 0.036084786, -0.0..."
3,[-0.0013106825 0.00499818 0.053116668 -0.00...,0,0,0,0,1,0,0,0,0,0,0,0,"[-0.0013106825, 0.00499818, 0.053116668, -0.00..."
4,[-0.013440807 -0.019121878 0.019484065 -0.0...,1,0,0,0,0,0,0,0,0,0,0,0,"[-0.013440807, -0.019121878, 0.019484065, -0.0..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2555,[0.0040453877 0.028940763 0.03343941 0.0087...,0,0,1,0,0,0,0,0,0,0,0,0,"[0.0040453877, 0.028940763, 0.03343941, 0.0087..."
2556,[-0.015449206 0.001333692 0.006457404 -0.02...,1,0,0,0,0,0,0,0,0,0,0,0,"[-0.015449206, 0.001333692, 0.006457404, -0.02..."
2557,[-0.01638672 -0.0019023608 0.05313262 -0.01...,0,0,0,0,0,0,0,1,0,0,0,0,"[-0.01638672, -0.0019023608, 0.05313262, -0.01..."
2558,[-0.018889135 0.01379838 0.0349957 0.007016...,0,1,0,0,0,0,0,0,0,0,0,0,"[-0.018889135, 0.01379838, 0.0349957, 0.007016..."


In [16]:
df["embeddings_array"].iloc[0],type(df["embeddings_array"].iloc[0])

(array([ 5.8760797e-06,  1.8332329e-02,  3.2247532e-02, ...,
         8.6899180e-03, -1.5169558e-02, -2.2120630e-03]),
 numpy.ndarray)

## Create X, y for training

In [17]:
X = np.array(list(df["embeddings_array"]))
y= df[labels].values

In [18]:
X,y

(array([[ 5.8760797e-06,  1.8332329e-02,  3.2247532e-02, ...,
          8.6899180e-03, -1.5169558e-02, -2.2120630e-03],
        [-1.5487520e-02,  7.1209020e-03,  3.2546464e-02, ...,
         -2.2918832e-02, -4.1488796e-03,  2.4867827e-02],
        [-7.7484410e-03,  2.1770685e-03,  3.6084786e-02, ...,
         -1.1790700e-02, -6.7398650e-03,  1.2477863e-02],
        ...,
        [-1.6386720e-02, -1.9023608e-03,  5.3132620e-02, ...,
         -2.0504981e-02, -3.2578380e-02, -3.3031139e-03],
        [-1.8889135e-02,  1.3798380e-02,  3.4995700e-02, ...,
          4.3293600e-03, -1.0879731e-02,  5.3803160e-03],
        [-1.6756753e-02, -2.3842216e-02,  5.7848380e-02, ...,
          1.2718450e-03, -1.1283917e-02,  2.0400228e-02]]),
 array([[0, 0, 1, ..., 0, 0, 0],
        [0, 0, 1, ..., 0, 0, 0],
        [0, 0, 1, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 1, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64))

In [19]:
# Split the data into train and test sets with stratified sampling. Validation set not taken into consideration at the moment.
# A cross validation methods can be also applied.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42,stratify=y)

## Train a Random Forest model


In [20]:
# Initialize RandomForestClassifier
rf = RandomForestClassifier(n_estimators=30, random_state=42)

# Use MultiOutputClassifier to handle multi-label classification
multi_rf = MultiOutputClassifier(rf, n_jobs=-1)

# Train the model
multi_rf.fit(X_train, y_train)

# Make predictions
y_pred_train = multi_rf.predict(X_train)
y_pred = multi_rf.predict(X_test)

# Evaluate the model
train_accuracy = accuracy_score(y_train, y_pred_train)
test_accuracy = accuracy_score(y_test, y_pred)

train_f1_micro=f1_score(y_train, y_pred_train,average='micro')
test_f1_micro=f1_score(y_test, y_pred,average='micro')

train_f1_macro=f1_score(y_train, y_pred_train,average='macro')
test_f1_macro=f1_score(y_test, y_pred,average='macro')

train_f1_weighted=f1_score(y_train, y_pred_train,average='weighted')# weighted take into account class unbalance
test_f1_weighted=f1_score(y_test, y_pred,average='weighted')

print(f"Train Accuracy: {train_accuracy * 100:.2f}%")
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")


print(f"Train F1-micro: {train_f1_micro:.2f}")
print(f"Test F1-micro: {test_f1_micro :.2f}")

print(f"Train F1-macro: {train_f1_macro:.2f}")
print(f"Test F1-macro: {test_f1_macro :.2f}")

print(f"Train F1-weighted: {train_f1_weighted:.2f}")
print(f"Test F1-weighted: {test_f1_weighted :.2f}")

Train Accuracy: 97.60%
Test Accuracy: 12.50%
Train F1-micro: 0.99
Test F1-micro: 0.23
Train F1-macro: 0.98
Test F1-macro: 0.17
Train F1-weighted: 0.99
Test F1-weighted: 0.22


### The results show that the model is in overfittings (big gap between train and test performances)
The model is too complex

In [21]:
print (classification_report(y_train,y_pred_train))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99       187
           1       0.99      0.98      0.98       161
           2       1.00      0.99      0.99       430
           3       1.00      0.99      1.00       245
           4       1.00      0.93      0.96       139
           5       1.00      0.88      0.93        24
           6       1.00      0.95      0.97        19
           7       1.00      0.98      0.99       247
           8       1.00      0.97      0.99        71
           9       1.00      0.83      0.91        29
          10       1.00      0.98      0.99        65
          11       1.00      1.00      1.00       243

   micro avg       1.00      0.98      0.99      1860
   macro avg       1.00      0.95      0.98      1860
weighted avg       1.00      0.98      0.99      1860
 samples avg       0.98      0.98      0.98      1860



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [22]:
print (classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.01      0.02        80
           1       0.67      0.06      0.11        70
           2       0.81      0.26      0.39       184
           3       0.93      0.13      0.23       106
           4       1.00      0.05      0.10        60
           5       1.00      0.09      0.17        11
           6       0.00      0.00      0.00         8
           7       0.78      0.17      0.28       105
           8       1.00      0.07      0.12        30
           9       0.00      0.00      0.00        13
          10       1.00      0.25      0.40        28
          11       0.83      0.10      0.17       104

   micro avg       0.84      0.13      0.23       799
   macro avg       0.75      0.10      0.17       799
weighted avg       0.84      0.13      0.22       799
 samples avg       0.14      0.13      0.13       799



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Train a Logistic Regression model

In [23]:
from sklearn.linear_model import LogisticRegression
# Initialize LogisticRegression
logreg = LogisticRegression(solver='liblinear')

# Use MultiOutputClassifier to handle multi-label classification
multi_logreg = MultiOutputClassifier(logreg, n_jobs=-1)

# Train the model
multi_logreg.fit(X_train, y_train)

# Make predictions
y_pred_train = multi_logreg.predict(X_train)
y_pred = multi_logreg.predict(X_test)

# Evaluate the model
train_accuracy = accuracy_score(y_train, y_pred_train)
test_accuracy = accuracy_score(y_test, y_pred)

train_f1_micro=f1_score(y_train, y_pred_train,average='micro')
test_f1_micro=f1_score(y_test, y_pred,average='micro')

train_f1_macro=f1_score(y_train, y_pred_train,average='macro')
test_f1_macro=f1_score(y_test, y_pred,average='macro')

train_f1_weighted=f1_score(y_train, y_pred_train,average='weighted')# weighted take into account class unbalance
test_f1_weighted=f1_score(y_test, y_pred,average='weighted')

print(f"Train Accuracy: {train_accuracy * 100:.2f}%")
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")


print(f"Train F1-micro: {train_f1_micro:.2f}")
print(f"Test F1-micro: {test_f1_micro :.2f}")

print(f"Train F1-macro: {train_f1_macro:.2f}")
print(f"Test F1-macro: {test_f1_macro :.2f}")

print(f"Train F1-weighted: {train_f1_weighted:.2f}")
print(f"Test F1-weighted: {test_f1_weighted :.2f}")

Train Accuracy: 18.53%
Test Accuracy: 14.71%
Train F1-micro: 0.31
Test F1-micro: 0.25
Train F1-macro: 0.18
Test F1-macro: 0.13
Train F1-weighted: 0.30
Test F1-weighted: 0.23


In [24]:
print (classification_report(y_train,y_pred_train))

              precision    recall  f1-score   support

           0       0.97      0.16      0.28       187
           1       1.00      0.10      0.18       161
           2       0.90      0.33      0.48       430
           3       0.96      0.31      0.47       245
           4       0.00      0.00      0.00       139
           5       0.00      0.00      0.00        24
           6       0.00      0.00      0.00        19
           7       1.00      0.16      0.28       247
           8       0.00      0.00      0.00        71
           9       0.00      0.00      0.00        29
          10       1.00      0.09      0.17        65
          11       0.98      0.17      0.29       243

   micro avg       0.94      0.19      0.31      1860
   macro avg       0.57      0.11      0.18      1860
weighted avg       0.81      0.19      0.30      1860
 samples avg       0.19      0.19      0.19      1860



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [25]:
print (classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.05      0.10        80
           1       0.67      0.03      0.05        70
           2       0.80      0.32      0.46       184
           3       0.94      0.16      0.27       106
           4       0.00      0.00      0.00        60
           5       0.00      0.00      0.00        11
           6       0.00      0.00      0.00         8
           7       1.00      0.11      0.21       105
           8       0.00      0.00      0.00        30
           9       0.00      0.00      0.00        13
          10       1.00      0.11      0.19        28
          11       0.91      0.19      0.32       104

   micro avg       0.86      0.15      0.25       799
   macro avg       0.53      0.08      0.13       799
weighted avg       0.75      0.15      0.23       799
 samples avg       0.15      0.15      0.15       799



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### The results show less better test performance, less overfitting.
