## Train a Model on Embeddings (Extracted Features)
In this feature-based approach, we are using the embeddings from the previous transformation step to train some models on a multilabel classification task.
These results will be considered the baseline for more advanced modelling techniques.

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score,classification_report

pd.set_option('display.max_columns', None)

In [3]:
df=pd.read_csv("out/3_elaborated_dataset_for_multilabel_training.csv")
df

Unnamed: 0,embeddings,Alert User,Ambient Atmosphere,Ambient Luminance,Ambient Temperature,Control Hub,Energy Saving,Gardening,Other,Other Appliances,Outlet Control,Robot Control,Security
0,[ 1.10090651e-01 1.83028907e-01 1.02849193e-...,0,0,1,0,0,0,0,0,0,0,0,0
1,[ 2.13754341e-01 1.57022059e-01 4.43846077e-...,0,0,1,0,0,0,0,0,0,0,0,0
2,[ 1.92298502e-01 -2.79919673e-02 4.65791404e-...,0,0,1,0,0,0,0,0,0,0,0,0
3,[ 1.73662826e-01 -1.66096225e-01 -7.57032558e-...,0,0,0,0,1,0,0,0,0,0,0,0
4,[-8.44048522e-03 -5.86027130e-02 3.59569043e-...,1,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2555,[ 2.20692277e-01 -1.48237303e-01 6.49801314e-...,0,0,1,0,0,0,0,0,0,0,0,0
2556,[ 3.33835036e-02 -3.38201582e-01 6.66526139e-...,1,0,0,0,0,0,0,0,0,0,0,0
2557,[-2.00998753e-01 -4.28132027e-01 -3.77161413e-...,0,0,0,0,0,0,0,1,0,0,0,0
2558,[-2.31185228e-01 -2.89236039e-01 2.85209566e-...,0,1,0,0,0,0,0,0,0,0,0,0


In [4]:
labels=[col for col in df.columns if col != "embeddings"]
labels

['Alert User',
 'Ambient Atmosphere',
 'Ambient Luminance',
 'Ambient Temperature',
 'Control Hub',
 'Energy Saving',
 'Gardening',
 'Other',
 'Other Appliances',
 'Outlet Control',
 'Robot Control',
 'Security']

In [5]:
#convert the saved embeddings into manageable arrays
df["embeddings_array"]=df["embeddings"].apply(lambda x : np.fromstring(x.strip('[]'), sep=' '))

In [6]:
df

Unnamed: 0,embeddings,Alert User,Ambient Atmosphere,Ambient Luminance,Ambient Temperature,Control Hub,Energy Saving,Gardening,Other,Other Appliances,Outlet Control,Robot Control,Security,embeddings_array
0,[ 1.10090651e-01 1.83028907e-01 1.02849193e-...,0,0,1,0,0,0,0,0,0,0,0,0,"[0.110090651, 0.183028907, 0.102849193, -0.014..."
1,[ 2.13754341e-01 1.57022059e-01 4.43846077e-...,0,0,1,0,0,0,0,0,0,0,0,0,"[0.213754341, 0.157022059, 0.443846077, -0.130..."
2,[ 1.92298502e-01 -2.79919673e-02 4.65791404e-...,0,0,1,0,0,0,0,0,0,0,0,0,"[0.192298502, -0.0279919673, 0.465791404, -0.2..."
3,[ 1.73662826e-01 -1.66096225e-01 -7.57032558e-...,0,0,0,0,1,0,0,0,0,0,0,0,"[0.173662826, -0.166096225, -0.0757032558, -0...."
4,[-8.44048522e-03 -5.86027130e-02 3.59569043e-...,1,0,0,0,0,0,0,0,0,0,0,0,"[-0.00844048522, -0.058602713, 0.359569043, -0..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2555,[ 2.20692277e-01 -1.48237303e-01 6.49801314e-...,0,0,1,0,0,0,0,0,0,0,0,0,"[0.220692277, -0.148237303, 0.649801314, 0.210..."
2556,[ 3.33835036e-02 -3.38201582e-01 6.66526139e-...,1,0,0,0,0,0,0,0,0,0,0,0,"[0.0333835036, -0.338201582, 0.666526139, 0.04..."
2557,[-2.00998753e-01 -4.28132027e-01 -3.77161413e-...,0,0,0,0,0,0,0,1,0,0,0,0,"[-0.200998753, -0.428132027, -0.377161413, 0.0..."
2558,[-2.31185228e-01 -2.89236039e-01 2.85209566e-...,0,1,0,0,0,0,0,0,0,0,0,0,"[-0.231185228, -0.289236039, 0.285209566, -0.3..."


In [7]:
df["embeddings_array"].iloc[0],type(df["embeddings_array"].iloc[0])

(array([ 1.10090651e-01,  1.83028907e-01,  1.02849193e-01, -1.44717395e-02,
         4.55853075e-01, -2.67275721e-01,  2.09992871e-01,  6.03019595e-01,
        -3.41860026e-01, -1.50129989e-01, -4.86329310e-02, -5.35513423e-02,
        -3.28059405e-01,  1.09006651e-01, -3.30405799e-03,  6.62896752e-01,
        -3.50006446e-02,  5.46921901e-02,  1.44659607e-02,  6.38281941e-01,
        -1.28065005e-01,  1.75272986e-01,  1.88850477e-01,  3.10716182e-01,
         3.62308681e-01, -6.46377057e-02, -1.05822787e-01,  2.20651790e-01,
        -6.62548468e-02, -4.65032727e-01,  3.03388864e-01, -1.15656480e-01,
         1.28715992e-01, -1.65760547e-01,  2.74653852e-01, -4.15095866e-01,
         5.57725728e-01,  8.08102265e-02, -2.02457428e-01,  3.12488347e-01,
        -7.56973147e-01, -6.05182290e-01,  5.42944372e-02, -1.70146674e-01,
        -1.65818170e-01, -4.99431074e-01,  3.59099925e-01, -3.92225116e-01,
        -9.41386074e-03,  1.16573289e-01, -1.54255152e-01, -1.66508541e-01,
        -1.4

## Create X, y for training

In [8]:
X = np.array(list(df["embeddings_array"]))
y= df[labels].values

In [9]:
X,y

(array([[ 0.11009065,  0.18302891,  0.10284919, ..., -0.28095666,
         -0.2009484 ,  0.07048095],
        [ 0.21375434,  0.15702206,  0.44384608, ..., -0.33353853,
         -0.0659714 , -0.19565643],
        [ 0.1922985 , -0.02799197,  0.4657914 , ..., -0.28203088,
         -0.15333086, -0.30591229],
        ...,
        [-0.20099875, -0.42813203, -0.37716141, ...,  0.05594643,
          0.18499643, -0.14358717],
        [-0.23118523, -0.28923604,  0.28520957, ...,  0.36215779,
          0.26588878, -0.23236091],
        [ 0.04144778, -0.22082672,  0.05451458, ..., -0.20584038,
         -0.11561005,  0.00484603]]),
 array([[0, 0, 1, ..., 0, 0, 0],
        [0, 0, 1, ..., 0, 0, 0],
        [0, 0, 1, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 1, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64))

In [10]:
# Split the data into train and test sets with stratified sampling. Validation set not taken into consideration at the moment.
# A cross validation methods can be also applied.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42,stratify=y)

## Train a Random Forest model


In [11]:
# Initialize RandomForestClassifier
rf = RandomForestClassifier(n_estimators=30, random_state=42)

# Use MultiOutputClassifier to handle multi-label classification
multi_rf = MultiOutputClassifier(rf, n_jobs=-1)

# Train the model
multi_rf.fit(X_train, y_train)

# Make predictions
y_pred_train = multi_rf.predict(X_train)
y_pred = multi_rf.predict(X_test)

# Evaluate the model
train_accuracy = accuracy_score(y_train, y_pred_train)
test_accuracy = accuracy_score(y_test, y_pred)

train_f1_micro=f1_score(y_train, y_pred_train,average='micro')
test_f1_micro=f1_score(y_test, y_pred,average='micro')

train_f1_macro=f1_score(y_train, y_pred_train,average='macro')
test_f1_macro=f1_score(y_test, y_pred,average='macro')

train_f1_weighted=f1_score(y_train, y_pred_train,average='weighted')# weighted take into account class unbalance
test_f1_weighted=f1_score(y_test, y_pred,average='weighted')

print(f"Train Accuracy: {train_accuracy * 100:.2f}%")
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")


print(f"Train F1-micro: {train_f1_micro:.2f}")
print(f"Test F1-micro: {test_f1_micro :.2f}")

print(f"Train F1-macro: {train_f1_macro:.2f}")
print(f"Test F1-macro: {test_f1_macro :.2f}")

print(f"Train F1-weighted: {train_f1_weighted:.2f}")
print(f"Test F1-weighted: {test_f1_weighted :.2f}")

Train Accuracy: 97.66%
Test Accuracy: 18.88%
Train F1-micro: 0.99
Test F1-micro: 0.32
Train F1-macro: 0.98
Test F1-macro: 0.18
Train F1-weighted: 0.99
Test F1-weighted: 0.30


### The results show that the model is in overfittings (big gap between train and test performarces)
The model is too complex

In [12]:
print (classification_report(y_train,y_pred_train))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99       187
           1       0.99      0.97      0.98       161
           2       1.00      0.99      0.99       430
           3       1.00      0.98      0.99       245
           4       1.00      0.94      0.97       139
           5       1.00      0.92      0.96        24
           6       1.00      0.84      0.91        19
           7       1.00      0.98      0.99       247
           8       1.00      0.97      0.99        71
           9       1.00      0.93      0.96        29
          10       1.00      0.95      0.98        65
          11       1.00      0.99      1.00       243

   micro avg       1.00      0.98      0.99      1860
   macro avg       1.00      0.95      0.98      1860
weighted avg       1.00      0.98      0.99      1860
 samples avg       0.98      0.98      0.98      1860



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [13]:
print (classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.10      0.18        80
           1       0.67      0.09      0.15        70
           2       0.85      0.40      0.55       184
           3       0.86      0.18      0.30       106
           4       1.00      0.05      0.10        60
           5       0.00      0.00      0.00        11
           6       0.00      0.00      0.00         8
           7       0.81      0.25      0.38       105
           8       1.00      0.10      0.18        30
           9       0.00      0.00      0.00        13
          10       0.00      0.00      0.00        28
          11       1.00      0.16      0.28       104

   micro avg       0.86      0.20      0.32       799
   macro avg       0.60      0.11      0.18       799
weighted avg       0.82      0.20      0.30       799
 samples avg       0.20      0.19      0.20       799



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Train a Logistic Regression model

In [14]:
from sklearn.linear_model import LogisticRegression
# Initialize LogisticRegression
logreg = LogisticRegression(solver='liblinear')

# Use MultiOutputClassifier to handle multi-label classification
multi_logreg = MultiOutputClassifier(logreg, n_jobs=-1)

# Train the model
multi_logreg.fit(X_train, y_train)

# Make predictions
y_pred_train = multi_logreg.predict(X_train)
y_pred = multi_logreg.predict(X_test)

# Evaluate the model
train_accuracy = accuracy_score(y_train, y_pred_train)
test_accuracy = accuracy_score(y_test, y_pred)

train_f1_micro=f1_score(y_train, y_pred_train,average='micro')
test_f1_micro=f1_score(y_test, y_pred,average='micro')

train_f1_macro=f1_score(y_train, y_pred_train,average='macro')
test_f1_macro=f1_score(y_test, y_pred,average='macro')

train_f1_weighted=f1_score(y_train, y_pred_train,average='weighted')# weighted take into account class unbalance
test_f1_weighted=f1_score(y_test, y_pred,average='weighted')

print(f"Train Accuracy: {train_accuracy * 100:.2f}%")
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")


print(f"Train F1-micro: {train_f1_micro:.2f}")
print(f"Test F1-micro: {test_f1_micro :.2f}")

print(f"Train F1-macro: {train_f1_macro:.2f}")
print(f"Test F1-macro: {test_f1_macro :.2f}")

print(f"Train F1-weighted: {train_f1_weighted:.2f}")
print(f"Test F1-weighted: {test_f1_weighted :.2f}")

Train Accuracy: 85.38%
Test Accuracy: 52.60%
Train F1-micro: 0.92
Test F1-micro: 0.66
Train F1-macro: 0.94
Test F1-macro: 0.65
Train F1-weighted: 0.91
Test F1-weighted: 0.66


In [15]:
print (classification_report(y_train,y_pred_train))

              precision    recall  f1-score   support

           0       0.98      0.91      0.94       187
           1       0.96      0.83      0.89       161
           2       0.94      0.86      0.90       430
           3       0.98      0.93      0.95       245
           4       0.99      0.81      0.89       139
           5       1.00      1.00      1.00        24
           6       1.00      1.00      1.00        19
           7       0.95      0.74      0.83       247
           8       1.00      0.94      0.97        71
           9       1.00      0.93      0.96        29
          10       1.00      0.97      0.98        65
          11       0.99      0.89      0.94       243

   micro avg       0.97      0.87      0.92      1860
   macro avg       0.98      0.90      0.94      1860
weighted avg       0.97      0.87      0.91      1860
 samples avg       0.87      0.87      0.86      1860



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [16]:
print (classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.62      0.71        80
           1       0.72      0.51      0.60        70
           2       0.76      0.63      0.69       184
           3       0.82      0.61      0.70       106
           4       0.76      0.53      0.63        60
           5       0.71      0.45      0.56        11
           6       1.00      0.50      0.67         8
           7       0.62      0.47      0.53       105
           8       0.86      0.63      0.73        30
           9       0.83      0.38      0.53        13
          10       0.94      0.54      0.68        28
          11       0.84      0.65      0.74       104

   micro avg       0.78      0.58      0.66       799
   macro avg       0.81      0.55      0.65       799
weighted avg       0.78      0.58      0.66       799
 samples avg       0.57      0.59      0.57       799



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### The results show less better test performance, less overfitting