<a href="https://colab.research.google.com/github/maralhi/Sensors_models/blob/main/03_Model_Comparison.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

5. Model training and model selection

In [None]:
from google.colab import drive
import pandas as pd

drive.mount('/content/drive')

file_path = '/content/drive/MyDrive/Projects_ML/Project_Sensors/df_cl.csv'

df = pd.read_csv(file_path)

df.head()


Mounted at /content/drive


Unnamed: 0,footfall,tempMode,AQ,USS,CS,VOC,RP,IP,Temperature,fail,is_outlier_footfall,is_outlier_CS,is_outlier_Temperature,footfall_tempMode_interaction
0,0,7,7,1,6,6,36,3,1,1,False,False,True,0
1,190,1,3,3,5,1,20,4,1,0,False,False,True,190
2,31,7,2,2,6,1,24,6,1,0,False,False,True,217
3,83,4,3,4,5,1,28,6,1,0,False,False,True,332
4,640,7,5,6,4,0,68,6,1,0,True,False,True,4480


In [None]:
X = df[['VOC', 'AQ', 'footfall_tempMode_interaction']]

y = df['fail']

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42) # 60% training, 40% temporary (test + validation)

X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42) # 50% validation, 50% test (of the temporary set)

print("Training set - Features:", X_train.shape)
print("Validation set - Features:", X_val.shape)
print("Test set - Features:", X_test.shape)
print("Training set - Target variable:", y_train.shape)
print("Validation set - Target variable:", y_val.shape)
print("Test set - Target variable:", y_test.shape)

Training set - Features: (566, 3)
Validation set - Features: (189, 3)
Test set - Features: (189, 3)
Training set - Target variable: (566,)
Validation set - Target variable: (189,)
Test set - Target variable: (189,)


In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb


model_metrics = {}

models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'XGBoost': xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    'SVM': SVC(probability=True),
    'KNN': KNeighborsClassifier()
}

for name, model in models.items():
    if name in ['SVM', 'KNN']:
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_val_scaled)
        try:
            y_proba = model.predict_proba(X_val_scaled)[:, 1]
        except:
            y_proba = None
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        try:
            y_proba = model.predict_proba(X_val)[:, 1]
        except:
            y_proba = None

    roc_auc = roc_auc_score(y_val, y_proba) if y_proba is not None else None

    model_metrics[name] = {
        'Accuracy': accuracy_score(y_val, y_pred),
        'Precision': precision_score(y_val, y_pred, zero_division=0),
        'Recall': recall_score(y_val, y_pred, zero_division=0),
        'F1-score': f1_score(y_val, y_pred, zero_division=0),
        'ROC AUC': roc_auc
    }


Parameters: { "use_label_encoder" } are not used.



In [None]:
import pandas as pd

metrics_df = pd.DataFrame(model_metrics).T.sort_values(by='F1-score', ascending=False)
print(metrics_df)


                     Accuracy  Precision    Recall  F1-score   ROC AUC
XGBoost              0.873016   0.870588  0.850575  0.860465  0.904722
SVM                  0.857143   0.826087  0.873563  0.849162  0.873338
KNN                  0.857143   0.826087  0.873563  0.849162  0.882860
Logistic Regression  0.857143   0.833333  0.862069  0.847458  0.920667
Random Forest        0.846561   0.837209  0.827586  0.832370  0.896552
Decision Tree        0.851852   0.873418  0.793103  0.831325  0.839306
