In [4]:
import pandas as pd

df = pd.read_csv('Dataset/Train/X_train_with_trq_margin.csv')


In [5]:
df

Unnamed: 0,trq_measured,oat,mgt,pa,ias,np,ng,trq_target,faulty,trq_margin
0,0.690790,-0.165988,0.027987,-0.925711,0.810638,-1.045463,1.280369,0.800616,0,-3.057098
1,1.473188,0.313427,2.123784,-0.312016,0.913846,0.072721,1.263805,1.447255,0,-2.115131
2,-1.466291,0.942660,-1.019912,-0.421774,0.312093,-2.094875,1.316260,-1.703897,0,-6.465562
3,1.334258,-0.106061,1.710670,-0.184317,1.130757,-0.067370,1.294173,1.334488,0,-7.052871
4,-0.696288,1.435260,-0.177547,1.352120,-1.972702,0.526307,-0.842595,-0.648782,0,-9.499112
...,...,...,...,...,...,...,...,...,...,...
598831,-0.120858,-1.484379,-0.503519,-0.503565,0.646206,-1.531962,1.244479,-0.128699,1,-11.275479
598832,-0.427967,0.643025,0.186683,1.373404,-1.682088,0.717142,-0.577635,-0.392722,1,-0.648192
598833,-0.413343,0.643025,0.199278,1.371820,-1.524653,0.536297,-0.522419,-0.450824,1,10.239773
598834,-3.159493,-0.571895,0.756841,0.577379,0.870825,0.586006,0.064906,-2.678510,1,3.016958


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import cross_val_score

# Supponiamo che df sia il DataFrame contenente i dati, già preparati
y = df['faulty'].values
X = df.drop(columns=['trq_target', 'faulty'])

# Divisione in set di addestramento e test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Lista di classificatori da provare
classifiers = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42)
}


In [7]:
X_train

Unnamed: 0,trq_measured,oat,mgt,pa,ias,np,ng,trq_margin
192115,0.983275,0.972623,1.191759,-1.035997,-1.687336,-0.373024,1.316260,0.080256
220666,1.897292,0.223537,1.448696,-0.496177,-0.875670,0.620352,0.399681,9.601690
93178,1.122206,-0.285841,0.267291,-0.206479,1.078278,0.615257,-0.105542,-4.153085
530359,-1.122620,0.283464,-1.004798,1.348074,-1.470425,0.630540,-1.292677,-4.950618
204582,0.515298,0.942660,0.365531,-0.086695,1.123760,0.653464,-0.218734,0.512488
...,...,...,...,...,...,...,...,...
110268,-0.318285,-0.285841,-1.009836,-0.078252,0.759909,0.587239,-1.309241,-1.570566
259178,-1.042187,-0.615439,-1.828506,-0.479819,-0.658759,-2.359775,1.225154,-4.354306
365838,1.868044,-3.641747,1.307632,-0.031816,1.305685,-0.620094,1.261044,-4.612870
131932,-0.822823,0.882733,-0.770532,-0.049757,-0.342139,0.599975,-1.251265,-6.588956


In [11]:
X_test

Unnamed: 0,trq_measured,oat,mgt,pa,ias,np,ng,trq_margin
32072,1.443940,-0.195951,0.803835,-0.850252,1.003059,-0.561511,1.307977,-3.564409
353171,0.522210,0.738200,1.531429,2.052640,-1.517738,0.520339,0.244462,7.698460
448003,0.829720,-1.184745,0.272329,-0.710416,-1.968971,-0.856976,1.291413,-5.617264
162527,-0.471840,0.103683,-1.176089,-1.067658,-1.094330,-1.903841,1.307977,-6.783396
299628,0.032697,0.223537,0.020430,0.395607,1.139503,0.671294,-0.572113,-5.455970
...,...,...,...,...,...,...,...,...
579892,1.136830,1.751673,1.937379,-0.399084,-1.173047,0.678935,0.573610,-8.198281
22247,0.119186,0.231972,-0.447943,-0.609955,-0.113475,0.579042,-0.806465,60.683974
142845,-0.464528,0.792842,-0.672291,-0.573219,-0.459340,0.610163,-1.281634,-14.897612
547565,-0.274412,-0.615439,-0.712595,0.747571,0.691687,0.681482,-1.016599,-13.725804


In [9]:
y_train

array([0, 0, 0, ..., 1, 0, 0])

In [10]:
y_test

array([0, 1, 1, ..., 0, 1, 0])

In [12]:
# Valutazione dei modelli
results = {}

for name, clf in classifiers.items():
    # Addestramento del classificatore
    clf.fit(X_train, y_train)
    
    # Predizione
    y_pred = clf.predict(X_test)
    
    # Calcolo delle metriche
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    
    # Salvo i risultati
    results[name] = {
        "accuracy": accuracy,
        "classification_report": report
    }
    
    print(f"Model: {name}")
    print(f"Accuracy: {accuracy}")
    print(f"Classification Report:\n{report}")
    print("="*50)

# Se vuoi confrontare anche la validazione incrociata
for name, clf in classifiers.items():
    cv_scores = cross_val_score(clf, X, y, cv=5)  # 5-fold cross-validation
    print(f"{name} - CV Accuracy: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model: Logistic Regression
Accuracy: 0.9136664217487142
Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.92      0.91     59747
           1       0.92      0.91      0.91     60021

    accuracy                           0.91    119768
   macro avg       0.91      0.91      0.91    119768
weighted avg       0.91      0.91      0.91    119768

Model: Random Forest
Accuracy: 0.9989646650190368
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     59747
           1       1.00      1.00      1.00     60021

    accuracy                           1.00    119768
   macro avg       1.00      1.00      1.00    119768
weighted avg       1.00      1.00      1.00    119768

Model: K-Nearest Neighbors
Accuracy: 0.9651242401977156
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.96      0.96     59747
      

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Logistic Regression - CV Accuracy: 0.9089 ± 0.0049
Random Forest - CV Accuracy: 0.9991 ± 0.0001
K-Nearest Neighbors - CV Accuracy: 0.9676 ± 0.0007
Gradient Boosting - CV Accuracy: 0.9609 ± 0.0006
