In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_curve, auc, accuracy_score
import matplotlib.pyplot as plt


In [3]:
train_path = '../../feature_extracted_data/training_features_01.csv'
train_data = pd.read_csv(train_path)
train_data.shape

(51700, 560)

In [5]:
print(train_data.head())

   AN311_window_1_mean  AN311_window_1_std  AN311_window_1_min  \
0             3.392500            0.139142                 3.2   
1             3.372500            0.099132                 3.2   
2             3.356667            0.121610                 3.2   
3             3.380833            0.130573                 3.2   
4             3.355000            0.125731                 3.1   

   AN311_window_1_max  AN311_window_2_mean  AN311_window_2_std  \
0                 3.7             3.356667            0.121610   
1                 3.7             3.380833            0.130573   
2                 3.6             3.355000            0.125731   
3                 3.6             3.392500            0.152295   
4                 3.5             3.402500            0.145179   

   AN311_window_2_min  AN311_window_2_max  AN311_window_3_mean  \
0                 3.2                 3.6             3.355000   
1                 3.2                 3.6             3.392500   
2       

In [7]:
train_label_path = '../../extracted_data/train_labels_full_with_columns.csv'
train_labels = pd.read_csv(train_label_path)
train_labels.shape

(51700, 3)

In [9]:
train_labels.head()

Unnamed: 0,MM263,MM264,MM256
0,normal,normal,normal
1,normal,normal,normal
2,normal,normal,normal
3,normal,normal,normal
4,normal,normal,normal


In [11]:
test_data = pd.read_csv("../../feature_extracted_data/test_features_01.csv")
test_data.shape

(5076, 560)

In [13]:
test_data.head()

Unnamed: 0,AN311_window_1_mean,AN311_window_1_std,AN311_window_1_min,AN311_window_1_max,AN311_window_2_mean,AN311_window_2_std,AN311_window_2_min,AN311_window_2_max,AN311_window_3_mean,AN311_window_3_std,...,V_window_3_min,V_window_3_max,V_window_4_mean,V_window_4_std,V_window_4_min,V_window_4_max,V_window_5_mean,V_window_5_std,V_window_5_min,V_window_5_max
0,4.073333,0.124989,3.9,4.4,4.061667,0.105026,3.9,4.2,4.158333,0.104549,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3.345833,0.246271,2.9,3.9,3.205,0.133448,2.8,3.7,3.274167,0.198954,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.148333,0.100816,3.9,4.3,4.056667,0.098939,3.9,4.2,4.1,0.125167,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3.1775,0.350823,2.7,4.0,3.215833,0.31649,2.7,3.9,3.261667,0.192,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3.918333,0.172715,3.6,4.2,3.79,0.159896,3.6,4.2,3.873333,0.185173,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
test_labels = pd.read_csv("../../extracted_data/test_labels_full_with_columns.csv")
print(test_labels.shape)
print(test_labels.head())

(5076, 3)
    MM263   MM264    MM256
1  normal  normal   normal
2  normal  normal   normal
3  normal  normal   normal
4  normal  normal   normal


### Combine with original data & feature extracted

In [17]:
original_train_data = pd.read_csv("../../extracted_data/training_data_full_with_columns.csv")
original_train_data.shape

(51700, 16800)

In [20]:
original_test_data = pd.read_csv("../../extracted_data/test_data_full_with_columns.csv")
original_test_data.shape

(5076, 16800)

In [22]:
merged_train_df = pd.concat([original_train_data, train_data], axis=1)
merged_train_df.shape

(51700, 17360)

In [24]:
merged_test_df = pd.concat([original_test_data, test_data], axis=1)
merged_test_df.shape

(5076, 17360)

In [26]:
# Separate features and target
X_train = merged_train_df
y_train = train_labels['MM263']

X_test = merged_test_df
y_test = test_labels['MM263']

In [28]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(51700, 17360)
(51700,)
(5076, 17360)
(5076,)


In [50]:
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

# Define your X_train, y_train, X_test, y_test
# X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.3, random_state=42)

def xgBoostModel(X_train, y_train):
    # Initialize the label encoder
    le = LabelEncoder()

    # Fit label encoder and return encoded labels
    y_train_encoded = le.fit_transform(y_train)

    # Initialize the XGBoost classifier
    model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')

    # Train the model with the encoded labels
    model.fit(X_train, y_train_encoded)

    # Store the label encoder in the model object for future use (prediction phase)
    model.label_encoder_ = le

    return model


In [52]:
# Train the model
model = xgBoostModel(X_train, y_train)

# Assume y_test is available and needs to be encoded using the same encoder used for y_train
y_test_encoded = model.label_encoder_.transform(y_test)

# Predict on the test data
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:,1]  # Probabilities for the positive class


Parameters: { "use_label_encoder" } are not used.



Accuracy: 0.9933018124507487
ROC AUC: 0.9021513405044683
Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      5042
           1       0.00      0.00      0.00        34

    accuracy                           0.99      5076
   macro avg       0.50      0.50      0.50      5076
weighted avg       0.99      0.99      0.99      5076

Confusion Matrix:
[[5042    0]
 [  34    0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [54]:
# Compute ROC AUC
auc = roc_auc_score(y_test_encoded, y_pred_proba)
print(f"ROC AUC: {auc}")

ROC AUC: 0.9021513405044683


In [58]:
thresholds = [0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5]

for thresh in thresholds:
    y_pred_thresholded = np.where(y_pred_proba >= thresh, 1, 0)
    # accuracy = accuracy_score(y_test_encoded, y_pred_thresholded)
    print(f"Threshold: {thresh}")
    # print(f"Accuracy: {accuracy}")
    print("Classification Report:")
    print(classification_report(y_test_encoded, y_pred_thresholded))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test_encoded, y_pred_thresholded))
    print("\n")


Threshold: 0.01
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5042
           1       0.53      0.29      0.38        34

    accuracy                           0.99      5076
   macro avg       0.76      0.65      0.69      5076
weighted avg       0.99      0.99      0.99      5076

Confusion Matrix:
[[5033    9]
 [  24   10]]


Threshold: 0.05
Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      5042
           1       0.80      0.12      0.21        34

    accuracy                           0.99      5076
   macro avg       0.90      0.56      0.60      5076
weighted avg       0.99      0.99      0.99      5076

Confusion Matrix:
[[5041    1]
 [  30    4]]


Threshold: 0.1
Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      5042
           1       0.67    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Model without cutter loader features

In [61]:
import pandas as pd

prefixes_to_remove = ['AMP1_IR', 'AMP2_IR', 'DMP3_IR', 'DMP4_IR', 'AMP5_IR', 'F_SIDE', 'V']

# Generate a list of columns to drop
columns_to_drop = [col for col in X_train.columns if any(col.startswith(prefix) for prefix in prefixes_to_remove)]

# Drop these columns from the DataFrame
X_train_new = X_train.drop(columns=columns_to_drop)

print(X_train_new.shape)
print(X_train_new.head())


(51700, 13020)
   AN311_value_1  AN311_value_2  AN311_value_3  AN311_value_4  AN311_value_5  \
0            3.2            3.2            3.2            3.2            3.2   
1            3.7            3.6            3.6            3.6            3.6   
2            3.5            3.5            3.5            3.5            3.5   
3            3.2            3.2            3.2            3.3            3.3   
4            3.5            3.5            3.5            3.5            3.5   

   AN311_value_6  AN311_value_7  AN311_value_8  AN311_value_9  AN311_value_10  \
0            3.2            3.2            3.2            3.2             3.2   
1            3.5            3.5            3.5            3.5             3.4   
2            3.5            3.5            3.4            3.4             3.4   
3            3.4            3.4            3.5            3.5             3.5   
4            3.5            3.5            3.4            3.4             3.5   

   ...  WM868_win

In [63]:
columns_to_drop = [col for col in X_test.columns if any(col.startswith(prefix) for prefix in prefixes_to_remove)]

# Drop these columns from the DataFrame
X_test_new = X_test.drop(columns=columns_to_drop)

print(X_test_new.shape)
print(X_test_new.head())

(5076, 13020)
   AN311_value_1  AN311_value_2  AN311_value_3  AN311_value_4  AN311_value_5  \
0            4.1            4.1            4.1            4.1            4.1   
1            3.6            3.5            3.5            3.5            3.5   
2            4.2            4.2            4.2            4.2            4.3   
3            3.0            3.1            3.1            3.2            3.2   
4            3.7            3.7            3.7            3.7            3.7   

   AN311_value_6  AN311_value_7  AN311_value_8  AN311_value_9  AN311_value_10  \
0            4.1            4.1            4.1            4.1             4.1   
1            3.5            3.5            3.2            3.2             3.0   
2            4.3            4.3            4.3            4.3             4.3   
3            3.2            3.2            3.2            3.2             3.1   
4            3.7            3.7            3.7            3.8             3.8   

   ...  WM868_wind

In [65]:
# Train the model
model = xgBoostModel(X_train_new, y_train)

# Assume y_test is available and needs to be encoded using the same encoder used for y_train
y_test_encoded = model.label_encoder_.transform(y_test)

# Predict on the test data
y_pred = model.predict(X_test_new)
y_pred_proba = model.predict_proba(X_test_new)[:,1]  # Probabilities for the positive class


Parameters: { "use_label_encoder" } are not used.



In [67]:
# Compute ROC AUC
auc = roc_auc_score(y_test_encoded, y_pred_proba)
print(f"ROC AUC: {auc}")

ROC AUC: 0.8947779825932753


In [69]:
thresholds = [0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5]

for thresh in thresholds:
    y_pred_thresholded = np.where(y_pred_proba >= thresh, 1, 0)
    # accuracy = accuracy_score(y_test_encoded, y_pred_thresholded)
    print(f"Threshold: {thresh}")
    # print(f"Accuracy: {accuracy}")
    print("Classification Report:")
    print(classification_report(y_test_encoded, y_pred_thresholded))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test_encoded, y_pred_thresholded))
    print("\n")


Threshold: 0.01
Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      5042
           1       0.23      0.15      0.18        34

    accuracy                           0.99      5076
   macro avg       0.61      0.57      0.59      5076
weighted avg       0.99      0.99      0.99      5076

Confusion Matrix:
[[5025   17]
 [  29    5]]


Threshold: 0.05
Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      5042
           1       1.00      0.03      0.06        34

    accuracy                           0.99      5076
   macro avg       1.00      0.51      0.53      5076
weighted avg       0.99      0.99      0.99      5076

Confusion Matrix:
[[5042    0]
 [  33    1]]


Threshold: 0.1
Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      5042
           1       1.00    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
