In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_curve, auc, accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
import matplotlib.pyplot as plt

# Load Data
zip_file_path = '../../extracted_data/feature_extracted_data/trainingData_features_01_1.zip'
train_data = pd.read_csv(zip_file_path, compression='zip')


In [3]:
train_data.head

<bound method NDFrame.head of        AN311_mean  AN311_std  AN311_min  AN311_max  AN311_top5_min  \
0        3.371500   0.131799        3.1        3.7            3.10   
1        3.367000   0.129400        3.0        3.7            3.02   
2        3.362667   0.130536        3.0        3.7            3.02   
3        3.374833   0.131719        3.0        3.7            3.02   
4        3.379500   0.129215        3.0        3.7            3.02   
...           ...        ...        ...        ...             ...   
51695    3.697667   0.098630        3.5        4.0            3.50   
51696    3.709000   0.102887        3.5        4.0            3.50   
51697    3.708500   0.100055        3.5        4.0            3.50   
51698    3.712833   0.103047        3.5        4.0            3.50   
51699    3.726833   0.105182        3.5        4.0            3.50   

       AN311_top5_max  AN311_min_std  AN311_max_std  AN311_top_freqs_max  \
0                3.70      -2.059949       2.492424  

In [39]:
train_labels = pd.read_csv('../../extracted_data/trainingDataFullLabels.csv')
# train_labels.columns = ['MM263', 'MM264', 'MM256']

In [41]:
train_labels.head

<bound method NDFrame.head of         MM263   MM264   MM256
0      normal  normal  normal
1      normal  normal  normal
2      normal  normal  normal
3      normal  normal  normal
4      normal  normal  normal
...       ...     ...     ...
51695  normal  normal  normal
51696  normal  normal  normal
51697  normal  normal  normal
51698  normal  normal  normal
51699  normal  normal  normal

[51700 rows x 3 columns]>

In [11]:
test_data = pd.read_csv("../../extracted_data/feature_extracted_data/testData_features_01_01.csv")

In [13]:
test_data.head()

Unnamed: 0,AN311_mean,AN311_std,AN311_min,AN311_max,AN311_top5_min,AN311_top5_max,AN311_min_std,AN311_max_std,AN311_top_freqs_max,AN422_mean,...,F_SIDE_top_freqs_max,V_mean,V_std,V_min,V_max,V_top5_min,V_top5_max,V_min_std,V_max_std,V_top_freqs_max
0,3.261,0.215667,2.7,3.9,2.7,3.84,-2.60123,2.962899,34.768204,1.7485,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4.086,0.120294,3.8,4.3,3.8,4.3,-2.377507,1.778974,19.688883,1.605833,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3.297167,0.299514,2.7,4.2,2.7,4.2,-1.993785,3.014328,43.573463,1.755333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3.894667,0.178992,3.5,4.3,3.52,4.28,-2.204937,2.26453,17.768761,1.516833,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3.378,0.226236,2.9,3.9,2.9,3.9,-2.112839,2.307326,24.79513,1.670667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
test_labels = pd.read_csv('../../Mining_DataSet/testLabels/testLabels.csv', header = None)
test_labels.columns = ['MM263', 'MM264', 'MM256']
test_labels.shape


(5076, 3)

In [17]:
test_labels.head()

Unnamed: 0,MM263,MM264,MM256
0,normal,normal,warning
1,normal,normal,normal
2,normal,normal,normal
3,normal,normal,normal
4,normal,normal,normal


In [43]:
# Separate features and target
X_train = train_data
y_train = train_labels['MM263']

X_test = test_data
y_test = test_labels['MM263']

In [45]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(51700, 252)
(51700,)
(5076, 252)
(5076,)


In [47]:
counts = train_labels['MM263'].value_counts()  # This gives you a count of each unique value in the column

# Calculate the probability of 'warning'
if 'warning' in counts:
    warning_probability = counts['warning'] / counts.sum()
else:
    warning_probability = 0  # In case there are no 'warning' labels

print("Probability of 'warning':", warning_probability)




In [49]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import balanced_accuracy_score, classification_report, confusion_matrix

def trainRF(n):
    rf_classifier = RandomForestClassifier(n_estimators=n, random_state=42, class_weight='balanced')
    rf_classifier.fit(X_train, y_train)
    return rf_classifier



In [51]:
rf_classifier = trainRF(100)

In [53]:
from sklearn.metrics import roc_auc_score

rf_probabilities = rf_classifier.predict_proba(X_test)
auc = roc_auc_score(y_test, rf_probabilities[:, 1])
print("AUC with Random Forest using Probabilities:", auc)

AUC with Random Forest using Probabilities: 0.49968208227360766


In [57]:
# Threshold derived from training data's class distribution
warning_threshold = warning_probability  # The variable computed earlier

# Assuming the 'warning' class is the second column in the probabilities
predicted_labels = np.where(rf_probabilities[:, 1] >= warning_threshold, 'warning', 'normal')

# Check the shape of predicted_labels to ensure it is one-dimensional
print(predicted_labels.shape)

# Classification report and confusion matrix for the new predictions
print(classification_report(y_test, predicted_labels))
print("Confusion Matrix:\n", confusion_matrix(y_test, predicted_labels))

(5076,)
              precision    recall  f1-score   support

      normal       0.99      0.67      0.80      5042

    accuracy                           0.67      5076
   macro avg       0.50      0.51      0.41      5076
weighted avg       0.99      0.67      0.80      5076

Confusion Matrix:
 [[3399 1643]
 [  22   12]]
