In [13]:
import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

In [23]:
df = pd.read_csv('processed_data/MJD_TRAIN_PROCESSED.csv')

features = ['tdrift', 'tdrift50', 'tdrift10', 'rea', 'dcr', 'peakindex', 
            'peakvalue', 'tailslope', 'currentamp', 'lfpr', 'lq80', 
            'areagrowthrate', 'inflection point', 'risingedgeslope']
X = df[features]
y = df['truedcr']

#splitting into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21)

In [24]:
#create xgboost model and fit 
xgb_clf = xgb.XGBClassifier(objective='binary:logistic', 
                            n_estimators=100, 
                            learning_rate=0.1, 
                            max_depth=5, 
                            random_state=42)
xgb_clf.fit(X_train, y_train)

In [25]:
#first round of predictions on test split
y_pred = xgb_clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9830
Precision: 0.9836
Recall: 0.9994
F1 Score: 0.9914

Classification Report:
               precision    recall  f1-score   support

       False       0.82      0.14      0.23      6406
        True       0.98      1.00      0.99    331595

    accuracy                           0.98    338001
   macro avg       0.90      0.57      0.61    338001
weighted avg       0.98      0.98      0.98    338001



In [26]:
#testing the accuracy on our train data
df_test = pd.read_csv('processed_data/MJD_TRAIN_PROCESSED.csv')

X_test_dataset = df_test[features]  
y_test_dataset = df_test['truedcr']

y_pred_dataset = xgb_clf.predict(X_test_dataset)

accuracy_final = accuracy_score(y_test_dataset, y_pred_dataset)
precision_final = precision_score(y_test_dataset, y_pred_dataset)
recall_final = recall_score(y_test_dataset, y_pred_dataset)
f1_final = f1_score(y_test_dataset, y_pred_dataset)

print(f'Final Test Accuracy: {accuracy_final:.4f}')
print(f'Final Test Precision: {precision_final:.4f}')
print(f'Final Test Recall: {recall_final:.4f}')
print(f'Final Test F1 Score: {f1_final:.4f}')
print("\nFinal Test Classification Report:\n", classification_report(y_test_dataset, y_pred_dataset))

Final Test Accuracy: 0.9829
Final Test Precision: 0.9835
Final Test Recall: 0.9994
Final Test F1 Score: 0.9914

Final Test Classification Report:
               precision    recall  f1-score   support

       False       0.82      0.14      0.23     32267
        True       0.98      1.00      0.99   1657734

    accuracy                           0.98   1690001
   macro avg       0.90      0.57      0.61   1690001
weighted avg       0.98      0.98      0.98   1690001



In [16]:
y_pred_dataset

array([1, 1, 1, ..., 1, 1, 1])

In [27]:
d['truedcr']

KeyError: 'truedcr'