In [33]:
import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, StratifiedKFold
from sklearn.pipeline import make_pipeline

In [37]:
df = pd.read_csv('MJD_TRAIN_PROCESSED.csv')

features = ['tdrift', 'tdrift50', 'tdrift10', 'rea', 'dcr', 'peakindex', 
            'peakvalue', 'tailslope', 'currentamp', 'lfpr', 'lq80', 
            'areagrowthrate', 'inflection point', 'risingedgeslope']
zeroes = df[df['truedcr'] == 0] 
"""for i in range(0, 15):
    df = pd.concat([df, zeroes])
    df = df.reset_index(drop = True)"""
df = df.dropna()
X = df[features]
y = df['truedcr']

#splitting into train and test
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21)

In [39]:
from imblearn.over_sampling import SMOTE
from collections import Counter

smote = SMOTE(sampling_strategy=0.3, random_state=42)  # 30% False to True ratio
X_resampled, y_resampled = smote.fit_resample(X, y)

print("Before:", Counter(y))  # Check original balance
print("After:", Counter(y_resampled))

Before: Counter({True: 1657733, False: 32232})
After: Counter({True: 1657733, False: 663093})


In [40]:
from imblearn.under_sampling import RandomUnderSampler

undersampler = RandomUnderSampler(sampling_strategy=0.3, random_state=42)  # Keep 30% False to True
X_resampled, y_resampled = undersampler.fit_resample(X, y)

print("Before:", Counter(y))
print("After:", Counter(y_resampled))

Before: Counter({True: 1657733, False: 32232})
After: Counter({True: 107440, False: 32232})


In [None]:
from imblearn.combine import SMOTETomek

smt = SMOTETomek(sampling_strategy=0.3, random_state=42)
X_resampled, y_resampled = smt.fit_resample(X, y)

print("Before:", Counter(y))
print("After:", Counter(y_resampled))

In [79]:
params ={
    'min_child_weight': [7],
    'gamma': [2],
    'subsample': [.8,],
    'colsample_bytree': [.9],
    'max_depth': [11],
    'n_estimators':[800,900,1000,1100],
    'learning_rate': [.08, .09, .1]
}

In [25]:
#create xgboost model and fit 
xgb_clf = xgb.XGBClassifier(objective='binary:logistic',
                            min_child_weight = 7,
                            gamma = 2,
                            subsample = .8,
                            n_estimators = 1000,
                            learning_rate = .09,
                            colsample_bytree = .9,
                            max_depth = 11,
                            random_state=42)
xgb_clf.fit(X, y)

In [81]:
folds = 5
param_comb = 7
skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 21)

random_search = RandomizedSearchCV(xgb_clf, param_distributions = params, n_iter = param_comb, scoring ='roc_auc',\
                                  n_jobs = 4, cv = skf.split(X,y), verbose=3, random_state = 21)
random_search.fit(X,y)

Fitting 5 folds for each of 7 candidates, totalling 35 fits


random_search.cv_results_

In [82]:
random_search.best_params_

{'subsample': 0.8,
 'n_estimators': 1000,
 'min_child_weight': 7,
 'max_depth': 11,
 'learning_rate': 0.09,
 'gamma': 2,
 'colsample_bytree': 0.9}

In [11]:
#first round of predictions on test split
y_pred = xgb_clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')
print("\nClassification Report:\n", classification_report(y_test, y_pred))

NameError: name 'X_test' is not defined

In [26]:
#testing the accuracy on our train data
df_test = pd.read_csv('MJD_TEST_PROCESSED.csv')

X_test_dataset = df_test[features]  
y_test_dataset = df_test['truedcr']

y_pred_dataset = xgb_clf.predict(X_test_dataset)

accuracy_final = accuracy_score(y_test_dataset, y_pred_dataset)
precision_final = precision_score(y_test_dataset, y_pred_dataset)
recall_final = recall_score(y_test_dataset, y_pred_dataset)
f1_final = f1_score(y_test_dataset, y_pred_dataset)

print(f'Final Test Accuracy: {accuracy_final:.4f}')
print(f'Final Test Precision: {precision_final:.4f}')
print(f'Final Test Recall: {recall_final:.4f}')
print(f'Final Test F1 Score: {f1_final:.4f}')
print("\nFinal Test Classification Report:\n", classification_report(y_test_dataset, y_pred_dataset))

Final Test Accuracy: 0.9795
Final Test Precision: 0.9861
Final Test Recall: 0.9930
Final Test F1 Score: 0.9896

Final Test Classification Report:
               precision    recall  f1-score   support

       False       0.44      0.28      0.34      7426
        True       0.99      0.99      0.99    382574

    accuracy                           0.98    390000
   macro avg       0.71      0.64      0.67    390000
weighted avg       0.98      0.98      0.98    390000



In [27]:
df_test['pred'] = y_pred_dataset

In [28]:
df_test[df_test['pred'] != df_test['truedcr']]['pred'].sum()

5336

In [7]:
np.sum(df_test['truedcr'] == 0)

7426

In [29]:
(7426 - 5336) / 7426

0.2814435766226771