## Modeling ##

In [2]:
# pip install faiss_imputer
# pip install imbalanced-learn

In [3]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import scipy.stats as stats
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import KNNImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    classification_report, 
    confusion_matrix,
    r2_score,
    mean_squared_error, 
    root_mean_squared_error,
    mean_absolute_error, 
    mean_absolute_percentage_error
)
from sklearn.inspection import PartialDependenceDisplay
from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors
from scipy.spatial import KDTree
from faiss_imputer import FaissImputer

from tqdm.notebook import tqdm, trange
from ipywidgets import interactive
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    classification_report, accuracy_score, confusion_matrix, 
    f1_score, fbeta_score, 
    matthews_corrcoef, brier_score_loss
)
from sklearn.decomposition import PCA
import pickle
from xgboost import XGBClassifier

This notebook is dedicated to the feature selection and statistical modeling of our trucking data.

In [182]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler

In [6]:
df = pd.read_csv('../data/data_clean.csv', low_memory=False)

In [7]:
#df_sample = df.sample(frac=0.3, random_state=42)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1057049 entries, 0 to 1057048
Data columns (total 47 columns):
 #   Column                     Non-Null Count    Dtype  
---  ------                     --------------    -----  
 0   Unnamed: 0                 1057049 non-null  int64  
 1   RecordID                   1057049 non-null  int64  
 2   ESS_Id                     1057049 non-null  int64  
 3   EventTimeStamp             1057049 non-null  object 
 4   eventDescription           1006152 non-null  object 
 5   ecuSoftwareVersion         792713 non-null   object 
 6   ecuSerialNumber            751116 non-null   object 
 7   ecuModel                   1001026 non-null  object 
 8   ecuMake                    1001026 non-null  object 
 9   ecuSource                  1057049 non-null  int64  
 10  spn                        1057049 non-null  int64  
 11  fmi                        1057049 non-null  int64  
 12  active                     1057049 non-null  bool   
 13  activeTransi

In [9]:
df = df.drop('Unnamed: 0', axis=1)

In [10]:
object_columns_to_change_to_category = [        
    'eventDescription',  
    'ecuSoftwareVersion',          
    'ecuSerialNumber',         
    'ecuModel',           
    'ecuMake',                  
    'EquipmentID',                                 
    'LampStatus',              
    'ServiceDistance',
    'next_derate_timestamp',
    'time_until_derate'] 

In [11]:
for column in object_columns_to_change_to_category:
    df[column] = df[column].astype('category')

In [12]:
df['EventTimeStamp'] = pd.to_datetime(df['EventTimeStamp'])
df['LocationTimeStamp'] = pd.to_datetime(df['LocationTimeStamp'])

In [13]:
int_columns_to_categorical = ['RecordID',
'ESS_Id',                    
'ecuSource',                    
'spn',                       
'fmi',  
'active',       
'MCTNumber',                  
'FaultId']

In [14]:
for column in int_columns_to_categorical:
    df[column] = df[column].astype('category')

In [15]:
float64_cols = df.select_dtypes(include=['float64']).columns
df[float64_cols] = df[float64_cols].astype('float32')

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1057049 entries, 0 to 1057048
Data columns (total 46 columns):
 #   Column                     Non-Null Count    Dtype         
---  ------                     --------------    -----         
 0   RecordID                   1057049 non-null  category      
 1   ESS_Id                     1057049 non-null  category      
 2   EventTimeStamp             1057049 non-null  datetime64[ns]
 3   eventDescription           1006152 non-null  category      
 4   ecuSoftwareVersion         792713 non-null   category      
 5   ecuSerialNumber            751116 non-null   category      
 6   ecuModel                   1001026 non-null  category      
 7   ecuMake                    1001026 non-null  category      
 8   ecuSource                  1057049 non-null  category      
 9   spn                        1057049 non-null  category      
 10  fmi                        1057049 non-null  category      
 11  active                     1057049 no

Scaling and encoding features for modeling

In [18]:
test_date = '2019-01-01'
df_test = df.sort_values('EventTimeStamp').loc[df['EventTimeStamp'] > test_date]
df_train = df.sort_values('EventTimeStamp').loc[df['EventTimeStamp'] < test_date]

In [85]:
X_train = df_train.drop(['target',
            'LocationTimeStamp',
            'EventTimeStamp',
            'eventDescription',
            'ESS_Id',
            'ecuModel',
            'ecuMake',
            'ecuSource',
            'ecuSoftwareVersion',
            'ecuSerialNumber',
            'MCTNumber',
            'ServiceDistance',
            'Latitude',
            'Longitude',
            'RecordID',
            'next_derate_timestamp',
            'time_until_derate',
            'FaultId',
            'EngineLoad',
            'TurboBoostPressure',
            'CruiseControlSetSpeed',
            'DistanceLtd'], axis=1)

y_train = df_train['target'].values

X_test = df_test.drop(['target',
            'LocationTimeStamp',
            'EventTimeStamp',
            'eventDescription',
            'ESS_Id',
            'ecuModel',
            'ecuMake',
            'ecuSource',
            'ecuSoftwareVersion',
            'ecuSerialNumber',
            'MCTNumber',
            'ServiceDistance',
            'Latitude',
            'Longitude',
            'RecordID',
            'next_derate_timestamp',
            'time_until_derate',
            'FaultId',
            'EngineLoad',
            'TurboBoostPressure',
            'CruiseControlSetSpeed',
            'DistanceLtd'], axis=1)


y_test = df_test['target'].values

X_val = df_test.drop(['target',
            'LocationTimeStamp',
            'EventTimeStamp',
            'eventDescription',
            'ESS_Id',
            'ecuModel',
            'ecuMake',
            'ecuSource',
            'ecuSoftwareVersion',
            'ecuSerialNumber',
            'MCTNumber',
            'ServiceDistance',
            'Latitude',
            'Longitude',
            'RecordID',
            'next_derate_timestamp',
            'time_until_derate',
            'FaultId'], axis=1)

y_val = df_test['target'].values

In [20]:
# X_train = X_train.reset_index(drop=True)
# X_test = X_test.reset_index(drop=True)

In [21]:
# print(X_train.shape, X_test.shape)

In [22]:
categorical_features = ['spn', 'fmi', 'EquipmentID', 'LampStatus'] 
bool_features = ['CruiseControlActive', 'ParkingBrake', 'IgnStatus', 'active']
numeric_features = [x for x in X_train.columns if x not in categorical_features + bool_features] 
#pca = PCA(n_components=1, svd_solver="arpack")

numeric_pipe = Pipeline(
    steps=[
        ('scale', StandardScaler()),
        ('numeric_impute', SimpleImputer(strategy='most_frequent'))
    ]
)

categorical_pipe = Pipeline(
    steps=[
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=True)),
        ('categorical_impute', SimpleImputer(strategy='most_frequent')),
    ]
)

bool_pipe = Pipeline(
    steps=[
        ('bool_impute', SimpleImputer(strategy='most_frequent'))
    ]
)

ct = ColumnTransformer(
    transformers=[
        ('numeric', numeric_pipe, numeric_features),
        ('categorical', categorical_pipe, categorical_features),
        ('bool', bool_pipe, bool_features)
    ]
)

pipe = Pipeline(
    steps=[
        ('transformer', ct),
        #('pca', pca)
    ]
)

pipe.fit(X_train, y_train)
X_train_transformed = pipe.transform(X_train)
X_test_transformed = pipe.transform(X_test)


In [202]:
smote = SMOTE()
X_trained_smoted, y_trained_smoted = smote.fit_resample(X_train_transformed, y_train)

In [None]:
# oversampler = RandomOverSampler(random_state = 321)
# X_resampled, y_resampled = oversampler.fit_resample(X_train_transformed, y_train)

In [24]:
# print(X_train.shape, X_trained_balanced.shape, X_train_transformed.shape)

In [25]:
# filename = 'pipe_transformed.pkl'

# pickle_list = [pipe, x_train_transformed, x_test_transformed]

# with open(filename, 'wb') as file:
#     pickle.dump(pickle_list, file)

In [26]:
# with open(filename, 'rb') as file:
#     pipe, X_train_transformed, X_test_transformed = pickle.load(file)

In [155]:
# beta = 0.5

# thresholds = pd.DataFrame({'threshold': candidate_thresholds})
# thresholds['fbeta'] = thresholds['threshold'].apply(lambda x: fbeta_score(y_val, y_val_pred_proba > x, beta = beta))
# thresholds.sort_values('fbeta', ascending = False).head()

Unnamed: 0,threshold,fbeta
2,0.12,0.066858
3,0.13,0.065856
1,0.11,0.064488
6,0.16,0.061958
4,0.14,0.061179


In [188]:
# LGBMClassiferModel = LGBMClassifier()
# LGBMClassiferModel = LGBMClassiferModel.fit(X_train_transformed, y_train)
# LGBMClassifer_y_pred = LGBMClassiferModel.predict(X_test_transformed)
# print(f'Accuracy: {accuracy_score(y_test, LGBMClassifer_y_pred)}')
# print(f'MCC: {matthews_corrcoef(y_test, LGBMClassifer_y_pred)}')
# print(confusion_matrix(y_test, LGBMClassifer_y_pred))
# print(classification_report(y_test, LGBMClassifer_y_pred, zero_division=1))

[LightGBM] [Info] Number of positive: 2207, number of negative: 943351
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.050387 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4866
[LightGBM] [Info] Number of data points in the train set: 945558, number of used features: 1149
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.002334 -> initscore=-6.057804
[LightGBM] [Info] Start training from score -6.057804
Accuracy: 0.9955332717439076
MCC: 0.006033579835433306
[[110991    189]
 [   309      2]]
              precision    recall  f1-score   support

       False       1.00      1.00      1.00    111180
        True       0.01      0.01      0.01       311

    accuracy                           1.00    111491
   macro avg       0.50      0.50      0.50    111491
weighted avg       0.99      1.00      1.00    111491



In [190]:
# LGBMClassiferModel = LGBMClassifier()
# LGBMClassiferModel = LGBMClassiferModel.fit(X_resampled, y_resampled)
# LGBMClassifer_y_pred = LGBMClassiferModel.predict(X_test_transformed)
# print(f'Accuracy: {accuracy_score(y_test, LGBMClassifer_y_pred)}')
# print(f'MCC: {matthews_corrcoef(y_test, LGBMClassifer_y_pred)}')
# print(confusion_matrix(y_test, LGBMClassifer_y_pred))
# print(classification_report(y_test, LGBMClassifer_y_pred, zero_division=1))

[LightGBM] [Info] Number of positive: 943351, number of negative: 943351
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.040389 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4888
[LightGBM] [Info] Number of data points in the train set: 1886702, number of used features: 1161
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Accuracy: 0.9333130028432788
MCC: 0.14498179838652228
[[103820   7360]
 [    75    236]]
              precision    recall  f1-score   support

       False       1.00      0.93      0.97    111180
        True       0.03      0.76      0.06       311

    accuracy                           0.93    111491
   macro avg       0.52      0.85      0.51    111491
weighted avg       1.00      0.93      0.96    111491



In [204]:
LGBMClassiferModel = LGBMClassifier()
LGBMClassiferModel = LGBMClassiferModel.fit(X_trained_smoted, y_trained_smoted)
LGBMClassifer_y_pred = LGBMClassiferModel.predict(X_test_transformed)
print(f'Accuracy: {accuracy_score(y_test, LGBMClassifer_y_pred)}')
print(f'MCC: {matthews_corrcoef(y_test, LGBMClassifer_y_pred)}')
print(confusion_matrix(y_test, LGBMClassifer_y_pred))
print(classification_report(y_test, LGBMClassifer_y_pred, zero_division=1))

[LightGBM] [Info] Number of positive: 943351, number of negative: 943351
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 7.420423 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 56989
[LightGBM] [Info] Number of data points in the train set: 1886702, number of used features: 1161
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Accuracy: 0.969683651595196
MCC: 0.1864146783532086
[[107911   3269]
 [   111    200]]
              precision    recall  f1-score   support

       False       1.00      0.97      0.98    111180
        True       0.06      0.64      0.11       311

    accuracy                           0.97    111491
   macro avg       0.53      0.81      0.55    111491
weighted avg       1.00      0.97      0.98    111491



In [161]:
# LGBMClassiferModel = LGBMClassifier(class_weight='balanced')
# LGBMClassiferModel = LGBMClassiferModel.fit(X_train_transformed, y_train)
# LGBMClassifer_y_pred = LGBMClassiferModel.predict(X_test_transformed)
# print(f'Accuracy: {accuracy_score(y_test, LGBMClassifer_y_pred)}')
# print(f'MCC: {matthews_corrcoef(y_test, LGBMClassifer_y_pred)}')
# print(confusion_matrix(y_test, LGBMClassifer_y_pred))
# print(classification_report(y_test, LGBMClassifer_y_pred, zero_division=1))

[LightGBM] [Info] Number of positive: 2207, number of negative: 943351
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.039475 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4866
[LightGBM] [Info] Number of data points in the train set: 945558, number of used features: 1149
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
Accuracy: 0.9359230789929232
MCC: 0.1614511006645619
[[104091   7089]
 [    55    256]]
              precision    recall  f1-score   support

       False       1.00      0.94      0.97    111180
        True       0.03      0.82      0.07       311

    accuracy                           0.94    111491
   macro avg       0.52      0.88      0.52    111491
weighted avg       1.00      0.94      0.96    111491



In [105]:
# y_val_pred_proba = LGBMClassiferModel.predict_proba(X_test_transformed)[:, 1]
# candidate_thresholds = np.arange(start = 0.1, stop = 0.925, step = 0.01)
# thresholds = pd.DataFrame({'threshold': candidate_thresholds})
# thresholds['f1'] = thresholds['threshold'].apply(lambda x: f1_score(y_test, y_val_pred_proba > x))
# thresholds.sort_values('f1', ascending = False).head()

Unnamed: 0,threshold,f1
3,0.13,0.019971
2,0.12,0.019663
1,0.11,0.019444
0,0.1,0.0191
8,0.18,0.012121


In [109]:
# threshold = 0.13
# y_pred_proba = LGBMClassiferModel.predict_proba(X_test_transformed)[:,1]

# y_pred = y_pred_proba > threshold
# print(f'Accuracy: {accuracy_score(y_test, LGBMClassifer_y_pred)}')
# print(f'MCC: {matthews_corrcoef(y_test, LGBMClassifer_y_pred)}')
# print(confusion_matrix(y_test, LGBMClassifer_y_pred))
# print(classification_report(y_test, LGBMClassifer_y_pred))

Accuracy: 0.9955332717439076
MCC: 0.006033579835433306
[[110991    189]
 [   309      2]]
              precision    recall  f1-score   support

       False       1.00      1.00      1.00    111180
        True       0.01      0.01      0.01       311

    accuracy                           1.00    111491
   macro avg       0.50      0.50      0.50    111491
weighted avg       0.99      1.00      1.00    111491



In [141]:
# threshold = 0.11
# y_pred_proba = LGBMClassiferModel.predict_proba(X_test_transformed)[:,1]

# y_pred = y_pred_proba > threshold
# print(f'Accuracy: {accuracy_score(y_test, LGBMClassifer_y_pred)}')
# print(f'MCC: {matthews_corrcoef(y_test, LGBMClassifer_y_pred)}')
# print(confusion_matrix(y_test, LGBMClassifer_y_pred))
# print(classification_report(y_test, LGBMClassifer_y_pred))

Accuracy: 0.9955332717439076
MCC: 0.006033579835433306
[[110991    189]
 [   309      2]]
              precision    recall  f1-score   support

       False       1.00      1.00      1.00    111180
        True       0.01      0.01      0.01       311

    accuracy                           1.00    111491
   macro avg       0.50      0.50      0.50    111491
weighted avg       0.99      1.00      1.00    111491



In [163]:
XGBClassiferModel = XGBClassifier()
XGBClassiferModel.fit(X_train_transformed, y_train)
XGBClassifer_y_pred = XGBClassiferModel.predict(X_test_transformed)
print(f'Accuracy: {accuracy_score(y_test, XGBClassifer_y_pred)}')
print(f'MCC: {matthews_corrcoef(y_test, XGBClassifer_y_pred)}')
print(confusion_matrix(y_test, XGBClassifer_y_pred))
print(classification_report(y_test, XGBClassifer_y_pred, zero_division=0))

Accuracy: 0.9967531011471777
MCC: 0.03512475401526756
[[111124     56]
 [   306      5]]
              precision    recall  f1-score   support

       False       1.00      1.00      1.00    111180
        True       0.08      0.02      0.03       311

    accuracy                           1.00    111491
   macro avg       0.54      0.51      0.51    111491
weighted avg       0.99      1.00      1.00    111491



In [192]:
XGBClassiferModel = XGBClassifier()
XGBClassiferModel.fit(X_resampled, y_resampled)
XGBClassifer_y_pred = XGBClassiferModel.predict(X_test_transformed)
print(f'Accuracy: {accuracy_score(y_test, XGBClassifer_y_pred)}')
print(f'MCC: {matthews_corrcoef(y_test, XGBClassifer_y_pred)}')
print(confusion_matrix(y_test, XGBClassifer_y_pred))
print(classification_report(y_test, XGBClassifer_y_pred, zero_division=0))

Accuracy: 0.9451704621897732
MCC: 0.1662054196994539
[[105135   6045]
 [    68    243]]
              precision    recall  f1-score   support

       False       1.00      0.95      0.97    111180
        True       0.04      0.78      0.07       311

    accuracy                           0.95    111491
   macro avg       0.52      0.86      0.52    111491
weighted avg       1.00      0.95      0.97    111491



In [206]:
XGBClassiferModel = XGBClassifier()
XGBClassiferModel.fit(X_trained_smoted, y_trained_smoted)
XGBClassifer_y_pred = XGBClassiferModel.predict(X_test_transformed)
print(f'Accuracy: {accuracy_score(y_test, XGBClassifer_y_pred)}')
print(f'MCC: {matthews_corrcoef(y_test, XGBClassifer_y_pred)}')
print(confusion_matrix(y_test, XGBClassifer_y_pred))
print(classification_report(y_test, XGBClassifer_y_pred, zero_division=0))

Accuracy: 0.9760249706254317
MCC: 0.20124094477470156
[[108627   2553]
 [   120    191]]
              precision    recall  f1-score   support

       False       1.00      0.98      0.99    111180
        True       0.07      0.61      0.13       311

    accuracy                           0.98    111491
   macro avg       0.53      0.80      0.56    111491
weighted avg       1.00      0.98      0.99    111491



In [174]:
# XGBClassiferModel = XGBClassifier(scale_pos_weight=427)
# XGBClassiferModel.fit(X_train_transformed, y_train)
# XGBClassifer_y_pred = XGBClassiferModel.predict(X_test_transformed)
# print(f'Accuracy: {accuracy_score(y_test, XGBClassifer_y_pred)}')
# print(f'MCC: {matthews_corrcoef(y_test, XGBClassifer_y_pred)}')
# print(confusion_matrix(y_test, XGBClassifer_y_pred))
# print(classification_report(y_test, XGBClassifer_y_pred, zero_division=0))

Accuracy: 0.9464979235992143
MCC: 0.16193793933851372
[[105292   5888]
 [    77    234]]
              precision    recall  f1-score   support

       False       1.00      0.95      0.97    111180
        True       0.04      0.75      0.07       311

    accuracy                           0.95    111491
   macro avg       0.52      0.85      0.52    111491
weighted avg       1.00      0.95      0.97    111491



In [111]:
# y_val_pred_proba = XGBClassiferModel.predict_proba(X_test_transformed)[:, 1]
# candidate_thresholds = np.arange(start = 0.1, stop = 0.925, step = 0.01)
# thresholds = pd.DataFrame({'threshold': candidate_thresholds})
# thresholds['f1'] = thresholds['threshold'].apply(lambda x: f1_score(y_test, y_val_pred_proba > x))
# thresholds.sort_values('f1', ascending = False).head()

Unnamed: 0,threshold,f1
0,0.1,0.087719
2,0.12,0.084416
3,0.13,0.083612
1,0.11,0.081633
4,0.14,0.08


In [147]:
# threshold = 0.12
# y_pred_proba = XGBClassiferModel.predict_proba(X_test_transformed)[:,1]

# y_pred = y_pred_proba > threshold
# print(f'Accuracy: {accuracy_score(y_test, XGBClassifer_y_pred)}')
# print(f'MCC: {matthews_corrcoef(y_test, XGBClassifer_y_pred)}')
# print(confusion_matrix(y_test, XGBClassifer_y_pred))
# print(classification_report(y_test, XGBClassifer_y_pred))

Accuracy: 0.9967531011471777
MCC: 0.03512475401526756
[[111124     56]
 [   306      5]]
              precision    recall  f1-score   support

       False       1.00      1.00      1.00    111180
        True       0.08      0.02      0.03       311

    accuracy                           1.00    111491
   macro avg       0.54      0.51      0.51    111491
weighted avg       0.99      1.00      1.00    111491



In [30]:
# RandomForestClassifierModel = RandomForestClassifier()
# RandomForestClassifierModel.fit(X_trained_balanced, y_trained_balanced)
# RandomForestClassifier_y_pred = RandomForestClassifierModel.predict(X_test_transformed)
# print(f'Accuracy: {accuracy_score(y_test, RandomForestClassifier_y_pred)}')
# print(f'MCC: {matthews_corrcoef(y_test, RandomForestClassifier_y_pred)}')
# print(confusion_matrix(y_test, RandomForestClassifier_y_pred))
# print(classification_report(y_test, RandomForestClassifier_y_pred, zero_division=0))

In [212]:
DecisionTreeClassifierModel = DecisionTreeClassifier()
DecisionTreeClassifierModel.fit(X_resampled, y_resampled)
DecisionTreeClassifierModel_y_pred = DecisionTreeClassifierModel.predict(X_test_transformed)
print(f'Accuracy: {accuracy_score(y_test, DecisionTreeClassifierModel_y_pred)}')
print(f'MCC: {matthews_corrcoef(y_test, DecisionTreeClassifierModel_y_pred)}')
print(confusion_matrix(y_test, DecisionTreeClassifierModel_y_pred))
print(classification_report(y_test, DecisionTreeClassifierModel_y_pred, zero_division=0))

Accuracy: 0.9824918603295333
MCC: 0.0645763737941879
[[109487   1693]
 [   259     52]]
              precision    recall  f1-score   support

       False       1.00      0.98      0.99    111180
        True       0.03      0.17      0.05       311

    accuracy                           0.98    111491
   macro avg       0.51      0.58      0.52    111491
weighted avg       0.99      0.98      0.99    111491



In [213]:
DecisionTreeClassifierModel = DecisionTreeClassifier()
DecisionTreeClassifierModel.fit(X_trained_smoted, y_trained_smoted)
DecisionTreeClassifierModel_y_pred = DecisionTreeClassifierModel.predict(X_test_transformed)
print(f'Accuracy: {accuracy_score(y_test, DecisionTreeClassifierModel_y_pred)}')
print(f'MCC: {matthews_corrcoef(y_test, DecisionTreeClassifierModel_y_pred)}')
print(confusion_matrix(y_test, DecisionTreeClassifierModel_y_pred))
print(classification_report(y_test, DecisionTreeClassifierModel_y_pred, zero_division=0))

Accuracy: 0.9822317496479537
MCC: 0.10914885593187094
[[109423   1757]
 [   224     87]]
              precision    recall  f1-score   support

       False       1.00      0.98      0.99    111180
        True       0.05      0.28      0.08       311

    accuracy                           0.98    111491
   macro avg       0.52      0.63      0.54    111491
weighted avg       1.00      0.98      0.99    111491



In [121]:
# y_val_pred_proba = DecisionTreeClassifierModel.predict_proba(X_test_transformed)[:, 1]
# candidate_thresholds = np.arange(start = 0.1, stop = 0.925, step = 0.01)
# thresholds = pd.DataFrame({'threshold': candidate_thresholds})
# thresholds['f1'] = thresholds['threshold'].apply(lambda x: f1_score(y_test, y_val_pred_proba > x))
# thresholds.sort_values('f1', ascending = False).head()

Unnamed: 0,threshold,f1
41,0.51,0.030043
62,0.72,0.030043
60,0.7,0.030043
59,0.69,0.030043
58,0.68,0.030043


In [149]:
# threshold = 0.12
# y_pred_proba = DecisionTreeClassifierModel.predict_proba(X_test_transformed)[:,1]

# y_pred = y_pred_proba > threshold
# print(f'Accuracy: {accuracy_score(y_test, DecisionTreeClassifierModel_y_pred)}')
# print(f'MCC: {matthews_corrcoef(y_test, DecisionTreeClassifierModel_y_pred)}')
# print(confusion_matrix(y_test, DecisionTreeClassifierModel_y_pred))
# print(classification_report(y_test, DecisionTreeClassifierModel_y_pred))

Accuracy: 0.995945861100896
MCC: 0.029975850832518688
[[111032    148]
 [   304      7]]
              precision    recall  f1-score   support

       False       1.00      1.00      1.00    111180
        True       0.05      0.02      0.03       311

    accuracy                           1.00    111491
   macro avg       0.52      0.51      0.51    111491
weighted avg       0.99      1.00      1.00    111491



In [32]:
LogisticRegressionModel = LogisticRegression(max_iter=100)
LogisticRegressionModel.fit(X_train_transformed, y_train)
LogisticRegressionModel_y_pred = LogisticRegressionModel.predict(X_test_transformed)
print(f'Accuracy: {accuracy_score(y_test, LogisticRegressionModel_y_pred)}')
print(f'MCC: {matthews_corrcoef(y_test, LogisticRegressionModel_y_pred)}')
print(confusion_matrix(y_test, LogisticRegressionModel_y_pred))
print(classification_report(y_test, LogisticRegressionModel_y_pred, zero_division=0))

Accuracy: 0.9972015678395566
MCC: -0.00015839771122191769
[[111179      1]
 [   311      0]]
              precision    recall  f1-score   support

       False       1.00      1.00      1.00    111180
        True       0.00      0.00      0.00       311

    accuracy                           1.00    111491
   macro avg       0.50      0.50      0.50    111491
weighted avg       0.99      1.00      1.00    111491



In [194]:
LogisticRegressionModel = LogisticRegression(max_iter=100)
LogisticRegressionModel.fit(X_resampled, y_resampled)
LogisticRegressionModel_y_pred = LogisticRegressionModel.predict(X_test_transformed)
print(f'Accuracy: {accuracy_score(y_test, LogisticRegressionModel_y_pred)}')
print(f'MCC: {matthews_corrcoef(y_test, LogisticRegressionModel_y_pred)}')
print(confusion_matrix(y_test, LogisticRegressionModel_y_pred))
print(classification_report(y_test, LogisticRegressionModel_y_pred, zero_division=0))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.9674502874671498
MCC: 0.19654345354763378
[[107643   3537]
 [    92    219]]
              precision    recall  f1-score   support

       False       1.00      0.97      0.98    111180
        True       0.06      0.70      0.11       311

    accuracy                           0.97    111491
   macro avg       0.53      0.84      0.55    111491
weighted avg       1.00      0.97      0.98    111491



In [208]:
LogisticRegressionModel = LogisticRegression(max_iter=100)
LogisticRegressionModel.fit(X_trained_smoted, y_trained_smoted)
LogisticRegressionModel_y_pred = LogisticRegressionModel.predict(X_test_transformed)
print(f'Accuracy: {accuracy_score(y_test, LogisticRegressionModel_y_pred)}')
print(f'MCC: {matthews_corrcoef(y_test, LogisticRegressionModel_y_pred)}')
print(confusion_matrix(y_test, LogisticRegressionModel_y_pred))
print(classification_report(y_test, LogisticRegressionModel_y_pred, zero_division=0))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.9730650904557319
MCC: 0.21168729076715714
[[108274   2906]
 [    97    214]]
              precision    recall  f1-score   support

       False       1.00      0.97      0.99    111180
        True       0.07      0.69      0.12       311

    accuracy                           0.97    111491
   macro avg       0.53      0.83      0.56    111491
weighted avg       1.00      0.97      0.98    111491



In [176]:
# LogisticRegressionModel = LogisticRegression(max_iter=100, class_weight='balanced')
# LogisticRegressionModel.fit(X_train_transformed, y_train)
# LogisticRegressionModel_y_pred = LogisticRegressionModel.predict(X_test_transformed)
# print(f'Accuracy: {accuracy_score(y_test, LogisticRegressionModel_y_pred)}')
# print(f'MCC: {matthews_corrcoef(y_test, LogisticRegressionModel_y_pred)}')
# print(confusion_matrix(y_test, LogisticRegressionModel_y_pred))
# print(classification_report(y_test, LogisticRegressionModel_y_pred, zero_division=0))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.9666968634239536
MCC: 0.1933604049613864
[[107560   3620]
 [    93    218]]
              precision    recall  f1-score   support

       False       1.00      0.97      0.98    111180
        True       0.06      0.70      0.11       311

    accuracy                           0.97    111491
   macro avg       0.53      0.83      0.54    111491
weighted avg       1.00      0.97      0.98    111491



In [125]:
# y_val_pred_proba = LogisticRegressionModel.predict_proba(X_test_transformed)[:, 1]
# candidate_thresholds = np.arange(start = 0.1, stop = 0.925, step = 0.01)
# thresholds = pd.DataFrame({'threshold': candidate_thresholds})
# thresholds['f1'] = thresholds['threshold'].apply(lambda x: f1_score(y_test, y_val_pred_proba > x))
# thresholds.sort_values('f1', ascending = False).head()

Unnamed: 0,threshold,f1
1,0.11,0.057252
2,0.12,0.056566
3,0.13,0.054167
0,0.1,0.053476
4,0.14,0.048035


In [151]:
# threshold = 0.12
# y_pred_proba = DecisionTreeClassifierModel.predict_proba(X_test_transformed)[:,1]

# y_pred = y_pred_proba > threshold
# print(f'Accuracy: {accuracy_score(y_test, LogisticRegressionModel_y_pred)}')
# print(f'MCC: {matthews_corrcoef(y_test, LogisticRegressionModel_y_pred)}')
# print(confusion_matrix(y_test, LogisticRegressionModel_y_pred))
# print(classification_report(y_test, LogisticRegressionModel_y_pred))

Accuracy: 0.9972015678395566
MCC: -0.00015839771122191769
[[111179      1]
 [   311      0]]
              precision    recall  f1-score   support

       False       1.00      1.00      1.00    111180
        True       0.00      0.00      0.00       311

    accuracy                           1.00    111491
   macro avg       0.50      0.50      0.50    111491
weighted avg       0.99      1.00      1.00    111491



In [33]:
KNeighborsClassifierModel = KNeighborsClassifier()
KNeighborsClassifierModel.fit(X_train_transformed, y_train)
KNeighborsClassifierModel_y_pred = LogisticRegressionModel.predict(X_test_transformed)
print(f'Accuracy: {accuracy_score(y_test, KNeighborsClassifierModel_y_pred)}')
print(f'MCC: {matthews_corrcoef(y_test, KNeighborsClassifierModel_y_pred)}')
print(confusion_matrix(y_test, KNeighborsClassifierModel_y_pred))
print(classification_report(y_test, KNeighborsClassifierModel_y_pred, zero_division=0))

Accuracy: 0.9972015678395566
MCC: -0.00015839771122191769
[[111179      1]
 [   311      0]]
              precision    recall  f1-score   support

       False       1.00      1.00      1.00    111180
        True       0.00      0.00      0.00       311

    accuracy                           1.00    111491
   macro avg       0.50      0.50      0.50    111491
weighted avg       0.99      1.00      1.00    111491



In [196]:
KNeighborsClassifierModel = KNeighborsClassifier()
KNeighborsClassifierModel.fit(X_resampled, y_resampled)
KNeighborsClassifierModel_y_pred = LogisticRegressionModel.predict(X_test_transformed)
print(f'Accuracy: {accuracy_score(y_test, KNeighborsClassifierModel_y_pred)}')
print(f'MCC: {matthews_corrcoef(y_test, KNeighborsClassifierModel_y_pred)}')
print(confusion_matrix(y_test, KNeighborsClassifierModel_y_pred))
print(classification_report(y_test, KNeighborsClassifierModel_y_pred, zero_division=0))

Accuracy: 0.9674502874671498
MCC: 0.19654345354763378
[[107643   3537]
 [    92    219]]
              precision    recall  f1-score   support

       False       1.00      0.97      0.98    111180
        True       0.06      0.70      0.11       311

    accuracy                           0.97    111491
   macro avg       0.53      0.84      0.55    111491
weighted avg       1.00      0.97      0.98    111491



In [209]:
KNeighborsClassifierModel = KNeighborsClassifier()
KNeighborsClassifierModel.fit(X_trained_smoted, y_trained_smoted)
KNeighborsClassifierModel_y_pred = LogisticRegressionModel.predict(X_test_transformed)
print(f'Accuracy: {accuracy_score(y_test, KNeighborsClassifierModel_y_pred)}')
print(f'MCC: {matthews_corrcoef(y_test, KNeighborsClassifierModel_y_pred)}')
print(confusion_matrix(y_test, KNeighborsClassifierModel_y_pred))
print(classification_report(y_test, KNeighborsClassifierModel_y_pred, zero_division=0))

Accuracy: 0.9730650904557319
MCC: 0.21168729076715714
[[108274   2906]
 [    97    214]]
              precision    recall  f1-score   support

       False       1.00      0.97      0.99    111180
        True       0.07      0.69      0.12       311

    accuracy                           0.97    111491
   macro avg       0.53      0.83      0.56    111491
weighted avg       1.00      0.97      0.98    111491



In [None]:
# KNeighborsClassifierModel = KNeighborsClassifier()
# KNeighborsClassifierModel.fit(X_train_transformed, y_train)
# KNeighborsClassifierModel_y_pred = LogisticRegressionModel.predict(X_test_transformed)
# print(f'Accuracy: {accuracy_score(y_test, KNeighborsClassifierModel_y_pred)}')
# print(f'MCC: {matthews_corrcoef(y_test, KNeighborsClassifierModel_y_pred)}')
# print(confusion_matrix(y_test, KNeighborsClassifierModel_y_pred))
# print(classification_report(y_test, KNeighborsClassifierModel_y_pred, zero_division=0))

In [None]:
# y_val_pred_proba = KNeighborsClassifierModel.predict_proba(X_test_transformed)[:, 1]
# candidate_thresholds = np.arange(start = 0.1, stop = 0.925, step = 0.01)
# thresholds = pd.DataFrame({'threshold': candidate_thresholds})
# thresholds['f1'] = thresholds['threshold'].apply(lambda x: f1_score(y_test, y_val_pred_proba > x))
# thresholds.sort_values('f1', ascending = False).head()

In [None]:
# threshold = 0.12
# y_pred_proba = KNeighborsClassifierModel.predict_proba(X_test_transformed)[:,1]

# y_pred = y_pred_proba > threshold
# print(f'Accuracy: {accuracy_score(y_test, KNeighborsClassifierModel_y_pred)}')
# print(f'MCC: {matthews_corrcoef(y_test, KNeighborsClassifierModel_y_pred)}')
# print(confusion_matrix(y_test, KNeighborsClassifierModel_y_pred))
# print(classification_report(y_test, KNeighborsClassifierModel_y_pred))

In [36]:
# - there rae 1763 positive samples and 738,171 negative samples. This suggests dataset is highly imbalanced, which may require adjustments like class weighting or resampling.
# - LightGBM uses histogram-based learning instead of splitting values directly. This means the dataset has 6531 unique bin partitions, which can improve training efficiency.
# - there are 739,934 rows being used for training. the model is leveraging 1270 features in total.
# - Since it's a binary classification task, LightGBM calculates an initial score (-6.037159). This score is derived from the prior probability (pavg=0.002383), meaning only 0.238% of samples are positive. The low prior probability affects LightGBM's initial weight adjustments to balance the learning process.

# Next steps:
# Given the severe class imbalance, I might: ✔ Use scale_pos_weight in LightGBM to balance classes ✔ Apply oversampling techniques (SMOTE, ADASYN) if necessary ✔ Tune the learning rate for better convergence