In [23]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, f1_score, make_scorer, accuracy_score, confusion_matrix, classification_report
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import lightgbm as lgb

In [3]:
directory = 'result/'
files = ['MJD_TRAIN_PROCESSED.csv', 'MJD_TEST_PROCESSED.csv', 'MJD_NPML_PROCESSED.csv']
train = pd.read_csv(directory + files[0]).dropna()
test = pd.read_csv(directory + files[1]).dropna()
npml = pd.read_csv(directory + files[2])
target = 'lowavse'

In [4]:
# Decision Tree Grid Search

to_drop = ['highavse', 'lowavse', 'truedcr', 'lq', 'id']

# Splitting features and target
X_train = train.drop(columns=to_drop)
y_train = train["lowavse"]
X_test = test.drop(columns=to_drop)
y_test = test['lowavse']

# Initialize and train the Decision Tree
dtree = DecisionTreeClassifier()

# Define the parameter grid
param_grid = {
    "criterion": ["gini", "entropy"],
    "max_depth": [None, 30],
    "min_samples_leaf": [1, 10],
    "max_leaf_nodes": [None, 30],
}

# Score metric
f1_false_scorer = make_scorer(f1_score, pos_label=False)

# Initialize GridSearchCV
grid_search = GridSearchCV(
    estimator=dtree,
    param_grid=param_grid,
    scoring=f1_false_scorer, 
    cv=5,                # 5-fold cross-validation
    verbose=1,           # Print progress
    n_jobs=-3            # Use except 2 cpus
)

# Fit the GridSearchCV
grid_search.fit(X_train, y_train)

# Get the best parameters and the best estimator
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Make predictions with the best model
y_pred = best_model.predict(X_test)

print("Best Parameters:", grid_search.best_params_)
print("Best F1-Score for 'False' Class:", grid_search.best_score_)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


ValueError: 
All the 80 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
80 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/conda/lib/python3.11/site-packages/sklearn/base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.11/site-packages/sklearn/tree/_classes.py", line 1009, in fit
    super()._fit(
  File "/opt/conda/lib/python3.11/site-packages/sklearn/tree/_classes.py", line 294, in _fit
    check_classification_targets(y)
  File "/opt/conda/lib/python3.11/site-packages/sklearn/utils/multiclass.py", line 221, in check_classification_targets
    raise ValueError(
ValueError: Unknown label type: unknown. Maybe you are trying to fit a classifier, which expects discrete classes on a regression target with continuous values.


In [21]:
# Best Parameters: {'criterion': 'entropy', 'max_depth': 30, 'max_leaf_nodes': None, 'min_samples_leaf': 1}
# Best F1-Score for 'False' Class: 0.7373655289714032

to_drop = ['highavse', 'lowavse', 'truedcr', 'lq', 'id']

# Splitting features and target
X_train = train.drop(columns=to_drop)
y_train = train["lowavse"]
X_test = test.drop(columns=to_drop)
y_test = test['lowavse']

# Initialize and train the Decision Tree
dtree = DecisionTreeClassifier(criterion='entropy', max_depth=30, max_leaf_nodes=None, min_samples_leaf=1)
dtree.fit(X_train, y_train)
y_pred = dtree.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       False       0.43      0.44      0.43      3160
        True       1.00      1.00      1.00    386834

    accuracy                           0.99    389994
   macro avg       0.71      0.72      0.71    389994
weighted avg       0.99      0.99      0.99    389994



In [39]:
import lightgbm as lgb
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.metrics import f1_score, make_scorer
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform

X_train_new, X_valid, y_train_new, y_valid = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42
)

f1_false_scorer = make_scorer(f1_score, pos_label=False)

lgb_clf = lgb.LGBMClassifier(random_state=42)

param_test = {
    'num_leaves': sp_randint(6, 50), 
    'min_child_samples': sp_randint(100, 500), 
    'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
    'subsample': sp_uniform(loc=0.2, scale=0.8), 
    'colsample_bytree': sp_uniform(loc=0.4, scale=0.6),
    'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
    'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100]
}

fit_params = {
    'eval_set': [(X_valid, y_valid)],
    'eval_metric': 'binary_logloss', 
    'callbacks': [lgb.early_stopping(stopping_rounds=10, verbose=False)]
}

random_search = RandomizedSearchCV(
    estimator=lgb_clf,
    param_distributions=param_test,
    n_iter=50, 
    scoring=f1_false_scorer,
    cv=3,
    n_jobs=1,
    verbose=100
)

random_search.fit(X_train_new, y_train_new, **fit_params)

print("Best parameters found:", random_search.best_params_)
print("Best F1 score for false (via RandomizedSearchCV):", random_search.best_score_)

best_model = random_search.best_estimator_
y_pred = best_model.predict(X_valid)
validation_f1 = f1_score(y_valid, y_pred, pos_label=False)
print("Validation F1 score for false:", validation_f1)


Fitting 3 folds for each of 50 candidates, totalling 150 fits
[CV 1/3; 1/50] START colsample_bytree=0.7774922158929176, min_child_samples=450, min_child_weight=1, num_leaves=36, reg_alpha=5, reg_lambda=0, subsample=0.6336369974414555
[LightGBM] [Info] Number of positive: 893814, number of negative: 7500
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.017805 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3825
[LightGBM] [Info] Number of data points in the train set: 901314, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.991679 -> initscore=4.780595
[LightGBM] [Info] Start training from score 4.780595
[CV 1/3; 1/50] END colsample_bytree=0.7774922158929176, min_child_samples=450, min_child_weight=1, num_leaves=36, reg_alpha=5, reg_lambda=0, subsample=0.6336369974414555;, score=0.542 total time=   2.6s
[CV 2/3; 1/50] START colsample_bytree=0.7774922158929176, min_child_sa

In [43]:
# Best parameters found: 
# {'colsample_bytree': 0.5992088141516931, 'min_child_samples': 223, 'min_child_weight': 1, 'num_leaves': 46, 'reg_alpha': 0.1, 'reg_lambda': 1, 'subsample': 0.7441909435524472}
to_drop = ['highavse', 'lowavse', 'truedcr', 'lq', 'id']

# Splitting features and target
X_train = train.drop(columns=to_drop)
y_train = train["lowavse"]
X_test = test.drop(columns=to_drop)
y_test = test['lowavse']

# Create a LightGBM classifier using its scikit-learn API
lgbm_clf = lgb.LGBMClassifier(
    **{'colsample_bytree': 0.5992088141516931, 'min_child_samples': 223, 'min_child_weight': 1, 'num_leaves': 46, 'reg_alpha': 0.1, 'reg_lambda': 1, 'subsample': 0.7441909435524472}
)

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('lgbm', lgbm_clf)
])

# Train the model on the training set
pipeline.fit(X_train, y_train)

# Make predictions on the test set
y_pred = pipeline.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))

[LightGBM] [Info] Number of positive: 1676039, number of negative: 13926
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.033381 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3825
[LightGBM] [Info] Number of data points in the train set: 1689965, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.991760 -> initscore=4.790431
[LightGBM] [Info] Start training from score 4.790431
Accuracy: 0.9935640035487725
Confusion Matrix:
[[  1265   1895]
 [   615 386219]]
Classification Report:
              precision    recall  f1-score   support

       False       0.67      0.40      0.50      3160
        True       1.00      1.00      1.00    386834

    accuracy                           0.99    389994
   macro avg       0.83      0.70      0.75    389994
weighted avg       0.99      0.99      0.99    389994



In [46]:
from catboost import CatBoostClassifier, Pool, cv

best_model = CatBoostClassifier(
    depth=6,
    iterations=594,
    learning_rate=0.18303897637982314,
    thread_count=-1,
    verbose=50
)

best_model.fit(X_train, y_train)

best_model.predict(X_test)
print(classification_report(y_test, y_pred))

0:	learn: 0.3016114	total: 146ms	remaining: 1m 26s
50:	learn: 0.0218480	total: 3.02s	remaining: 32.1s
100:	learn: 0.0207342	total: 5.79s	remaining: 28.3s
150:	learn: 0.0207342	total: 8.39s	remaining: 24.6s
200:	learn: 0.0207342	total: 11s	remaining: 21.5s
250:	learn: 0.0207342	total: 13.7s	remaining: 18.8s
300:	learn: 0.0207342	total: 16.4s	remaining: 16s
350:	learn: 0.0207342	total: 18.9s	remaining: 13.1s
400:	learn: 0.0207342	total: 21.5s	remaining: 10.4s
450:	learn: 0.0207342	total: 24.1s	remaining: 7.64s
500:	learn: 0.0207342	total: 26.7s	remaining: 4.95s
550:	learn: 0.0207342	total: 29.2s	remaining: 2.28s
593:	learn: 0.0207342	total: 31.4s	remaining: 0us
              precision    recall  f1-score   support

       False       0.67      0.40      0.50      3160
        True       1.00      1.00      1.00    386834

    accuracy                           0.99    389994
   macro avg       0.83      0.70      0.75    389994
weighted avg       0.99      0.99      0.99    389994



In [78]:
import tensorflow as tf
from tensorflow.keras import backend as K
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

def compute_f1(precision, recall):
    epsilon = K.epsilon()
    return 2 * (precision * recall) / (precision + recall + epsilon)

def false_recall_m(y_true, y_pred):

    y_true_neg = 1 - y_true
    y_pred_neg = 1 - y_pred
    true_negatives = K.sum(K.round(K.clip(y_true_neg * y_pred_neg, 0, 1)))
    possible_negatives = K.sum(K.round(K.clip(y_true_neg, 0, 1)))
    return true_negatives / (possible_negatives + K.epsilon())

def false_precision_m(y_true, y_pred):
    y_true_neg = 1 - y_true
    y_pred_neg = 1 - y_pred
    true_negatives = K.sum(K.round(K.clip(y_true_neg * y_pred_neg, 0, 1)))
    predicted_negatives = K.sum(K.round(K.clip(y_pred_neg, 0, 1)))
    return true_negatives / (predicted_negatives + K.epsilon())

def false_f1_m(y_true, y_pred):
    neg_precision = false_precision_m(y_true, y_pred)
    neg_recall = false_recall_m(y_true, y_pred)
    return 1-compute_f1(neg_precision, neg_recall)

num_features = X_train_scaled.shape[1]

model = Sequential([
    Dense(64, activation='relu', input_shape=(num_features,)),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer=Adam(learning_rate=0.001),
              loss='binary_crossentropy',  
              metrics=[false_f1_m])           

model.summary()

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_22 (Dense)            (None, 64)                1024      
                                                                 
 dropout_15 (Dropout)        (None, 64)                0         
                                                                 
 dense_23 (Dense)            (None, 32)                2080      
                                                                 
 dropout_16 (Dropout)        (None, 32)                0         
                                                                 
 dense_24 (Dense)            (None, 1)                 33        
                                                                 
Total params: 3137 (12.25 KB)
Trainable params: 3137 (12.25 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [79]:
history = model.fit(X_train_scaled, y_train,
                    epochs=10,
                    batch_size=10000,
                    validation_split=0.2)

