In [None]:
import numpy as np
import pandas as pd

class TreeNode:
    def __init__(self, data, grad, hess, indices, col_sample=0.8, min_samples=5, min_hess=1, max_depth=10, reg_lambda=1, min_gain=1, epsilon=0.1):
        self.data, self.grad, self.hess = data, grad, hess
        self.indices = indices
        self.max_depth = max_depth
        self.min_samples = min_samples
        self.reg_lambda = reg_lambda
        self.min_gain = min_gain
        self.min_hess = min_hess
        self.total_samples = len(indices)
        self.total_features = data.shape[1]
        self.col_sample = col_sample
        self.epsilon = epsilon
        self.selected_cols = np.random.permutation(self.total_features)[:round(self.col_sample * self.total_features)]
        self.value = self.compute_optimal_value(self.grad[self.indices], self.hess[self.indices])
        self.best_score = float('-inf')
        self.split_feature = None
        self.split_value = None
        self.left_child = None
        self.right_child = None
        self.split_node()

    def compute_optimal_value(self, grad, hess):
        return -np.sum(grad) / (np.sum(hess) + self.reg_lambda)

    def split_node(self):
        for feature in self.selected_cols:
            self.evaluate_split(feature)
        if self.is_leaf():
            return
        feature_data = self.split_feature_data()
        left_indices = np.nonzero(feature_data <= self.split_value)[0]
        right_indices = np.nonzero(feature_data > self.split_value)[0]
        self.left_child = TreeNode(self.data, self.grad, self.hess, self.indices[left_indices], self.col_sample, self.min_samples, self.min_hess, self.max_depth-1, self.reg_lambda, self.min_gain, self.epsilon)
        self.right_child = TreeNode(self.data, self.grad, self.hess, self.indices[right_indices], self.col_sample, self.min_samples, self.min_hess, self.max_depth-1, self.reg_lambda, self.min_gain, self.epsilon)

    def evaluate_split(self, feature_idx):
        feature_data = self.data[self.indices, feature_idx]
        for value in np.unique(feature_data):
            left_mask = feature_data <= value
            right_mask = feature_data > value
            if np.sum(left_mask) < self.min_samples or np.sum(right_mask) < self.min_samples:
                continue
            score = self.calculate_gain(left_mask, right_mask)
            if score > self.best_score:
                self.best_score = score
                self.split_feature = feature_idx
                self.split_value = value

    def calculate_gain(self, left_mask, right_mask):
        grad, hess = self.grad[self.indices], self.hess[self.indices]
        grad_left, hess_left = grad[left_mask].sum(), hess[left_mask].sum()
        grad_right, hess_right = grad[right_mask].sum(), hess[right_mask].sum()
        gain = 0.5 * ((grad_left ** 2 / (hess_left + self.reg_lambda)) + (grad_right ** 2 / (hess_right + self.reg_lambda)) - ((grad_left + grad_right) ** 2 / (hess_left + hess_right + self.reg_lambda))) - self.min_gain
        return gain

    def is_leaf(self):
        return self.best_score == float('-inf') or self.max_depth <= 0

    def split_feature_data(self):
        return self.data[self.indices, self.split_feature]

    def predict_sample(self, sample):
        if self.is_leaf():
            return self.value
        return self.left_child.predict_sample(sample) if sample[self.split_feature] <= self.split_value else self.right_child.predict_sample(sample)

    def predict(self, X):
        return np.array([self.predict_sample(sample) for sample in X])


class GradientBoostingTree:
    def fit(self, data, grad, hess, col_sample=0.8, min_samples=5, min_hess=1, max_depth=10, reg_lambda=1, min_gain=1, epsilon=0.1):
        self.tree = TreeNode(data, grad, hess, np.arange(len(data)), col_sample, min_samples, min_hess, max_depth, reg_lambda, min_gain, epsilon)
        return self

    def predict(self, X):
        return self.tree.predict(X)


class GradientBoostingClassifier:
    def __init__(self):
        self.trees = []
        self.base_score = None
        self.pred_mean = None

    @staticmethod
    def sigmoid(x):
        return 1 / (1 + np.exp(-x))

    def compute_gradient(self, preds, labels):
        preds = self.sigmoid(preds)
        return preds - labels

    def compute_hessian(self, preds):
        preds = self.sigmoid(preds)
        return preds * (1 - preds)

    def fit(self, X, y, col_sample=0.8, min_hess=1, max_depth=5, min_samples=5, lr=0.4, rounds=5, reg_lambda=1.5, min_gain=1, epsilon=0.1):
        self.base_score = np.full(X.shape[0], 1, dtype=np.float64)

        # Compute class weights
        n_total = len(y)
        n_class_0 = np.sum(y == 0)
        n_class_1 = np.sum(y == 1)

        w_0 = n_class_1 / n_total if n_class_0 > 0 else 1
        w_1 = n_class_0 / n_total if n_class_1 > 0 else 1

        class_weights = np.where(y == 0, w_0, w_1)  # Assign weights per sample

        for _ in range(rounds):
            grad = self.compute_gradient(self.base_score, y) * class_weights
            hess = self.compute_hessian(self.base_score) * class_weights

            tree = GradientBoostingTree().fit(X, grad, hess, col_sample, min_samples, min_hess, max_depth, reg_lambda, min_gain, epsilon)
            self.base_score += lr * tree.predict(X)
            self.trees.append(tree)

        # Compute weighted mean for pred_mean
        predictions = self.predict_proba(X)
        self.pred_mean = np.average(predictions, weights=class_weights)

    def predict_proba(self, X):
        predictions = np.zeros(X.shape[0])
        for tree in self.trees:
            predictions += tree.predict(X)
        scores = predictions
        return (self.sigmoid(np.full((X.shape[0], 1), 1).flatten().astype('float64') + predictions))

    def predict(self, X):
        predictions = self.predict_proba(X)
        return (predictions > self.pred_mean).astype(int)

import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score, precision_recall_curve, auc


# Load dataset
df = pd.read_csv("Train.csv")  # Replace with your actual dataset

# Split target and features
y = df.iloc[:, 0]   # First column is the target variable
X = df.iloc[:, 1:]  # Remaining columns are features

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=5)

# Implementing the specific Isolation Forest model
iso_forest = IsolationForest(
    n_estimators=310,
    contamination=0.08362674314960916,
    random_state=42
)
iso_forest.fit(X_train)

# Generate Anomaly Scores as a New Feature (avoiding SettingWithCopyWarning)
X_train = X_train.copy()
X_test = X_test.copy()
X_train["anomaly_score"] = iso_forest.decision_function(X_train)
X_test["anomaly_score"] = iso_forest.decision_function(X_test)

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, f1_score
from scipy.stats import uniform, randint
import numpy as np

# Define a parameter distribution
param_dist = {
    'col_sample': uniform(0.6, 1),  # Values between 0.6 and 1.0
    'min_hess': randint(1, 10),
    'max_depth': randint(3, 10),
    'min_samples': randint(3, 10),
    'lr': uniform(0.2, 0.6),  # Values between 0.2 and 0.6
    'rounds': randint(3, 10),
    'reg_lambda': uniform(0.5, 2.5),  # Values between 0.5 and 2.5
    'min_gain': uniform(0.5, 1.5)
}

# Custom scorer for RandomizedSearchCV
scorer = make_scorer(f1_score)

# Convert GradientBoostingClassifier into a scikit-learn compatible estimator
from sklearn.base import BaseEstimator, ClassifierMixin

class GBTWrapper(BaseEstimator, ClassifierMixin):
    def __init__(self, col_sample=0.8, min_hess=1, max_depth=5, min_samples=5, lr=0.4, rounds=5, reg_lambda=1.5, min_gain=1, epsilon=0.1):
        self.col_sample = col_sample
        self.min_hess = min_hess
        self.max_depth = max_depth
        self.min_samples = min_samples
        self.lr = lr
        self.rounds = rounds
        self.reg_lambda = reg_lambda
        self.min_gain = min_gain
        self.epsilon = epsilon
        self.model = None

    def fit(self, X, y):
        self.model = GradientBoostingClassifier()
        self.model.fit(X, y, col_sample=self.col_sample, min_hess=self.min_hess, max_depth=self.max_depth,
                       min_samples=self.min_samples, lr=self.lr, rounds=self.rounds, reg_lambda=self.reg_lambda,
                       min_gain=self.min_gain, epsilon=self.epsilon)
        return self

    def predict(self, X):
        return self.model.predict(X)

# Perform Randomized Search for faster tuning
random_search = RandomizedSearchCV(
    GBTWrapper(), param_distributions=param_dist, n_iter=5, scoring=scorer, cv=2, n_jobs=-1, verbose=1, random_state=42
)

# Fit the model
random_search.fit(X_train.values, y_train.values)

# Print best results
print("Best Parameters:", random_search.best_params_)
print("Best F1 Score:", random_search.best_score_)

Fitting 2 folds for each of 5 candidates, totalling 10 fits
Best Parameters: {'col_sample': np.float64(0.6464504127199977), 'lr': np.float64(0.5645269111408631), 'max_depth': 7, 'min_gain': np.float64(1.1757488779543146), 'min_hess': 2, 'min_samples': 6, 'reg_lambda': np.float64(2.8555043892121317), 'rounds': 8}
Best F1 Score: 0.3964211300404461


In [None]:
estimator=random_search.best_estimator_

In [None]:
estimator.model.predict_proba(X_test.values)

In [None]:
# Predict Probabilities
y_pred_prob = estimator.model.predict_proba(X_test.values)  # Ensure correct format
y_pred_prob = y_pred_prob.flatten()  # Ensure it's 1D

# Adjust Decision Threshold
threshold = 0.9  # Can be tuned further
y_pred = (y_pred_prob >= threshold).astype(int)

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
precision, recall, _ = precision_recall_curve(y_test, y_pred_prob)
pr_auc = auc(recall, precision)

# Print Results
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Precision-Recall AUC: {pr_auc:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 0.9688
F1 Score: 0.4848
Precision-Recall AUC: 0.4059

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.98      0.98      1060
           1       0.46      0.52      0.48        31

    accuracy                           0.97      1091
   macro avg       0.72      0.75      0.73      1091
weighted avg       0.97      0.97      0.97      1091

