In [None]:
import numpy as np
from collections import Counter

class Node:
    """Decision tree node class"""
    def __init__(self, feature=None, threshold=None, left=None, right=None, *, value=None):
        self.feature = feature    # Index of feature to split on
        self.threshold = threshold  # Threshold value for split
        self.left = left          # Left child node
        self.right = right        # Right child node
        self.value = value       # Value if leaf node (class probability)


class DecisionTree:
    """Decision tree classifier implementation"""
    def __init__(self, max_depth=10, min_samples_leaf=5, max_features=None):
        self.max_depth = max_depth
        self.min_samples_leaf = min_samples_leaf
        self.max_features = max_features
        self.root = None

    def _gini_impurity(self, y):
        """Calculate Gini impurity for a set of labels"""
        counts = np.bincount(y)
        probabilities = counts / len(y)
        return 1 - np.sum(probabilities ** 2)

    def _best_split(self, X, y, feature_indices):
        """Find the best split for a node using Gini impurity"""
        best_gini = float('inf')
        best_feature, best_threshold = None, None

        for feature in feature_indices:
            thresholds = np.unique(X[:, feature])
            for threshold in thresholds:
                # Split data
                left_mask = X[:, feature] <= threshold
                right_mask = ~left_mask

                if np.sum(left_mask) < self.min_samples_leaf or np.sum(right_mask) < self.min_samples_leaf:
                    continue

                # Calculate weighted Gini impurity
                gini_left = self._gini_impurity(y[left_mask])
                gini_right = self._gini_impurity(y[right_mask])
                weighted_gini = (len(y[left_mask]) * gini_left + len(y[right_mask]) * gini_right) / len(y)

                if weighted_gini < best_gini:
                    best_gini = weighted_gini
                    best_feature = feature
                    best_threshold = threshold

        return best_feature, best_threshold

    def _build_tree(self, X, y, depth=0):
        """Recursively build the decision tree"""
        # Stopping conditions
        if (depth >= self.max_depth or
            len(y) < self.min_samples_leaf * 2 or
            len(np.unique(y)) == 1):
            counter = Counter(y)
            total = sum(counter.values())
            probabilities = {k: v/total for k, v in counter.items()}
            return Node(value=probabilities)

        # Determine number of features to consider
        n_features = X.shape[1]
        if isinstance(self.max_features, str):
            if self.max_features == 'sqrt':
                max_features = int(np.sqrt(n_features))
            elif self.max_features == 'log2':
                max_features = int(np.log2(n_features))
            else:
                max_features = n_features
        else:
            max_features = self.max_features if self.max_features else n_features

        # Random feature selection
        feature_indices = np.random.choice(
            n_features,
            size=max_features,
            replace=False
        )

        # Find best split
        feature, threshold = self._best_split(X, y, feature_indices)
        if feature is None:
            counter = Counter(y)
            total = sum(counter.values())
            probabilities = {k: v/total for k, v in counter.items()}
            return Node(value=probabilities)

        # Split data
        left_mask = X[:, feature] <= threshold
        right_mask = ~left_mask

        # Recursively build left and right subtrees
        left = self._build_tree(X[left_mask], y[left_mask], depth + 1)
        right = self._build_tree(X[right_mask], y[right_mask], depth + 1)

        return Node(feature=feature, threshold=threshold, left=left, right=right)

    def fit(self, X, y):
        """Train the decision tree"""
        self.root = self._build_tree(X, y)

    def _traverse_tree(self, x, node):
        """Traverse the tree to make a prediction for a single sample"""
        if node.value is not None:
            return node.value

        if x[node.feature] <= node.threshold:
            return self._traverse_tree(x, node.left)
        else:
            return self._traverse_tree(x, node.right)

    def predict_proba(self, X):
        """Predict class probabilities"""
        return np.array([self._traverse_tree(x, self.root) for x in X])

    def predict(self, X):
        """Predict class labels"""
        probas = self.predict_proba(X)
        return np.array([max(p.keys(), key=lambda k: p[k]) for p in probas])

class RandomForest:
    """Random Forest classifier implementation"""
    def __init__(self, n_trees=100, max_depth=10, min_samples_leaf=5,
                 max_features='sqrt', bootstrap_features=False):
        self.n_trees = n_trees
        self.max_depth = max_depth
        self.min_samples_leaf = min_samples_leaf
        self.max_features = max_features
        self.bootstrap_features = bootstrap_features
        self.trees = []

    def _bootstrap_sample(self, X, y):
        """Create bootstrap sample of the data"""
        n_samples = X.shape[0]
        indices = np.random.choice(n_samples, size=n_samples, replace=True)
        return X[indices], y[indices]

    def fit(self, X, y):
        """Train the random forest"""
        self.trees = []
        for _ in range(self.n_trees):
            tree = DecisionTree(
                max_depth=self.max_depth,
                min_samples_leaf=self.min_samples_leaf,
                max_features=self.max_features
            )

            # Bootstrap sample
            X_sample, y_sample = self._bootstrap_sample(X, y)

            # Optionally bootstrap features
            if self.bootstrap_features:
                n_features = X.shape[1]
                if isinstance(self.max_features, str):
                    if self.max_features == 'sqrt':
                        max_features = int(np.sqrt(n_features))
                    elif self.max_features == 'log2':
                        max_features = int(np.log2(n_features))
                    else:
                        max_features = n_features
                else:
                    max_features = self.max_features if self.max_features else n_features

                feature_indices = np.random.choice(
                    n_features,
                    size=max_features,
                    replace=True
                )
                X_sample = X_sample[:, feature_indices]
                tree.feature_indices = feature_indices

            tree.fit(X_sample, y_sample)
            self.trees.append(tree)

    def predict_proba(self, X):
        """Predict class probabilities (soft voting)"""
        tree_probas = []
        for tree in self.trees:
            if hasattr(tree, 'feature_indices'):
                X_subset = X[:, tree.feature_indices]
                proba = tree.predict_proba(X_subset)
            else:
                proba = tree.predict_proba(X)
            tree_probas.append(proba)

        # Average probabilities across all trees
        avg_proba = []
        for i in range(len(X)):
            class_probs = {}
            for proba in tree_probas:
                for cls, p in proba[i].items():
                    class_probs[cls] = class_probs.get(cls, 0) + p
            total = sum(class_probs.values())
            avg_proba.append({k: v/total for k, v in class_probs.items()})

        return avg_proba

    def predict(self, X):
        """Predict class labels (hard voting)"""
        probas = self.predict_proba(X)
        return np.array([max(p.keys(), key=lambda k: p[k]) for p in probas])

    def get_feature_importances(self, feature_names=None):
        """Calculate feature importances based on Gini importance"""
        if not self.trees:
            return None

        if feature_names is None:
            n_features = self.trees[0].max_features if hasattr(self.trees[0], 'max_features') else X.shape[1]
            feature_names = range(n_features)

        importances = {f: 0 for f in feature_names}
        total_importances = 0

        for tree in self.trees:
            # Get feature indices used in this tree
            if hasattr(tree, 'feature_indices'):
                features_used = tree.feature_indices
            else:
                features_used = range(len(feature_names))

            # Calculate importance for each feature in this tree
            tree_importances = self._compute_tree_importance(tree.root, len(feature_names))
            for f_idx, imp in enumerate(tree_importances):
                if f_idx in features_used:
                    importances[feature_names[f_idx]] += imp
                    total_importances += imp

        # Normalize importances
        if total_importances > 0:
            importances = {k: v/total_importances for k, v in importances.items()}

        return importances

    def _compute_tree_importance(self, node, n_features):
        """Recursively compute feature importance for a single tree"""
        importances = np.zeros(n_features)

        if node.value is not None:  # Leaf node
            return importances

        # Calculate importance for this node's feature
        left_counts = Counter([max(p.keys(), key=lambda k: p[k]) for p in node.left.predict_proba])
        right_counts = Counter([max(p.keys(), key=lambda k: p[k]) for p in node.right.predict_proba])
        parent_impurity = self._gini_impurity(list(left_counts.elements()) + list(right_counts.elements()))
        left_impurity = self._gini_impurity(list(left_counts.elements()))
        right_impurity = self._gini_impurity(list(right_counts.elements()))
        impurity_reduction = parent_impurity - (left_impurity + right_impurity)/2

        importances[node.feature] = impurity_reduction

        # Add importances from child nodes
        left_importances = self._compute_tree_importance(node.left, n_features)
        right_importances = self._compute_tree_importance(node.right, n_features)

        return importances + left_importances + right_importances
    def summary(self):
        """Output a summary of the Random Forest model"""
        print("Random Forest Model Summary:")
        print(f"Number of Trees: {self.n_trees}")
        print(f"Max Depth: {self.max_depth}")
        print(f"Min Samples per Leaf: {self.min_samples_leaf}")
        print(f"Max Features: {self.max_features}")
        print(f"Bootstrap Features: {self.bootstrap_features}")



In [None]:
# Example usage with the bankruptcy dataset
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report

# Load and prepare data
data = pd.read_csv('Train.csv')
X = data.drop('Bankrupt?', axis=1).values
y = data['Bankrupt?'].values
feature_names = data.drop('Bankrupt?', axis=1).columns.tolist()

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [None]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(sampling_strategy='minority', random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

In [None]:
# Example usage
rfnew = RandomForest(
    n_trees=100,
    max_depth=10,
    min_samples_leaf=5,
    max_features='sqrt',  # Now accepts 'sqrt' or 'log2' or integer
    bootstrap_features=True
)
rfnew.fit(X_resampled, y_resampled)

In [None]:
y_prednew = rfnew.predict(X_test)
print(classification_report(y_test, y_prednew))

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import precision_score, recall_score, confusion_matrix

# Assuming you have:
# y_proba = list of dictionaries with class probabilities like:
y_proba = rfnew.predict_proba(X_test)
# [{0: 0.989, 1: 0.011}, {0: 0.988, 1: 0.012}, ...]
# y_test = true labels (0 or 1)
thresholds = np.linspace(0.1, 0.9, 10)

# Extract probabilities for class 1
y_proba_class1 = np.array([d[1] for d in y_proba])

# Initialize lists to store metrics
thresholds = np.linspace(0.1, 0.9, 10)
precisions = []
recalls = []
f1_scores = []
confusion_matrices = []

print("Testing different thresholds to improve recall for class 1:\n")

for thresh in thresholds:
    # Apply threshold to get predicted class (1 if >= threshold, else 0)
    y_pred = (y_proba_class1 >= thresh).astype(int)

    # Calculate metrics
    precision = precision_score(y_test, y_pred, pos_label=1)
    recall = recall_score(y_test, y_pred, pos_label=1)
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    cm = confusion_matrix(y_test, y_pred)

    # Store results
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)
    confusion_matrices.append(cm)

    # Print current threshold results
    print(f"Threshold: {thresh:.2f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-score: {f1:.4f}")
    print("Confusion Matrix:")
    print(cm)
    print("-"*40)

# Plot precision-recall curve
plt.figure(figsize=(12, 5))

# Precision-Recall Curve
plt.subplot(1, 2, 1)
plt.plot(thresholds, precisions, 'b-', label='Precision')
plt.plot(thresholds, recalls, 'r-', label='Recall')
plt.plot(thresholds, f1_scores, 'g-', label='F1-score')
plt.xlabel('Threshold')
plt.ylabel('Score')
plt.title('Metrics vs Threshold')
plt.legend()
plt.grid(True)

# Precision-Recall Tradeoff
plt.subplot(1, 2, 2)
plt.plot(recalls, precisions, 'mo-')
for i, thresh in enumerate(thresholds):
    plt.annotate(f"{thresh:.2f}", (recalls[i], precisions[i]),
                 textcoords="offset points", xytext=(5,5), ha='center')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.grid(True)

plt.tight_layout()
plt.show()

# Find optimal threshold based on F1-score
optimal_idx = np.argmax(f1_scores)
optimal_threshold = thresholds[optimal_idx]
print(f"\nOptimal threshold: {optimal_threshold:.2f}")
print(f"At this threshold - Precision: {precisions[optimal_idx]:.4f}, Recall: {recalls[optimal_idx]:.4f}, F1: {f1_scores[optimal_idx]:.4f}")

In [None]:
y_pred_with_threshold = (y_proba_class1 >= 0.63).astype(int)
print(classification_report(y_test, y_pred_with_threshold))

In [None]:
rfnew.summary()

In [None]:
import pickle

# Saving the model to a file
with open('random_forest_model.pt', 'wb') as file:
    pickle.dump(rfnew, file)



In [None]:
with open('random_forest_model.pt', 'rb') as file:
    loaded_model = pickle.load(file)

# Predicting with the loaded model

y_proba = loaded_model.predict_proba(X_test)

y_pred_with_threshold = (y_proba_class1 >= 0.63).astype(int)
print(classification_report(y_test, y_pred_with_threshold))




In [None]:
with open("scaler.pth", "wb") as f:
    pickle.dump(scaler, f)