<a href="https://colab.research.google.com/github/ozenyilmaz/.arffdatasimpleda/blob/main/pruning_trees.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install pulp numpy pandas scikit-learn


Collecting pulp
  Downloading PuLP-2.9.0-py3-none-any.whl.metadata (5.4 kB)
Downloading PuLP-2.9.0-py3-none-any.whl (17.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.7/17.7 MB[0m [31m55.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pulp
Successfully installed pulp-2.9.0


In [12]:
import numpy as np
import pandas as pd
import pulp as pl
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from scipy.io import arff
import io

# Load and preprocess the FICO dataset
def load_fico_dataset(file_path):
    # Read file
    with open(file_path, "r", encoding="utf-8") as file:
        file_lines = file.readlines()

    # Find where the data starts
    data_start_index = next((i for i, line in enumerate(file_lines) if not line.startswith("%")), None)
    arff_content = "\n".join(file_lines[data_start_index:])  # Remove metadata

    # Load ARFF into Pandas DataFrame
    data, meta = arff.loadarff(io.StringIO(arff_content))
    df = pd.DataFrame(data)

    # Convert categorical columns from byte strings to regular strings
    df = df.applymap(lambda x: x.decode("utf-8") if isinstance(x, bytes) else x)

    return df

# Function to preprocess dataset
def preprocess_data(df):
    # Convert target column to numeric (0 = Bad, 1 = Good)
    df["RiskPerformance"] = df["RiskPerformance"].map({"Bad": 0, "Good": 1})

    # Handle missing values (fill with median)
    df.fillna(df.median(), inplace=True)

    # Separate features and target
    X = df.drop(columns=["RiskPerformance"])
    y = df["RiskPerformance"]

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    return X_train, X_test, y_train, y_test

# Function to train Random Forest and extract decision paths
def train_random_forest(X_train, y_train, n_estimators=100, max_depth=5):
    clf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
    clf.fit(X_train, y_train)

    # Mocked decision paths, replace with actual path extraction if needed
    paths = np.arange(n_estimators)  # Fake path identifiers
    losses = np.random.rand(n_estimators)  # Simulated loss values
    frequencies = np.random.rand(n_estimators)  # Simulated path frequencies

    return clf, paths, losses, frequencies

# Function to optimize decision paths using PuLP
def optimize_decision_paths_pulp(losses, frequencies, max_leaves, lambd=0.5):
    L = len(losses)  # Number of extracted paths

    # Create a PuLP optimization problem (Maximization)
    model = pl.LpProblem("TreePruning", pl.LpMaximize)

    # Define binary decision variables z_j for each path
    z = [pl.LpVariable(f"z_{j}", cat="Binary") for j in range(L)]

    # Objective function: Maximize fidelity while minimizing loss
    model += pl.lpSum(lambd * frequencies[j] * z[j] for j in range(L)) - pl.lpSum((1 - lambd) * losses[j] * z[j] for j in range(L))

    # Constraint: Limit the number of selected paths (pruned model complexity)
    model += pl.lpSum(z[j] for j in range(L)) <= max_leaves

    # Solve the optimization problem
    model.solve(pl.PULP_CBC_CMD(msg=False))  # Use CBC solver (default open-source solver in PuLP)

    # Retrieve the selected paths
    selected_paths = [j for j in range(L) if pl.value(z[j]) > 0.5]

    return selected_paths, pl.value(model.objective)

# Main function to run the entire pipeline
def main():
    file_path = "/content/fico"  # Path to uploaded dataset

    # Load and preprocess dataset
    df = load_fico_dataset(file_path)
    X_train, X_test, y_train, y_test = preprocess_data(df)

    # Train Random Forest and extract paths
    clf, paths, losses, freque


In [5]:
from google.colab import files

uploaded = files.upload()  # Prompts file upload


Saving fico to fico (1)


In [6]:
import os

# List all files in the directory
for file in os.listdir("/content/"):
    print(file)  # Prints the file names


.config
fico
fico (1)
sample_data


In [7]:
file_path = "/content/fico"
print("File path:", file_path)


File path: /content/fico


In [14]:
from sklearn.preprocessing import LabelEncoder

def preprocess_data(df):
    # Convert target column to numeric (0 = Bad, 1 = Good)
    df["RiskPerformance"] = df["RiskPerformance"].map({"Bad": 0, "Good": 1})

    # Identify categorical columns
    categorical_cols = df.select_dtypes(include=["object"]).columns.tolist()
    print("📌 Categorical Columns Detected:", categorical_cols)

    # Convert categorical columns to numeric using Label Encoding
    label_encoders = {}
    for col in categorical_cols:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))  # Convert to string before encoding
        label_encoders[col] = le

    # Convert all remaining columns to numeric
    df = df.apply(pd.to_numeric, errors="coerce")

    # Handle missing values (fill with median)
    df.fillna(df.median(), inplace=True)

    # Separate features and target
    X = df.drop(columns=["RiskPerformance"])
    y = df["RiskPerformance"]

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    return X_train, X_test, y_train, y_test


In [20]:
import numpy as np
import pandas as pd
import pulp as pl
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from scipy.io import arff
import io

# 📌 Load the FICO dataset from file
def load_fico_dataset(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        file_lines = file.readlines()

    # Find where the data starts
    data_start_index = next((i for i, line in enumerate(file_lines) if not line.startswith("%")), None)
    arff_content = "\n".join(file_lines[data_start_index:])  # Remove metadata

    # Load ARFF into Pandas DataFrame
    data, meta = arff.loadarff(io.StringIO(arff_content))
    df = pd.DataFrame(data)

    # Convert categorical columns from byte strings to strings
    df = df.applymap(lambda x: x.decode("utf-8") if isinstance(x, bytes) else x)

    return df

# 📌 Preprocess dataset (Convert categorical values, handle missing data)
def preprocess_data(df):
    # Convert target column to numeric (0 = Bad, 1 = Good)
    df["RiskPerformance"] = df["RiskPerformance"].map({"Bad": 0, "Good": 1})

    # Identify categorical columns and encode them
    categorical_cols = df.select_dtypes(include=["object"]).columns.tolist()
    label_encoders = {}
    for col in categorical_cols:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))
        label_encoders[col] = le

    # Convert everything to numeric (fixes median calculation issues)
    df = df.apply(pd.to_numeric, errors="coerce")

    # Handle missing values (fill with median)
    df.fillna(df.median(), inplace=True)

    # Separate features and target
    X = df.drop(columns=["RiskPerformance"])
    y = df["RiskPerformance"]

    # Split into training and testing sets
    return train_test_split(X, y, test_size=0.2, random_state=42)

# 📌 Train Random Forest and extract decision paths (Mocked Paths)
def train_random_forest(X_train, y_train, n_estimators=100, max_depth=5):
    clf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
    clf.fit(X_train, y_train)

    # Extracted Paths (Mocked - Replace with actual path extraction if needed)
    paths = np.arange(n_estimators)  # Simulated path identifiers
    losses = np.random.rand(n_estimators)  # Simulated loss values
    frequencies = np.random.rand(n_estimators)  # Simulated path frequencies

    return clf, paths, losses, frequencies

# 📌 Solve MIP using PuLP instead of Gurobi
def optimize_decision_paths_pulp(losses, frequencies, max_leaves, lambd=0.5):
    L = len(losses)  # Number of paths

    # Create a PuLP optimization model
    model = pl.LpProblem("TreePruning", pl.LpMaximize)

    # Binary decision variables z_j for each path
    z = [pl.LpVariable(f"z_{j}", cat="Binary") for j in range(L)]

    # Objective function: Maximize fidelity while minimizing loss
    model += pl.lpSum(lambd * frequencies[j] * z[j] for j in range(L)) - pl.lpSum((1 - lambd) * losses[j] * z[j] for j in range(L))

    # Constraint: Limit the number of selected paths (pruned model complexity)
    model += pl.lpSum(z[j] for j in range(L)) <= max_leaves

    # Solve the optimization problem
    model.solve(pl.PULP_CBC_CMD(msg=False))  # Use CBC solver

    # Retrieve the selected paths
    selected_paths = [j for j in range(L) if pl.value(z[j]) > 0.5]

    return selected_paths, pl.value(model.objective)

# 📌 Main function to execute the full pipeline
# Global variable to store the dataset
X_train, X_test, y_train, y_test = None, None, None, None

def main():
    global X_train, X_test, y_train, y_test  # Ensure they are accessible globally

    file_path = "/content/fico"  # Path to uploaded dataset

    print("\n📌 Loading Dataset...")
    df = load_fico_dataset(file_path)
    print("✅ Dataset Loaded! Shape:", df.shape)

    print("\n📌 Preprocessing Data...")
    X_train, X_test, y_train, y_test = preprocess_data(df)  # Now stored globally
    print("✅ Preprocessing Done! Training Samples:", X_train.shape[0])

    print("\n📌 Training Random Forest...")
    clf, paths, losses, frequencies = train_random_forest(X_train, y_train)
    print("✅ Model Trained! Extracted Paths:", len(paths))

    print("\n📌 Optimizing Decision Paths using PuLP...")
    max_leaves = 10  # Number of paths to keep
    selected_paths, best_score = optimize_decision_paths_pulp(losses, frequencies, max_leaves)

    print("\n🔍 Selected Paths:", selected_paths)
    print("🏆 Optimized Score:", best_score if best_score is not None else "No Solution Found")

    # Extract and print decision rules
    feature_names = X_train.columns.tolist()  # Now X_train is defined
    selected_rules = extract_forest_rules(clf, feature_names, selected_paths)

    print("\n📌 Extracted Decision Rules from Selected Paths:")
    for i, rule in enumerate(selected_rules[:10], 1):  # Show first 10 rules
        print(f"Rule {i}: IF {rule} THEN RiskPerformance")

if __name__ == "__main__":
    main()




📌 Loading Dataset...


  df = df.applymap(lambda x: x.decode("utf-8") if isinstance(x, bytes) else x)


✅ Dataset Loaded! Shape: (9871, 24)

📌 Preprocessing Data...
✅ Preprocessing Done! Training Samples: 7896

📌 Training Random Forest...
✅ Model Trained! Extracted Paths: 100

📌 Optimizing Decision Paths using PuLP...

🔍 Selected Paths: [1, 13, 16, 18, 57, 72, 82, 85, 86, 90]
🏆 Optimized Score: 3.8048326206576646

📌 Extracted Decision Rules from Selected Paths:
Rule 1: IF PercentTradesNeverDelq <= 95.50 AND MSinceMostRecentInqexcl7days <= -7.50 AND NumTrades60Ever2DerogPubRec <= 3.50 AND NetFractionRevolvingBurden <= 72.00 AND NumTotalTrades <= 33.50 THEN RiskPerformance
Rule 2: IF PercentTradesNeverDelq <= 95.50 AND MSinceMostRecentInqexcl7days <= -7.50 AND NumTrades60Ever2DerogPubRec <= 3.50 AND NetFractionRevolvingBurden <= 72.00 AND NumTotalTrades > 33.50 THEN RiskPerformance
Rule 3: IF PercentTradesNeverDelq <= 95.50 AND MSinceMostRecentInqexcl7days <= -7.50 AND NumTrades60Ever2DerogPubRec <= 3.50 AND NetFractionRevolvingBurden > 72.00 THEN RiskPerformance
Rule 4: IF PercentTradesNe