In [1]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import pandas as pd

# Load the dataset
file_path = '/Users/macbook/Desktop/project2/BankruptcyData.csv'  # Update with your actual path
data = pd.read_csv(file_path)

# Select the relevant features and target
features = data.drop(columns=['Bankrupt?'])  # All columns except the target variable
target = data['Bankrupt?']  # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=42, stratify=target)

# Define oversampling and undersampling strategy
over = SMOTE(sampling_strategy=0.5)  # Oversample the minority class to 50% of the majority class
under = RandomUnderSampler(sampling_strategy=0.7)  # Undersample the majority class to 70% of its original size

# Create a pipeline for sampling and training
pipeline = Pipeline(steps=[
    ('over', over),                 # Step 1: Oversample the minority class using SMOTE
    ('under', under),               # Step 2: Undersample the majority class using RandomUnderSampler
    ('model', RandomForestClassifier(n_estimators=100, random_state=42))  # Step 3: Train RandomForestClassifier
])

# Train the model using the pipeline
pipeline.fit(X_train, y_train)

# Make predictions on the test set
y_pred = pipeline.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Display the results
print(f"Accuracy: {accuracy:.4f}")
print(f"Confusion Matrix:\n{conf_matrix}")
print(f"Classification Report:\n{class_report}")


Accuracy: 0.9526
Confusion Matrix:
[[1912   68]
 [  29   37]]
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.97      0.98      1980
           1       0.35      0.56      0.43        66

    accuracy                           0.95      2046
   macro avg       0.67      0.76      0.70      2046
weighted avg       0.96      0.95      0.96      2046



In [4]:
import os

# Step 1: Load the newly filtered dataset
filtered_file_path = os.path.expanduser('~/Documents/project2/Filtered_BankruptcyData.csv')
filtered_data = pd.read_csv(filtered_file_path)

# Step 2: Prepare the Data
# Assuming 'Bankrupt?' is the target column
# Split the data into features (X) and target (y)
X = filtered_data.drop(columns=['Bankrupt?'])  # Drop the target column to keep only features
y = filtered_data['Bankrupt?']  # Target column

# Step 3: Split the filtered data into training and testing sets
X_train_filtered, X_test_filtered, y_train_filtered, y_test_filtered = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y)

# Step 4: Define oversampling and undersampling strategy
over = SMOTE(sampling_strategy=0.5)  # Oversample the minority class to 50% of the majority class
under = RandomUnderSampler(sampling_strategy=0.7)  # Undersample the majority class to 70% of its original size

# Step 5: Create a pipeline for sampling and training on the filtered dataset
pipeline_filtered = Pipeline(steps=[
    ('over', over),                 # Step 1: Oversample the minority class using SMOTE
    ('under', under),               # Step 2: Undersample the majority class using RandomUnderSampler
    ('model', RandomForestClassifier(n_estimators=100, random_state=42))  # Step 3: Train RandomForestClassifier
])

# Step 6: Train the model using the pipeline
pipeline_filtered.fit(X_train_filtered, y_train_filtered)

# Step 7: Make predictions on the test set
y_pred_filtered = pipeline_filtered.predict(X_test_filtered)

# Step 8: Evaluate the model
accuracy_filtered = accuracy_score(y_test_filtered, y_pred_filtered)
conf_matrix_filtered = confusion_matrix(y_test_filtered, y_pred_filtered)
class_report_filtered = classification_report(y_test_filtered, y_pred_filtered)

# Display the results
print(f"Filtered Dataset - Random Forest with Over and Under Sampling")
print(f"Accuracy: {accuracy_filtered:.4f}")
print(f"Confusion Matrix:\n{conf_matrix_filtered}")
print(f"Classification Report:\n{class_report_filtered}")



Filtered Dataset - Random Forest with Over and Under Sampling
Accuracy: 0.9409
Confusion Matrix:
[[1882   98]
 [  23   43]]
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.95      0.97      1980
           1       0.30      0.65      0.42        66

    accuracy                           0.94      2046
   macro avg       0.65      0.80      0.69      2046
weighted avg       0.97      0.94      0.95      2046

