In [4]:
import pandas as pd
import numpy as np
import skfuzzy as fuzz
import skfuzzy.control as ctrl
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Define fuzzy logic system 1
def create_fuzzy_system_1():
    income = ctrl.Antecedent(np.arange(0, 100001, 1000), 'income')
    debt_to_income = ctrl.Antecedent(np.arange(0, 101, 1), 'debt_to_income')
    credit_score = ctrl.Antecedent(np.arange(300, 851, 1), 'credit_score')
    risk = ctrl.Consequent(np.arange(0, 11, 1), 'risk')
    
    # Membership functions for fuzzy system 1
    income['low'] = fuzz.trimf(income.universe, [0, 0, 50000])
    income['high'] = fuzz.trimf(income.universe, [30000, 100000, 100000])
    debt_to_income['low'] = fuzz.trimf(debt_to_income.universe, [0, 0, 40])
    debt_to_income['high'] = fuzz.trimf(debt_to_income.universe, [20, 100, 100])
    credit_score['low'] = fuzz.trimf(credit_score.universe, [300, 300, 600])
    credit_score['high'] = fuzz.trimf(credit_score.universe, [550, 850, 850])
    risk['low'] = fuzz.trimf(risk.universe, [0, 0, 5])
    risk['high'] = fuzz.trimf(risk.universe, [5, 10, 10])
    
    # Define rules for fuzzy system 1
    rule1 = ctrl.Rule(income['high'] & debt_to_income['low'] & credit_score['high'], risk['low'])
    rule2 = ctrl.Rule(income['low'] & debt_to_income['high'] & credit_score['low'], risk['high'])
    rule3 = ctrl.Rule(income['high'] & debt_to_income['high'] & credit_score['high'], risk['low'])
    rule4 = ctrl.Rule(income['low'] & debt_to_income['low'] & credit_score['high'], risk['low'])
    rule5 = ctrl.Rule(income['low'] & debt_to_income['high'] & credit_score['high'], risk['high'])
    
    risk_ctrl = ctrl.ControlSystem([rule1, rule2, rule3, rule4, rule5])
    return ctrl.ControlSystemSimulation(risk_ctrl)

# Define fuzzy logic system 2
def create_fuzzy_system_2():
    income = ctrl.Antecedent(np.arange(0, 100001, 1000), 'income')
    debt_to_income = ctrl.Antecedent(np.arange(0, 101, 1), 'debt_to_income')
    credit_score = ctrl.Antecedent(np.arange(300, 851, 1), 'credit_score')
    risk = ctrl.Consequent(np.arange(0, 11, 1), 'risk')
    
    # Membership functions for fuzzy system 2
    income['low'] = fuzz.trimf(income.universe, [0, 0, 50000])
    income['middle'] = fuzz.trimf(income.universe, [20000, 50000, 80000])
    income['high'] = fuzz.trimf(income.universe, [30000, 100000, 100000])
    debt_to_income['low'] = fuzz.trimf(debt_to_income.universe, [0, 0, 40])
    debt_to_income['middle'] = fuzz.trimf(debt_to_income.universe, [20, 50, 80])
    debt_to_income['high'] = fuzz.trimf(debt_to_income.universe, [20, 100, 100])
    credit_score['low'] = fuzz.trimf(credit_score.universe, [300, 300, 600])
    credit_score['middle'] = fuzz.trimf(credit_score.universe, [500, 675, 800])
    credit_score['high'] = fuzz.trimf(credit_score.universe, [550, 850, 850])
    risk['low'] = fuzz.trimf(risk.universe, [0, 0, 5])
    risk['middle'] = fuzz.trimf(risk.universe, [3, 5, 7])
    risk['high'] = fuzz.trimf(risk.universe, [5, 10, 10])
    
    # Define rules for fuzzy system 2
    rule1 = ctrl.Rule(income['high'] & debt_to_income['low'] & credit_score['high'], risk['low'])
    rule2 = ctrl.Rule(income['low'] & debt_to_income['high'] & credit_score['low'], risk['high'])
    rule3 = ctrl.Rule(income['middle'] & debt_to_income['middle'] & credit_score['middle'], risk['middle'])
    rule4 = ctrl.Rule(income['high'] & debt_to_income['high'] & credit_score['high'], risk['low'])
    rule5 = ctrl.Rule(income['low'] & debt_to_income['low'] & credit_score['high'], risk['low'])
    rule6 = ctrl.Rule(income['middle'] & debt_to_income['high'] & credit_score['low'], risk['high'])
    rule7 = ctrl.Rule(income['low'] & debt_to_income['middle'] & credit_score['middle'], risk['middle'])
    rule8 = ctrl.Rule(income['high'] & debt_to_income['middle'] & credit_score['middle'], risk['low'])
    
    risk_ctrl = ctrl.ControlSystem([rule1, rule2, rule3, rule4, rule5, rule6, rule7, rule8])
    return ctrl.ControlSystemSimulation(risk_ctrl)

# Function to generate synthetic data with a default value if no rules are activated
def generate_synthetic_data(fuzzy_system, n_samples=100, default_risk_value=5.0):
    np.random.seed(42)  # For reproducibility
    samples = []
    for _ in range(n_samples):
        income_val = np.random.randint(0, 100001)
        dti_val = np.random.randint(0, 101)
        credit_score_val = np.random.randint(300, 851)
        
        # Set inputs to the fuzzy system
        fuzzy_system.input['income'] = income_val
        fuzzy_system.input['debt_to_income'] = dti_val
        fuzzy_system.input['credit_score'] = credit_score_val
        
        try:
            fuzzy_system.compute()
            # Use computed risk if available; otherwise, use default value
            risk_val = fuzzy_system.output.get('risk', default_risk_value)
            samples.append([income_val, dti_val, credit_score_val, risk_val])
        except KeyError:
            print(f"No rule activated for income={income_val}, debt_to_income={dti_val}, credit_score={credit_score_val}. Using default risk value.")
            samples.append([income_val, dti_val, credit_score_val, default_risk_value])
    
    return pd.DataFrame(samples, columns=['income', 'debt_to_income', 'credit_score', 'risk'])

# Generate synthetic data for both systems
fuzzy_system_1 = create_fuzzy_system_1()
fuzzy_system_2 = create_fuzzy_system_2()

data1 = generate_synthetic_data(fuzzy_system_1, n_samples=100, default_risk_value=5.0)
data2 = generate_synthetic_data(fuzzy_system_2, n_samples=100, default_risk_value=5.0)

# Train and evaluate models on data from each fuzzy system
def train_and_evaluate(data):
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        data[['income', 'debt_to_income', 'credit_score']], data['risk'], test_size=0.3, random_state=42)

    # Train a Random Forest model
    rf = RandomForestRegressor(n_estimators=100, random_state=42)
    rf.fit(X_train, y_train)

    # Predict and evaluate
    y_pred = rf.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    return mse, r2

# Evaluate both systems
mse1, r2_1 = train_and_evaluate(data1)
mse2, r2_2 = train_and_evaluate(data2)

# Store results in a DataFrame
evaluation_results = pd.DataFrame({
    "Fuzzy System": ["System 1", "System 2"],
    "Mean Squared Error (MSE)": [mse1, mse2],
    "R-squared (R2)": [r2_1, r2_2]
})

# Display the evaluation results
print("\n### Evaluation Results ###")
print(evaluation_results)

# Display the first few synthetic samples generated by each system
print("\n### Synthetic Data from Fuzzy Logic System 1 ###")
print(data1.head())

print("\n### Synthetic Data from Fuzzy Logic System 2 ###")
print(data2.head())



### Evaluation Results ###
  Fuzzy System  Mean Squared Error (MSE)  R-squared (R2)
0     System 1                  1.203119        0.783519
1     System 2                  0.531786        0.876308

### Synthetic Data from Fuzzy Logic System 1 ###
   income  debt_to_income  credit_score      risk
0   15795              92           570  7.622807
1   76820              82           514  5.000000
2   37194              74           387  7.804477
3   44131              23           430  7.546576
4   67221              52           643  2.136193

### Synthetic Data from Fuzzy Logic System 2 ###
   income  debt_to_income  credit_score      risk
0   15795              92           570  7.622807
1   76820              82           514  7.630028
2   37194              74           387  8.120495
3   44131              23           430  7.546576
4   67221              52           643  3.113306


# Analysis of Fuzzy Logic System Performance and Output

## Overview

This analysis compares two fuzzy logic systems, `System 1` and `System 2`, based on their performance metrics and synthetic data output. The following metrics were used to assess each system:

1. **Mean Squared Error (MSE)**: Measures the average squared difference between predicted and actual values. A lower MSE indicates that predictions are closer to the actual values.
2. **R-squared (R²)**: Indicates the proportion of variance in the target variable explained by the model. Values closer to 1 represent a better fit.

## Evaluation Results

### Mean Squared Error (MSE)
- **System 1 MSE**: `1.203119`
- **System 2 MSE**: `0.531786`

`System 2` has a significantly lower MSE compared to `System 1`, suggesting that it more accurately represents the patterns in the synthetic data used for evaluation.

### R-squared (R²)
- **System 1 R²**: `0.783519`
- **System 2 R²**: `0.876308`

R² values closer to 1 indicate a stronger fit to the data. `System 2` explains approximately 87.6% of the variability in the risk scores, while `System 1` explains about 78.4%. This result suggests that `System 2` provides a better fit for the data compared to `System 1`.

**Conclusion**: `System 2` outperforms `System 1` in both MSE and R² metrics, indicating that `System 2` is better at capturing the relationships in the data.

## Sample Output: Synthetic Data from Each System

The following is a snapshot of the synthetic data generated by each system, showing `income`, `debt_to_income`, `credit_score`, and the calculated `risk` value:

- **System 1** produces more generalized risk values, as it lacks the `middle` membership functions.
- **System 2** includes `middle` membership functions, resulting in a more nuanced range of risk values due to additional rules and flexibility.

### Examples of Differences in `risk` Values

1. **For income=15795, debt_to_income=92, credit_score=570**:
   - Both systems yield a similar `risk` value of `7.62`.

2. **For income=76820, debt_to_income=82, credit_score=514**:
   - `System 1` assigns a `risk` of `5.0` (the default value when no specific rules are activated).
   - `System 2` assigns a `risk` of `7.63`, indicating that it applies additional rules due to the `middle` membership functions.

3. **For income=67221, debt_to_income=52, credit_score=643**:
   - `System 1` assigns a `risk` of `2.14`, while `System 2` assigns a `risk` of `3.11`, demonstrating again that `System 2` produces a more refined risk calculation.

## Explanation of System Differences

`System 2` includes `middle` membership functions for each variable (`income`, `debt_to_income`, and `credit_score`). This addition allows `System 2` to cover more cases and calculate a wider range of `risk` values. By incorporating these intermediate membership functions, `System 2` can avoid relying on the default risk value (`5.0`) as often as `System 1`, which improves the precision and flexibility of its risk predictions.

This flexibility in `System 2` is likely a major factor contributing to its lower MSE and higher R² scores, as it better captures the nuances of the synthetic data distribution.

## Role of the Random Forest Model

In this analysis, the **Random Forest model** serves as an evaluation tool to measure how well each fuzzy logic system approximates the "true" risk scores in the data. The Random Forest model is trained on the synthetic data generated by each fuzzy system and then evaluated using MSE and R² metrics. These metrics provide insights into each fuzzy system’s accuracy and fit to the data.

By comparing the MSE and R² values for each system, the Random Forest model helps us determine which fuzzy system (`System 1` or `System 2`) more accurately represents the patterns in the data and can thus be considered more effective in assessing risk. This approach leverages the ensemble learning capabilities of Random Forest to provide a reliable benchmark for evaluating the performance of different fuzzy systems.