In [14]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# file path
file_path = "financial_risk_assessment.csv"

# read csv file
df = pd.read_csv(file_path)

df.head()

Unnamed: 0,Age,Gender,Education Level,Marital Status,Income,Credit Score,Loan Amount,Loan Purpose,Employment Status,Years at Current Job,Payment History,Debt-to-Income Ratio,Assets Value,Number of Dependents,City,State,Country,Previous Defaults,Marital Status Change,Risk Rating
0,49,Male,PhD,Divorced,72799.0,688.0,45713.0,Business,Unemployed,19,Poor,0.154313,120228.0,0.0,Port Elizabeth,AS,Cyprus,2.0,2,Low
1,57,Female,Bachelor's,Widowed,,690.0,33835.0,Auto,Employed,6,Fair,0.14892,55849.0,0.0,North Catherine,OH,Turkmenistan,3.0,2,Medium
2,21,Non-binary,Master's,Single,55687.0,600.0,36623.0,Home,Employed,8,Fair,0.362398,180700.0,3.0,South Scott,OK,Luxembourg,3.0,2,Medium
3,59,Male,Bachelor's,Single,26508.0,622.0,26541.0,Personal,Unemployed,2,Excellent,0.454964,157319.0,3.0,Robinhaven,PR,Uganda,4.0,2,Medium
4,25,Non-binary,Bachelor's,Widowed,49427.0,766.0,36528.0,Personal,Unemployed,10,Fair,0.143242,287140.0,,New Heather,IL,Namibia,3.0,1,Low


# **Part 1: Implementing Decision Tree 🌳**

In [15]:
# Display general information about the dataset
print("🔎 Dataset Information:")
df.info()

# Show number of missing values in each column
print("\n❗ Missing Values per Column:")
print(df.isnull().sum())

# Identify categorical columns
categorical_cols = df.select_dtypes(include="object").columns
print("\n🧩 Categorical Columns:")
print(categorical_cols)

# Show missing values only in categorical columnso
print("\n📋 Missing Values in Categorical Columns:")
print(df[categorical_cols].isnull().sum())

🔎 Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15000 entries, 0 to 14999
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Age                    15000 non-null  int64  
 1   Gender                 15000 non-null  object 
 2   Education Level        15000 non-null  object 
 3   Marital Status         15000 non-null  object 
 4   Income                 12750 non-null  float64
 5   Credit Score           12750 non-null  float64
 6   Loan Amount            12750 non-null  float64
 7   Loan Purpose           15000 non-null  object 
 8   Employment Status      15000 non-null  object 
 9   Years at Current Job   15000 non-null  int64  
 10  Payment History        15000 non-null  object 
 11  Debt-to-Income Ratio   15000 non-null  float64
 12  Assets Value           12750 non-null  float64
 13  Number of Dependents   12750 non-null  float64
 14  City                   15000 no

## **🤔 Handling Missing Values**

In this dataset, some numerical columns like *Income*, *Credit Score*, *Loan Amount*, *Assets Value*, *Number of Dependents*, and *Previous Defaults* had missing values. To fix this and make the data ready for training, we filled the missing values using the **median** of each column.

We chose the median instead of the average (mean) because the median is less affected by very high or low values (outliers), which are common in financial data. This method helps us keep the data more stable and prevents our decision tree model from learning wrong patterns due to missing values.

We also checked the categorical columns for missing values (such as *Gender*, *Education Level*, *Marital Status*, *Loan Purpose*, *Employment Status*, etc.), and confirmed that there were **no missing values** in these columns. Therefore, no additional imputation was needed for categorical data.

In [16]:
# List of numerical columns with missing values
numeric_missing_cols = [
    "Income", "Credit Score", "Loan Amount",
    "Assets Value", "Number of Dependents",
    "Previous Defaults"
]

# Fill missing values in numeric columns using median value
for col in numeric_missing_cols:
    median_val = df[col].median()
    df[col] = df[col].fillna(median_val)

# Verify that all missing values have been filled
print("\nMissing values after imputation:")
df.isnull().sum()


Missing values after imputation:


Age                      0
Gender                   0
Education Level          0
Marital Status           0
Income                   0
Credit Score             0
Loan Amount              0
Loan Purpose             0
Employment Status        0
Years at Current Job     0
Payment History          0
Debt-to-Income Ratio     0
Assets Value             0
Number of Dependents     0
City                     0
State                    0
Country                  0
Previous Defaults        0
Marital Status Change    0
Risk Rating              0
dtype: int64

### **🔠 Encoding Categorical Features**

To prepare the dataset for the ID3 decision tree algorithm, we manually encoded the categorical features into numerical values. This is important because the algorithm requires numerical comparisons to calculate entropy and information gain.

Each category in features like *Gender*, *Education Level*, *Marital Status*, *Loan Purpose*, *Employment Status*, and *Payment History* is mapped to a unique integer. Since we are not allowed to use external encoding libraries (e.g., `sklearn.preprocessing.LabelEncoder`), all encodings are done manually.

In [17]:
# Gender
gender_map = {"Male": 0, "Female": 1, "Non-binary": 2}

# Education Level
education_map = {"High School": 0, "Bachelor's": 1, "Master's": 2, "PhD": 3}

# Marital Status
marital_map = {"Single": 0, "Married": 1, "Divorced": 2, "Widowed": 3}

# Loan Purpose
loan_purpose_map = {"Home": 0, "Auto": 1, "Personal": 2, "Business": 3}

# Employment Status
employment_map = {"Unemployed": 0, "Employed": 1, "Self-employed": 2}

# Payment History
payment_map = {"Poor": 0, "Fair": 1, "Good": 2, "Excellent": 3}

# Risk Rating (Target Variable – optional for printing, not encoded yet)
risk_rating_map = {"Low": 0, "Medium": 1, "High": 2}

### **🧹 Dropping Certain Columns Before Splitting the Data**
Before splitting the dataset into training, validation, and test sets, we removed some columns from the feature set `X`:

- **`Risk Rating`**: This is the original target column in string format. Since we already encoded it as `Risk Rating Encoded`, it is no longer needed in the input features.
- **`Risk Rating Encoded`**: This is our label (`y`), and should not be included among the input features. Including it would lead to data leakage.
- **`City`, `State`, and `Country`**: These columns contain too many unique values, which can make the model too complex and prone to overfitting. Additionally, they may not provide meaningful splits for a decision tree algorithm like ID3.

By dropping these columns, we ensure that the input features are clean, relevant, and suitable for training the decision tree without leaking any label information or introducing noise.

In [18]:
from sklearn.model_selection import train_test_split

# Encode the target column on-the-fly
y = df["Risk Rating"].map(risk_rating_map)

# Drop non-feature columns
X = df.drop(columns=["Risk Rating", "City", "State", "Country"])


# 85% train+validation, 15% test
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size=0.15, random_state=42, stratify=y
)

# %15 validation için train_val'ı tekrar bölelim
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.1765, random_state=42, stratify=y_train_val
)
# (0.1765 ≈ 15 / 85, böylece %15 validation, %70 train elde edilir)

# Sonuçları kontrol et
print("Train set size:", X_train.shape)
print("Validation set size:", X_val.shape)
print("Test set size:", X_test.shape)

Train set size: (10499, 16)
Validation set size: (2251, 16)
Test set size: (2250, 16)


In [None]:
# Calculate entropy of label array y
def entropy(y):
    classes, counts = np.unique(y, return_counts=True)
    probabilities = counts / counts.sum()
    return -np.sum(probabilities * np.log2(probabilities + 1e-9))  # epsilon for safety

# Calculate information gain of a feature in the dataset
def information_gain(data, feature, target):
    total_entropy = entropy(data[target])
    values, counts = np.unique(data[feature], return_counts=True)
    weighted_entropy = 0
    
    for val, count in zip(values, counts):
        subset = data[data[feature] == val]
        subset_entropy = entropy(subset[target])
        weighted_entropy += (count / len(data)) * subset_entropy
        
    return total_entropy - weighted_entropy

In [None]:
# Find the best feature to split the data
def best_feature_to_split(data, features, target):
    """Find the best feature with highest information gain"""
    gains = [information_gain(data, feature, target) for feature in features]
    return features[np.argmax(gains)]

In [21]:
def id3(data, features, target, depth=0, max_depth=None):
    """Recursive ID3 algorithm"""
    
    # Eğer tüm etiketler aynıysa -> yaprak düğüm
    if len(np.unique(data[target])) == 1:
        return np.unique(data[target])[0]
    
    # Eğer feature kalmadıysa -> en sık görülen sınıf
    if len(features) == 0:
        return data[target].mode()[0]
    
    # Maksimum derinliğe ulaşıldıysa -> çoğunluk sınıfı
    if max_depth is not None and depth >= max_depth:
        return data[target].mode()[0]

    # En iyi feature'ı bul
    best_feature = best_feature_to_split(data, features, target)
    
    tree = {best_feature: {}}
    
    # Her unique değer için dal oluştur
    for value in np.unique(data[best_feature]):
        sub_data = data[data[best_feature] == value]
        if sub_data.empty:
            tree[best_feature][value] = data[target].mode()[0]
        else:
            # Recursive call
            remaining_features = [f for f in features if f != best_feature]
            subtree = id3(sub_data, remaining_features, target, depth+1, max_depth)
            tree[best_feature][value] = subtree
    
    return tree

In [22]:
def predict_single(tree, sample):
    """Predict the class label for a single sample using the decision tree"""
    if not isinstance(tree, dict):
        return tree  # reached a leaf node
    
    feature = next(iter(tree))
    feature_value = sample[feature]
    
    if feature_value in tree[feature]:
        return predict_single(tree[feature][feature_value], sample)
    else:
        # If unseen feature value during training → majority class fallback
        return -1  # or mode of training labels

In [23]:
def predict(tree, X):
    """Predict class labels for a dataset"""
    return X.apply(lambda row: predict_single(tree, row), axis=1)

In [24]:
def accuracy(y_true, y_pred):
    return np.mean(y_true == y_pred)

In [25]:
features = list(X_train.columns)
target = "Risk Rating Encoded"

# Combine X_train and y_train into a single DataFrame
train_data = X_train.copy()
train_data[target] = y_train

# Build the decision tree
tree = id3(train_data, features, target, max_depth=50)  # You can change max_depth

In [26]:
# Tahminleri yap
y_pred = predict(tree, X_test)

# -1 varsa onları çıkaralım (bilinmeyen dal değerleri olabilir)
valid_idx = y_pred != -1
test_acc = accuracy(y_test[valid_idx], y_pred[valid_idx])

print(f"🌟 Test Accuracy: {test_acc:.4f}")

🌟 Test Accuracy: nan
