In [55]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [56]:
# ------------------------- Helper functions -------------------------
def plurality_value(examples, target):
    counts = examples[target].value_counts()
    return counts.idxmax()

def entropy(examples, target):
    values, counts = np.unique(examples[target], return_counts=True)
    probs = counts / counts.sum()
    return -np.sum(probs * np.log2(probs + 1e-9))

def remainder(examples, attr, target, threshold=None):
    if threshold is None:
        # –ê–Ω–≥–∏–ª–∞–ª—Ç–∞–π “Ø–µ–¥
        values, counts = np.unique(examples[attr], return_counts=True)
        total = len(examples)
        rem = 0.0
        for v, c in zip(values, counts):
            subset = examples[examples[attr] == v]
            rem += (c / total) * entropy(subset, target)
        return rem
    else:
        # –¢–æ–æ–Ω —É—Ç–≥–∞—Ç–∞–π “Ø–µ–¥
        left = examples[examples[attr] <= threshold]
        right = examples[examples[attr] > threshold]
        total = len(examples)
        rem = (len(left)/total) * entropy(left, target) + (len(right)/total) * entropy(right, target)
        return rem

def information_gain(examples, attr, target):
    base_entropy = entropy(examples, target)
    if np.issubdtype(examples[attr].dtype, np.number):
        # Continuous numeric “Ø–µ–¥ —Ö–∞–º–≥–∏–π–Ω —Å–∞–π–Ω threshold —É—Ç–≥—ã–≥ –æ–ª–Ω–æ
        values = sorted(examples[attr].unique())
        best_gain, best_threshold = -1, None
        for i in range(len(values) - 1):
            threshold = (values[i] + values[i + 1]) / 2
            gain = base_entropy - remainder(examples, attr, target, threshold)
            if gain > best_gain:
                best_gain, best_threshold = gain, threshold
        return best_gain, best_threshold
    else:
        gain = base_entropy - remainder(examples, attr, target)
        return gain, None



In [57]:
# ------------------------- Model -------------------------
def decision_tree_learning(examples, attributes, target, parent_examples=None):
    if len(examples) == 0:
        return plurality_value(parent_examples, target)
    if len(np.unique(examples[target])) == 1:
        return np.unique(examples[target])[0]
    if len(attributes) == 0:
        return plurality_value(examples, target)
    
    gains, thresholds = {}, {}
    for attr in attributes:
        gain, threshold = information_gain(examples, attr, target)
        gains[attr] = gain
        thresholds[attr] = threshold

    best_attr = max(gains, key=gains.get)
    best_threshold = thresholds[best_attr]
    tree = {best_attr: {}}
    remaining_attrs = [a for a in attributes if a != best_attr]

    if best_threshold is not None:
        left = examples[examples[best_attr] <= best_threshold]
        right = examples[examples[best_attr] > best_threshold]
        tree[best_attr][f'<= {round(best_threshold, 2)}'] = decision_tree_learning(left, remaining_attrs, target, examples)
        tree[best_attr][f'> {round(best_threshold, 2)}'] = decision_tree_learning(right, remaining_attrs, target, examples)
    else:
        for v in np.unique(examples[best_attr]):
            exs = examples[examples[best_attr] == v]
            subtree = decision_tree_learning(exs, remaining_attrs, target, examples)
            tree[best_attr][v] = subtree
    return tree

def predict(tree, sample):
    if not isinstance(tree, dict):
        return tree

    attr = next(iter(tree))
    value = sample[attr]

    for condition, branch in tree[attr].items():
        if not isinstance(branch, dict):
            if isinstance(condition, str) and condition.startswith('<='):
                threshold = float(condition.split('<= ')[1])
                if value <= threshold:
                    return branch
            elif isinstance(condition, str) and condition.startswith('>'):
                threshold = float(condition.split('> ')[1])
                if value > threshold:
                    return branch
            elif str(value) == str(condition):
                return branch
        else:
            if isinstance(condition, str) and condition.startswith('<='):
                threshold = float(condition.split('<= ')[1])
                if value <= threshold:
                    return predict(branch, sample)
            elif isinstance(condition, str) and condition.startswith('>'):
                threshold = float(condition.split('> ')[1])
                if value > threshold:
                    return predict(branch, sample)
            elif str(value) == str(condition):
                return predict(branch, sample)

    return None


def print_tree(tree, indent=""):
    if not isinstance(tree, dict):
        print(indent + "‚Üí " + str(tree))
        return
    for attr, branches in tree.items():
        for value, subtree in branches.items():
            print(f"{indent}[{attr} = {value}]")
            print_tree(subtree, indent + "   ")


In [58]:
# ------------------------- Main -------------------------
df = pd.read_csv("./data/loan_train.csv")
df.columns = df.columns.str.strip() 

# –ê–Ω–≥–∏–ª–∞–ª—Ç–∞–π –±–æ–ª–æ–Ω —Ç–æ–æ–Ω —É—Ç–≥–∞—Ç–∞–π –±–∞–≥–∞–Ω—É—É–¥—ã–Ω –∞–Ω–≥–∏–ª–∞–ª
categorical = ['Gender', 'Married', 'Education', 'Self_Employed', 'Area', 'Status']
numeric_cols = ['Applicant_Income', 'Coapplicant_Income', 'Loan_Amount', 'Term', 'Credit_History', 'Dependents']

# Category column to string
for col in categorical:
    if col in df.columns:
        df[col] = df[col].astype(str).str.strip()

# Numeric column to float
for col in numeric_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')
        # –£—Ç–≥–∞ –±–∞–π—Ö–≥“Ø–π –±–∞–π–≤–∞–ª 0 –±–æ–ª–≥–æ–Ω–æ  
        df[col] = df[col].fillna(0)                        


In [59]:
# Print cleaned data types
print("\nData types after cleaning:")
print(df.dtypes)


Data types after cleaning:
Gender                 object
Married                object
Dependents            float64
Education              object
Self_Employed          object
Applicant_Income        int64
Coapplicant_Income    float64
Loan_Amount             int64
Term                  float64
Credit_History        float64
Area                   object
Status                 object
dtype: object


In [60]:
# Print sample
print("\nUnique sample values:")
for c in df.columns:
    print(f"{c}: {df[c].unique()[:10]}") 


Unique sample values:
Gender: ['Male' 'Female' 'nan']
Married: ['No' 'Yes' 'nan']
Dependents: [0. 1. 2.]
Education: ['Graduate' 'Not Graduate']
Self_Employed: ['No' 'Yes' 'nan']
Applicant_Income: [ 584900  458300  300000  258300  600000  541700  233300  303600  400600
 1284100]
Coapplicant_Income: [      0.  150800.  235800.  419600.  151600.  250400.  152600. 1096800.
   70000.  184000.]
Loan_Amount: [15000000 12800000  6600000 12000000 14100000 26700000  9500000 15800000
 16800000 34900000]
Term: [360. 120. 240.   0. 180.  60. 300. 480.  36.  84.]
Credit_History: [1. 0.]
Area: ['Urban' 'Rural' 'Semiurban']
Status: ['Y' 'N']


In [69]:
dftrain, dftest = train_test_split(df, test_size=0.3, random_state=63)

# Train model
target = 'Status'
attributes = [c for c in df.columns if c != target]

tree = decision_tree_learning(dftrain, attributes, target)


In [70]:
# Evaluation
correct = 0
total = len(dftest)
predictions = []

total_yes = 0
total_no = 0
correct_yes = 0
correct_no = 0

for _, row in dftest.iterrows():
    sample = row.to_dict()
    predicted = predict(tree, sample)
    actual = sample[target]

    predictions.append((predicted, actual))
    
    if actual == 'Y':
        total_yes += 1
    else:
        total_no += 1

    if predicted == actual:
        correct += 1
        if actual == 'Y':
            correct_yes += 1
        else:
            correct_no += 1

accuracy = correct / total if total > 0 else 0
error_rate = (total - correct) / total if total > 0 else 0

print(f"\n‚úÖ Accuracy: {accuracy * 100:.2f}% ({correct}/{total})")
print(f"‚ùå Error rate: {error_rate * 100:.2f}% ({total - correct}/{total})")

# Confusion matrix
print("\n---------------- Confusion Matrix ----------------")
print(f"                    Classified as:")
print(f"                 |   N   |   Y   |")
print(f"Correct label N  |  {correct_no:3d}  |  {total_no - correct_no:3d}  |")
print(f"Correct label Y  |  {total_yes - correct_yes:3d}  |  {correct_yes:3d}  |")

precision = correct_yes / (correct_yes + (total_no - correct_no)) if (correct_yes + (total_no - correct_no)) > 0 else 0
recall = correct_yes / total_yes if total_yes > 0 else 0
f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

print(f"\nüîé False alarm rate: {(total_no - correct_no) * 100 / total_no if total_no > 0 else 0:.2f}%")
print(f"üö´ Missed detection rate: {(total_yes - correct_yes) * 100 / total_yes if total_yes > 0 else 0:.2f}%")
print(f"üéØ Precision: {precision * 100:.2f}%")
print(f"üéØ Recall: {recall * 100:.2f}%")
print(f"üéØ F1 score: {f1 * 100:.2f}%")



‚úÖ Accuracy: 68.65% (127/185)
‚ùå Error rate: 31.35% (58/185)

---------------- Confusion Matrix ----------------
                    Classified as:
                 |   N   |   Y   |
Correct label N  |   25  |   27  |
Correct label Y  |   31  |  102  |

üîé False alarm rate: 51.92%
üö´ Missed detection rate: 23.31%
üéØ Precision: 79.07%
üéØ Recall: 76.69%
üéØ F1 score: 77.86%


In [62]:

# Print the tree
print("\n------------------------- Loan Decision Tree -------------------------")
print_tree(tree)


------------------------- Loan Decision Tree -------------------------
[Credit_History = <= 0.5]
   [Coapplicant_Income = <= 396650.0]
      [Applicant_Income = <= 234600.0]
         ‚Üí N
      [Applicant_Income = > 234600.0]
         [Loan_Amount = <= 18650000.0]
            [Self_Employed = No]
               [Term = <= 90.0]
                  ‚Üí N
               [Term = > 90.0]
                  [Gender = Female]
                     [Dependents = <= 0.5]
                        [Education = Graduate]
                           [Married = No]
                              [Area = Semiurban]
                                 ‚Üí N
                              [Area = Urban]
                                 ‚Üí N
                           [Married = Yes]
                              [Area = Semiurban]
                                 ‚Üí Y
                              [Area = Urban]
                                 ‚Üí N
                        [Education = Not Graduate]
       