In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

# Load dataset
df = pd.read_csv("base.csv")

# Separate features and target variable
X = df.drop(columns=['LoanApproved'])
y = df['LoanApproved']

# Label encoding for categorical columns
categorical_columns = X.select_dtypes(include=['object']).columns
label_encoder = LabelEncoder()
for col in categorical_columns:
    X[col] = label_encoder.fit_transform(X[col])

# Train-test split (80% training, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Decision Tree model (max depth = 5)
clf = DecisionTreeClassifier(random_state=42, max_depth=1)
clf.fit(X_train, y_train)

# Make predictions on both training and validation sets
y_pred_train = clf.predict(X_train)
y_pred_val = clf.predict(X_val)

# Calculate accuracy for both training and validation sets
train_accuracy = accuracy_score(y_train, y_pred_train)
val_accuracy = accuracy_score(y_val, y_pred_val)

# Print the accuracies
print(f"Training Accuracy: {train_accuracy * 100:.2f}%")
print(f"Validation Accuracy: {val_accuracy * 100:.2f}%")

# Add the 'CorrectPrediction' column for both training and validation sets
X_train['CorrectPrediction'] = (y_train == y_pred_train).astype(int)
X_val['CorrectPrediction'] = (y_val == y_pred_val).astype(int)

# Combine the training and validation datasets
combined_df = pd.concat([X_train, X_val])

# Add the 'LoanApproved' target back into the dataframe
combined_df['LoanApproved'] = pd.concat([y_train, y_val])

# Save the dataframe to a CSV file
combined_df.to_csv("final.csv", index=False)

Training Accuracy: 77.51%
Validation Accuracy: 78.30%
