In [2]:
# Import Required Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

# Load Data

train = pd.read_csv('train_ctrUa4K.csv')
test = pd.read_csv('test_lAUu6dG.csv')


test_loan_ids = test['Loan_ID']


train['source'] = 'train'
test['source'] = 'test'
test['Loan_Status'] = np.nan


data = pd.concat([train, test], ignore_index=True)


# Preprocessing
# Fill missing values
for col in ['Gender', 'Married', 'Dependents', 'Self_Employed', 'Credit_History', 'Loan_Amount_Term']:
    data[col].fillna(data[col].mode()[0], inplace=True)

data['LoanAmount'].fillna(data['LoanAmount'].median(), inplace=True)

# Label encoding for categorical columns
label_cols = ['Gender', 'Married', 'Education', 'Self_Employed', 'Property_Area', 'Dependents']
le = LabelEncoder()
for col in label_cols:
    data[col] = le.fit_transform(data[col])

# Encode target
data['Loan_Status'] = data['Loan_Status'].map({'Y': 1, 'N': 0})


data['Total_Income'] = data['ApplicantIncome'] + data['CoapplicantIncome']
data['Income_to_Loan'] = data['Total_Income'] / data['LoanAmount']

# Drop unneeded columns
data.drop(['Loan_ID', 'ApplicantIncome', 'CoapplicantIncome'], axis=1, inplace=True)


# Split back into train and test

train_data = data[data['source'] == 'train'].drop(['source'], axis=1)
test_data = data[data['source'] == 'test'].drop(['source', 'Loan_Status'], axis=1)

X = train_data.drop('Loan_Status', axis=1)
y = train_data['Loan_Status']

# -------------------------------
# Train-Test Split
# -------------------------------
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=42)

# -------------------------------
# Define and Train Models
# -------------------------------
models = {
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "SVM": SVC(kernel='linear', probability=True),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "Gradient Boosting": GradientBoostingClassifier(),
    "AdaBoost": AdaBoostClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

# Evaluation
print("Model Evaluation:\n")
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)
    print(f"{name} - Accuracy: {acc:.4f}, Precision: {prec:.4f}, Recall: {rec:.4f}, F1: {f1:.4f}")

# Final Prediction (using best model)

final_model = models["XGBoost"]  # Change if another model performs better
final_model.fit(X, y)
final_predictions = final_model.predict(test_data)

# Map predictions back to Y/N
final_output = pd.DataFrame({
    'Loan_ID': test_loan_ids,
    'Loan_Status': pd.Series(final_predictions).map({1: 'Y', 0: 'N'})
})

# Save to CSV
final_output.to_csv('final_submission.csv', index=False)
print("\n✅ Submission file 'final_submission.csv' created successfully.")


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['LoanAmount'].fillna(data['LoanAmount'].median(), inplace=True)


Model Evaluation:

Decision Tree - Accuracy: 0.7143, Precision: 0.7692, Recall: 0.8000, F1: 0.7843
Random Forest - Accuracy: 0.7922, Precision: 0.7833, Recall: 0.9400, F1: 0.8545
SVM - Accuracy: 0.7662, Precision: 0.7388, Recall: 0.9900, F1: 0.8462
KNN - Accuracy: 0.5519, Precision: 0.6183, Recall: 0.8100, F1: 0.7013
Gradient Boosting - Accuracy: 0.7727, Precision: 0.7686, Recall: 0.9300, F1: 0.8416
AdaBoost - Accuracy: 0.7597, Precision: 0.7480, Recall: 0.9500, F1: 0.8370
XGBoost - Accuracy: 0.7597, Precision: 0.7838, Recall: 0.8700, F1: 0.8246

✅ Submission file 'final_submission.csv' created successfully.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
