In [None]:
# Importing necessary libraries
import pandas as pd

# Loading the datasets
train_path = '/data/loan_train.csv'
test_path = '/data/loan_test.csv'

train_data = pd.read_csv(train_path)
test_data = pd.read_csv(test_path)

# Displaying the first few rows of each dataset to understand their structure
train_data_head = train_data.head()
test_data_head = test_data.head()

train_data_info = train_data.info()
test_data_info = test_data.info()

(train_data_head, test_data_head)


In [None]:
# Checking for missing values in train and test datasets
train_missing = train_data.isnull().sum()
test_missing = test_data.isnull().sum()

# Displaying missing values for each dataset
train_missing, test_missing


In [None]:
# Filling missing values in both train and test datasets

# Fill categorical features with the mode
for col in ['Gender', 'Married', 'Dependents', 'Self_Employed']:
    train_data[col].fillna(train_data[col].mode()[0], inplace=True)
    test_data[col].fillna(test_data[col].mode()[0], inplace=True)

# Fill LoanAmount with median
train_data['LoanAmount'].fillna(train_data['LoanAmount'].median(), inplace=True)
test_data['LoanAmount'].fillna(test_data['LoanAmount'].median(), inplace=True)

# Fill Loan_Amount_Term with mode
train_data['Loan_Amount_Term'].fillna(train_data['Loan_Amount_Term'].mode()[0], inplace=True)
test_data['Loan_Amount_Term'].fillna(test_data['Loan_Amount_Term'].mode()[0], inplace=True)

# Fill Credit_History with mode
train_data['Credit_History'].fillna(train_data['Credit_History'].mode()[0], inplace=True)
test_data['Credit_History'].fillna(test_data['Credit_History'].mode()[0], inplace=True)

# Confirming that missing values have been handled
train_missing_after = train_data.isnull().sum()
test_missing_after = test_data.isnull().sum()

train_missing_after, test_missing_after


In [None]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Encoding categorical variables
label_encoder = LabelEncoder()

# Encode binary categorical variables directly
train_data['Gender'] = label_encoder.fit_transform(train_data['Gender'])
test_data['Gender'] = label_encoder.transform(test_data['Gender'])

train_data['Married'] = label_encoder.fit_transform(train_data['Married'])
test_data['Married'] = label_encoder.transform(test_data['Married'])

train_data['Education'] = label_encoder.fit_transform(train_data['Education'])
test_data['Education'] = label_encoder.transform(test_data['Education'])

train_data['Self_Employed'] = label_encoder.fit_transform(train_data['Self_Employed'])
test_data['Self_Employed'] = label_encoder.transform(test_data['Self_Employed'])

train_data['Property_Area'] = label_encoder.fit_transform(train_data['Property_Area'])
test_data['Property_Area'] = label_encoder.transform(test_data['Property_Area'])

# Encoding target variable in training data
train_data['Loan_Status'] = label_encoder.fit_transform(train_data['Loan_Status'])

# Convert "Dependents" to numerical (handling "3+" as 3)
train_data['Dependents'].replace('3+', 3, inplace=True)
test_data['Dependents'].replace('3+', 3, inplace=True)
train_data['Dependents'] = train_data['Dependents'].astype(int)
test_data['Dependents'] = test_data['Dependents'].astype(int)

# Scaling numerical features
scaler = StandardScaler()
numerical_cols = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount']

train_data[numerical_cols] = scaler.fit_transform(train_data[numerical_cols])
test_data[numerical_cols] = scaler.transform(test_data[numerical_cols])

# Displaying first few rows of the processed training data to confirm changes
train_data.head()


In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Splitting data into features (X) and target (y)
X = train_data.drop(columns=['Loan_ID', 'Loan_Status'])
y = train_data['Loan_Status']

# Splitting the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Defining models to test
models = {
    'Logistic Regression': LogisticRegression(random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42)
}

# Training and evaluating each model
for model_name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions on the validation set
    y_pred = model.predict(X_val)
    
    # Evaluate the model
    accuracy = accuracy_score(y_val, y_pred)
    report = classification_report(y_val, y_pred)
    matrix = confusion_matrix(y_val, y_pred)
    
    # Display results
    print(f"Model: {model_name}")
    print(f"Accuracy: {accuracy:.4f}")
    print("Classification Report:\n", report)
    print("Confusion Matrix:\n", matrix)
    print("-" * 40)


In [None]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for Random Forest
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize the Random Forest model
rf = RandomForestClassifier(random_state=42)

# Initialize Grid Search with Cross-Validation
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)

# Fit Grid Search on the training data
grid_search.fit(X_train, y_train)

# Best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Cross-Validation Score:", best_score)


In [None]:
# Train Random Forest with the best parameters on the entire training data
final_rf = RandomForestClassifier(**grid_search.best_params_, random_state=42)
final_rf.fit(X, y)

# Preprocess test data and make predictions
X_test = test_data.drop(columns=['Loan_ID'])
y_test_pred = final_rf.predict(X_test)

# Prepare the output DataFrame
output = test_data[['Loan_ID']].copy()
output['Loan_Status'] = label_encoder.inverse_transform(y_test_pred)

# Save the predictions to a CSV file
output.to_csv('loan_approval_predictions.csv', index=False)
print("Predictions saved to 'loan_approval_predictions.csv'")
