In [16]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib

# Step 1: Load Data
# Download loanpred_test.csv
!wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1-03DcqzYIYtIAIt188ujy5Wr4GSdlgv5' -O loanpred_test.csv

# Load training and test data
df_train = pd.read_csv("/content/drive/MyDrive/EAFIT/SEGUNDO SEMESTRE/APRENDIZAJE AUTOMÁTICO/COMPETITION 1/train_file.csv")
df_test = pd.read_csv("/content/loanpred_test.csv")

# Step 2: Data Preprocessing
# Define features and target
X = df_train.drop(columns=["Loan_Status", "Loan_ID"])
y = df_train["Loan_Status"]

# Identify numerical and categorical columns
num_cols = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History']
cat_cols = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area']

# Preprocessing pipelines for numerical and categorical data
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine transformers into a preprocessor
preprocessor = ColumnTransformer(transformers=[
    ('num', num_transformer, num_cols),
    ('cat', cat_transformer, cat_cols)
])

# Step 3: Define Models

# Random Forest
rf_clf = RandomForestClassifier(n_estimators=7, max_depth=7, min_samples_split=7, random_state=7)

# Bagging Classifier (using Decision Trees)
bagging_clf = BaggingClassifier(
    estimator=rf_clf,  # Updated from base_estimator to estimator
    n_estimators=50,
    random_state=42
)

# Gradient Boosting Classifier
gb_clf = GradientBoostingClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    random_state=42
)

# Step 4: Build Pipelines for Each Model

# Random Forest Pipeline
rf_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", rf_clf)
])

# Bagging Pipeline
bagging_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", bagging_clf)
])

# Boosting Pipeline
boosting_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", gb_clf)
])

# Step 5: Train and Evaluate Models
models = {
    "Random Forest": rf_pipeline,
    "Bagging": bagging_pipeline,
    "Boosting": boosting_pipeline
}

# Train-test split for validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

for name, pipeline in models.items():
    print(f"\n🚀 Training {name}...")
    pipeline.fit(X_train, y_train)
    y_pred_val = pipeline.predict(X_val)
    print(f"{name} Validation Accuracy: {accuracy_score(y_val, y_pred_val):.4f}")
    print(f"{name} Classification Report:\n{classification_report(y_val, y_pred_val)}")

# Step 6: Generate Predictions on Test Data using the Best Model (e.g., Boosting)
best_pipeline = boosting_pipeline  # You can switch to bagging_pipeline or rf_pipeline

X_test = df_test.drop(columns=["Loan_ID"], errors="ignore")
predictions = best_pipeline.predict(X_test)

# Step 7: Prepare Submission File
submission = pd.DataFrame({
    "Loan_ID": df_test["Loan_ID"],
    "Loan_Status": predictions
})

# Map numerical predictions to labels if necessary
submission["Loan_Status"] = submission["Loan_Status"].map({1: "Y", 0: "N"})

# Save to CSV
submission.to_csv("loan_predictions.csv", index=False)
print("✅ Predictions saved to loan_predictions.csv")



--2025-02-20 02:44:01--  https://docs.google.com/uc?export=download&id=1-03DcqzYIYtIAIt188ujy5Wr4GSdlgv5
Resolving docs.google.com (docs.google.com)... 64.233.189.138, 64.233.189.113, 64.233.189.139, ...
Connecting to docs.google.com (docs.google.com)|64.233.189.138|:443... connected.
HTTP request sent, awaiting response... 303 See Other
Location: https://drive.usercontent.google.com/download?id=1-03DcqzYIYtIAIt188ujy5Wr4GSdlgv5&export=download [following]
--2025-02-20 02:44:01--  https://drive.usercontent.google.com/download?id=1-03DcqzYIYtIAIt188ujy5Wr4GSdlgv5&export=download
Resolving drive.usercontent.google.com (drive.usercontent.google.com)... 173.194.174.132, 2404:6800:4008:c1b::84
Connecting to drive.usercontent.google.com (drive.usercontent.google.com)|173.194.174.132|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 8358 (8.2K) [application/octet-stream]
Saving to: ‘loanpred_test.csv’


2025-02-20 02:44:03 (48.1 MB/s) - ‘loanpred_test.csv’ saved [8358/