In [2]:
import pandas as pd


train = pd.read_csv('train_ctrUa4K.csv')
train.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [3]:
train.isnull().sum()
train = train.dropna()
train.isnull().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [4]:
train['Gender']= train['Gender'].map({'Male':0, 'Female':1})
train['Married']= train['Married'].map({'No':0, 'Yes':1})
train['Loan_Status']= train['Loan_Status'].map({'N':0, 'Y':1})

In [5]:
X = train[['Gender', 'Married', 'ApplicantIncome', 'LoanAmount', 'Credit_History']]
y = train.Loan_Status
X.shape, y.shape

((480, 5), (480,))

In [6]:
from sklearn.model_selection import train_test_split
x_train, x_cv, y_train, y_cv = train_test_split(X,y, test_size = 0.2, random_state = 10)

In [7]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(max_depth=4, random_state = 10)
model.fit(x_train, y_train)

In [8]:
from sklearn.metrics import accuracy_score
pred_cv = model.predict(x_cv)
accuracy_score(y_cv,pred_cv)

0.8020833333333334

In [9]:
pred_train = model.predict(x_train)
accuracy_score(y_train,pred_train)

0.8203125

## Hyperparameter Tuning

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score
import numpy as np

# Define the hyperparameter grid
param_dist = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [3, 5, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5],
    'max_features': ['sqrt', 'log2', None],
    'bootstrap': [True, False],
    'criterion': ['gini', 'entropy']
}

# Initialize the model
rf = RandomForestClassifier(random_state=42)

# RandomizedSearchCV with 5-fold cross-validation
random_search = RandomizedSearchCV(
    estimator=rf, 
    param_distributions=param_dist, 
    n_iter=20,  # Number of random combinations to try
    cv=5, 
    scoring='accuracy', 
    random_state=42, 
    n_jobs=-1
)

# Fit the random search model
random_search.fit(x_train, y_train)

# Get the best hyperparameters
print("Best Hyperparameters:", random_search.best_params_)

# Evaluate on cross-validation set
best_model = random_search.best_estimator_
pred_cv = best_model.predict(x_cv)
print("Cross-validation Accuracy:", accuracy_score(y_cv, pred_cv))

# Evaluate on training set
pred_train = best_model.predict(x_train)
print("Training Accuracy:", accuracy_score(y_train, pred_train))


Best Hyperparameters: {'n_estimators': 300, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 3, 'criterion': 'gini', 'bootstrap': True}
Cross-validation Accuracy: 0.8020833333333334
Training Accuracy: 0.8098958333333334


## Saving The Best Model

In [11]:
# saving the model
import pickle
pickle_out = open("classifier.pkl", mode = "wb")
pickle.dump(best_model, pickle_out)
pickle_out.close()

Logistic Regression and SVM models

In [10]:
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import pandas as pd  # Assuming X and y are pandas DataFrames or Series
import numpy as np #Assuming X and y are numpy arrays

# Assuming X and y are already defined (replace with your actual data)
# Example dummy data (replace with your actual data):
np.random.seed(10)
X = pd.DataFrame(np.random.rand(100, 5))
y = pd.Series(np.random.randint(0, 2, 100))


# Split the data
x_train, x_cv, y_train, y_cv = train_test_split(X, y, test_size=0.2, random_state=10)

# 1. Logistic Regression
logistic_model = LogisticRegression(random_state=10, max_iter=1000)  # Increase max_iter if needed
logistic_model.fit(x_train, y_train)

# 2. SVM (Support Vector Machine)
svm_model = SVC(random_state=10)
svm_model.fit(x_train, y_train)

# 3. Random Forest (already provided)
rf_model = RandomForestClassifier(max_depth=4, random_state=10)
rf_model.fit(x_train, y_train)

# Pickle the models
models = {
    'logistic_regression': logistic_model,
    'svm': svm_model,
    'random_forest': rf_model,
}

for model_name, model in models.items():
    with open(f'{model_name}.pkl', 'wb') as f:
        pickle.dump(model, f)

print("Models trained and pickled successfully.")

Models trained and pickled successfully.
