In [None]:
# Loading Packages
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
# %matplotlib inline
import warnings
warnings.filterwarnings('ignore')

# Additional imports for new changes
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import VotingClassifier, StackingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
import joblib  # For saving the model

# Reading data
train = pd.read_csv('Train_file.csv')
test = pd.read_csv('Test_file.csv')

# Copy of train and test data so even if we have to make any changes in these datasets
# we would not lose the original datasets
train_original = train.copy()
test_original = test.copy()

# Features present in the data and their data types
train.columns

# Features present in the test dataset
test.columns

train.dtypes

# shape of the dataset
train.shape, test.shape

train['Loan_Status'].value_counts()

# Normalize can be set to true to print proportions
# instead of number
train['Loan_Status'].value_counts(normalize=True)

# Normalize can be set to True to print proportions instead of number
train['Loan_Status'].value_counts().plot.bar()

# Independent variable (categorical)
plt.figure(1, figsize= (20,15))
plt.subplot(221)
train['Gender'].value_counts(normalize=True).plot.bar(figsize = (20,10),title = 'Gender')
plt.subplot(222)
train['Married'].value_counts(normalize=True).plot.bar(figsize = (20,10),title = 'Married')
plt.subplot(223)
train['Self_Employed'].value_counts(normalize=True).plot.bar(title = 'Self_Employed')
plt.subplot(224)
train['Credit_History'].value_counts(normalize=True).plot.bar(title='Credit_History')
plt.show()

# Independent Variable (ordinal)
plt.figure(1)
plt.subplot(131)
train['Dependents'].value_counts(normalize=True).plot.bar(figsize=(24,6), title = 'Dependents')
plt.subplot(132)
train['Education'].value_counts(normalize=True).plot.bar(title='Education')
plt.subplot(133)
train['Property_Area'].value_counts(normalize=True).plot.bar(title='Property_Area')
plt.show()

plt.figure(1)
plt.subplot(121)
sns.distplot(train['ApplicantIncome'])
plt.subplot(122)
train['ApplicantIncome'].plot.box(figsize=(16,5))
plt.show()

train.boxplot(column = 'ApplicantIncome', by='Education')
plt.suptitle('')

# Co-applicant Income Distribution
plt.figure(1)
plt.subplot(121)
sns.distplot(train['CoapplicantIncome'])
plt.subplot(122)
train['CoapplicantIncome'].plot.box(figsize=(16,5))
plt.show()

# Distribution of LoanAmount variable
plt.figure(1)
plt.subplot(121)
df = train.dropna()
sns.distplot(train['LoanAmount'])
plt.subplot(122)
train['LoanAmount'].plot.box(figsize=(16,5))
plt.show()

# Find the relation between target variable and categorical independent variable
Gender = pd.crosstab(train['Gender'], train['Loan_Status'])
Gender.div(Gender.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True, figsize=(4,4))

# Visualizing remaining categorical vs target variable
Married = pd.crosstab(train['Married'], train['Loan_Status'])
Dependents = pd.crosstab(train['Dependents'], train['Loan_Status'])
Education = pd.crosstab(train['Education'], train['Loan_Status'])
Self_Employed = pd.crosstab(train['Self_Employed'], train['Loan_Status'])

Married.div(Married.sum(1).astype(float), axis = 0).plot(kind='bar', stacked=True, figsize=(4,4))
plt.show()

Dependents.div(Dependents.sum(1).astype(float), axis =0).plot(kind='bar', stacked = True)
plt.show()

Education.div(Education.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True, figsize=(4,4))
plt.show()

Self_Employed.div(Self_Employed.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True, figsize=(4,4))
plt.show()

# Relationship between remaining categorical and independent variables and Loan_Status
Credit_History = pd.crosstab(train['Credit_History'], train['Loan_Status'])
Property_Area = pd.crosstab(train['Property_Area'], train['Loan_Status'])

Credit_History.div(Credit_History.sum(1).astype(float), axis=0).plot(kind='bar', stacked = True, figsize=(4,4))
plt.show()

Property_Area.div(Property_Area.sum(1).astype(float), axis=0).plot(kind='bar', stacked = True)
plt.show()

# Trying to find out the mean income of people for which loan has been approved
# vs the mean income of people for which loan has not been approved
train.groupby('Loan_Status')['ApplicantIncome'].mean().plot.bar()

bins = [0,2500,4000,6000,81000]
group = ['Low','Average', 'High','Very high']
train['Income_bin'] = pd.cut(train['ApplicantIncome'], bins, labels=group)
Income_bin = pd.crosstab(train['Income_bin'], train['Loan_Status'])
Income_bin.div(Income_bin.sum(1).astype(float), axis=0).plot(kind='bar', stacked= True)
plt.xlabel('ApplicantIncome')
P = plt.ylabel('Percentage')

bins = [0,1000,3000, 42000]
group = ['Low', 'Average', 'High']
train['Coapplicant_Income_bin'] = pd.cut(train['CoapplicantIncome'], bins, labels = group)
Coapplicant_Income_bin = pd.crosstab(train['Coapplicant_Income_bin'], train['Loan_Status'])
Coapplicant_Income_bin.div(Coapplicant_Income_bin.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True)
plt.xlabel('CoapplicantIncome')
P = plt.ylabel('Percentage')

train['Total_Income'] = train['ApplicantIncome'] + train['CoapplicantIncome']
bins = [0,2500,4000,6000,81000]
group = ['Low', 'Average', 'High','Very High']
train['Total_Income_bin'] = pd.cut(train['Total_Income'], bins, labels=group)
Total_Income_bin = pd.crosstab(train['Total_Income_bin'], train['Loan_Status'])
Total_Income_bin.div(Total_Income_bin.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True)
plt.xlabel('Total_Income')
P = plt.ylabel('Percentage')

# Loan Amount Variable
bins = [0,100,200,700]
group = ['Low','Average','High']
train['LoanAmount_bin'] = pd.cut(train['LoanAmount'], bins, labels=group)
LoanAmount_bin = pd.crosstab(train['LoanAmount_bin'], train['Loan_Status'])
LoanAmount_bin.div(LoanAmount_bin.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True)
plt.xlabel('LoanAmount')
P = plt.ylabel('Percentage')

# Lets drop the bins which we created for the exploration part.
train=train.drop(['Income_bin','Coapplicant_Income_bin','LoanAmount_bin','Total_Income_bin','Total_Income'], axis=1)

# Select only numeric columns
numeric_train = train.select_dtypes(include=['float64', 'int64'])

# Compute correlation matrix
matrix = numeric_train.corr()

# Plot the heatmap
f, ax = plt.subplots(figsize=(9, 6))
sns.heatmap(matrix, vmax=0.8, square=True, cmap="BuPu")
plt.show()

# CHANGE 3: Independent vs independent variable visualizations (feature vs feature)
# Scatter plot: ApplicantIncome vs CoapplicantIncome
plt.figure(figsize=(8,6))
sns.scatterplot(data=train, x='ApplicantIncome', y='CoapplicantIncome', hue='Loan_Status')
plt.title('Applicant Income vs Coapplicant Income')
plt.show()

# Scatter plot: ApplicantIncome vs LoanAmount
plt.figure(figsize=(8,6))
sns.scatterplot(data=train, x='ApplicantIncome', y='LoanAmount', hue='Loan_Status')
plt.title('Applicant Income vs Loan Amount')
plt.show()

# Boxplot: LoanAmount by Property_Area
plt.figure(figsize=(10,6))
sns.boxplot(data=train, x='Property_Area', y='LoanAmount', hue='Loan_Status')
plt.title('Loan Amount by Property Area')
plt.show()

# Checking and imputing missing values
train.isnull().sum()
train['Gender'].fillna(train['Gender'].mode()[0], inplace=True)
train['Married'].fillna(train['Married'].mode()[0], inplace=True)
train['Dependents'].fillna(train['Dependents'].mode()[0], inplace=True)
train['Self_Employed'].fillna(train['Self_Employed'].mode()[0], inplace=True)
train['Credit_History'].fillna(train['Credit_History'].mode()[0], inplace=True)
train['Loan_Amount_Term'].value_counts()
train['Loan_Amount_Term'].fillna(train['Loan_Amount_Term'].mode()[0], inplace=True)
train['LoanAmount'].fillna(train['LoanAmount'].median(), inplace=True)
train.isnull().sum()

test['Gender'].fillna(train['Gender'].mode()[0], inplace=True)
test['Dependents'].fillna(train['Dependents'].mode()[0], inplace=True)
test['Self_Employed'].fillna(train['Self_Employed'].mode()[0], inplace=True)
test['Credit_History'].fillna(train['Credit_History'].mode()[0], inplace=True)
test['Loan_Amount_Term'].fillna(train['Loan_Amount_Term'].mode()[0], inplace=True)
test['LoanAmount'].fillna(train['LoanAmount'].median(), inplace=True)

# CHANGE 2: Combine applicants with 1,2,3 or more dependents into a new feature
# Replace '3+' with 3 and convert to int
train['Dependents'].replace('3+', 3, inplace=True)
test['Dependents'].replace('3+', 3, inplace=True)
train['Dependents'] = train['Dependents'].astype(int)
test['Dependents'] = test['Dependents'].astype(int)

# Create Has_Multiple_Dependents: 1 if Dependents > 0, else 0
train['Has_Multiple_Dependents'] = (train['Dependents'] > 0).astype(int)
test['Has_Multiple_Dependents'] = (test['Dependents'] > 0).astype(int)

# Plot the new feature vs Loan_Status
Has_Dep = pd.crosstab(train['Has_Multiple_Dependents'], train['Loan_Status'])
Has_Dep.div(Has_Dep.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True, figsize=(4,4))
plt.show()

# Visualizing effect of log transformation (similar changes to be done to the test file)
train['LoanAmount_log'] = np.log(train['LoanAmount'])
train['LoanAmount_log'].hist(bins=20)
test['LoanAmount_log'] = np.log(test['LoanAmount'])

# Add Total_Income and log
train['Total_Income'] = train['ApplicantIncome'] + train['CoapplicantIncome']
test['Total_Income'] = test['ApplicantIncome'] + test['CoapplicantIncome']

# Let's check the distribution of Total Income
sns.distplot(train['Total_Income'])

# Take the log transformation to make the distribution normal
train['Total_Income_log'] = np.log(train['Total_Income'])
sns.distplot(train['Total_Income_log'])
test['Total_Income_log'] = np.log(test['Total_Income'])

# CHANGE 4: Better EMI formula including interest rates
# Assume a fixed average interest rate for personal loans in India, e.g., 10% per annum (0.10 / 12 monthly)
# EMI = P * r * (1+r)^n / ((1+r)^n - 1), where P=LoanAmount, r=monthly rate, n=Loan_Amount_Term (months)
interest_rate_annual = 0.10  # 10% assumed
r = interest_rate_annual / 12  # monthly rate
train['EMI_improved'] = train['LoanAmount'] * r * ( (1 + r)**train['Loan_Amount_Term'] ) / ( (1 + r)**train['Loan_Amount_Term'] - 1 )
test['EMI_improved'] = test['LoanAmount'] * r * ( (1 + r)**test['Loan_Amount_Term'] ) / ( (1 + r)**test['Loan_Amount_Term'] - 1 )

# Plot distribution
sns.distplot(train['EMI_improved'])
plt.title('Improved EMI Distribution')
plt.show()

# Balance Income feature now and check its distribution (using improved EMI)
# Assuming Total_Income is monthly, and EMI is monthly, no *1000 needed; adjust if units differ
train['Balance_Income_improved'] = train['Total_Income'] - train['EMI_improved']
test['Balance_Income_improved'] = test['Total_Income'] - test['EMI_improved']
sns.distplot(train['Balance_Income_improved'])
plt.title('Improved Balance Income Distribution')
plt.show()

# Replace Loan_Status to numeric
train['Loan_Status'].replace('N',0,inplace=True)
train['Loan_Status'].replace('Y',1,inplace=True)

# Drop Loan_ID
train = train.drop('Loan_ID', axis=1)
test = test.drop('Loan_ID',axis=1)

# Get dummies on features only
X = pd.get_dummies(train.drop('Loan_Status', axis=1))
y = train['Loan_Status']
test = pd.get_dummies(test)

# Making sure the train and test have same columns
test = test.reindex(columns=X.columns, fill_value=0)

# Drop original features after engineering
drop_cols = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term']
X = X.drop(columns=[col for col in drop_cols if col in X.columns], errors='ignore')
test = test.drop(columns=[col for col in drop_cols if col in test.columns], errors='ignore')

# Train test split for some models
X_train, X_cv, y_train, y_cv = train_test_split(X, y, test_size=0.3)

# Logistic Regression
model = LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1, penalty='l2', random_state=1, solver='liblinear',tol=0.0001, verbose=0, warm_start=False)
model.fit(X_train, y_train)
pred_cv = model.predict(X_cv)
accuracy_score(y_cv, pred_cv)

pred_test = model.predict(test)

submission = pd.read_csv('Sample_Submission.csv')
submission['Loan_Status'] = pred_test
submission['Loan_ID'] = test_original['Loan_ID']
submission['Loan_Status'].replace(0,'N',inplace=True)
submission['Loan_Status'].replace(1, 'Y', inplace=True)
pd.DataFrame(submission, columns=['Loan_ID', 'Loan_Status']).to_csv('logistic.csv')

# Cross validation logistic
i = 1
kf = StratifiedKFold(n_splits = 5, random_state = 1, shuffle = True)
for train_index, test_index in kf.split(X, y):
    print('\n{} kfold {}'.format(i, kf.n_splits))
    xtr, xvl = X.iloc[train_index], X.iloc[test_index]
    ytr, yvl = y.iloc[train_index], y.iloc[test_index]
    model = LogisticRegression(random_state=1)
    model.fit(xtr, ytr)
    pred_test = model.predict(xvl)
    score = accuracy_score(yvl, pred_test)
    print('Accuracy Score', score)
    i += 1
    pred_test = model.predict(test)
    pred = model.predict_proba(xvl)[:,1]

fpr, tpr, _ = metrics.roc_curve(yvl, pred)
auc = metrics.roc_auc_score(yvl, pred)
plt.figure(figsize=(12,8))
plt.plot(fpr, tpr, label='validation, auc='+ str(auc))
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc=4)
plt.show()

submission['Loan_Status'] = pred_test
submission['Loan_ID'] = test_original['Loan_ID']
submission['Loan_Status'].replace(0, 'N', inplace=True)
submission['Loan_Status'].replace(1, 'Y', inplace=True)
pd.DataFrame(submission, columns=['Loan_ID', 'Loan_Status']).to_csv('Logistic.csv')

# Logistic with new features (already included since features added before)
i = 1
kf = StratifiedKFold(n_splits=5, random_state=1, shuffle=True)
for train_index, test_index in kf.split(X,y):
    print('\n{} of kfold {}'.format(i, kf.n_splits))
    xtr, xvl = X.loc[train_index], X.loc[test_index]
    ytr, yvl = y.iloc[train_index], y.iloc[test_index]
    model = LogisticRegression(random_state=1)
    model.fit(xtr, ytr)
    pred_test = model.predict(xvl)
    score = accuracy_score(yvl, pred_test)
    print('accuracy_score', score)
    i += 1
    pred_test = model.predict(test)
    pred = model.predict_proba(xvl)[:,1]

submission['Loan_Status'] = pred_test
submission['Loan_ID'] = test_original['Loan_ID']
pd.DataFrame(submission, columns=['Loan_ID', 'Loan_Status']).to_csv('Log2.csv')

# Decision Tree
i = 1
kf = StratifiedKFold(n_splits=5, random_state=1, shuffle=True)
for train_index, test_index in kf.split(X,y):
    print('\n{} of kfold {}'.format(i, kf.n_splits))
    xtr, xvl = X.loc[train_index], X.loc[test_index]
    ytr, yvl = y.iloc[train_index], y.iloc[test_index]
    model = DecisionTreeClassifier(random_state=1)
    model.fit(xtr, ytr)
    pred_test = model.predict(xvl)
    score = accuracy_score(yvl, pred_test)
    print('accuracy score', score)
    i += 1
    pred_test = model.predict(test)

submission['Loan_Status'] = pred_test
submission['Loan_ID'] = test_original['Loan_ID']
submission['Loan_Status'].replace(0, 'N', inplace=True)
submission['Loan_Status'].replace(1, 'Y', inplace=True)
pd.DataFrame(submission, columns=['Loan_ID', 'Loan_Status']).to_csv('Decision Tree.csv')

# Random Forest
i = 1
kf = StratifiedKFold(n_splits=5, random_state=1, shuffle=True)
for train_index, test_index in kf.split(X,y):
    print('\n{} of kfold {}'.format(i, kf.n_splits))
    xtr, xvl = X.loc[train_index], X.loc[test_index]
    ytr, yvl = y.iloc[train_index], y.iloc[test_index]
    model = RandomForestClassifier(random_state=1, max_depth=10)
    model.fit(xtr, ytr)
    pred_test = model.predict(xvl)
    score = accuracy_score(yvl, pred_test)
    print('accuracy_score', score)
    i += 1
    pred_test = model.predict(test)

paramgrid = {'max_depth': list(range(1,20,2)), 'n_estimators': list(range(1,200,20))}
grid_search = GridSearchCV(RandomForestClassifier(random_state=1), paramgrid)
x_train, x_cv, y_train, y_cv = train_test_split(X, y, test_size=0.3, random_state=1)
grid_search.fit(x_train, y_train)
grid_search.best_estimator_

i = 1
kf = StratifiedKFold(n_splits=5, random_state=1,shuffle=True)
for train_index, test_index in kf.split(X,y):
    print('\n{} of kfold {}'.format(i, kf.n_splits))
    xtr, xvl = X.loc[train_index], X.loc[test_index]
    ytr, yvl = y.iloc[train_index], y.iloc[test_index]
    model = RandomForestClassifier(random_state=1, max_depth = 5, n_estimators=41)
    model.fit(xtr, ytr)
    pred_test = model.predict(xvl)
    score = accuracy_score(yvl, pred_test)
    print('accuracy_score', score)
    i += 1
    pred_test = model.predict(test)
    pred2 = model.predict_proba(test)[:,1]

submission['Loan_Status'] = pred_test
submission['Loan_ID'] = test_original['Loan_ID']
submission['Loan_Status'].replace(0, 'N', inplace=True)
submission['Loan_Status'].replace(1, 'Y', inplace=True)
pd.DataFrame(submission, columns=['Loan_ID', 'Loan_Status']).to_csv('Random Forest.csv')

# Feature importances
importances = pd.Series(model.feature_importances_, index=X.columns)
importances.plot(kind='barh', figsize=(12, 8))

# CHANGE 1: XGBoost with GridSearchCV
xgb_param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [3, 4, 5],
    'learning_rate': [0.1, 0.2]
}

xgb_grid = GridSearchCV(
    XGBClassifier(random_state=1, eval_metric='logloss'),
    xgb_param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)

xgb_grid.fit(X_train, y_train)
print("Best XGBoost params:", xgb_grid.best_params_)

i = 1
kf = StratifiedKFold(n_splits=5, random_state=1, shuffle=True)
xgb_preds = np.zeros(len(test))
for train_index, test_index in kf.split(X, y):
    print('\n{} of kfold {}'.format(i, kf.n_splits))
    xtr, xvl = X.loc[train_index], X.loc[test_index]
    ytr, yvl = y.iloc[train_index], y.iloc[test_index]
    model = XGBClassifier(**xgb_grid.best_params_, random_state=1)
    model.fit(xtr, ytr)
    pred_test = model.predict(xvl)
    score = accuracy_score(yvl, pred_test)
    print('accuracy_score', score)
    i += 1
    fold_pred = model.predict(test)
    xgb_preds += fold_pred / 5

pred3 = np.round(xgb_preds).astype(int)

submission['Loan_Status'] = pred3
submission['Loan_ID'] = test_original['Loan_ID']
submission['Loan_Status'].replace(0,'N',inplace=True)
submission['Loan_Status'].replace(1,'Y', inplace = True)
pd.DataFrame(submission, columns = ['Loan_ID', 'Loan_Status']).to_csv('XGBoost_Tuned.csv')

# CHANGE 5: Ensemble modeling
log_reg = LogisticRegression(random_state=1)
rf = RandomForestClassifier(random_state=1, max_depth=5, n_estimators=41)
xgb = XGBClassifier(**xgb_grid.best_params_, random_state=1)

ensemble = VotingClassifier(
    estimators=[('lr', log_reg), ('rf', rf), ('xgb', xgb)],
    voting='soft'
)

i = 1
kf = StratifiedKFold(n_splits=5, random_state=1, shuffle=True)
ensemble_preds = np.zeros(len(test))
for train_index, test_index in kf.split(X, y):
    print('\n{} of kfold for Ensemble {}'.format(i, kf.n_splits))
    xtr, xvl = X.loc[train_index], X.loc[test_index]
    ytr, yvl = y.iloc[train_index], y.iloc[test_index]
    model = ensemble
    model.fit(xtr, ytr)
    pred_test = model.predict(xvl)
    score = accuracy_score(yvl, pred_test)
    print('ensemble accuracy_score', score)
    i += 1
    fold_pred = model.predict(test)
    ensemble_preds += fold_pred / 5

final_ensemble_pred = np.round(ensemble_preds).astype(int)

submission['Loan_Status'] = final_ensemble_pred
submission['Loan_ID'] = test_original['Loan_ID']
submission['Loan_Status'].replace(0, 'N', inplace=True)
submission['Loan_Status'].replace(1, 'Y', inplace=True)
pd.DataFrame(submission, columns=['Loan_ID', 'Loan_Status']).to_csv('Ensemble_Voting.csv')

# Stacking
estimators = [('lr', log_reg), ('rf', rf), ('xgb', xgb)]
stacking = StackingClassifier(estimators=estimators, final_estimator=RandomForestClassifier(random_state=1))
stacking.fit(X_train, y_train)
stack_pred = stacking.predict(test)
submission['Loan_Status'] = stack_pred
submission['Loan_Status'].replace(0, 'N', inplace=True)
submission['Loan_Status'].replace(1, 'Y', inplace=True)
pd.DataFrame(submission, columns=['Loan_ID', 'Loan_Status']).to_csv('Stacking_Ensemble.csv')

# Save the final stacking model for deployment
joblib.dump(stacking, 'loan_prediction_model.pkl')
print("Model saved as 'loan_prediction_model.pkl'")