In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import psycopg2
from flask_sqlalchemy import SQLAlchemy

ModuleNotFoundError: No module named 'psycopg2'

In [None]:
import os
import sqlalchemy
from sqlalchemy import create_engine
from Config import db_password
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [None]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [None]:
# clean data
clean_dataset = pd.read_csv('clean_dataset.csv')
clean_dataset

In [None]:
clean_dataset.info()

In [None]:
column_headers = clean_dataset.columns.values.tolist()
column_headers

In [None]:
features = clean_dataset.drop(['Approved','ZipCode','Income'],1)
features

In [None]:
target = pd.DataFrame(clean_dataset['Approved'])
target

In [None]:
features.to_csv('features.csv', index=True)
target.to_csv('targets.csv', index=True)

In [None]:
# connect to database
# db_string = "postgresql://[user]:[password]@[location]:[port]/[database]"
db_string = f'postgresql://postgres:{db_password}@127.0.0.1:5432/Credit_Approval'

In [None]:
engine = create_engine(db_string)
db = SQLAlchemy()

In [None]:
target.to_sql(name='target',index=False, con=engine)
features.to_sql(name='features',index=False, con=engine)

In [None]:
# sql query our desired data
info = engine.execute("SELECT * from features JOIN target ON target.index = features.index").fetchall()
info

In [None]:
# table target into df
df1 = pd.read_sql_table("target",engine)
df1

In [None]:
# table features into df
df2 = pd.read_sql_table("features",engine)
df2

In [None]:
# merge two dataframes
clean = pd.merge(df1, df2, on = "index", how = "inner")
clean

In [None]:
# drop index column for our clean final df
clean_df = clean.drop(['index', 'ZipCode', 'Income'],1)
clean_df

In [None]:
# Create our features
X = clean_df.drop("Approved", axis=1)
X = pd.get_dummies(X)

In [None]:
# Create our target (target = approved column)
target = ["Approved"]
y = clean_df.loc[:, target].copy()

In [None]:
# X.describe() to test
X.describe()

In [None]:
# Check the balance of our target values
y['Approved'].value_counts()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
print(Counter(y_train['Approved']))
print(Counter(y_test['Approved']))

Random Forest Classifier

In [None]:
from imblearn.ensemble import BalancedRandomForestClassifier
rf_model = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
rf_model.fit(X_train, y_train)
print(Counter(y_train['Approved']))

In [None]:
# Calculated the balanced accuracy score
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = rf_model.predict(X_test)
balanced_accuracy_score(y_test,y_pred)

In [None]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Approved", "Actual Denied"], columns=["Predicted Approved", "Predicted Denied"])

cm_df

In [None]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

In [None]:
# List the features sorted in descending order by feature importance
importances = sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)
for importance in importances:
    print(f'{importance[1]}:  {importance[0]*100:.1f}%')