In [None]:
# Import our dependencies
from pymongo import MongoClient
from pprint import pprint
from tabulate import tabulate
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from imblearn.over_sampling import SMOTE

In [None]:
# Connect to MongoDB Atlas
client = MongoClient("mongodb+srv://group_user:UTProj4@project4.ofsuk.mongodb.net/remote?retryWrites=true&w=majority")

# Access the 'remote' database
db = client['remote']

# Fetch data from the 'record_df_df_dfs' collection
data_cursor = db['records'].find()
data_list = list(data_cursor)
data_df = pd.DataFrame(data_list)  # Convert to DataFrame

# Fetch data from the 'credit' collection
record_cursor = db['credit'].find()
record_list = list(record_cursor)
record_df = pd.DataFrame(record_list)  # Convert to DataFrame


In [None]:
data_df.head()

In [None]:
record_df.head()

In [None]:
# Group and process the 'credit' collection DataFrame
begin_month = record_df.groupby("ID")["MONTHS_BALANCE"].agg(min).reset_index()
begin_month = begin_month.rename(columns={'MONTHS_BALANCE': 'begin_month'})

# Merge the DataFrames on the 'ID' column
merged_df = pd.merge(data_df, begin_month, how="left", on="ID")
merged_df = pd.DataFrame(merged_df)

# Show the first few rows of the merged DataFrame
print(merged_df.head())

In [None]:
print(merged_df.info())

In [None]:
# Find all users' account open month
begin_month = record_df.groupby("ID", as_index=False)["MONTHS_BALANCE"].agg("min")
begin_month = begin_month.rename(columns={"MONTHS_BALANCE": "begin_month"})

# Merge with the main dataset
new_data = pd.merge(data_df, begin_month, how="left", on="ID")

In [None]:
# Add a column to identify past due payments
record_df['past_due'] = 'No'  # Default to 'No'
record_df.loc[record_df['STATUS'].isin([2, 3, 4, 5]), 'past_due'] = 'Yes'

# Group by 'ID' to determine if any record is past due
risk_factor = record_df.groupby('ID', as_index=False)['past_due'].agg(
    lambda x: 'Yes' if 'Yes' in x.values else 'No'
)

# Debug: Ensure 'cpunt' contains 'past_due'
print("risk_factor columns:", risk_factor.columns)
print(risk_factor.head())

# Merge with new_data
new_data = pd.merge(new_data, risk_factor, how='inner', on='ID')

# Debug: Ensure 'past_due' exists in new_data
print("new_data columns:", new_data.columns)
print(new_data.head())

# Map 'Yes' and 'No' to numeric targets
if 'past_due' in new_data.columns:
    new_data['target'] = new_data['past_due'].map({'Yes': 1, 'No': 0})
else:
    print("Error: 'past_due' column is missing in new_data.")


In [None]:
print(risk_factor['past_due'].value_counts())
risk_factor['past_due'].value_counts(normalize=True)

In [None]:
print(record_df['past_due'].value_counts())

In [None]:
# Create a binary target variable (1 for high-risk customers, 0 otherwise)
print(new_data['target'].value_counts())

In [None]:
risk_factor = record_df.groupby('ID', as_index=False)['past_due'].agg(
    lambda x: 'Yes' if 'Yes' in x.values else 'No'
)
print(new_data['ID'].dtype)
print(risk_factor['ID'].dtype)

# Check for missing values in 'past_due'
na_count = new_data['past_due'].isna().sum()
print(f"Number of missing values in 'past_due': {na_count}")

In [None]:
yes_count = (risk_factor['past_due'] == 'Yes').sum()
no_count = (risk_factor['past_due'] == 'No').sum()

print(f"Yes: {yes_count}")
print(f"No: {no_count}")

In [None]:
new_data = new_data.rename(columns={
    "ID": "ID",
    "CODE_GENDER": "GENDER",
    "FLAG_OWN_CAR": "OWN CAR",
    "FLAG_OWN_REALTY": "OWN REALTY",
    "CNT_CHILDREN": "CHILDREN",
    "AMT_INCOME_TOTAL": "INCOME TOTAL",
    "NAME_INCOME_TYPE": "INCOME TYPE",
    "NAME_EDUCATION_TYPE": "EDUCATION TYPE",
    "NAME_FAMILY_STATUS": "FAMILY STATUS",
    "NAME_HOUSING_TYPE": "HOUSING TYPE",
    "DAYS_BIRTH": "DAYS SINCE BIRTH",
    "DAYS_EMPLOYED": "DAYS EMPLOYED",
    "FLAG_MOBIL": "MOBIL",
    "FLAG_WORK_PHONE": "WORK PHONE",
    "FLAG_PHONE": "PHONE",
    "FLAG_EMAIL": "EMAIL",
    "OCCUPATION_TYPE": "TYPE",
    "CNT_FAM_MEMBERS": "FAM MEMBERS"
})

In [None]:
print(new_data.columns)

In [None]:
new_data.dropna()
new_data = new_data.mask(new_data == 'NULL').dropna()

In [None]:
new_data.info()

In [None]:
# Create a DataFrame with column names and initialize 'IV' with None
ivtable = pd.DataFrame({'variable': new_data.columns, 'IV': None})

# List of variables to exclude
namelist = ['MOBIL', 'begin_month', 'past_due', 'target', 'ID']

# Drop rows where the 'variable' is in the namelist
ivtable = ivtable[~ivtable['variable'].isin(namelist)].reset_index(drop=True)


The function calculates the Information Value (IV) of a feature relative to a target variable. Information Value is a measure used in predictive modeling to evaluate the predictive power of a feature. It is commonly used in credit scoring and binary classification problems.

Function Workflow:
Input Parameters:

df: The DataFrame containing the feature and target variable.<br>
feature: The column name of the feature for which IV is being calculated.<br>
target: The column name of the target variable (binary: 0 or 1).<br>
pr: A boolean flag to print intermediate data and the IV score.

In [None]:
# Updated Function
def calc_iv(df, feature, target, pr=False):
    # Handle missing values in a way compatible with categorical data
    if df[feature].dtype.name == "category":
        # Add "NULL" as a category if not already present
        if "NULL" not in df[feature].cat.categories:
            df[feature] = df[feature].cat.add_categories("NULL")
    # Replace missing values with "NULL"
    df[feature] = df[feature].fillna("NULL")

    lst = []

    # Iterate over unique feature values
    for val in df[feature].unique():
        all_count = df[df[feature] == val].shape[0]
        good_count = df[(df[feature] == val) & (df[target] == 0)].shape[0]  # Good (e.g., target == 0)
        bad_count = df[(df[feature] == val) & (df[target] == 1)].shape[0]   # Bad (e.g., target == 1)

        lst.append([feature, val, all_count, good_count, bad_count])

    # Convert to DataFrame
    data = pd.DataFrame(lst, columns=['Variable', 'Value', 'All', 'Good', 'Bad'])

    # Calculate shares and distributions
    total_good = data['Good'].sum()
    total_bad = data['Bad'].sum()
    epsilon = 1e-10  # Small constant to prevent division by zero

    data['Share'] = data['All'] / data['All'].sum()
    data['Bad Rate'] = data['Bad'] / (data['All'] + epsilon)
    data['Distribution Good'] = data['Good'] / (total_good + epsilon)
    data['Distribution Bad'] = data['Bad'] / (total_bad + epsilon)

    # Calculate Weight of Evidence (WoE)
    data['WoE'] = np.log((data['Distribution Good'] + epsilon) / (data['Distribution Bad'] + epsilon))
    data.replace({'WoE': {np.inf: 0, -np.inf: 0}}, inplace=True)  # Replace infinite values with 0

    # Calculate Information Value (IV)
    data['IV'] = (data['Distribution Good'] - data['Distribution Bad']) * data['WoE']

    # Sort data
    data = data.sort_values(by=['Variable', 'Value'], ascending=[True, True]).reset_index(drop=True)

    # Print results if needed
    if pr:
        print(data)
        print('IV = ', data['IV'].sum())

    # Calculate total IV
    iv = data['IV'].sum()
    print('This variable\'s IV is:', iv)
    print(df[feature].value_counts())

    return iv, data


In [None]:
yes_count = (risk_factor['past_due'] == 'Yes').sum()
no_count = (risk_factor['past_due'] == 'No').sum()

print(f"Yes: {yes_count}")
print(f"No: {no_count}")

In [None]:
print(new_data['target'].value_counts())


In [None]:
print(new_data.info())
print(new_data.head())

In [None]:
iv, data = calc_iv(df=new_data, feature='GENDER', target='target', pr=True)


In [None]:
new_data['GENDER'] = new_data['GENDER'].replace(['F','M'],[0,1])
print(new_data['GENDER'].value_counts())
iv, data = calc_iv(new_data,'GENDER','target')
ivtable.loc[ivtable['variable']=='GENDER','IV']=iv
data.head()

In [None]:
def convert_dummy(df, feature,rank=0):
    pos = pd.get_dummies(df[feature], prefix=feature)
    mode = df[feature].value_counts().index[rank]
    biggest = feature + '_' + str(mode)
    pos.drop([biggest],axis=1,inplace=True)
    df.drop([feature],axis=1,inplace=True)
    df=df.join(pos)
    return df

In [None]:
def get_category(df, col, binsnum, labels, qcut = False):
    if qcut:
        localdf = pd.qcut(df[col], q = binsnum, labels = labels) # quantile cut
    else:
        localdf = pd.cut(df[col], bins = binsnum, labels = labels) # equal-length cut
        
    localdf = pd.DataFrame(localdf)
    name = 'gp' + '_' + col
    localdf[name] = localdf[col]
    df = df.join(localdf[name])
    df[name] = df[name].astype(object)
    return df

In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        
    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
new_data['OWN CAR'] = new_data['OWN CAR'].replace(['N','Y'],[0,1])
print(new_data['OWN CAR'].value_counts())
iv, data=calc_iv(new_data,'OWN CAR','target')
ivtable.loc[ivtable['variable']=='OWN CAR','IV']=iv
data.head()

In [None]:
new_data['OWN REALTY'] = new_data['OWN REALTY'].replace(['N','Y'],[0,1])
print(new_data['OWN REALTY'].value_counts())
iv, data=calc_iv(new_data,'OWN REALTY','target')
ivtable.loc[ivtable['variable']=='OWN REALTY','IV']=iv
data.head()

In [None]:
new_data['PHONE']=new_data['PHONE'].astype(str)
print(new_data['PHONE'].value_counts(normalize=True,sort=False))
new_data.drop(new_data[new_data['PHONE'] == 'nan' ].index, inplace=True)
iv, data=calc_iv(new_data,'PHONE','target')
ivtable.loc[ivtable['variable']=='PHONE','IV']=iv
data.head()

In [None]:
print(new_data['EMAIL'].value_counts(normalize=True,sort=False))
new_data['EMAIL']=new_data['EMAIL'].astype(str)
iv, data=calc_iv(new_data,'EMAIL','target')
ivtable.loc[ivtable['variable']=='EMAIL','IV']=iv
data.head()

In [None]:
new_data['WORK PHONE']=new_data['WORK PHONE'].astype(str)
iv, data = calc_iv(new_data,'WORK PHONE','target')
new_data.drop(new_data[new_data['WORK PHONE'] == 'nan' ].index, inplace=True)
ivtable.loc[ivtable['variable']=='WORK PHONE','IV']=iv
data.head()

In [None]:
new_data.loc[new_data['CHILDREN'] >= 2,'CHILDREN']='2More'
print(new_data['CHILDREN'].value_counts(sort=False))

In [None]:
iv, data=calc_iv(new_data,'CHILDREN','target')
ivtable.loc[ivtable['variable']=='CHILDREN','IV']=iv
data.head()

In [None]:
new_data = convert_dummy(new_data,'CHILDREN')

In [None]:
new_data['INCOME TOTAL']=new_data['INCOME TOTAL'].astype(object)
new_data['INCOME TOTAL'] = new_data['INCOME TOTAL']/10000 
print(new_data['INCOME TOTAL'].value_counts(bins=10,sort=False))
new_data['INCOME TOTAL'].plot(kind='hist',bins=50,density=True)

In [None]:
# Create categories using pd.qcut
new_data['gp_inc'] = pd.qcut(
    new_data['INCOME TOTAL'], 
    q=3,  # Number of quantiles
    labels=["low", "medium", "high"]
)

# Calculate IV
iv, data = calc_iv(new_data, 'gp_inc', 'target')

# Update IV table
ivtable.loc[ivtable['variable'] == 'INCOME TOTAL', 'IV'] = iv

# Print results
print(data.head())

In [None]:
# Convert 'INCOME TOTAL' to numeric and normalize
new_data['INCOME TOTAL'] = new_data['INCOME TOTAL'].astype(float) / 10000

# Print distribution
print(new_data['INCOME TOTAL'].value_counts(bins=10, sort=False))

# Plot histogram
new_data['INCOME TOTAL'].plot(kind='hist', bins=50, density=True)

# Create categories using pd.qcut
new_data['gp_inc'] = pd.qcut(
    new_data['INCOME TOTAL'], 
    q=3,  # Quantiles
    labels=["low", "medium", "high"]
)

# Add 'NULL' as a category to avoid errors when filling NaNs
new_data['gp_inc'] = new_data['gp_inc'].cat.add_categories("NULL").fillna("NULL")

# Calculate IV
iv, data = calc_iv(new_data, 'gp_inc', 'target')

# Update IV table
ivtable.loc[ivtable['variable'] == 'INCOME TOTAL', 'IV'] = iv

# Print results
print(data.head())

In [None]:
new_data = convert_dummy(new_data,'gp_inc')

In [None]:
# Calculate Age
new_data['Age'] = -(new_data['DAYS SINCE BIRTH']) // 365

# Check distribution of Age
print(new_data['Age'].value_counts(bins=10, normalize=True, sort=False))
new_data['Age'].plot(kind='hist', bins=20, density=True)

# Step 1: Create age categories
new_data['gp_Age'] = pd.qcut(
    new_data['Age'],
    q=5,  # Number of quantiles
    labels=["lowest", "low", "medium", "high", "highest"]
)

# Step 2: Calculate IV for 'gp_Age'
iv, data = calc_iv(new_data, 'gp_Age', 'target')
print("IV Data for gp_Age:")
print(data[['Value', 'Good', 'Bad', 'WoE', 'IV']])

# Step 3: Update the IV table for 'DAYS SINCE BIRTH'
if 'DAYS SINCE BIRTH' not in ivtable['variable'].values:
    # Append a new row for DAYS SINCE BIRTH if it does not exist
    new_row = pd.DataFrame({'variable': ['DAYS SINCE BIRTH'], 'IV': [0]})
    ivtable = pd.concat([ivtable, new_row], ignore_index=True)

# Update the IV value for DAYS SINCE BIRTH
ivtable.loc[ivtable['variable'] == 'DAYS SINCE BIRTH', 'IV'] = iv

# Step 4: Convert 'gp_Age' to dummy variables
new_data = convert_dummy(new_data, 'gp_Age')

# Step 5: Sort and display the IV table
ivtable = ivtable.sort_values(by='IV', ascending=False)
print(ivtable)


Family Size

In [None]:
# Step 1: Create work experience categories for 'DAYS_EMPLOYED'
new_data['worktm'] = -(new_data['DAYS EMPLOYED']) // 365
new_data.loc[new_data['worktm'] < 0, 'worktm'] = np.nan
new_data['worktm'] = new_data['worktm'].fillna(new_data['worktm'].mean())
new_data = get_category(new_data, 'worktm', 5, ["lowest", "low", "medium", "high", "highest"])

# Step 2: Calculate IV for gp_worktm
iv, data = calc_iv(new_data, 'gp_worktm', 'target')
print("IV Data for gp_worktm:")
print(data[['Value', 'Good', 'Bad', 'WoE', 'IV']])

# Step 3: Update the IV table for DAYS_EMPLOYED
if 'DAYS EMPLOYED' not in ivtable['variable'].values:
    new_row = pd.DataFrame({'variable': ['DAYS EMPLOYED'], 'IV': [0]})
    ivtable = pd.concat([ivtable, new_row], ignore_index=True)
ivtable.loc[ivtable['variable'] == 'DAYS EMPLOYED', 'IV'] = iv

# Step 4: Convert gp_worktm to dummy variables
new_data = convert_dummy(new_data, 'gp_worktm')

# Step 5: Sort and display IV table
ivtable = ivtable.sort_values(by='IV', ascending=False)
print(ivtable)


In [None]:
new_data['FAM MEMBERS'].value_counts(sort=False)

In [None]:
# Replace NaN or inf values in 'FAM MEMBERS' with a default value (e.g., 0) or handle them
new_data['FAM MEMBERS'] = new_data['FAM MEMBERS'].fillna(0).replace([np.inf, -np.inf], 0).astype(int)

# Create a new grouped column for 'FAM MEMBERS'
new_data['FAM MEMBERS GP'] = new_data['FAM MEMBERS'].astype(object)

# Group families with 3 or more members into a single category
new_data.loc[new_data['FAM MEMBERS GP'] >= 3, 'FAM MEMBERS GP'] = '3more'

# Calculate IV for 'FAM MEMBERS GP'
iv, data = calc_iv(new_data, 'FAM MEMBERS GP', 'target')

# Update IV table for 'FAM MEMBERS'
ivtable.loc[ivtable['variable'] == 'FAM MEMBERS', 'IV'] = iv

# Display the first few rows of the IV DataFrame
print(data.head())

In [None]:
new_data = convert_dummy(new_data,'FAM MEMBERS GP')

In [None]:
print(new_data.columns)


Income Type

In [None]:
# Display value counts for 'INCOME TYPE' without sorting
income_type_counts = new_data['INCOME TYPE'].value_counts(sort=False)
print("Value counts (unsorted):")
print(income_type_counts)

# Display normalized value counts for 'INCOME TYPE' without sorting
income_type_normalized_counts = new_data['INCOME TYPE'].value_counts(normalize=True, sort=False)
print("Normalized value counts (unsorted):")
print(income_type_normalized_counts)

# Consolidate categories for 'INCOME TYPE'
new_data['INCOME TYPE'] = new_data['INCOME TYPE'].replace({'Pensioner': 'State servant', 'Student': 'State servant'})

# Calculate IV for 'INCOME TYPE'
iv, data = calc_iv(new_data, 'INCOME TYPE', 'target')

# Update the IV table
ivtable.loc[ivtable['variable'] == 'INCOME TYPE', 'IV'] = iv

# Display the first few rows of the resulting data
print("First few rows of data after IV calculation:")
print(data.head())


In [None]:
new_data = convert_dummy(new_data,'INCOME TYPE')

In [None]:
# Group occupations into broader categories
new_data.loc[(new_data['TYPE'] == 'Cleaning staff') | 
             (new_data['TYPE'] == 'Cooking staff') | 
             (new_data['TYPE'] == 'Drivers') | 
             (new_data['TYPE'] == 'Laborers') | 
             (new_data['TYPE'] == 'Low-skill Laborers') | 
             (new_data['TYPE'] == 'Security staff') | 
             (new_data['TYPE'] == 'Waiters/barmen staff'), 'TYPE'] = 'Laborwk'

new_data.loc[(new_data['TYPE'] == 'Accountants') | 
             (new_data['TYPE'] == 'Core staff') | 
             (new_data['TYPE'] == 'HR staff') | 
             (new_data['TYPE'] == 'Medicine staff') | 
             (new_data['TYPE'] == 'Private service staff') | 
             (new_data['TYPE'] == 'Realty agents') | 
             (new_data['TYPE'] == 'Sales staff') | 
             (new_data['TYPE'] == 'Secretaries'), 'TYPE'] = 'officewk'

new_data.loc[(new_data['TYPE'] == 'Managers') | 
             (new_data['TYPE'] == 'High skill tech staff') | 
             (new_data['TYPE'] == 'IT staff'), 'TYPE'] = 'hightecwk'

# Print the value counts for the TYPE column
print(new_data['TYPE'].value_counts())

# Calculate IV for 'TYPE'
iv, data = calc_iv(new_data, 'TYPE', 'target')

# Update the IV table for 'TYPE'
ivtable.loc[ivtable['variable'] == 'TYPE', 'IV'] = iv

# Display the first few rows of the IV DataFrame
print(data.head())


In [None]:
new_data = convert_dummy(new_data,'TYPE')

House Type

In [None]:
iv, data=calc_iv(new_data,'HOUSING TYPE','target')
ivtable.loc[ivtable['variable']=='HOUSING TYPE','IV']=iv
data.head()

Education

In [None]:
# Consolidate categories for 'EDUCATION TYPE'
new_data.loc[new_data['EDUCATION TYPE'] == 'Academic degree', 'EDUCATION TYPE'] = 'Higher education'

# Calculate IV for 'EDUCATION TYPE'
iv, data = calc_iv(new_data, 'EDUCATION TYPE', 'target')

# Update the IV table
ivtable.loc[ivtable['variable'] == 'EDUCATION TYPE', 'IV'] = iv

# Display the first few rows of the resulting data
print("First few rows of data after IV calculation:")
print(data.head())


In [None]:
new_data = convert_dummy(new_data,'EDUCATION TYPE')

Marriage Condition

In [None]:
# Display normalized value counts for 'FAMILY STATUS' without sorting
family_status_normalized_counts = new_data['FAMILY STATUS'].value_counts(normalize=True, sort=False)

# Print the results
print("Normalized value counts (unsorted):")
print(family_status_normalized_counts)


In [None]:
iv, data=calc_iv(new_data,'FAMILY STATUS','target')
ivtable.loc[ivtable['variable']=='FAMILY STATUS','IV']=iv
data.head()

In [None]:
new_data = convert_dummy(new_data,'FAMILY STATUS')

In [None]:
new_data.head()

In [None]:
new_data.columns

In [None]:
ivtable['IV'] = ivtable['IV'].fillna(0)


In [None]:
# Rename variables in ivtable
ivtable.loc[ivtable['variable'] == 'DAYS SINCE BIRTH', 'IV'] = iv
ivtable.loc[ivtable['variable'] == 'DAYS EMPLOYED', 'variable'] = 'DAYS EMPLOYEED'
ivtable.loc[ivtable['variable'] == 'inc', 'variable'] = 'incgp'

# Sort ivtable by IV
ivtable = ivtable.sort_values(by='IV', ascending=False)

# Verify ivtable
print(ivtable)


In [None]:
from sklearn.linear_model import LogisticRegression
import seaborn as sns
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.utils.multiclass import unique_labels

# selected_columns = ['GENDER', 'OWN CAR', 'OWN REALTY', 'INCOME TOTAL',
#          'FAM MEMBERS', 'CHILDREN_1', 'CHILDREN_2More', 'gp_inc_medium', 'gp_inc_high',
#        'gp_inc_NULL', 'FAM MEMBERS GP_1',
#        'FAM MEMBERS GP_3more', 'INCOME TYPE_Commercial associate',
#         'INCOME TYPE_State servant',
#         'TYPE_Laborwk', 'TYPE_hightecwk',
#        'TYPE_officewk', 
#        'EDUCATION TYPE_Higher education', 'EDUCATION TYPE_Incomplete higher',
#        'EDUCATION TYPE_Lower secondary','DAYS SINCE BIRTH','FAMILY STATUS_Civil marriage','FAMILY STATUS_Separated' ,'FAMILY STATUS_Single / not married',
#                     'FAMILY STATUS_Widow' ,'DAYS EMPLOYED','Age']

# # Data Preparation
# # Assuming new_data is your DataFrame
# # Ensure target variable is an integer
# Y = new_data['target'].astype('int')
# X = new_data[selected_columns]
Y = new_data['target'].astype('int')

# Define features (X) - Drop '_id' and 'ID' as they are non-numeric and not meaningful
X = new_data.drop(columns=['_id', 'ID','target','CHILDREN_1', 'CHILDREN_2More','DAYS SINCE BIRTH', 'DAYS EMPLOYED'])

# Check and ensure all columns in X are numeric
numeric_columns = X.select_dtypes(include=['number']).columns
X = X[numeric_columns]

# Apply SMOTE to balance the dataset
sm = SMOTE(random_state=42)
X_balance, Y_balance = sm.fit_resample(X, Y)

# Convert X_balance back to a DataFrame
X_balance = pd.DataFrame(X_balance, columns=X.columns)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_balance, Y_balance, 
                                                    stratify=Y_balance, 
                                                    test_size=0.3, 
                                                    random_state=10086)

# Train a Logistic Regression Model
model = LogisticRegression(C=0.8, random_state=0, solver='lbfgs', max_iter=1000)
model.fit(X_train, y_train)

# Make predictions on the test set
y_predict = model.predict(X_test)

# Evaluate the model
print('Accuracy Score is {:.5}'.format(accuracy_score(y_test, y_predict)))
print("Confusion Matrix:\n", pd.DataFrame(confusion_matrix(y_test, y_predict)))

# Plot the Confusion Matrix
sns.set_style('white') 

def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion Matrix', cmap='coolwarm'):
    """
    Plots the confusion matrix.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    
    sns.heatmap(cm, annot=True, fmt=".2f" if normalize else "d", cmap=cmap,
                xticklabels=classes, yticklabels=classes)
    plt.title(title)
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()

cm = confusion_matrix(y_test, y_predict)
class_names = ['0', '1']  # Replace with your actual class labels if necessary

plot_confusion_matrix(cm, class_names, normalize=True, 
                      title='Normalized Confusion Matrix: Logistic Regression')

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "iframe_connected"  
selected_columns = ['GENDER', 'OWN CAR', 'OWN REALTY', 'INCOME TOTAL',
         'FAM MEMBERS', 'CHILDREN_1', 'CHILDREN_2More', 'gp_inc_medium', 'gp_inc_high',
       'gp_inc_NULL', 'FAM MEMBERS GP_1',
       'FAM MEMBERS GP_3more', 'INCOME TYPE_Commercial associate',
        'INCOME TYPE_State servant',
        'TYPE_Laborwk', 'TYPE_hightecwk',
       'TYPE_officewk', 
       'EDUCATION TYPE_Higher education', 'EDUCATION TYPE_Incomplete higher',
       'EDUCATION TYPE_Lower secondary','DAYS SINCE BIRTH','FAMILY STATUS_Civil marriage','FAMILY STATUS_Separated' ,'FAMILY STATUS_Single / not married',
                    'FAMILY STATUS_Widow' ,'DAYS EMPLOYED','Age']
X = new_data[selected_columns].fillna(0)  
Y = new_data['target']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

pca = PCA(n_components=3)
X_pca = pca.fit_transform(X_scaled)

pca_df = pd.DataFrame(X_pca, columns=['PCA1', 'PCA2', 'PCA3'])
pca_df['target'] = Y

explained_variance = pca.explained_variance_ratio_
print(f"Explained Variance Ratio: {explained_variance}")
print(f"Total Explained Variance: {np.sum(explained_variance)}")

fig = px.scatter_3d(
    pca_df,
    x='PCA1',
    y='PCA2',
    z='PCA3',
    color='target',
    title="PCA 3D Visualization"
)
fig.update_traces(marker=dict(opacity=0.5))  

fig.show()

# Running Neural Networks with only values high on the IV chart

In [None]:
# Manually specify the selected columns
selected_columns = [
    'GENDER',
    'OWN REALTY',
    'gp_Age_low',
    'gp_Age_medium',
    'gp_Age_high',
    'gp_Age_highest',
    'gp_worktm_high',
    'gp_worktm_highest',
    'gp_worktm_low',
    'gp_worktm_medium',
    'FAMILY STATUS_Civil marriage',
    'FAMILY STATUS_Separated',
    'FAMILY STATUS_Single / not married',
    'FAMILY STATUS_Widow'
]

# we select the high iv columns
Y = new_data['target']
X = new_data[selected_columns]

Y = Y.astype('int')
X_balance,Y_balance = SMOTE().fit_resample(X,Y)
X_balance = pd.DataFrame(X_balance, columns = X.columns)

X_train, X_test, y_train, y_test = train_test_split(X_balance,Y_balance, 
                                                    stratify=Y_balance, test_size=0.3,
                                                    random_state = 10086)
													
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = X_train_scaled.shape[1]  # Correct the number of features
hidden_nodes_layer1 = 80
hidden_nodes_layer2 = 30
hidden_nodes_layer3 = 10

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation='relu'))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation='relu'))

# Third hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer3, activation='sigmoid'))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

# Check the structure of the model
nn.summary()

# Compile the model
nn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=15)

# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

In [None]:
# Predict probabilities
y_pred_prob = nn.predict(X_test_scaled)

# Convert probabilities to class predictions (e.g., threshold of 0.5)
y_pred = (y_pred_prob > 0.5).astype(int)

# Print the classification report
print(classification_report(y_test, y_pred))

Performance Interpretation for High-Risk vs. No-Risk
Current Accuracy (65.31%):
Suggests that ~35% of samples are misclassified, but it doesn't reveal if high-risk cases are disproportionately affected.
If high-risk cases are a minority, the model may overfit to predicting the majority (no-risk), inflating accuracy but underperforming where it matters.
Loss (0.6021):
Indicates moderate uncertainty in predictions; may suggest overlap between high- and no-risk feature distributions.

# All columns except target and objects

In [None]:
# Select all columns with object data type
object_columns = new_data.select_dtypes(include=['object']).columns

# Print the list of object columns
print("Object columns in the DataFrame:")
print(object_columns.tolist())

In [None]:
# Select all columns excluding object data types and the 'target' column
selected_columns = new_data.select_dtypes(exclude=['object']).drop(columns=['target']).columns.tolist()

# View the resulting list of columns
print(selected_columns)

# we select the high iv columns
Y = new_data['target']
X = new_data[selected_columns]

Y = Y.astype('int')
X_balance,Y_balance = SMOTE().fit_resample(X,Y)
X_balance = pd.DataFrame(X_balance, columns = X.columns)

X_train, X_test, y_train, y_test = train_test_split(X_balance,Y_balance, 
                                                    stratify=Y_balance, test_size=0.3,
                                                    random_state = 10086)

# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = X_train_scaled.shape[1]  # Correct the number of features
hidden_nodes_layer1 = 80
hidden_nodes_layer2 = 30
hidden_nodes_layer3 = 10

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation='relu'))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation='relu'))

# Third hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer3, activation='sigmoid'))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

# Check the structure of the model
nn.summary()

# Compile the model
nn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=15)

# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

In [None]:
# Predict probabilities
y_pred_prob = nn.predict(X_test_scaled)

# Convert probabilities to class predictions (e.g., threshold of 0.5)
y_pred = (y_pred_prob > 0.5).astype(int)

# Print the classification report
print(classification_report(y_test, y_pred))

Why Such a Large Improvement?
The improvement can be attributed to the inclusion of more features, providing the model with richer information to make decisions:

Feature Completeness:
By excluding only object columns and the target, the model now has access to more relevant numerical and categorical data (potentially after encoding), which contributes to its ability to capture patterns in the data.
Complex Relationships:
More features allow the model to learn complex interactions between variables that were missing when only 5 high-IV features were used.
Diminished Feature Selection Bias:
Relying only on high-IV features may exclude some important interactions or complementary features. Including all numeric columns mitigates this bias.

Accuracy: 97.72%
What is it?
Accuracy measures the proportion of correct predictions (both high-risk and no-risk) relative to the total predictions.
Interpretation:
An accuracy of 97.72% indicates that the model predicts the correct class for most of the samples.
This is a substantial improvement over the previous accuracy of 65.31%, highlighting that the additional features provide more comprehensive information for the model to learn from.