In [None]:
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.linear_model import LinearRegression
import seaborn as sns
import matplotlib.pyplot as plt
import pickle

In [None]:
import pandas as pd
# Using the temp file previously created (to avoid re-downloading the data)
df_cleaned = pd.read_csv('temp.csv', sep='|')

In [None]:
# specificaly for the frontent part we havent done encoding of the categorical columns
# to make it easy for the user to enter the data

date_columns = [
    'activation_date',
    'date_of_birth',
    'approvedon_date',
    'expected_disbursedon_date',
    'disbursedon_date',
    'expected_maturedon_date',
    'maturedon_date',
    'transaction_date',
    'submitted_on_date',
    'validatedon_date',
    'created_date',
]

numerical_columns = [
    'gender_cv_id',
    'legal_form_enum',
    'interest_period_frequency_enum',
    'interest_method_enum',
    'interest_calculated_in_period_enum',
    'transaction_type_enum',
    'principal_amount',
    'nominal_interest_rate_per_period',
    'annual_nominal_interest_rate',
    'amount',
    'term_frequency',
    'number_of_repayments',
    'principal_disbursed_derived',
    'total_expected_repayment_derived',
    'total_expected_costofloan_derived',
    'principal_portion_derived',
    'outstanding_loan_balance_derived'
    'principal_repaid_derived',
    'principal_outstanding_derived',
    'interest_charged_derived',
    'interest_repaid_derived',
    'interest_outstanding_derived',
    'total_repayment_derived',
    'total_costofloan_derived',
    'total_outstanding_derived',
    
]
other_columns_not_encoded = [
    'office_joining_date',
    'has_email_address',
    'status_enum',
    'has_mobile_no',
    'validatedon_userid',
    'loan_transaction_strategy_id',
    'is_reversed',
    'submittedon_date_client',
    'submittedon_date_loan',
    'principal_amount_proposed',   
    'manually_adjusted_or_reversed'
]

In [None]:
print(df_cleaned.shape)

In [None]:
# These columns are redandant and can be dropped
df_cleaned = df_cleaned.drop(columns=other_columns_not_encoded)

for col in date_columns:
    df_cleaned[col] = pd.to_datetime(df_cleaned[col], errors='coerce')

# Find the minimum date
min_date = df_cleaned[date_columns].min().min()
reference_date = pd.to_datetime(min_date)

# Conversion to days usign the reference date
for col in date_columns:
    df_cleaned[col] = df_cleaned[col].fillna(reference_date)
    df_cleaned[col] = (df_cleaned[col] - reference_date).dt.days

df_encoded = df_cleaned.copy()

In [None]:
missing_values = df_encoded.isnull().mean() * 100
missing_values = missing_values[missing_values > 0]

print(f"Number of columns with missing values: {len(missing_values)}")
print(missing_values.index)

df_encoded = df_encoded.drop(columns=missing_values.index)
print(df_encoded.shape)

In [None]:
# We will use the correlation matrix to find the most important features
# we will drop the columns that are not correlated with the target variable and keep the ones that are not too highly correlated with each other

correlation_matrix = df_encoded.corr()
target_variable = 'interest_repaid_derived'
target_correlation = correlation_matrix[target_variable].sort_values(ascending=False)

# The top 10 most correlated features and the least 10 correlated features
print(target_correlation[:10])
print("\n")
print(target_correlation[-10:])

In [None]:
# Now given we are predicting the interest_repaid_derived we wont be having the following columns
# Also these columns are highly correlated with the target variable (which is understandable)
highly_corr_future = [
    'principal_disbursed_derived',
    'total_expected_repayment_derived',
    'total_expected_costofloan_derived',
    'outstanding_loan_balance_derived',
    'principal_repaid_derived',
    'principal_outstanding_derived',
    'interest_charged_derived',
    'interest_outstanding_derived',
    'total_repayment_derived',
    'total_costofloan_derived',
    'total_outstanding_derived',
]

In [None]:
df_encoded = df_encoded.drop(columns=highly_corr_future)
print(df_encoded.shape)

In [None]:
print(df_encoded.columns)

In [None]:
# categorical_columns = ['gender_cv_id_16', 'gender_cv_id_17', 'gender_cv_id_750143','legal_form_enum_1', 'legal_form_enum_2', 'has_email_address_0','has_email_address_1', 'interest_period_frequency_enum_2','interest_period_frequency_enum_3', 'interest_method_enum_0','interest_method_enum_1', 'interest_calculated_in_period_enum_0','interest_calculated_in_period_enum_1','transaction_type_enum_1', 'transaction_type_enum_2']
numerical_columns = df_encoded.columns

In [None]:
X = df_encoded.drop('interest_repaid_derived', axis=1)
y = df_encoded['interest_repaid_derived']

In [None]:
# We are currently using mean to fill the missing values when the user doest input the data
print(X.mean())

In [None]:

# We divide the numbers into bins, this makes it more generalised
# this makes sense as 3 percent and 3.1 percent are not that different

num_quantiles = 16

# Apply quantile-based binning
y_binned, bin_edges = pd.qcut(y, q=num_quantiles, labels=False, retbins=True, duplicates='drop')

mask = ~y_binned.isna()

# Apply the mask to both X and y_binned to drop corresponding rows
X_filtered = X[mask]
y_binned_filtered = y_binned[mask]

# Check the distribution of the binned values
print(y_binned.value_counts())

In [None]:
bin_ranges = [(bin_edges[i], bin_edges[i+1]) for i in range(len(bin_edges)-1)]
for i, bin_range in enumerate(bin_ranges):
    print(f"Bin {i}: {bin_range}")

In [None]:
# Spliting the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X_filtered, y_binned_filtered, test_size=0.2, random_state=42)

In [None]:
# Given the dataset is large we will be using a random sample of the data
X_random = X_train.sample(frac=0.1, random_state=42)
y_random = y_train[X_random.index]

In [None]:
# print the first row of X_random
print(X_test.iloc[0])

In [None]:
print(X_test.iloc[0].values.reshape(1, -1))

LogisticRegression

In [None]:
# Initialize the Logistic Regression model
logistic_model = LogisticRegression(max_iter=100, random_state=42)

# Fit the model on the training data
logistic_model.fit(X_random, y_random)

# Predict the target values
y_pred_train = logistic_model.predict(X_random)
y_pred_test = logistic_model.predict(X_test)

# Calculate the accuracy of the model
train_accuracy = accuracy_score(y_random, y_pred_train)
test_accuracy = accuracy_score(y_test, y_pred_test)

print(f"Training Accuracy: {train_accuracy:.2f}")
print(f"Test Accuracy: {test_accuracy:.2f}")

In [None]:
# save the model
import pickle
with open('logistic_model.pkl', 'wb') as model_file:
    pickle.dump(logistic_model, model_file)


Decision Tree

In [None]:
# Initialize the Decision Tree model
decision_tree_model = DecisionTreeClassifier(random_state=42)

# Fit the model on the training data
decision_tree_model.fit(X_random, y_random)

# Predict the target values
y_pred_train = decision_tree_model.predict(X_random)
y_pred_test = decision_tree_model.predict(X_test)

# Calculate the accuracy of the model
train_accuracy = accuracy_score(y_random, y_pred_train)
test_accuracy = accuracy_score(y_test, y_pred_test)

print(f"Training Accuracy: {train_accuracy:.2f}")
print(f"Test Accuracy: {test_accuracy:.2f}")


In [None]:
with open('decision_tree_model_low.pkl', 'wb') as model_file:
    pickle.dump(decision_tree_model, model_file)

In [None]:
XGBClassifier

In [None]:
# Initialize the XGBoost model
xgb_model = XGBClassifier(random_state=42)

# Fit the model on the training data
xgb_model.fit(X_random, y_random)

# Predict the target values
y_pred_train = xgb_model.predict(X_random)
y_pred_test = xgb_model.predict(X_test)

# Calculate the accuracy of the model
train_accuracy = accuracy_score(y_random, y_pred_train)
test_accuracy = accuracy_score(y_test, y_pred_test)

print(f"Training Accuracy: {train_accuracy:.2f}")
print(f"Test Accuracy: {test_accuracy:.2f}")


In [None]:
with open('xgb_model.pkl', 'wb') as model_file:
    pickle.dump(xgb_model, model_file)