In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_csv('dataset.csv', sep=";")

## Cleaning the dataset

In [3]:
na_columns = df.isna().sum().sort_values(ascending=False)
na_columns

worst_status_active_inv                69515
account_worst_status_12_24m            66761
account_worst_status_6_12m             60350
account_incoming_debt_vs_paid_0_24m    59315
account_worst_status_3_6m              57702
account_status                         54373
account_worst_status_0_3m              54373
avg_payment_span_0_3m                  49305
avg_payment_span_0_12m                 23836
num_active_div_by_paid_inv_0_12m       22939
num_arch_written_off_12_24m            18078
num_arch_written_off_0_12m             18078
account_days_in_rem_12_24m             11836
account_days_in_term_12_24m            11836
account_days_in_dc_12_24m              11836
default                                10000
sum_paid_inv_0_12m                         0
sum_capital_paid_account_12_24m            0
sum_capital_paid_account_0_12m             0
recovery_debt                              0
status_max_archived_0_24_months            0
status_max_archived_0_6_months             0
status_3rd

### List of columns i want to drop

In [4]:
na_columns = na_columns[na_columns > 49000].keys().tolist()
na_columns

['worst_status_active_inv',
 'account_worst_status_12_24m',
 'account_worst_status_6_12m',
 'account_incoming_debt_vs_paid_0_24m',
 'account_worst_status_3_6m',
 'account_status',
 'account_worst_status_0_3m',
 'avg_payment_span_0_3m']

In [5]:
df.drop(columns=['worst_status_active_inv',
 'account_worst_status_12_24m',
 'account_worst_status_6_12m',
 'account_incoming_debt_vs_paid_0_24m',
 'account_worst_status_3_6m',
 'account_status',
 'account_worst_status_0_3m',
 'avg_payment_span_0_3m'], inplace=True)

### Drop the NA values in the column default because this is my target variable y

In [6]:
df=df.dropna(subset=['default'])

In [7]:
df.shape

(89976, 35)

### Define X and y

In [8]:
y = df['default']
X = df.drop(columns=['default'])

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [23]:
pd.set_option('display.max_columns', None)
X_test[0:1]


Unnamed: 0,uuid,account_amount_added_12_24m,account_days_in_dc_12_24m,account_days_in_rem_12_24m,account_days_in_term_12_24m,age,avg_payment_span_0_12m,merchant_category,merchant_group,has_paid,max_paid_inv_0_12m,max_paid_inv_0_24m,name_in_email,num_active_div_by_paid_inv_0_12m,num_active_inv,num_arch_dc_0_12m,num_arch_dc_12_24m,num_arch_ok_0_12m,num_arch_ok_12_24m,num_arch_rem_0_12m,num_arch_written_off_0_12m,num_arch_written_off_12_24m,num_unpaid_bills,status_last_archived_0_24m,status_2nd_last_archived_0_24m,status_3rd_last_archived_0_24m,status_max_archived_0_6_months,status_max_archived_0_12_months,status_max_archived_0_24_months,recovery_debt,sum_capital_paid_account_0_12m,sum_capital_paid_account_12_24m,sum_paid_inv_0_12m,time_hours
62034,19689e3e-b3a1-4339-987b-ac1b76d4aee2,0,0.0,0.0,0.0,19,15.666667,Diversified entertainment,Entertainment,True,11270.0,11270.0,F,0.666667,2,0,0,3,1,0,0.0,0.0,3,1,1,1,1,1,1,0,4567,0,24287,21.466944


### categorical and numerical columns

In [56]:
cat_cols=X.select_dtypes(include=['object']).columns
num_cols=X.select_dtypes(include=['number']).columns

### Let's preprocess the data

In [57]:
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline

In [58]:
num_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

num_pipe

In [59]:
cat_pipe = Pipeline([
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])
cat_pipe

In [60]:
from sklearn.compose import ColumnTransformer

preproc_pipe = ColumnTransformer([
    ('num', num_pipe, num_cols),
    ('cat', cat_pipe, cat_cols)
], remainder='drop')

preproc_pipe

In [61]:
from sklearn.linear_model import LogisticRegression

final_pipe = Pipeline([
    ('preprocessor', preproc_pipe),
    ('model', LogisticRegression())
])
final_pipe

In [62]:
final_pipe.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [63]:
final_pipe.score(X_test, y_test)

0.9855518097284481

In [64]:
X_train.columns

Index(['uuid', 'account_amount_added_12_24m', 'account_days_in_dc_12_24m',
       'account_days_in_rem_12_24m', 'account_days_in_term_12_24m', 'age',
       'avg_payment_span_0_12m', 'merchant_category', 'merchant_group',
       'has_paid', 'max_paid_inv_0_12m', 'max_paid_inv_0_24m', 'name_in_email',
       'num_active_div_by_paid_inv_0_12m', 'num_active_inv',
       'num_arch_dc_0_12m', 'num_arch_dc_12_24m', 'num_arch_ok_0_12m',
       'num_arch_ok_12_24m', 'num_arch_rem_0_12m',
       'num_arch_written_off_0_12m', 'num_arch_written_off_12_24m',
       'num_unpaid_bills', 'status_last_archived_0_24m',
       'status_2nd_last_archived_0_24m', 'status_3rd_last_archived_0_24m',
       'status_max_archived_0_6_months', 'status_max_archived_0_12_months',
       'status_max_archived_0_24_months', 'recovery_debt',
       'sum_capital_paid_account_0_12m', 'sum_capital_paid_account_12_24m',
       'sum_paid_inv_0_12m', 'time_hours'],
      dtype='object')

In [65]:
final_pipe.predict_proba(X_test)[0]

array([0.98769294, 0.01230706])

# Saving the model

In [66]:
import pickle
with open('model.pkl', 'wb') as f:
    pickle.dump(final_pipe, f)

# loading the model

In [70]:
new_pipe= pickle.load(open('model.pkl', 'rb'))

# Make a predict with loaded model

In [71]:
new_pipe.predict_proba(X_test)[0]

array([0.98769294, 0.01230706])