In [108]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import make_union
from sklearn.compose import make_column_transformer
from imblearn.over_sampling import SMOTE, SMOTENC
pd.set_option('display.max_columns', None)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score

In [2]:
data = pd.read_csv("dataset.csv", sep=";", error_bad_lines=False)



  data = pd.read_csv("dataset.csv", sep=";", error_bad_lines=False)


In [3]:
data.shape

(99976, 43)

In [4]:
df = data.dropna()

In [5]:
df.shape


(9111, 43)

## Defining data to predict

In [6]:
pred_df = data[data["default"].isna()]

In [7]:
pred_df.shape

(10000, 43)

### Cleaning data

In [8]:
df["default"].value_counts()

0.0    9020
1.0      91
Name: default, dtype: int64

In [9]:
(df==0).sum().sort_values(ascending=False)/len(df)

num_arch_written_off_12_24m            1.000000
num_arch_written_off_0_12m             0.999671
account_days_in_dc_12_24m              0.995610
recovery_debt                          0.993524
default                                0.990012
account_days_in_term_12_24m            0.971683
num_arch_dc_0_12m                      0.909340
num_arch_dc_12_24m                     0.907914
account_days_in_rem_12_24m             0.717265
num_arch_rem_0_12m                     0.485457
account_incoming_debt_vs_paid_0_24m    0.286467
sum_capital_paid_account_12_24m        0.185380
sum_capital_paid_account_0_12m         0.180880
account_amount_added_12_24m            0.081111
num_arch_ok_12_24m                     0.073318
status_3rd_last_archived_0_24m         0.026122
num_arch_ok_0_12m                      0.018549
status_max_archived_0_6_months         0.015805
avg_payment_span_0_3m                  0.010317
status_2nd_last_archived_0_24m         0.009439
status_max_archived_0_12_months        0

In [10]:
X = df.drop(columns=["default", "num_arch_written_off_12_24m", "num_arch_written_off_0_12m",
                     "account_days_in_dc_12_24m", "recovery_debt", "account_days_in_term_12_24m",
                     "num_arch_dc_0_12m", "num_arch_dc_12_24m", "account_days_in_rem_12_24m",
                     "num_arch_rem_0_12m"])
y = df["default"]

### split X and y

In [12]:
X.drop(columns="name_in_email", inplace=True)

In [13]:
X.drop(columns="uuid", inplace=True)

In [14]:
X.drop(columns="merchant_category", inplace=True)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

### splitting num and cat features

In [25]:
num_features = X_train.select_dtypes(include="number").columns.tolist()
cat_features = X_train.select_dtypes(exclude="number").columns.tolist()


pandas.core.frame.DataFrame

### Pre-processing pipelines

In [18]:
from sklearn import set_config; set_config(display='diagram')

In [43]:
num_transformer = make_pipeline(RobustScaler())
cat_transformer = make_pipeline(OneHotEncoder(handle_unknown='ignore'))

preprocessor = ColumnTransformer([
    ('num_transformer', num_transformer, num_features),
    ('cat_transformer', cat_transformer, cat_features)
])
pipeline = make_pipeline(preprocessor, LogisticRegression())
pipeline

In [44]:
pipeline.fit(X_train, y_train)

In [45]:
train_auc = roc_auc_score(y_train, pipeline.predict_proba(X_train)[:, 1])
test_auc = roc_auc_score(y_test, pipeline.predict_proba(X_test)[:, 1])

print(f"Train AUC: {train_auc}")
print(f"Test AUC: {test_auc}")

Train AUC: 0.942627663015851
Test AUC: 0.9086192584394023


In [110]:
import pickle

# Save the model to a file
with open('model_pipeline.pkl', 'wb') as file:
    pickle.dump(pipeline, file)

In [73]:
new_data = data[data["default"].isna()]

In [74]:
new_data = new_data.drop(columns=["default", "num_arch_written_off_12_24m", "num_arch_written_off_0_12m",
                     "account_days_in_dc_12_24m", "recovery_debt", "account_days_in_term_12_24m",
                     "num_arch_dc_0_12m", "num_arch_dc_12_24m", "account_days_in_rem_12_24m",
                     "num_arch_rem_0_12m",  "name_in_email"])

In [75]:
new_data = new_data.drop(columns="merchant_category")

In [76]:
new_data = new_data.dropna()

In [105]:
new_data.columns

Index(['uuid', 'account_amount_added_12_24m',
       'account_incoming_debt_vs_paid_0_24m', 'account_status',
       'account_worst_status_0_3m', 'account_worst_status_12_24m',
       'account_worst_status_3_6m', 'account_worst_status_6_12m', 'age',
       'avg_payment_span_0_12m', 'avg_payment_span_0_3m', 'merchant_group',
       'has_paid', 'max_paid_inv_0_12m', 'max_paid_inv_0_24m',
       'num_active_div_by_paid_inv_0_12m', 'num_active_inv',
       'num_arch_ok_0_12m', 'num_arch_ok_12_24m', 'num_unpaid_bills',
       'status_last_archived_0_24m', 'status_2nd_last_archived_0_24m',
       'status_3rd_last_archived_0_24m', 'status_max_archived_0_6_months',
       'status_max_archived_0_12_months', 'status_max_archived_0_24_months',
       'sum_capital_paid_account_0_12m', 'sum_capital_paid_account_12_24m',
       'sum_paid_inv_0_12m', 'time_hours', 'worst_status_active_inv'],
      dtype='object')

In [78]:
new_data_pred = new_data.drop(columns="uuid")

In [83]:
predictions = pipeline.predict_proba(new_data_pred)

In [84]:
predictions

array([[9.99900902e-01, 9.90975824e-05],
       [9.89982214e-01, 1.00177857e-02],
       [9.90379146e-01, 9.62085377e-03],
       ...,
       [9.96699342e-01, 3.30065842e-03],
       [9.99929184e-01, 7.08156773e-05],
       [9.88115927e-01, 1.18840732e-02]])

In [85]:
results_df = pd.DataFrame({'uuid': new_data["uuid"],
                          'pd': predictions[:,1]})

In [100]:
results_df_1 = pd.DataFrame(results_df[results_df['pd']>0.5])

In [104]:
results_df_1

Unnamed: 0,uuid,pd
90032,88efa4ef-b563-4712-9834-ed261b1197d6,0.500027
90771,03871b77-6646-432e-9744-c4d07ef8915c,0.70256
93863,3f0ffbcc-c8cb-4591-a67a-6f91a99d0d65,0.785952
