In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_csv('dataset.csv', sep=";")

## Clean the dataset

In [13]:
na_columns = df.isna().sum().sort_values(ascending=False)

In [17]:
na_columns = na_columns[na_columns > 49000].keys().tolist()
na_columns

['worst_status_active_inv',
 'account_worst_status_12_24m',
 'account_worst_status_6_12m',
 'account_incoming_debt_vs_paid_0_24m',
 'account_worst_status_3_6m',
 'account_status',
 'account_worst_status_0_3m',
 'avg_payment_span_0_3m']

In [18]:
df.drop(columns=['worst_status_active_inv',
 'account_worst_status_12_24m',
 'account_worst_status_6_12m',
 'account_incoming_debt_vs_paid_0_24m',
 'account_worst_status_3_6m',
 'account_status',
 'account_worst_status_0_3m',
 'avg_payment_span_0_3m'], inplace=True)

In [19]:
df=df.dropna()

In [20]:
df.shape

(60210, 35)

### Define X and y

In [21]:
y = df['default']
X = df.drop(columns=['default'])

In [34]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

### categorical and numerical columns

In [22]:
cat_cols=X.select_dtypes(include=['object']).columns
num_cols=X.select_dtypes(include=['number']).columns

### Let's preprocess the data

In [25]:
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline

In [28]:
num_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

num_pipe

In [31]:
cat_pipe = Pipeline([
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])
cat_pipe

In [32]:
from sklearn.compose import ColumnTransformer

preproc_pipe = ColumnTransformer([
    ('num', num_pipe, num_cols),
    ('cat', cat_pipe, cat_cols)
], remainder='drop')

preproc_pipe

In [33]:
from sklearn.linear_model import LogisticRegression

final_pipe = Pipeline([
    ('preprocessor', preproc_pipe),
    ('model', LogisticRegression())
])
final_pipe

In [35]:
final_pipe.fit(X_train, y_train)

In [36]:
final_pipe.score(X_test, y_test)

0.9885954714056359

In [38]:
final_pipe.predict_proba(X_test)[0]

array([0.81102979, 0.18897021])

In [None]:
import pickle