In [73]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_extraction import DictVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression

from sklearn import set_config
set_config(display='diagram')

%matplotlib inline

In [2]:
df = pd.read_csv('AER_credit_card_data.csv')
df.head()

Unnamed: 0,card,reports,age,income,share,expenditure,owner,selfemp,dependents,months,majorcards,active
0,yes,0,37.66667,4.52,0.03327,124.9833,yes,no,3,54,1,12
1,yes,0,33.25,2.42,0.005217,9.854167,no,no,3,34,1,13
2,yes,0,33.66667,4.5,0.004156,15.0,yes,no,4,58,1,5
3,yes,0,30.5,2.54,0.065214,137.8692,no,no,0,25,1,7
4,yes,0,32.16667,9.7867,0.067051,546.5033,yes,no,2,64,1,5


In [3]:
# Convert target value 'card' to 1s and 0s
display('Before', df.card.value_counts())
mapping = {'yes': 1, 'no': 0}
df.card.replace(mapping, inplace=True)
display('After', df.card.value_counts())

'Before'

yes    1023
no      296
Name: card, dtype: int64

'After'

1    1023
0     296
Name: card, dtype: int64

#### EDA

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1319 entries, 0 to 1318
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   card         1319 non-null   int64  
 1   reports      1319 non-null   int64  
 2   age          1319 non-null   float64
 3   income       1319 non-null   float64
 4   share        1319 non-null   float64
 5   expenditure  1319 non-null   float64
 6   owner        1319 non-null   object 
 7   selfemp      1319 non-null   object 
 8   dependents   1319 non-null   int64  
 9   months       1319 non-null   int64  
 10  majorcards   1319 non-null   int64  
 11  active       1319 non-null   int64  
dtypes: float64(4), int64(6), object(2)
memory usage: 123.8+ KB


In [5]:
df.describe()

Unnamed: 0,card,reports,age,income,share,expenditure,dependents,months,majorcards,active
count,1319.0,1319.0,1319.0,1319.0,1319.0,1319.0,1319.0,1319.0,1319.0,1319.0
mean,0.775588,0.456406,33.213103,3.365376,0.068732,185.057071,0.993935,55.267627,0.817286,6.996967
std,0.417353,1.345267,10.142783,1.693902,0.094656,272.218917,1.247745,66.271746,0.386579,6.305812
min,0.0,0.0,0.166667,0.21,0.000109,0.0,0.0,0.0,0.0,0.0
25%,1.0,0.0,25.41667,2.24375,0.002316,4.583333,0.0,12.0,1.0,2.0
50%,1.0,0.0,31.25,2.9,0.038827,101.2983,1.0,30.0,1.0,6.0
75%,1.0,0.0,39.41667,4.0,0.093617,249.0358,2.0,72.0,1.0,11.0
max,1.0,14.0,83.5,13.5,0.90632,3099.505,6.0,540.0,1.0,46.0


In [6]:
df.isnull().sum()

card           0
reports        0
age            0
income         0
share          0
expenditure    0
owner          0
selfemp        0
dependents     0
months         0
majorcards     0
active         0
dtype: int64

In [7]:
df.nunique()

card              2
reports          13
age             418
income          431
share          1162
expenditure     981
owner             2
selfemp           2
dependents        7
months          193
majorcards        2
active           35
dtype: int64

In [49]:
# Splitting dataset 60%-20%-20%

target = df.card
data = df.drop(columns='card')

dfTrainFull, dfTest, yTrainFull, yTest  = train_test_split(data, target, test_size=0.2, random_state=1)
dfTrain, dfVal, yTrain, yVal  = train_test_split(dfTrainFull, yTrainFull, test_size=0.25, random_state=1)
len(dfTrain) ,len(dfVal), len(dfTest), len(yTrain), len(yVal), len(yTest)

(791, 264, 264, 791, 264, 264)

In [59]:
numericalCols = dfTrain.select_dtypes(exclude=object).columns
categoricalCols = dfTrain.select_dtypes(include=object).columns
allCols = list(dfTrain.columns)
print(f'Numerical columns: {list(numericalCols)}\n')
print(f'Categorical columns: {list(categoricalCols)}')

Numerical columns: ['reports', 'age', 'income', 'share', 'expenditure', 'dependents', 'months', 'majorcards', 'active']

Categorical columns: ['owner', 'selfemp']


#### Variable with hights correlation

In [51]:
res = {}

for col in numericalCols:
    auc = roc_auc_score(yTrain, dfTrain[col])
    res[col] = auc
    if auc < 0.5:
        dfTrain[col] = -dfTrain[col]
    print(auc)

0.28333701393106236
0.4759979020592945
0.5908049467233478
0.989183643423692
0.991042345276873
0.46722427722262094
0.470578221903237
0.5343859842838476
0.6043173411362006


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfTrain[col] = -dfTrain[col]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfTrain[col] = -dfTrain[col]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfTrain[col] = -dfTrain[col]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value ins

In [54]:
res.get('reports'), res.get('dependents'), res.get('active'), res.get('share')

(0.28333701393106236,
 0.46722427722262094,
 0.6043173411362006,
 0.989183643423692)

The one with the highest correlation is the feature 'share'

#### Training the model

In [83]:
preprocessDV = DictVectorizer(sparse=False)
preprocessDV

In [84]:
preprocessor = ColumnTransformer([('DictVectorizer', preprocessDV, allCols)])
preprocessor

In [85]:
model = make_pipeline(preprocessor, LogisticRegression(solver='liblinear', C=1.0, max_iter=1000))
model