In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import balanced_accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost.sklearn import XGBClassifier
from pathlib import Path

c:\Users\admin\anaconda3\lib\site-packages\numpy\.libs\libopenblas.EL2C6PLE4ZYW3ECEVIV3OXXGRN2NRFM2.gfortran-win_amd64.dll
c:\Users\admin\anaconda3\lib\site-packages\numpy\.libs\libopenblas.FB5AE2TYXYH2IJRDKGDGQ3XBKLKTF43H.gfortran-win_amd64.dll


In [2]:
p = Path.cwd() / 'Backdata'

In [3]:
df = pd.read_csv(p / 'bank.csv', sep=';')
df = df[['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'y']]
df

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,261,1,999,0,nonexistent,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,149,1,999,0,nonexistent,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,226,1,999,0,nonexistent,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,151,1,999,0,nonexistent,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,307,1,999,0,nonexistent,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,73,retired,married,professional.course,no,yes,no,cellular,nov,fri,334,1,999,0,nonexistent,yes
41184,46,blue-collar,married,professional.course,no,no,no,cellular,nov,fri,383,1,999,0,nonexistent,no
41185,56,retired,married,university.degree,no,yes,no,cellular,nov,fri,189,2,999,0,nonexistent,no
41186,44,technician,married,professional.course,no,no,no,cellular,nov,fri,442,1,999,0,nonexistent,yes


In [4]:
y = df['y'].map({'no':0, 'yes':1})
X = df.drop('y', axis=1)
X.shape, y.shape

((41188, 15), (41188,))

In [5]:
df.columns

Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'y'],
      dtype='object')

In [6]:
num_features = ['age', 'campaign', 'pdays', 'previous']
cat_features = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']

In [9]:
preprocessor = ColumnTransformer([('numerical', 'passthrough', num_features),
                                  ('categorical', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), cat_features)])

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=42)

In [11]:
preprocessor.fit_transform(X_train)
preprocessor

0,1,2
,transformers,"[('numerical', ...), ('categorical', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [12]:
ohe_categories = preprocessor.named_transformers_['categorical'].categories_
ohe_categories

[array(['admin.', 'blue-collar', 'entrepreneur', 'housemaid', 'management',
        'retired', 'self-employed', 'services', 'student', 'technician',
        'unemployed', 'unknown'], dtype=object),
 array(['divorced', 'married', 'single', 'unknown'], dtype=object),
 array(['basic.4y', 'basic.6y', 'basic.9y', 'high.school', 'illiterate',
        'professional.course', 'university.degree', 'unknown'],
       dtype=object),
 array(['no', 'unknown', 'yes'], dtype=object),
 array(['no', 'unknown', 'yes'], dtype=object),
 array(['no', 'unknown', 'yes'], dtype=object),
 array(['cellular', 'telephone'], dtype=object),
 array(['apr', 'aug', 'dec', 'jul', 'jun', 'mar', 'may', 'nov', 'oct',
        'sep'], dtype=object),
 array(['failure', 'nonexistent', 'success'], dtype=object)]

In [13]:
new_ohe_features = [f'{col}_{val}' for col, vals in zip(cat_features, ohe_categories) for val in vals]
new_ohe_features

['job_admin.',
 'job_blue-collar',
 'job_entrepreneur',
 'job_housemaid',
 'job_management',
 'job_retired',
 'job_self-employed',
 'job_services',
 'job_student',
 'job_technician',
 'job_unemployed',
 'job_unknown',
 'marital_divorced',
 'marital_married',
 'marital_single',
 'marital_unknown',
 'education_basic.4y',
 'education_basic.6y',
 'education_basic.9y',
 'education_high.school',
 'education_illiterate',
 'education_professional.course',
 'education_university.degree',
 'education_unknown',
 'default_no',
 'default_unknown',
 'default_yes',
 'housing_no',
 'housing_unknown',
 'housing_yes',
 'loan_no',
 'loan_unknown',
 'loan_yes',
 'contact_cellular',
 'contact_telephone',
 'month_apr',
 'month_aug',
 'month_dec',
 'month_jul',
 'month_jun',
 'month_mar',
 'month_may',
 'month_nov',
 'month_oct',
 'month_sep',
 'poutcome_failure',
 'poutcome_nonexistent',
 'poutcome_success']

In [14]:
all_features = num_features + new_ohe_features
all_features

['age',
 'campaign',
 'pdays',
 'previous',
 'job_admin.',
 'job_blue-collar',
 'job_entrepreneur',
 'job_housemaid',
 'job_management',
 'job_retired',
 'job_self-employed',
 'job_services',
 'job_student',
 'job_technician',
 'job_unemployed',
 'job_unknown',
 'marital_divorced',
 'marital_married',
 'marital_single',
 'marital_unknown',
 'education_basic.4y',
 'education_basic.6y',
 'education_basic.9y',
 'education_high.school',
 'education_illiterate',
 'education_professional.course',
 'education_university.degree',
 'education_unknown',
 'default_no',
 'default_unknown',
 'default_yes',
 'housing_no',
 'housing_unknown',
 'housing_yes',
 'loan_no',
 'loan_unknown',
 'loan_yes',
 'contact_cellular',
 'contact_telephone',
 'month_apr',
 'month_aug',
 'month_dec',
 'month_jul',
 'month_jun',
 'month_mar',
 'month_may',
 'month_nov',
 'month_oct',
 'month_sep',
 'poutcome_failure',
 'poutcome_nonexistent',
 'poutcome_success']

In [15]:
X_train = pd.DataFrame(preprocessor.transform(X_train), columns=all_features)
X_test = pd.DataFrame(preprocessor.transform(X_test), columns=all_features)

In [16]:
lr_model = LogisticRegression(solver='liblinear', class_weight='balanced', random_state=42)
dt_model = DecisionTreeClassifier(max_depth=10, min_samples_split=0.05, class_weight='balanced')
rf_model = RandomForestClassifier(max_depth=5, min_samples_split=0.01, n_jobs=-1, max_features=0.7, class_weight='balanced')
xgb_model = XGBClassifier(max_depth=3, min_samples_leaf=0.05, verbosity=0, class_weight='balanced')
models = [lr_model, dt_model, rf_model, xgb_model]

In [17]:
for model in models:
    model.fit(X_train, y_train)

In [18]:
y_pred_lr = lr_model.predict(X_test)
y_pred_dt = dt_model.predict(X_test)
y_pred_rf = rf_model.predict(X_test)
y_pred_xgb = xgb_model.predict(X_test)

In [19]:
print(balanced_accuracy_score(y_test, y_pred_lr))
print(balanced_accuracy_score(y_test, y_pred_dt))
print(balanced_accuracy_score(y_test, y_pred_rf))
print(balanced_accuracy_score(y_test, y_pred_xgb))

0.7029409799204365
0.7111616900168247
0.6729232510967499
0.5905820046543075


In [20]:
lr_model.coef_

array([[ 0.00453509, -0.06833395, -0.00176782,  0.17977909, -0.01953132,
        -0.17135492, -0.08123806, -0.12451101, -0.10175439,  0.62601621,
        -0.14338662, -0.19073744,  0.57823038, -0.13177161,  0.10877631,
         0.20906248, -0.03270567,  0.01332526,  0.19576035,  0.38142007,
        -0.12718598,  0.04848842, -0.17550549, -0.08315685,  0.9265999 ,
        -0.15085902,  0.03226914,  0.0871499 ,  0.57521147,  0.11558593,
        -0.13299739,  0.21331074,  0.19448467,  0.15000461,  0.16739932,
         0.19448467,  0.19591603,  0.71962627, -0.16182625, -0.02460317,
        -0.78546079,  1.09856393, -0.67596258, -0.15059014,  1.17063956,
        -0.75451394, -0.87252237,  0.92479582,  0.62745368, -0.04645888,
         0.16583389,  0.43842501]])