In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline 

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import auc,roc_auc_score

### Dataset preparation
```
wget https://archive.ics.uci.edu/static/public/222/bank+marketing.zip
unzip bank+marketing.zip 
unzip bank.zip
```

In [5]:
!wget https://archive.ics.uci.edu/static/public/222/bank+marketing.zip

--2024-10-22 18:07:09--  https://archive.ics.uci.edu/static/public/222/bank+marketing.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘bank+marketing.zip’

bank+marketing.zip      [       <=>          ] 999.85K   576KB/s    in 1.7s    

2024-10-22 18:07:14 (576 KB/s) - ‘bank+marketing.zip’ saved [1023843]



In [9]:
!unzip bank+marketing.zip

Archive:  bank+marketing.zip
 extracting: bank.zip                
 extracting: bank-additional.zip     


In [10]:
!unzip bank.zip

Archive:  bank.zip
  inflating: bank-full.csv           
  inflating: bank-names.txt          
  inflating: bank.csv                


In [11]:
!ls

04-Evaluation-metrics  bank-additional.zip  bank.csv
Untitled.ipynb	       bank-full.csv	    bank.zip
bank+marketing.zip     bank-names.txt	    homework.md


In [3]:
df = pd.read_csv('bank-full.csv',sep=';')

In [4]:
cols = ['age','job','marital','education','balance','housing','contact','day','month','duration','campaign','pdays','previous','poutcome','y']

In [5]:
df = df[cols]
df.head()

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,2143,yes,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,29,yes,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,2,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,1506,yes,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,1,no,unknown,5,may,198,1,-1,0,unknown,no


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 15 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   balance    45211 non-null  int64 
 5   housing    45211 non-null  object
 6   contact    45211 non-null  object
 7   day        45211 non-null  int64 
 8   month      45211 non-null  object
 9   duration   45211 non-null  int64 
 10  campaign   45211 non-null  int64 
 11  pdays      45211 non-null  int64 
 12  previous   45211 non-null  int64 
 13  poutcome   45211 non-null  object
 14  y          45211 non-null  object
dtypes: int64(7), object(8)
memory usage: 5.2+ MB


In [7]:
df.y.value_counts()

y
no     39922
yes     5289
Name: count, dtype: int64

In [8]:
df.y = (df.y == 'yes').astype(int)

In [9]:
# Splitting the dataset
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

In [10]:
df_train.shape, df_val.shape, df_test.shape

((27126, 15), (9042, 15), (9043, 15))

In [11]:
y_train = df_train.y.values
y_val = df_val.y.values
t_test = df_test.y.values

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
fd_test = df_test.reset_index(drop=True)

df_train.drop(columns='y',inplace=True)
df_val.drop(columns='y',inplace=True)
df_test.drop(columns='y',inplace=True)

In [12]:
df_train.columns

Index(['age', 'job', 'marital', 'education', 'balance', 'housing', 'contact',
       'day', 'month', 'duration', 'campaign', 'pdays', 'previous',
       'poutcome'],
      dtype='object')

In [14]:
categorical_features = list(df_train.dtypes[df.dtypes == 'object'].index)
categorical_features

['job', 'marital', 'education', 'housing', 'contact', 'month', 'poutcome']

In [15]:
numerical_features = list(df_train.dtypes[df.dtypes == 'int'].index)
numerical_features

['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

In [16]:
def train_model(df,y,c=1.0):
    dicts = df[categorical_features+numerical_features].to_dict(orient='records')

    dv = DictVectorizer(sparse=False)
    x_train = dv.fit_transform(dicts)

    model = LogisticRegression(C=c,max_iter=1000)
    model.fit(X_train,y_train)

    return dv,model

In [17]:
def predict(df,dv,model):
    dicts = df[categorical_features+numerical_features].to_dict(orient='records')

    X = dv.transform(dicts)
    y_pred = model.predict_proba(X)[:,1]
    return y_pred

## Question 1: ROC AUC feature importance
ROC AUC could also be used to evaluate feature importance of numerical variables.

Let's do that

- For each numerical variable, use it as score (aka prediction) and compute the AUC with the y variable as ground truth.
- Use the training dataset for that

If your AUC is `< 0.5`, invert this variable by putting "-" in front

`(e.g. -df_train['engine_hp'])`

AUC can go below `0.5` if the variable is negatively correlated with the target variable. You can change the direction of the correlation by negating this variable - then negative correlation becomes positive.

Which numerical variable (among the following 4) has the highest AUC?

- balance
- day
- duration
- previous

In [22]:
scores = []
for numerical in numerical_features:
    dicts = df_train[[numerical]].to_dict(orient='records')
    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(dicts)
    
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train,y_train)

    X_val = dv.transform(df_val[[numerical]].to_dict(orient='records'))
    y_pred = model.predict_proba(X_val)[:,1]

    score = roc_auc_score(y_val,y_pred)

    scores.append((score,numerical))

    print(f'score: {score}, numerical: {numerical}')

score: 0.5019686989252637, numerical: age
score: 0.5995068851725284, numerical: balance
score: 0.5352198426324892, numerical: day
score: 0.7965344730967409, numerical: duration
score: 0.5762433459535511, numerical: campaign
score: 0.596249681284474, numerical: pdays
score: 0.607255799705406, numerical: previous


In [25]:
max(scores, key=lambda x: x[0])

(np.float64(0.7965344730967409), 'duration')