In [175]:
import pandas as pd
import decimal
from decimal import Decimal
from sklearn.metrics import mutual_info_score, accuracy_score
# Set the precision (number of decimal places)
decimal.getcontext().prec = 100

In [176]:
cols = [
    "age",
    "job",
    "marital",
    "education",
    "balance",
    "housing",
    "contact",
    "day",
    "month",
    "duration",
    "campaign",
    "pdays",
    "previous",
    "poutcome",
    "y"
]

In [177]:
df = pd.read_csv("./bank/bank-full.csv", delimiter=';')[cols]

In [178]:
df.head()

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,2143,yes,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,29,yes,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,2,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,1506,yes,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,1,no,unknown,5,may,198,1,-1,0,unknown,no


In [179]:
df.dtypes

age           int64
job          object
marital      object
education    object
balance       int64
housing      object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y            object
dtype: object

In [180]:
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
numeric_cols = df.select_dtypes(include=['int64']).columns.tolist()

categorical_cols.remove('y')

In [181]:
display(categorical_cols)
display(numeric_cols)

['job', 'marital', 'education', 'housing', 'contact', 'month', 'poutcome']

['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

In [182]:
df.isnull().sum()

age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

# Question 1:

In [183]:
df['education'].value_counts()

education
secondary    23202
tertiary     13301
primary       6851
unknown       1857
Name: count, dtype: int64

# Question 2:

In [184]:
df[numeric_cols].corr()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
age,1.0,0.097783,-0.00912,-0.004648,0.00476,-0.023758,0.001288
balance,0.097783,1.0,0.004503,0.02156,-0.014578,0.003435,0.016674
day,-0.00912,0.004503,1.0,-0.030206,0.16249,-0.093044,-0.05171
duration,-0.004648,0.02156,-0.030206,1.0,-0.08457,-0.001565,0.001203
campaign,0.00476,-0.014578,0.16249,-0.08457,1.0,-0.088628,-0.032855
pdays,-0.023758,0.003435,-0.093044,-0.001565,-0.088628,1.0,0.45482
previous,0.001288,0.016674,-0.05171,0.001203,-0.032855,0.45482,1.0


# Target encoding

In [185]:
df.head()

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,2143,yes,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,29,yes,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,2,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,1506,yes,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,1,no,unknown,5,may,198,1,-1,0,unknown,no


In [186]:
df.y = (df.y == 'yes').astype('int')

# Split the data

In [187]:
from sklearn.model_selection import train_test_split

In [188]:
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=42)

In [189]:
df_train, df_val = train_test_split(df_train_full, test_size=0.33, random_state=42)

In [190]:
y_train = df_train.y.values
y_val = df_val.y.values

In [191]:
del df_train['y']
del df_val['y']

# Question 3: Mutual Information

In [192]:
from sklearn.metrics import mutual_info_score

In [193]:
mi = df_train[categorical_cols].apply(lambda col: mutual_info_score(col, y_train))
round(mi,2).sort_values(ascending=False)

month        0.03
poutcome     0.03
job          0.01
housing      0.01
contact      0.01
marital      0.00
education    0.00
dtype: float64

# One hot encoding

In [194]:
from sklearn.feature_extraction import DictVectorizer

In [195]:
dv = DictVectorizer(sparse=False)

train_dict = df_train[categorical_cols + numeric_cols].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical_cols + numeric_cols].to_dict(orient='records')
X_val = dv.transform(val_dict)

# Train model:

In [196]:
from sklearn.linear_model import LogisticRegression

In [197]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

In [198]:
model.fit(X_train, y_train)

In [199]:
model.predict_proba(X_val)

array([[0.98685297, 0.01314703],
       [0.99094181, 0.00905819],
       [0.84835925, 0.15164075],
       ...,
       [0.99224499, 0.00775501],
       [0.97160001, 0.02839999],
       [0.90432025, 0.09567975]])

In [200]:
# 1 is just get the second value, which means get the proability of positive
y_pred = model.predict_proba(X_val)[:, 1] 


In [201]:
df_pred = pd.DataFrame()
df_pred['probability'] = y_pred
df_pred['prediction'] = y_pred.astype(int)
df_pred['actual'] = y_val

In [202]:
df_pred['correct'] = df_pred.prediction == df_pred.actual
origin_accuracy = round(df_pred.correct.mean(),2)

In [203]:
display(origin_accuracy)

0.88

# Question 5

In [207]:
accuracies = pd.DataFrame(columns=['eliminated_feature', 'accuracy', 'difference'])
for feature in categorical_cols+numeric_cols:
    subset = categorical_cols+numeric_cols
    subset.remove(feature)
    dv = DictVectorizer(sparse=False)
    train_dict = df_train[subset].to_dict(orient='records')
    X_train = dv.fit_transform(train_dict)

    model = LogisticRegression(solver='liblinear', max_iter=1000, C=10, random_state=42)
    model.fit(X_train, y_train)
    
    val_dict = df_val[subset].to_dict(orient='records')
    X_val = dv.transform(val_dict)
    
    y_pred = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    
    accuracies.loc[len(accuracies)] = [feature, accuracy, abs(origin_accuracy - accuracy)]

In [213]:
display(accuracies.sort_values(by='difference', ascending=False))

Unnamed: 0,eliminated_feature,accuracy,difference
10,duration,0.891588,-0.011588
6,poutcome,0.894186,-0.014186
5,month,0.900218,-0.020218
4,contact,0.900469,-0.020469
11,campaign,0.900469,-0.020469
0,job,0.901139,-0.021139
9,day,0.901223,-0.021223
7,age,0.901391,-0.021391
2,education,0.901475,-0.021475
3,housing,0.901475,-0.021475


The least useful features is:

- 'age': -0.001,
- 'balance': 0.0,
- 'previous': 0.005
