In [1]:
import pandas as pd
import decimal
from decimal import Decimal
from sklearn.metrics import mutual_info_score, accuracy_score
# Set the precision (number of decimal places)
decimal.getcontext().prec = 100

In [2]:
cols = [
    "age",
    "job",
    "marital",
    "education",
    "balance",
    "housing",
    "contact",
    "day",
    "month",
    "duration",
    "campaign",
    "pdays",
    "previous",
    "poutcome",
    "y"
]

In [3]:
df = pd.read_csv("./bank/bank-full.csv", delimiter=';')[cols]

In [4]:
df.head()

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,2143,yes,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,29,yes,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,2,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,1506,yes,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,1,no,unknown,5,may,198,1,-1,0,unknown,no


In [5]:
df.dtypes

age           int64
job          object
marital      object
education    object
balance       int64
housing      object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y            object
dtype: object

In [6]:
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
numeric_cols = df.select_dtypes(include=['int64']).columns.tolist()

categorical_cols.remove('y')

In [7]:
display(categorical_cols)
display(numeric_cols)

['job', 'marital', 'education', 'housing', 'contact', 'month', 'poutcome']

['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

In [8]:
df.isnull().sum()

age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

# Question 1:

In [9]:
df['education'].value_counts()

education
secondary    23202
tertiary     13301
primary       6851
unknown       1857
Name: count, dtype: int64

# Question 2:

In [10]:
df[numeric_cols].corr()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
age,1.0,0.097783,-0.00912,-0.004648,0.00476,-0.023758,0.001288
balance,0.097783,1.0,0.004503,0.02156,-0.014578,0.003435,0.016674
day,-0.00912,0.004503,1.0,-0.030206,0.16249,-0.093044,-0.05171
duration,-0.004648,0.02156,-0.030206,1.0,-0.08457,-0.001565,0.001203
campaign,0.00476,-0.014578,0.16249,-0.08457,1.0,-0.088628,-0.032855
pdays,-0.023758,0.003435,-0.093044,-0.001565,-0.088628,1.0,0.45482
previous,0.001288,0.016674,-0.05171,0.001203,-0.032855,0.45482,1.0


# Target encoding

In [11]:
df.head()

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,2143,yes,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,29,yes,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,2,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,1506,yes,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,1,no,unknown,5,may,198,1,-1,0,unknown,no


In [12]:
df.y = (df.y == 'yes').astype('int')

# Split the data

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=42)

In [15]:
df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=42)

In [16]:
y_train = df_train.y.values
y_val = df_val.y.values

In [17]:
del df_train['y']
del df_val['y']

# Question 3: Mutual Information

In [18]:
from sklearn.metrics import mutual_info_score

In [19]:
mi = df_train[categorical_cols].apply(lambda col: mutual_info_score(col, y_train))
round(mi,2).sort_values(ascending=False)

month        0.03
poutcome     0.03
job          0.01
housing      0.01
contact      0.01
marital      0.00
education    0.00
dtype: float64

# One hot encoding

In [20]:
from sklearn.feature_extraction import DictVectorizer

In [21]:
dv = DictVectorizer(sparse=False)

train_dict = df_train[categorical_cols + numeric_cols].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical_cols + numeric_cols].to_dict(orient='records')
X_val = dv.transform(val_dict)

# Question 4: Train model

In [22]:
from sklearn.linear_model import LogisticRegression

In [23]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

In [24]:
model.fit(X_train, y_train)

In [25]:
model.predict_proba(X_val)

array([[0.98684793, 0.01315207],
       [0.98960536, 0.01039464],
       [0.85316489, 0.14683511],
       ...,
       [0.94580514, 0.05419486],
       [0.99000965, 0.00999035],
       [0.7130273 , 0.2869727 ]])

In [26]:
# 1 is just get the second value, which means get the proability of positive
y_pred = model.predict_proba(X_val)[:, 1] 


In [27]:
df_pred = pd.DataFrame()
df_pred['probability'] = y_pred
df_pred['prediction'] = y_pred.astype(int)
df_pred['actual'] = y_val

In [28]:
df_pred['correct'] = df_pred.prediction == df_pred.actual
origin_accuracy = round(df_pred.correct.mean(),2)

In [29]:
display(origin_accuracy)

0.88

# Question 5

In [30]:
accuracies = pd.DataFrame(columns=['eliminated_feature', 'accuracy', 'difference'])
for feature in categorical_cols+numeric_cols:
    subset = categorical_cols+numeric_cols
    subset.remove(feature)
    dv = DictVectorizer(sparse=False)
    train_dict = df_train[subset].to_dict(orient='records')
    X_train = dv.fit_transform(train_dict)

    model = LogisticRegression(solver='liblinear', max_iter=1000, C=10, random_state=42)
    model.fit(X_train, y_train)
    
    val_dict = df_val[subset].to_dict(orient='records')
    X_val = dv.transform(val_dict)
    
    y_pred = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    
    accuracies.loc[len(accuracies)] = [feature, accuracy, abs(origin_accuracy - accuracy)]

In [31]:
display(accuracies.sort_values(by='difference', ascending=False))

Unnamed: 0,eliminated_feature,accuracy,difference
0,job,0.90157,0.02157
12,pdays,0.901349,0.021349
9,day,0.901128,0.021128
3,housing,0.901017,0.021017
7,age,0.901017,0.021017
2,education,0.900796,0.020796
4,contact,0.900796,0.020796
8,balance,0.900796,0.020796
1,marital,0.900686,0.020686
13,previous,0.900575,0.020575


# Question 6:

In [32]:
accuracies = pd.DataFrame(columns=['C', 'accuracy'])
for C in [0.01, 0.1, 1, 10, 100]:
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    accuracies.loc[len(accuracies)] = [C, round(accuracy,3)]


In [33]:
display(accuracies.sort_values(by="accuracy"))

Unnamed: 0,C,accuracy
0,0.01,0.899
1,0.1,0.901
2,1.0,0.901
3,10.0,0.901
4,100.0,0.901
