### BASICS

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [334]:
df = pd.read_csv('bank-full.csv',sep=';')
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [335]:
df = df.drop(columns=['default','loan'])
df.head(2)

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,2143,yes,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,29,yes,unknown,5,may,151,1,-1,0,unknown,no


In [20]:
df.isnull().sum().any()

False

In [97]:
df.education.mode()

0    secondary
Name: education, dtype: object

### MC

In [336]:
numerical = ['age','balance','day','duration','campaign','pdays','previous']
categorical = list(set(df.columns.values) - set(numerical))
print(sorted(numerical), "\n", sorted(categorical))

['age', 'balance', 'campaign', 'day', 'duration', 'pdays', 'previous'] 
 ['contact', 'education', 'housing', 'job', 'marital', 'month', 'poutcome', 'y']


In [220]:
for col in numerical:
    #drop returns a dataframe without the specified column
    #the original dataframe is not being modified
    corr = df[numerical].drop(col,axis=1).corrwith(df[col])
    idx = corr.idxmax()
    max = corr.max()
    print(f"{col} {idx} {np.round(max,3)}")


age balance 0.098
balance age 0.098
day campaign 0.162
duration balance 0.022
campaign day 0.162
pdays previous 0.455
previous pdays 0.455


### MI

In [7]:
from sklearn.metrics import mutual_info_score

In [221]:
def mi_function (data_series):
    return mutual_info_score(data_series,df.y)

In [222]:
df_mi = df[categorical].drop('y',axis=1).apply(mi_function).round(2)
df_mi = df_mi.sort_values(ascending=False).to_frame(name='MI SCORE')


display(df_mi.head())

Unnamed: 0,MI SCORE
poutcome,0.03
month,0.02
housing,0.01
job,0.01
contact,0.01


### LOGISTIC REGRESSION

In [337]:
#target encoding as per instructions
df.y = (df.y == 'yes').astype(int)

In [100]:
from sklearn.model_selection import train_test_split

In [338]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

In [339]:
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [227]:
df_train.size/df.size

0.5999867288934109

In [340]:
y_train = df_train.y.values
y_val = df_val.y.values
del df_train['y']
del df_val['y']

In [229]:
from sklearn.feature_extraction import DictVectorizer

In [341]:
categorical.pop(categorical.index('y'))

'y'

In [342]:
train_dict = df_train[categorical + numerical].to_dict(orient='records')
train_dict[0:2]

[{'marital': 'single',
  'education': 'tertiary',
  'poutcome': 'unknown',
  'housing': 'yes',
  'job': 'technician',
  'contact': 'cellular',
  'month': 'aug',
  'age': 32,
  'balance': 1100,
  'day': 11,
  'duration': 67,
  'campaign': 1,
  'pdays': -1,
  'previous': 0},
 {'marital': 'married',
  'education': 'secondary',
  'poutcome': 'unknown',
  'housing': 'yes',
  'job': 'entrepreneur',
  'contact': 'cellular',
  'month': 'nov',
  'age': 38,
  'balance': 0,
  'day': 17,
  'duration': 258,
  'campaign': 1,
  'pdays': -1,
  'previous': 0}]

In [343]:
dv = DictVectorizer(sparse=False)
dv.fit(train_dict)

In [344]:
X_train = dv.transform(train_dict)

In [345]:
X_train.shape, y_train.shape

((27126, 47), (27126,))

In [346]:
dv.get_feature_names_out()

array(['age', 'balance', 'campaign', 'contact=cellular',
       'contact=telephone', 'contact=unknown', 'day', 'duration',
       'education=primary', 'education=secondary', 'education=tertiary',
       'education=unknown', 'housing=no', 'housing=yes', 'job=admin.',
       'job=blue-collar', 'job=entrepreneur', 'job=housemaid',
       'job=management', 'job=retired', 'job=self-employed',
       'job=services', 'job=student', 'job=technician', 'job=unemployed',
       'job=unknown', 'marital=divorced', 'marital=married',
       'marital=single', 'month=apr', 'month=aug', 'month=dec',
       'month=feb', 'month=jan', 'month=jul', 'month=jun', 'month=mar',
       'month=may', 'month=nov', 'month=oct', 'month=sep', 'pdays',
       'poutcome=failure', 'poutcome=other', 'poutcome=success',
       'poutcome=unknown', 'previous'], dtype=object)

In [236]:
from sklearn.linear_model import LogisticRegression

In [347]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [348]:
val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [349]:
y_pred = model.predict_proba(X_val)[:, 1]
y_pred

array([0.01315207, 0.01039464, 0.14683511, ..., 0.05419486, 0.00999035,
       0.2869727 ])

In [350]:
churn = y_pred > 0.5

In [351]:
full_precision = (y_val == churn).mean()
full_precision

0.9011280690112807

### FEATURE ELIMINATION

In [300]:
total_features = categorical + numerical
total_features.append('y')

In [364]:
results = []
for f in total_features:
    if f != 'y':
        #reload data
        df = pd.read_csv('bank-full.csv',sep=';')
        df = df.drop(columns=['default','loan'])

        train_feat = total_features.copy()
        train_feat.remove(f)

        df.y = (df.y == 'yes').astype(int)

        df_full_train, df_test = train_test_split(df[train_feat], test_size=0.2, random_state=42)
        df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)
        y_train = df_train.y.values
        y_val = df_val.y.values
        del df_train['y']
        del df_val['y']

        train_feat.pop(train_feat.index('y'))
        
        train_dict = df_train[train_feat].to_dict(orient='records')
        dv.fit(train_dict)
        X_train = dv.transform(train_dict)
        model.fit(X_train, y_train)

        val_dict = df_val[train_feat].to_dict(orient='records')
        X_val = dv.transform(val_dict)

        y_pred = model.predict_proba(X_val)[:, 1]
        churn = y_pred > 0.6

        precision = (y_val == churn).mean()
        difference = float(full_precision - precision)

        results.append(
            {
                'Feature': f,
                'Precision': precision,
                'Difference': difference
            })

results_df = pd.DataFrame(data = results)

In [365]:
results_df.sort_values(by='Difference')

Unnamed: 0,Feature,Precision,Difference
0,marital,0.89969,0.001438
13,previous,0.899359,0.00177
9,day,0.899248,0.00188
12,pdays,0.899248,0.00188
8,balance,0.899137,0.001991
7,age,0.899027,0.002101
1,education,0.898916,0.002212
5,contact,0.898916,0.002212
6,month,0.898584,0.002544
3,housing,0.898253,0.002875


### REGULARIZED

In [379]:
total_features = categorical + numerical
total_features.append('y')
df = pd.read_csv('bank-full.csv',sep=';')
df = df.drop(columns=['default','loan'])
numerical = ['age','balance','day','duration','campaign','pdays','previous']
categorical = list(set(df.columns.values) - set(numerical))

In [380]:
df.y = (df.y == 'yes').astype(int)
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

y_train = df_train.y.values
y_val = df_val.y.values
del df_train['y']
del df_val['y']

categorical.pop(categorical.index('y'))
train_dict = df_train[categorical + numerical].to_dict(orient='records')

dv = DictVectorizer(sparse=False)
dv.fit(train_dict)
X_train = dv.transform(train_dict)

c_factors = [0.01, 0.1, 1, 10, 100]
results = []

for factor in c_factors:
    model = LogisticRegression(solver='liblinear', C=factor, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    val_dict = df_val[categorical + numerical].to_dict(orient='records')
    X_val = dv.transform(val_dict)
    y_pred = model.predict_proba(X_val)[:, 1]
    churn = y_pred > 0.5
    full_precision = (y_val == churn).mean()
    
    results.append({
        'C-value': int(factor),
        'Accuracy': full_precision
    })
    
results_df = pd.DataFrame(data = results)
results_df = results_df.sort_values(by='Accuracy',ascending=False)

In [384]:
results_df

Unnamed: 0,C-value,Accuracy
3,10.0,0.90157
1,0.1,0.90146
2,1.0,0.901128
4,100.0,0.900796
0,0.01,0.898363
