##### Homework #3
##### Nazmul Rabbi
##### 10/11/2024

In [1]:
# import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.feature_selection import mutual_info_classif
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

In [2]:
# read the data
df = pd.read_csv('bank-full.csv', delimiter=";")
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [3]:
# keep required columns
features = ['age', 'job', 'marital', 'education', 'balance', 'housing', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'y']
df = df[features]
df.head()

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,2143,yes,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,29,yes,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,2,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,1506,yes,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,1,no,unknown,5,may,198,1,-1,0,unknown,no


In [4]:
# check for missing values
df.isnull().sum()

age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [5]:
print('Q1: Most frequent observation for the education column is', df.education.mode()[0])

Q1: Most frequent observation for the education column is secondary


In [6]:
# find the correlation matrix
numerical_features = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']
correlation_matrix = df[numerical_features].corr()
correlation_matrix

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
age,1.0,0.097783,-0.00912,-0.004648,0.00476,-0.023758,0.001288
balance,0.097783,1.0,0.004503,0.02156,-0.014578,0.003435,0.016674
day,-0.00912,0.004503,1.0,-0.030206,0.16249,-0.093044,-0.05171
duration,-0.004648,0.02156,-0.030206,1.0,-0.08457,-0.001565,0.001203
campaign,0.00476,-0.014578,0.16249,-0.08457,1.0,-0.088628,-0.032855
pdays,-0.023758,0.003435,-0.093044,-0.001565,-0.088628,1.0,0.45482
previous,0.001288,0.016674,-0.05171,0.001203,-0.032855,0.45482,1.0


In [7]:
print("Q2: The two features that have the biggest correlation are pdays and previous")

Q2: The two features that have the biggest correlation are pdays and previous


In [8]:
# convert the target variable to binary
df.y = LabelEncoder().fit_transform(df.y)
df.head()

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,2143,yes,unknown,5,may,261,1,-1,0,unknown,0
1,44,technician,single,secondary,29,yes,unknown,5,may,151,1,-1,0,unknown,0
2,33,entrepreneur,married,secondary,2,yes,unknown,5,may,76,1,-1,0,unknown,0
3,47,blue-collar,married,unknown,1506,yes,unknown,5,may,92,1,-1,0,unknown,0
4,33,unknown,single,unknown,1,no,unknown,5,may,198,1,-1,0,unknown,0


In [9]:
# split the data
train_data, temp_data = train_test_split(df, test_size=0.4, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

# remove the target variable
y_train = train_data.y
X_train = train_data.drop(columns=['y'])
y_val = val_data.y
X_val = val_data.drop(columns=['y'])

In [10]:
categorical_features = ['job', 'marital', 'education', 'housing', 'contact', 'month', 'poutcome']
X_train_encoded  = X_train.copy()

label_encoders = {}

for col in categorical_features:
    le = LabelEncoder()
    X_train_encoded[col] = le.fit_transform(X_train[col])
    label_encoders[col] = le

mutual_info_score = mutual_info_classif(X_train_encoded[categorical_features], y_train)
mutual_info_dicts = {col: round(score, 2) for col, score in zip(categorical_features, mutual_info_score)}
mutual_info_dicts

{'job': np.float64(0.01),
 'marital': np.float64(0.01),
 'education': np.float64(0.0),
 'housing': np.float64(0.01),
 'contact': np.float64(0.01),
 'month': np.float64(0.02),
 'poutcome': np.float64(0.03)}

In [11]:
print("Q3: The variable with the biggest mutual information score is", list(mutual_info_dicts.keys())[-1])

Q3: The variable with the biggest mutual information score is poutcome


In [12]:
# set up the preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_features),
        ('num', 'passthrough', numerical_features)
    ]
)  

# set up the model pipeline
model = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42))
    ]
)

# fit the model
model.fit(X_train, y_train)

# predict the validation data
y_val_pred = model.predict(X_val)

# calculate the accuracy
accuracy = round(accuracy_score(y_val, y_val_pred), 2)

In [13]:
print("Q4: The accuracy score is", accuracy)

Q4: The accuracy score is 0.9


In [14]:
# check the accuracy difference when dropping a feature
def calculate_accuracy_difference_with_pipeline(feature_to_drop, X_train, y_train, X_val, y_val):
    categorical_features_dropped = [feat for feat in categorical_features if feat != feature_to_drop]
    numerical_features_dropped = [feat for feat in numerical_features if feat != feature_to_drop]

    preprocessor_dropped = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(), categorical_features_dropped),
            ('num', 'passthrough', numerical_features_dropped)
        ]
    )

    model_dropped = Pipeline(steps=[
        ('preprocessor', preprocessor_dropped),
        ('classifier', LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42))
    ])

    model_dropped.fit(X_train, y_train)
    y_val_pred = model_dropped.predict(X_val)
    return accuracy_score(y_val, y_val_pred)

In [15]:
# calculate the accuracy difference for each feature
original_accuracy = accuracy
accuracy_diff = {}
for feature in X_train.columns:
    accuracy_diff[feature] = original_accuracy - calculate_accuracy_difference_with_pipeline(feature, X_train, y_train, X_val, y_val)

accuracy_diff

{'age': -0.0006856890068568378,
 'job': -0.00013271400132708333,
 'marital': -0.0012386640123865922,
 'education': -0.0009068790090687173,
 'balance': -0.000796284007962833,
 'housing': -0.0022340190223402168,
 'contact': 0.0005308560053085554,
 'day': -0.0013492590134925875,
 'month': -0.0009068790090687173,
 'duration': 0.00517584605175847,
 'campaign': -0.0005750940057509535,
 'pdays': -0.0005750940057509535,
 'previous': -0.0005750940057509535,
 'poutcome': 0.01015262110152626}

In [24]:
original_accuracy = accuracy

# Only consider the features provided in the question
features_to_check = ['age', 'balance', 'marital', 'previous']

accuracy_diff = {}
for feature in features_to_check:
    # Update the categorical and numerical features after dropping
    categorical_features_dropped = [feat for feat in categorical_features if feat != feature]
    numerical_features_dropped = [feat for feat in numerical_features if feat != feature]

    # Create a new preprocessor with the updated features
    preprocessor_dropped = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(), categorical_features_dropped),
            ('num', 'passthrough', numerical_features_dropped)
        ]
    )

    # Create a new pipeline with the updated preprocessor
    model_dropped = Pipeline(steps=[
        ('preprocessor', preprocessor_dropped),
        ('classifier', LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42))
    ])

    # Fit the model without the specific feature and calculate accuracy
    model_dropped.fit(X_train, y_train)
    y_val_pred = model_dropped.predict(X_val)
    accuracy_diff[feature] = original_accuracy - round(accuracy_score(y_val, y_val_pred), 3)

In [25]:
print(f"Q5: The feature with the smallest difference is: {min(accuracy_diff, key=accuracy_diff.get)}")

Q5: The feature with the smallest difference is: age


In [18]:
# find the best C value
C_values = [0.01, 0.1, 1, 10, 100]
accuracy_per_C = {}

for C in C_values:
    model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42))
    ])
    model.fit(X_train, y_train)
    accuracy = accuracy_score(y_val, model.predict(X_val))
    accuracy_per_C[C] = round(accuracy, 3)

best_C = min([k for k, v in accuracy_per_C.items() if v == max(accuracy_per_C.values())])

In [19]:
print(f"Q6: The value of C that leads to the best accuracy on the validation set is {best_C}")

Q6: The value of C that leads to the best accuracy on the validation set is 0.1
