# Machine Learning for Classification

In [28]:
# For data manipulation and analysis
import pandas as pd
import numpy as np

# For data visualization
import seaborn as sns
import matplotlib.pyplot as plt

# For machine learning
from sklearn.metrics import mutual_info_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

In [29]:
# Import CSV file, specify separator and encoding to avoid issues with special characters
df_full = pd.read_csv('bank-full.csv', sep=';', encoding='utf-8')

df_full.head(10)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no
5,35,management,married,tertiary,no,231,yes,no,unknown,5,may,139,1,-1,0,unknown,no
6,28,management,single,tertiary,no,447,yes,yes,unknown,5,may,217,1,-1,0,unknown,no
7,42,entrepreneur,divorced,tertiary,yes,2,yes,no,unknown,5,may,380,1,-1,0,unknown,no
8,58,retired,married,primary,no,121,yes,no,unknown,5,may,50,1,-1,0,unknown,no
9,43,technician,single,secondary,no,593,yes,no,unknown,5,may,55,1,-1,0,unknown,no


In [30]:
# Select relevant features for analysis
selected_features = ['age', 'job', 'marital', 'education', 'balance', 
                     'housing', 'contact', 'day', 'month', 'duration', 
                     'campaign', 'pdays', 'previous', 'poutcome', 'y']

# Create a copy of the dataframe with selected features to avoid modification on the original dataset
df = df_full[selected_features].copy()

# Check for missing values in the dataframe
missing_values = df.isnull().sum()

print(missing_values)

age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64


In [31]:
# Calculate mode for the 'education' column after dropping NaN values to ensure robustness
education_mode = df['education'].dropna().mode()[0]

# Print the mode of the 'education' column to understand the most common education level
print("Most frequent education level:", education_mode)

Most frequent education level: secondary


In [32]:
# Get counts for each unique value in the 'education' column, including NaN values if needed
education_counts = df['education'].value_counts(dropna=False)

# Print counts of each education level for better understanding of its distribution
print("Counts of each education level:\n", education_counts)

education_percentage = df['education'].value_counts(normalize=True, dropna=False)
print("\nPercentage of each education level:\n", education_percentage)

Counts of each education level:
 education
secondary    23202
tertiary     13301
primary       6851
unknown       1857
Name: count, dtype: int64

Percentage of each education level:
 education
secondary    0.513194
tertiary     0.294198
primary      0.151534
unknown      0.041074
Name: proportion, dtype: float64


In [33]:
# Select numerical columns from the dataframe using select_dtypes for better readability
numerical_features = df.select_dtypes(include=['number']).columns.tolist()

print("Numerical features:", numerical_features)

Numerical features: ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']


In [34]:
# Ensure 'numerical' is defined properly
numerical = list(df.select_dtypes(include=['number']).columns)

# Initialize the correlations dictionary
correlations = {k: [] for k in numerical}

# Calculate correlations using nested loops
for col in numerical:
    for col2 in numerical:
        correlations[col].append(df[col].corr(df[col2]))

# Create the correlation matrix DataFrame
correlation_matrix = pd.DataFrame(correlations)
correlation_matrix.index = numerical

# Display the correlation matrix
print(correlation_matrix)

               age   balance       day  duration  campaign     pdays  previous
age       1.000000  0.097783 -0.009120 -0.004648  0.004760 -0.023758  0.001288
balance   0.097783  1.000000  0.004503  0.021560 -0.014578  0.003435  0.016674
day      -0.009120  0.004503  1.000000 -0.030206  0.162490 -0.093044 -0.051710
duration -0.004648  0.021560 -0.030206  1.000000 -0.084570 -0.001565  0.001203
campaign  0.004760 -0.014578  0.162490 -0.084570  1.000000 -0.088628 -0.032855
pdays    -0.023758  0.003435 -0.093044 -0.001565 -0.088628  1.000000  0.454820
previous  0.001288  0.016674 -0.051710  0.001203 -0.032855  0.454820  1.000000


In [35]:
# Convert 'y' column to binary values using map function for better readability
df['y'] = df['y'].map({'yes': 1, 'no': 0})

In [36]:
df.y

0        0
1        0
2        0
3        0
4        0
        ..
45206    1
45207    1
45208    1
45209    0
45210    0
Name: y, Length: 45211, dtype: int64

In [37]:
target_column = df['y']

In [38]:
# Split the full dataset into training (80%) and test (20%) sets
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

# Split the training dataset further into training (75%) and validation (25%) sets
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

# Extract target variable 'y' for each split using .loc[] for consistency
y_full_train = df_full_train.loc[:, 'y']
y_test = df_test.loc[:, 'y']
y_train = df_train.loc[:, 'y']
y_val = df_val.loc[:, 'y']

# Drop target variable 'y' from feature DataFrames
df_full_train = df_full_train.drop(columns=['y'])
df_test = df_test.drop(columns=['y'])
df_train = df_train.drop(columns=['y'])
df_val = df_val.drop(columns=['y'])

In [39]:
# Select categorical columns from the training dataset using select_dtypes for better readability
categorical_features = df_train.select_dtypes(include=['object']).columns.tolist()

print("Categorical features:", categorical_features)

Categorical features: ['job', 'marital', 'education', 'housing', 'contact', 'month', 'poutcome']


In [40]:
# Define the categorical features
categorical = list(df_train.select_dtypes(include=['object']).columns)

# Fill NaN values in categorical features with 'missing' to handle missing data
df_train_filled = df_train[categorical].fillna('missing')

# Calculate mutual information scores for each categorical feature in relation to the target variable 'y_train'
mi = df_train_filled.apply(lambda s: mutual_info_score(s, y_train))

# Sort the mutual information scores in descending order to identify the most important features
mi_sorted = mi.sort_values(ascending=False)

# Print the sorted mutual information scores for each categorical feature
print("Mutual Information Scores for Categorical Features (sorted):\n", mi_sorted)

Mutual Information Scores for Categorical Features (sorted):
 poutcome     0.029533
month        0.025090
contact      0.013356
housing      0.010343
job          0.007316
education    0.002697
marital      0.002050
dtype: float64


In [41]:
# Convert training data to dictionary format
dicts = df_train[categorical + numerical].to_dict(orient='records')

# Initialize DictVectorizer and transform training data
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(dicts)

# Initialize and train logistic regression model
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [42]:
# Convert validation data to dictionary format
dicts = df_val[categorical + numerical].to_dict(orient='records')

# Transform validation data to feature matrix
X_val = dv.fit_transform(dicts)

# Predict on validation data
y_pred = model.predict(X_val)

# Calculate and round accuracy to 2 decimal places
round((y_val == y_pred).mean(), 2)

np.float64(0.9)

In [43]:
# Convert training data to a dictionary format suitable for DictVectorizer
dicts = df_train[categorical + numerical].to_dict(orient='records')

# Transform dictionary into feature matrix using DictVectorizer and fit model
X_train = dv.fit_transform(dicts)
model.fit(X_train, y_train)

# Convert validation data to dictionary format and transform using DictVectorizer
dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.fit_transform(dicts)

# Predict on validation data and calculate original model accuracy
y_pred = model.predict(X_val)
original_accuracy = (y_val == y_pred).mean()
print('Original:', original_accuracy)

# Iterate over each feature and assess its impact by removing it from the model
for f in categorical + numerical:
    # Remove the feature 'f' from the feature set
    smaller_features = [feature for feature in categorical + numerical if feature != f]
    
    # Convert reduced training data to dictionary and transform
    dicts_smaller = df_train[smaller_features].to_dict(orient='records')
    X_train = dv.fit_transform(dicts_smaller)
    
    # Fit the model without the selected feature
    model.fit(X_train, y_train)
    
    # Convert validation data without the selected feature and transform
    dicts = df_val[smaller_features].to_dict(orient='records')
    X_val = dv.fit_transform(dicts)
    
    # Predict on validation data and calculate the difference in accuracy
    y_pred = model.predict(X_val)
    print(f'Without {f} difference:', original_accuracy - (y_val == y_pred).mean())

Original: 0.9009068790090687
Without job difference: -0.0002211900022119906
Without marital difference: 0.0
Without education difference: 0.0
Without housing difference: -0.0002211900022119906
Without contact difference: 0.00044238000442375913
Without month difference: 0.0011059500110593978
Without poutcome difference: 0.007520460075204571
Without age difference: -0.00044238000442387015
Without balance difference: -0.0001105950011059953
Without day difference: -0.00044238000442387015
Without duration difference: 0.011170095111700862
Without campaign difference: 0.0006635700066356387
Without pdays difference: 0.0
Without previous difference: 0.0


In [44]:
# Iterate over different values of regularization parameter C
for C in [0.01, 0.1, 1, 10, 100]:
    # Convert training data to dictionary format
    dicts = df_train[categorical + numerical].to_dict(orient='records')
    
    # Transform dictionary to feature matrix
    X_train = dv.fit_transform(dicts)
    
    # Train logistic regression model with specified C value
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)

    # Convert validation data to dictionary format
    dicts = df_val[categorical + numerical].to_dict(orient='records')
    
    # Transform dictionary to feature matrix
    X_val = dv.fit_transform(dicts)
    
    # Predict on validation data
    y_pred = model.predict(X_val)
    
    # Calculate and accuracy
    accuracy = round((y_val == y_pred).mean(), 3)
    print(f"{C}:", accuracy)

0.01: 0.898
0.1: 0.901
1: 0.901
10: 0.901
100: 0.901
