In [148]:
# 1. Import libraries and load data
import pandas as pd

df = pd.read_csv('bank-additional-full.csv', sep=';')

In [149]:
# Count 'unknown' values in each column BEFORE replacement
unknown_counts = (df == 'unknown').sum()
unknown_counts = unknown_counts[unknown_counts > 0]
print("Columns with 'unknown' values:")
print(unknown_counts)



Columns with 'unknown' values:
job           330
marital        80
education    1731
default      8597
housing       990
loan          990
dtype: int64


Best practice:

Treat "unknown" as its own category for job, marital, education, default, housing, and loan.
Do not drop these rows or try to impute with the mode or another value unless you have a strong reason.

In [150]:
#make a new column to indicate if the client has been contacted before
#pdays = number of days since the client was last contacted, 999 means the client has not been contacted before
#so we will create a new column 'prior_contact' where 0 means no prior contact
df['prior_contact'] = df['pdays'].apply(lambda x: 0 if x == 999 else 1)

print("New 'prior_contact' column added to df, indicating if the client has been contacted before.")


New 'prior_contact' column added to df, indicating if the client has been contacted before.


In [151]:
# Descriptive statistics for 'campaign'
print("Descriptive statistics for 'campaign':")
print(df['campaign'].describe())



print(("we see most are between 1 and 3, with a few outliers up to 50."))
# Cap campaign at 6
df['campaign_capped'] = df['campaign'].clip(upper=6)

# Descriptive statistics for 'campaign_capped'
print("\nDescriptive statistics for 'campaign_capped':")
print(df['campaign_capped'].describe())

Descriptive statistics for 'campaign':
count    41188.000000
mean         2.567593
std          2.770014
min          1.000000
25%          1.000000
50%          2.000000
75%          3.000000
max         56.000000
Name: campaign, dtype: float64
we see most are between 1 and 3, with a few outliers up to 50.

Descriptive statistics for 'campaign_capped':
count    41188.000000
mean         2.275274
std          1.550510
min          1.000000
25%          1.000000
50%          2.000000
75%          3.000000
max          6.000000
Name: campaign_capped, dtype: float64


In [152]:
#Time to handle data outliers, we will drop the original 'campaign' column
print("\nRemoving the original 'campaign' column.")
df.drop(columns=['campaign'], inplace=True)
print("Original 'campaign' column removed.")
print("New 'campaign_capped' column added with values capped at 6.")



Removing the original 'campaign' column.
Original 'campaign' column removed.
New 'campaign_capped' column added with values capped at 6.


In [153]:
#drop duration column
# If the 'duration' column exists, drop it; otherwise, print a message
#dropped because it is not useful for analysis and can skew results, occurs after the last contact with the client
if 'duration' in df.columns:
	df.drop(columns=['duration'], inplace=True)
	print('dropped duration column from df.')
else:
	print("'duration' column not found in df.")

dropped duration column from df.


In [154]:
#make a new column to indicate if the client has been contacted before
#pdays = number of days since the client was last contacted, 999 means the client has not been contacted before
#so we will create a new column 'prior_contact' where 0 means no prior contact
df['prior_contact'] = df['pdays'].apply(lambda x: 0 if x == 999 else 1)

print("New 'prior_contact' column added to df, indicating if the client has been contacted before.")


New 'prior_contact' column added to df, indicating if the client has been contacted before.


In [155]:
#lets drop the 'pdays' column
#we will keep the 'prior_contact' column    
df.drop(columns=['pdays'], inplace=True)
print("Dropped 'pdays' column from df, keeping 'prior_contact' column to indicate prior contact status.")

Dropped 'pdays' column from df, keeping 'prior_contact' column to indicate prior contact status.


In [156]:
#put target variable 'y' into binary format
#map 'yes' to 1 and 'no' to 0
df['y'] = df['y'].map({'yes': 1, 'no': 0}).astype(int)

## One Hot Encoding Notes

One-hot encoding is a data preprocessing step used to convert categorical variables (like "job", "education", "month", etc.) into a format that machine learning algorithms can understand.

Most machine learning models (especially logistic regression, decision trees, etc.) can't work directly with text labels — they need numbers



## Why Use In My Code?
One-hot encoding creates a new column for each unique value in a categorical column and assigns a 1 or 0 to indicate presence.

This avoids introducing false ordinal relationships (like if you encoded them as 0, 1, 2), which would wrongly suggest that "technician" > "admin."

I'm building a model to predict whether someone will subscribe to a term deposit (a classification task).

Many of the features are categorical:

['job', 'marital', 'education', 'default', 'housing', 'loan',
 'contact', 'month', 'day_of_week', 'poutcome']

 
 To make these usable in My logistic regression or other ML model, I must convert them to numbers — and one-hot encoding is the standard, safe way to do it.


In [157]:
# Define the original categorical columns
categorical_cols = [
    'job', 'marital', 'education', 'default', 'housing', 'loan',
    'contact', 'month', 'day_of_week', 'poutcome'
]

# Make a copy of df BEFORE encoding for comparison later
df_beforeEncoding = df.copy()

# Convert categorical variables to dummy/indicator variables
df_afterEncoding = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Informative messages
print("All categorical variables in df have been converted to dummy variables.")
print("Converted categorical columns to dummy variables:")
print(categorical_cols)
print()

# Compare column sets to find new dummy variables created
new_columns = set(df_afterEncoding.columns) - set(df_beforeEncoding.columns)
print("New columns created by one-hot encoding:")
print(sorted(new_columns))  # sorted to make it easier to read

All categorical variables in df have been converted to dummy variables.
Converted categorical columns to dummy variables:
['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']

New columns created by one-hot encoding:
['contact_telephone', 'day_of_week_mon', 'day_of_week_thu', 'day_of_week_tue', 'day_of_week_wed', 'default_unknown', 'default_yes', 'education_basic.6y', 'education_basic.9y', 'education_high.school', 'education_illiterate', 'education_professional.course', 'education_university.degree', 'education_unknown', 'housing_unknown', 'housing_yes', 'job_blue-collar', 'job_entrepreneur', 'job_housemaid', 'job_management', 'job_retired', 'job_self-employed', 'job_services', 'job_student', 'job_technician', 'job_unemployed', 'job_unknown', 'loan_unknown', 'loan_yes', 'marital_married', 'marital_single', 'marital_unknown', 'month_aug', 'month_dec', 'month_jul', 'month_jun', 'month_mar', 'month_may', 'month_nov', 'month_oct', 'mo

## 📊 DataFrames Overview

### 1. `df_beforeEncoding`
This is the **original** DataFrame, with some basic cleaning applied. It includes the following **categorical columns** that require encoding:

```python
['job', 'marital', 'education', 'default', 'housing', 'loan', 
 'contact', 'month', 'day_of_week', 'poutcome']



## 📊 DataFrames Overview

### 2. `df_afterEncoding`
This is the **updated** DataFrame, with One Hot Encoding applied. It includes the following **Numerical columns** that have been encoded:

```python
['contact_telephone', 'day_of_week_mon', 'day_of_week_thu', 'day_of_week_tue', 'day_of_week_wed', 'default_unknown', 'default_yes', 'education_basic.6y', 'education_basic.9y', 'education_high.school', 'education_illiterate', 'education_professional.course', 'education_university.degree', 'education_unknown', 'housing_unknown', 'housing_yes', 'job_blue-collar', 'job_entrepreneur', 'job_housemaid', 'job_management', 'job_retired', 'job_self-employed', 'job_services', 'job_student', 'job_technician', 'job_unemployed', 'job_unknown', 'loan_unknown', 'loan_yes', 'marital_married', 'marital_single', 'marital_unknown', 'month_aug', 'month_dec', 'month_jul', 'month_jun', 'month_mar', 'month_may', 'month_nov', 'month_oct', 'month_sep', 'poutcome_nonexistent', 'poutcome_success']
