In [87]:
# 1. Import libraries and load data
import pandas as pd

df = pd.read_csv('bank-additional-full.csv', sep=';')

In [88]:
#Now to handle unspecified values, we will replace them with NA
df.replace('unknown', pd.NA, inplace=True)

#lets figure out which columns have unknown values and how many
unknown_values = df.isna().sum()
unknown_values = unknown_values[unknown_values > 0]
print("Columns with NA values after replacement:")
print(unknown_values)


Columns with NA values after replacement:
job           330
marital        80
education    1731
default      8597
housing       990
loan          990
dtype: int64


In [89]:
# Descriptive statistics for 'campaign'
print("Descriptive statistics for 'campaign':")
print(df['campaign'].describe())



print(("we see most are between 1 and 3, with a few outliers up to 50."))
# Cap campaign at 6
df['campaign_capped'] = df['campaign'].clip(upper=6)

# Descriptive statistics for 'campaign_capped'
print("\nDescriptive statistics for 'campaign_capped':")
print(df['campaign_capped'].describe())

Descriptive statistics for 'campaign':
count    41188.000000
mean         2.567593
std          2.770014
min          1.000000
25%          1.000000
50%          2.000000
75%          3.000000
max         56.000000
Name: campaign, dtype: float64
we see most are between 1 and 3, with a few outliers up to 50.

Descriptive statistics for 'campaign_capped':
count    41188.000000
mean         2.275274
std          1.550510
min          1.000000
25%          1.000000
50%          2.000000
75%          3.000000
max          6.000000
Name: campaign_capped, dtype: float64


In [90]:
#Time to handle data outliers, we will drop the original 'campaign' column
print("\nRemoving the original 'campaign' column.")
df.drop(columns=['campaign'], inplace=True)
print("Original 'campaign' column removed.")
print("New 'campaign_capped' column added with values capped at 6.")



Removing the original 'campaign' column.
Original 'campaign' column removed.
New 'campaign_capped' column added with values capped at 6.


In [91]:
#drop duration column
# If the 'duration' column exists, drop it; otherwise, print a message
#dropped because it is not useful for analysis and can skew results, occurs after the last contact with the client
if 'duration' in df.columns:
	df.drop(columns=['duration'], inplace=True)
	print('dropped duration column from df.')
else:
	print("'duration' column not found in df.")

dropped duration column from df.


In [92]:
#make a new column to indicate if the client has been contacted before
#pdays = number of days since the client was last contacted, 999 means the client has not been contacted before
#so we will create a new column 'prior_contact' where 0 means no prior contact
df['prior_contact'] = df['pdays'].apply(lambda x: 0 if x == 999 else 1)

print("New 'prior_contact' column added to df, indicating if the client has been contacted before.")


New 'prior_contact' column added to df, indicating if the client has been contacted before.


In [93]:
#lets drop the 'pdays' column
#we will keep the 'prior_contact' column    
df.drop(columns=['pdays'], inplace=True)
print("Columns after removing pdays:")
print(df.columns)
print('pdays column removed, prior_contact column created')

Columns after removing pdays:
Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'previous', 'poutcome',
       'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m',
       'nr.employed', 'y', 'campaign_capped', 'prior_contact'],
      dtype='object')
pdays column removed, prior_contact column created
