In [60]:
# 1. Import libraries and load data
import pandas as pd

df = pd.read_csv('bank-additional-full.csv', sep=';')

In [61]:
#Now to handle unspecified values, we will replace them with NaN
df.replace('unknown', pd.NA, inplace=True)

#lets figure out which columns have unknown values and how many
unknown_values = df.isna().sum()
unknown_values = unknown_values[unknown_values > 0]
print("Columns with unknown values:")
print(unknown_values)


Columns with unknown values:
job           330
marital        80
education    1731
default      8597
housing       990
loan          990
dtype: int64


In [62]:
#Time to handle data outliers, we will use the IQR method
# 2. Function to remove outliers using IQR method, but I will only remove outliers from the 'campaign' column for now
def remove_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    return outliers, lower_bound, upper_bound


#Detect outliers in 'campaign'
outliers_campaign, low_camp, up_camp = remove_outliers_iqr(df, 'campaign')
print(f'Number of outliers in campaign: {len(outliers_campaign)}')

#Detect outliers in 'duration'
outliers_campaign, low_camp, up_camp = remove_outliers_iqr(df, 'duration')
print(f'Number of outliers in duration: {len(outliers_campaign)}')

print()
print()

# Remove outliers from the dataframe based on 'campaign' column
outliers_campaign, low_camp, up_camp = remove_outliers_iqr(df, 'campaign')
df_no_outliers = df[(df['campaign'] >= low_camp) & (df['campaign'] <= up_camp)]
print(f'Number after removing campaign outliers: {len(df_no_outliers)}')


# Remove outliers from the dataframe based on 'duration' column
outliers_campaign, low_camp, up_camp = remove_outliers_iqr(df, 'duration')
df_no_outliers = df[(df['duration'] >= low_camp) & (df['duration'] <= up_camp)]
print(f'Number after removing duration outliers: {len(df_no_outliers)}')



Number of outliers in campaign: 2406
Number of outliers in duration: 2963


Number after removing campaign outliers: 38782
Number after removing duration outliers: 38225


In [63]:
#make a new column to indicate if the client has been contacted before
#pdays = number of days since the client was last contacted, 999 means the client has not been contacted before
#so we will create a new column 'prior_contact' where 0 means no prior contact
df['prior_contact'] = df['pdays'].apply(lambda x: 0 if x == 999 else 1)



In [64]:
#lets look at all columns in df
print("Columns in df:")
print(df.columns)

Columns in df:
Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed', 'y', 'prior_contact'],
      dtype='object')


In [65]:
#print head for prior_contact
print(df[['pdays', 'prior_contact']].head())

   pdays  prior_contact
0    999              0
1    999              0
2    999              0
3    999              0
4    999              0


In [66]:
#lets drop the 'pdays' column
#we will keep the 'prior_contact' column    
df.drop(columns=['pdays'], inplace=True)
print("Columns after removing pdays:")
print(df.columns)
print('pdays column removed, prior_contact column created')

Columns after removing pdays:
Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'previous',
       'poutcome', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx',
       'euribor3m', 'nr.employed', 'y', 'prior_contact'],
      dtype='object')
pdays column removed, prior_contact column created
