In [9]:
# Import necessary libraries
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

# Load the dataset
file_path = '/content/bank-additional.csv'
data = pd.read_csv(file_path, sep=';')

# ---------------------------------
# Step 1: Data Cleaning
# ---------------------------------
# Check for missing values and fill if necessary
print("Missing values per column:\n", data.isnull().sum())
if data.isnull().sum().sum() > 0:
    print("Handling missing values...")
    data.fillna(method='ffill', inplace=True)  # Forward fill for missing values

# Check and ensure data types are consistent
print("\nData Types:\n", data.dtypes)

# ---------------------------------
# Step 2: Transformation
# ---------------------------------
# Separate categorical and numerical features
categorical_columns = data.select_dtypes(include=['object']).columns.drop('y')  # Exclude the target column 'y'
numerical_columns = data.select_dtypes(include=['int64', 'float64']).columns

# One-hot encode categorical features
encoder = OneHotEncoder(drop='first', sparse_output=False)
encoded_features = encoder.fit_transform(data[categorical_columns])
encoded_columns = encoder.get_feature_names_out(categorical_columns)
encoded_df = pd.DataFrame(encoded_features, columns=encoded_columns)

# Combine numerical and encoded categorical features
processed_features = pd.concat([data[numerical_columns], encoded_df], axis=1)

# Encode target variable 'y' as binary (1 for 'yes', 0 for 'no')
processed_features['y'] = data['y'].apply(lambda x: 1 if x == 'yes' else 0)

# ---------------------------------
# Step 3: Handle Class Imbalance
# ---------------------------------
# Check for class distribution
class_distribution = processed_features['y'].value_counts()
print("\nClass Distribution:\n", class_distribution)

# Calculate class weights for the target variable
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(processed_features['y']),
    y=processed_features['y']
)
class_weight_dict = {i: weight for i, weight in enumerate(class_weights)}
print("\nClass Weights:\n", class_weight_dict)

# ---------------------------------
# Step 4: Feature Scaling
# ---------------------------------
# Standardize numerical features
scaler = StandardScaler()
numerical_data = processed_features[numerical_columns]
scaled_numerical_data = scaler.fit_transform(numerical_data)

# Replace numerical features with scaled versions
processed_features[numerical_columns] = scaled_numerical_data

# ---------------------------------

# Step 5: Save Preprocessed Data
# ---------------------------------
# Save the preprocessed dataset to a CSV file
processed_features.to_csv('preprocessed_bank_data.csv', index=False)



Missing values per column:
 age               0
job               0
marital           0
education         0
default           0
housing           0
loan              0
contact           0
month             0
day_of_week       0
duration          0
campaign          0
pdays             0
previous          0
poutcome          0
emp.var.rate      0
cons.price.idx    0
cons.conf.idx     0
euribor3m         0
nr.employed       0
y                 0
dtype: int64

Data Types:
 age                 int64
job                object
marital            object
education          object
default            object
housing            object
loan               object
contact            object
month              object
day_of_week        object
duration            int64
campaign            int64
pdays               int64
previous            int64
poutcome           object
emp.var.rate      float64
cons.price.idx    float64
cons.conf.idx     float64
euribor3m         float64
nr.employed       float64
y    