In [2]:
# Import modules
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

In [10]:
# Read the CSV file into a Pandas DataFrame
df = pd.read_csv(
    Path('bank.csv')
)

# Review the DataFrame
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,no
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,no


In [11]:
# Split the features and target data
y = df['y']
X = df.drop(columns='y')

y

0       no
1       no
2       no
3       no
4       no
        ..
4516    no
4517    no
4518    no
4519    no
4520    no
Name: y, Length: 4521, dtype: object

In [12]:
# Encode the features dataset's categorical variables using get_dummies
X = pd.get_dummies(X)

# Review the features DataFrame
X.head()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,job_admin.,job_blue-collar,job_entrepreneur,...,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown
0,30,1787,19,79,1,-1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
1,33,4789,11,220,1,339,4,0,0,0,...,0,0,1,0,0,0,1,0,0,0
2,35,1350,16,185,1,330,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,30,1476,3,199,4,-1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
4,59,0,5,226,1,-1,0,0,1,0,...,0,0,1,0,0,0,0,0,0,1


In [13]:
# Split data into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [14]:
# Review the distinct values from y
y_train.value_counts()

no     3012
yes     378
Name: y, dtype: int64

In [15]:
# Instantiate a StandardScaler instance
scaler = StandardScaler()

# Fit the training data to the standard scaler
X_scaler = scaler.fit(X_train)

# Transform the training data using the scaler
X_train_scaled = X_scaler.transform(X_train)

# Transform the testing data using the scaler
X_test_scaled = X_scaler.transform(X_test)

In [16]:
# Import the RandomForestClassifier from sklearn
from sklearn.ensemble import RandomForestClassifier

# Instantiate a RandomForestClassifier instance
model = RandomForestClassifier()

# Fit the traning data to the model
model.fit(X_train_scaled, y_train)

RandomForestClassifier()

In [17]:
# Predict labels for original scaled testing features
y_pred = model.predict(X_test_scaled)

In [18]:
# Import RandomUnderSampler from imblearn
from imblearn.under_sampling import RandomUnderSampler

# Instantiate a RandomUnderSampler instance
rus = RandomUnderSampler(random_state=1)

In [19]:
# Fit the training data to the random undersampler model
X_undersampled, y_undersampled = rus.fit_resample(X_train_scaled, y_train)

In [20]:
# Count distinct values for the resampled target data
y_undersampled.value_counts()

no     378
yes    378
Name: y, dtype: int64

In [21]:
# Instantiate a new RandomForestClassier model
model_undersampled = RandomForestClassifier()

# Fit the undersampled data the new model
model_undersampled.fit(X_undersampled, y_undersampled)

RandomForestClassifier()

In [22]:
# Predict labels for oversampled testing features
y_pred_undersampled = model_undersampled.predict(X_test_scaled)

In [25]:
# Print classification reports
print(f"Classifiction Report - Original Data (no-3012 vs yes-378 sample size)")
print(classification_report(y_test, y_pred))
print("---------")
print(f"Classifiction Report - Undersampled Data (378 sample size both)")
print(classification_report(y_test, y_pred_undersampled))

Classifiction Report - Original Data (no-3012 vs yes-378 sample size)
              precision    recall  f1-score   support

          no       0.89      0.98      0.93       988
         yes       0.53      0.19      0.28       143

    accuracy                           0.88      1131
   macro avg       0.71      0.58      0.61      1131
weighted avg       0.85      0.88      0.85      1131

---------
Classifiction Report - Undersampled Data (378 sample size both)
              precision    recall  f1-score   support

          no       0.96      0.80      0.87       988
         yes       0.36      0.78      0.50       143

    accuracy                           0.80      1131
   macro avg       0.66      0.79      0.68      1131
weighted avg       0.89      0.80      0.83      1131

