# Import Libraries 

In [1]:
import pandas as pd 
import numpy as np
from scipy.stats.mstats import winsorize
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import mstats
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# Data Preprocessing 

## Load Data

In [2]:
Bank = pd.read_csv('Bank.csv')
Bank_Add = pd.read_csv('Bank_Add.csv')

## Handle outliers

### Age 

In [3]:
Label_Age = ['-18', '18-22', '23-27', '28-32', '33-37', '38-42', '43-47',
             '48-52', '53-57', '58-62', '63-67', '68-72', '73-78', '+78']

bins_age = [0, 18, 22, 27, 32, 37, 42, 47, 52, 57, 62, 67, 72, 78, float('inf')]

In [4]:
Bank['age'] = pd.cut(Bank['age'], bins=bins_age, labels=Label_Age,  right=False)

In [5]:
Bank_Add['age'] = pd.cut(Bank_Add['age'], bins=bins_age, labels=Label_Age,  right=False)

### Balance 

In [6]:
Bank['balance'] = Bank['balance'].apply(lambda x: max(0, x))

In [7]:
Bank['balance'] = np.log1p(Bank['balance'])


### Duration

In [8]:
Bank['duration'] = winsorize(Bank['duration'], limits=[0.05, 0.05])
Bank['duration'] = np.log1p(Bank['duration'])

In [9]:
Bank_Add['duration'] = winsorize(Bank_Add['duration'], limits=[0.05, 0.05])
Bank_Add['duration'] = np.log1p(Bank_Add['duration'])

### Campaign

In [10]:
Bank['campaign'] = winsorize(Bank['campaign'], limits=[0.05, 0.05])
Bank['campaign'] = np.log1p(Bank['campaign'])

In [11]:
Bank_Add['campaign'] = winsorize(Bank_Add['campaign'], limits=[0.05, 0.05])
Bank_Add['Campaign'] = np.log1p(Bank_Add['campaign'])

### Pdays

In [12]:
Bank['pdays'] = mstats.winsorize(Bank['pdays'], limits=[0.05, 0.05])
small_constant = 1e-6
Bank['pdays'] = np.log1p(Bank['pdays'] + small_constant)

In [13]:
Bank_Add['pdays'] = winsorize(Bank_Add['pdays'], limits=[0.05, 0.05])
Bank_Add['pdays'] = np.log1p(Bank_Add['pdays'])

### Previous

In [14]:
Bank['previous'] = winsorize(Bank['previous'], limits=[0.05, 0.05])
Bank['previous'] = np.log1p(Bank['previous'])

In [15]:
Bank_Add['previous'] = winsorize(Bank_Add['previous'], limits=[0.05, 0.05])
Bank_Add['previous'] = np.log1p(Bank_Add['previous'])

## Categorical Variable Encoding

In [16]:
label_encoder = LabelEncoder()

### Age

In [17]:
Bank['job'] = label_encoder.fit_transform(Bank['job'])

In [18]:
Bank_Add['job'] = label_encoder.fit_transform(Bank_Add['job'])

### Marital

In [19]:
Bank['marital'] = label_encoder.fit_transform(Bank['marital'])

In [20]:
Bank_Add['marital'] = label_encoder.fit_transform(Bank_Add['marital'])

### Education

In [21]:
Bank['education'] = label_encoder.fit_transform(Bank['education'])

In [22]:
Bank_Add['education'] = label_encoder.fit_transform(Bank_Add['education'])

### Default

In [23]:
Bank['default'] = label_encoder.fit_transform(Bank['default'])

In [24]:
Bank_Add['default'] = label_encoder.fit_transform(Bank_Add['default'])

### Housing

In [25]:
Bank['housing'] = label_encoder.fit_transform(Bank['housing'])

In [26]:
Bank_Add['housing'] = label_encoder.fit_transform(Bank_Add['housing'])

### Loan

In [27]:
Bank['loan'] = label_encoder.fit_transform(Bank['loan'])

In [28]:
Bank_Add['loan'] = label_encoder.fit_transform(Bank_Add['loan'])

### Contact

In [29]:
Bank['contact'] = label_encoder.fit_transform(Bank['contact'])

In [30]:
Bank_Add['contact'] = label_encoder.fit_transform(Bank_Add['contact'])

### housing

In [31]:
Bank['housing'] = label_encoder.fit_transform(Bank['housing'])

In [32]:
Bank_Add['housing'] = label_encoder.fit_transform(Bank_Add['housing'])

### Month

In [33]:
Bank['month'] = label_encoder.fit_transform(Bank['month'])

In [34]:
Bank_Add['month'] = label_encoder.fit_transform(Bank_Add['month'])

### Poutcome

In [35]:
Bank['poutcome'] = label_encoder.fit_transform(Bank['poutcome'])

In [36]:
Bank_Add['poutcome'] = label_encoder.fit_transform(Bank_Add['poutcome'])

### Day Of Week

In [37]:
Bank_Add['day_of_week'] = label_encoder.fit_transform(Bank_Add['day_of_week'])

### Age 

In [38]:
Bank['age'] = label_encoder.fit_transform(Bank['age'])


In [39]:
Bank_Add['age'] = label_encoder.fit_transform(Bank_Add['age'])

### Y

In [40]:
Bank['y'] = label_encoder.fit_transform(Bank['y'])

In [41]:
Bank_Add['y'] = label_encoder.fit_transform(Bank_Add['y'])

In [42]:
Bank.isnull().sum()

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

## Dimensionality Reduction with PCA

In [43]:
X = Bank.drop('y', axis=1)
X_A = Bank_Add.drop('y', axis=1)
y = Bank['y']
y_a = Bank_Add['y']

In [44]:
scaler = StandardScaler()

In [45]:
X_scaled = scaler.fit_transform(X)
X_A_scaled = scaler.fit_transform(X_A)


In [46]:
num_components = 10
pca = PCA(n_components=num_components)

In [48]:
X_pca = pca.fit_transform(X_scaled)
X_A_pca = pca.fit_transform(X_A_scaled)

In [None]:
explained_variance = pca.explained_variance_ratio_
cumulative_variance = explained_variance.cumsum()

In [None]:
plt.plot(range(1, num_components + 1), cumulative_variance, marker='o', linestyle='--')
plt.title('Cumulative Explained Variance')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Variance')
plt.show()

In [None]:
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='viridis', label='Primary Dataset')
plt.scatter(X_A_pca[:, 0], X_A_pca[:, 1], c=y_a, cmap='viridis', marker='x', label='Additional Dataset')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend()
plt.show()

## Data Splitting: Train-Test Partitioning

In [49]:
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

In [50]:
X_A_train, X_A_test, y_A_train, y_A_test = train_test_split(X_A_pca, y_a, test_size=0.2, random_state=42)

# Modeling

### Random Forest Classifier

In [51]:
model = RandomForestClassifier()
model.fit(X_train, y_train)

In [52]:
model.fit(X_A_train, y_A_train)

In [53]:
y_pred = model.predict(X_test)

In [54]:
y_A_pred = model.predict(X_A_test)


In [55]:
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print("\nClassification Report:\n", report)

Accuracy: 0.86

Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.97      0.92      8774
           1       0.20      0.06      0.09      1173

    accuracy                           0.86      9947
   macro avg       0.54      0.51      0.51      9947
weighted avg       0.80      0.86      0.83      9947



In [56]:
accuracy = accuracy_score(y_A_test, y_A_pred)
report = classification_report(y_A_test, y_A_pred)

print(f"Accuracy: {accuracy:.2f}")
print("\nClassification Report:\n", report)

Accuracy: 0.91

Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.98      0.95      7940
           1       0.76      0.45      0.56      1122

    accuracy                           0.91      9062
   macro avg       0.84      0.71      0.76      9062
weighted avg       0.91      0.91      0.90      9062



### Decision Tree Classifier

In [57]:
clf = DecisionTreeClassifier(random_state=42)

In [58]:
clf.fit(X_train, y_train)

In [59]:
clf.fit(X_A_train, y_A_train)

In [60]:
y_pred = clf.predict(X_test)

In [61]:
y_A_pred = clf.predict(X_A_test)

In [62]:
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print("\nClassification Report:\n", report)

Accuracy: 0.76

Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.82      0.86      8774
           1       0.18      0.29      0.22      1173

    accuracy                           0.76      9947
   macro avg       0.54      0.56      0.54      9947
weighted avg       0.81      0.76      0.78      9947



In [63]:
accuracy = accuracy_score(y_A_test, y_A_pred)
report = classification_report(y_A_test, y_A_pred)

print(f"Accuracy: {accuracy:.2f}")
print("\nClassification Report:\n", report)

Accuracy: 0.89

Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.94      0.94      7940
           1       0.56      0.56      0.56      1122

    accuracy                           0.89      9062
   macro avg       0.75      0.75      0.75      9062
weighted avg       0.89      0.89      0.89      9062



### LogisticRegression

In [64]:
logreg = LogisticRegression(random_state=42)

In [65]:
logreg.fit(X_train, y_train)
logreg.fit(X_A_train, y_A_train)

In [66]:
y_pred = logreg.predict(X_test)
y_A_pred = logreg.predict(X_A_test)

In [67]:
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print("\nClassification Report:\n", report)

Accuracy: 0.87

Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.97      0.93      8774
           1       0.29      0.10      0.15      1173

    accuracy                           0.87      9947
   macro avg       0.59      0.53      0.54      9947
weighted avg       0.82      0.87      0.84      9947



In [68]:
accuracy = accuracy_score(y_A_test, y_A_pred)
report = classification_report(y_A_test, y_A_pred)

print(f"Accuracy: {accuracy:.2f}")
print("\nClassification Report:\n", report)

Accuracy: 0.90

Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.97      0.94      7940
           1       0.66      0.35      0.45      1122

    accuracy                           0.90      9062
   macro avg       0.79      0.66      0.70      9062
weighted avg       0.88      0.90      0.88      9062



### SVM

In [69]:
svm_classifier = SVC(kernel='linear', random_state=42)

In [70]:
svm_classifier.fit(X_train, y_train)
svm_classifier.fit(X_A_train, y_A_train)

In [71]:
y_pred = svm_classifier.predict(X_test)
y_A_pred = svm_classifier.predict(X_A_test)

In [72]:
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print("\nClassification Report:\n", report)

Accuracy: 0.88

Classification Report:
               precision    recall  f1-score   support

           0       0.88      1.00      0.94      8774
           1       0.38      0.01      0.01      1173

    accuracy                           0.88      9947
   macro avg       0.63      0.50      0.47      9947
weighted avg       0.82      0.88      0.83      9947



In [73]:
accuracy = accuracy_score(y_A_test, y_A_pred)
report = classification_report(y_A_test, y_A_pred)

print(f"Accuracy: {accuracy:.2f}")
print("\nClassification Report:\n", report)

Accuracy: 0.88

Classification Report:
               precision    recall  f1-score   support

           0       0.88      1.00      0.93      7940
           1       0.73      0.01      0.02      1122

    accuracy                           0.88      9062
   macro avg       0.81      0.50      0.48      9062
weighted avg       0.86      0.88      0.82      9062



In [74]:
param_grid = {'C': [0.1, 1, 10, 100], 'kernel': ['linear', 'rbf', 'poly'], 'gamma': ['scale', 'auto']}

In [107]:
svm_classifier1 = SVC(C=100, kernel='poly',gamma= 'auto', random_state=42)

In [108]:
svm_classifier1.fit(X_train, y_train)
svm_classifier1.fit(X_A_train, y_A_train)

In [104]:
y_pred = svm_classifier1.predict(X_test)
y_A_pred = svm_classifier1.predict(X_A_test)

In [105]:
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print("\nClassification Report:\n", report)

Accuracy: 0.87

Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.98      0.93      8774
           1       0.28      0.05      0.09      1173

    accuracy                           0.87      9947
   macro avg       0.58      0.52      0.51      9947
weighted avg       0.81      0.87      0.83      9947



In [106]:
accuracy = accuracy_score(y_A_test, y_A_pred)
report = classification_report(y_A_test, y_A_pred)

print(f"Accuracy: {accuracy:.2f}")
print("\nClassification Report:\n", report)

Accuracy: 0.89

Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.99      0.94      7940
           1       0.71      0.23      0.35      1122

    accuracy                           0.89      9062
   macro avg       0.81      0.61      0.65      9062
weighted avg       0.88      0.89      0.87      9062

