In [2]:
**Data Preparation & Understanding**

○      Summarize dataset and attributes.

○      Handle missing values, perform preprocessing/feature engineering as needed.

○      Provide exploratory analysis with tables/visualizations.


<class 'SyntaxError'>: invalid character '○' (U+25CB) (3141712825.py, line 3)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

# Upload the file in Jupyter first: Files → Upload
df=pd.read_csv("bank.csv")
display(df.head())
df.info()

In [None]:
# 1) Print only the number of rows.
# 2) Show the first 10 rows.
#  df.shape[0], df.head(10)

df.shape[0]
df.head(10)

In [None]:
# select by position (iloc) and by name (loc)
df.iloc[0:5, 0:4]          # rows 0–4, columns 0–3
df.loc[0:4, ["age", "balance", "duration"]]  # rows 0–4, named columns


In [None]:
# Create a NEW feature using existing columns from bank.csv
# Example: Product of 'age' and 'balance'
df.loc[:, "age_balance_product"] = df["age"] * df["balance"]
df[["age", "balance", "age_balance_product"]].head()

In [None]:
#  Using .loc, create a new feature using existing numerical columns
# Example: balance_per_age = balance / age

df.loc[:, "balance_per_age"] = df["balance"] / df["age"]

# Replace infinite values in the newly created column if any
df['balance_per_age'] = df['balance_per_age'].replace([float('inf'), float('-inf')], np.nan)
df['balance_per_age'].fillna(df['balance_per_age'].median(), inplace=True)

# Display first 5 rows and selected columns to show the new feature
df[['age', 'balance', 'balance_per_age']].head()

In [None]:
import pandas as pd
import numpy as np

bank = pd.read_csv('bank.csv')
print('Poutcome:', bank.poutcome)
bank.info()

# check missing values
bank.isnull().sum()
# Fill numeric columns with mean (safe even if none are missing)
bank.fillna(bank.mean(numeric_only=True), inplace=True)
print('Total missing after fill:', bank.isnull().sum().sum())

In [None]:
from sklearn.preprocessing import StandardScaler
import numpy as np # Ensure numpy is imported for select_dtypes

X = df.drop('deposit', axis=1)

# Select only numerical columns for StandardScaler
numerical_cols = X.select_dtypes(include=np.number).columns
X_numeric = X[numerical_cols]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_numeric)
print('Scaled shape:', X_scaled.shape)

In [None]:
# define X and y here

# Drop any created feature you do not want to include (choose to keep either px_area or px_aspect, or both—your call).
X = df.drop(columns=["deposit"])
y = df["deposit"]

# Identify numerical columns for operations
numerical_cols = X.select_dtypes(include=np.number).columns

# Replace infinite values in numerical columns of X with the median of each column
for col in numerical_cols:
    # Using .loc to avoid FutureWaring with inplace=True on a copy
    X.loc[:, col] = X[col].replace([float('inf'), float('-inf')], X[col].median())
    # Fill NaN values in numerical columns with the median
    X.loc[:, col] = X[col].fillna(X[col].median())

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Load the dataset (as in cell e7ba806a)
df = pd.read_csv("bank.csv")

# Feature engineering: age_balance_product (as in cell qprRfdTKPC-t)
df.loc[:, "age_balance_product"] = df["age"] * df["balance"]

# Feature engineering: balance_per_age (as in cell MEdgNfIcPHcq)
df.loc[:, "balance_per_age"] = df["balance"] / df["age"]
df['balance_per_age'] = df['balance_per_age'].replace([float('inf'), float('-inf')], np.nan)
df['balance_per_age'] = df['balance_per_age'].fillna(df['balance_per_age'].median()) # Modified line to avoid FutureWarning

# Drop any created feature you do not want to include (choose to keep either px_area or px_aspect, or both—your call).
X = df.drop(columns=["deposit"])
y = df["deposit"]

# Identify numerical columns for operations
numerical_cols = X.select_dtypes(include=np.number).columns

# Replace infinite values in numerical columns of X with the median of each column
for col in numerical_cols:
    # Using .loc to avoid FutureWarning with inplace=True on a copy
    X.loc[:, col] = X[col].replace([float('inf'), float('-inf')], X[col].median())
    # Fill NaN values in numerical columns with the median
    X.loc[:, col] = X[col].fillna(X[col].median())

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y
)

print("Train shapes:", X_train.shape, y_train.shape)
print("Test  shapes:", X_test.shape, y_test.shape)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

dt = DecisionTreeClassifier(random_state=42)

# Identify numerical and categorical columns
numerical_cols = X.select_dtypes(include=np.number).columns
categorical_cols = X.select_dtypes(include='object').columns

# Create a preprocessor using ColumnTransformer
# This will scale numerical features and one-hot encode categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

# Apply preprocessing to X_train and X_test
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# Fit the Decision Tree model on the processed data
dt.fit(X_train_processed, y_train)              # train

y_pred_dt = dt.predict(X_test_processed)        # predict

acc_dt = accuracy_score(y_test, y_pred_dt)
prec_dt = precision_score(y_test, y_pred_dt, average="weighted")
rec_dt  = recall_score(y_test, y_pred_dt, average="weighted")
f1_dt   = f1_score(y_test, y_pred_dt, average="weighted")

(acc_dt, prec_dt, rec_dt, f1_dt)

In [None]:

# Add SVM, KNN, RF metrics here following the Decision Tree pattern

from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

# SVM Classifier
svm = SVC(kernel="rbf", random_state=42)
svm.fit(X_train_processed, y_train)
y_pred_svm = svm.predict(X_test_processed)
acc_svm = accuracy_score(y_test, y_pred_svm)
prec_svm = precision_score(y_test, y_pred_svm, average="weighted")
rec_svm = recall_score(y_test, y_pred_svm, average="weighted")
f1_svm = f1_score(y_test, y_pred_svm, average="weighted")

# KNN Classifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_processed, y_train)
y_pred_knn = knn.predict(X_test_processed)
acc_knn = accuracy_score(y_test, y_pred_knn)
prec_knn = precision_score(y_test, y_pred_knn, average="weighted")
rec_knn = recall_score(y_test, y_pred_knn, average="weighted")
f1_knn = f1_score(y_test, y_pred_knn, average="weighted")

# Random Forest Classifier
rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train_processed, y_train)
y_pred_rf = rf.predict(X_test_processed)
acc_rf = accuracy_score(y_test, y_pred_rf)
prec_rf = precision_score(y_test, y_pred_rf, average="weighted")
rec_rf = recall_score(y_test, y_pred_rf, average="weighted")
f1_rf = f1_score(y_test, y_pred_rf, average="weighted")

print("SVM Metrics:", (acc_svm, prec_svm, rec_svm, f1_svm))
print("KNN Metrics:", (acc_knn, prec_knn, rec_knn, f1_knn))
print("Random Forest Metrics:", (acc_rf, prec_rf, rec_rf, f1_rf))

In [None]:
import pandas as pd

# Assemble a table (start with Decision Tree row, then add others)
results = pd.DataFrame([
    {"Model": "Decision Tree", "Accuracy": acc_dt, "Precision(w)": prec_dt, "Recall(w)": rec_dt, "F1(w)": f1_dt},
    {"Model": "SVM", "Accuracy": acc_svm, "Precision(w)": prec_svm, "Recall(w)": rec_svm, "F1(w)": f1_svm},
    {"Model": "KNN", "Accuracy": acc_knn, "Precision(w)": prec_knn, "Recall(w)": rec_knn, "F1(w)": f1_knn},
    {"Model": "Random Forest", "Accuracy": acc_rf, "Precision(w)": prec_rf, "Recall(w)": rec_rf, "F1(w)": f1_rf}
])
results

In [None]:
# Add rows for SVM, KNN, and Random Forest using your computed metrics.
# Then sort by F1(w) descending.

# results.sort_values("F1(w)", ascending=False)

# TODO: your code below
results.sort_values("F1(w)", ascending=False)

In [None]:
import matplotlib.pyplot as plt

# F1(w) bar chart
ax = results.plot(kind="bar", x="Model", y="F1(w)", legend=False)
plt.title("Model Comparison by F1 (weighted)")
plt.ylabel("F1 (weighted)")
plt.xlabel("Model")
plt.ylim(0, 1)
plt.show()

In [None]:
**Classification (Supervised Learning)**

○      Apply at least three classifiers (e.g., Decision Tree, KNN, SVM, Random Forest, Logistic Regression).

○      Use train/test split and evaluate with Accuracy, Precision, Recall, and F1 (weighted).

○      The present results in a comparison table and plots.


In [None]:
# Import Libraries
# import the necessary packages for data, preprocessing, and ML
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [None]:
#  Load Data
#  read the CSV file (Hint: pd.read_csv)
df = pd.read_csv("bank.csv")
#  Display first few rows
print(df.head())

In [None]:
# Handle Missing Values
# Check for missing values
print(df.isna().sum().sort_values(ascending=False))
# Fill missing values with column mean (Hint: df.fillna(df.mean(numeric_only=True)))
df = df.fillna(df.mean(numeric_only=True))

In [None]:
# Separate Features and Target
# Drop 'deposit' column from X and keep it as y
X = df.drop("deposit", axis=1)
y = df["deposit"]

In [None]:
# Preprocess and Scale Data
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Identify numerical and categorical columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X.select_dtypes(include=['object']).columns

# Create a preprocessor using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

# Fit and transform the features
X_scaled = preprocessor.fit_transform(X)

print("Data preprocessed and scaled successfully.")

In [None]:
**Clustering (Unsupervised Learning)**

○      Apply at least one clustering algorithm (e.g., K-means, hierarchical, DBSCAN).

○      Evaluate results and provide visualizations

○      Discuss whether clusters provide meaningful patterns.


In [None]:
import pandas as pd
import numpy as np

mobile = pd.read_csv('bank.csv')
print('Shape:', mobile.shape)
mobile.info()

# Check missing values
mobile.isnull().sum()
# Fill numeric columns with mean (safe even if none are missing)
mobile.fillna(mobile.mean(numeric_only=True), inplace=True)
print('Total missing after fill:', mobile.isnull().sum().sum())

In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

# Separate features from target for clustering
# Assuming 'deposit' is the target column to be excluded from clustering features
X_mobile = mobile.drop('deposit', axis=1)

# Identify numerical and categorical columns in X_mobile
numerical_cols_mobile = X_mobile.select_dtypes(include=np.number).columns
categorical_cols_mobile = X_mobile.select_dtypes(include='object').columns

# Create a preprocessor using ColumnTransformer
preprocessor_mobile = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols_mobile),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols_mobile)
    ])

# Fit and transform the features
X_scaled_for_clustering = preprocessor_mobile.fit_transform(X_mobile)
print('Scaled shape for clustering:', X_scaled_for_clustering.shape)

In [None]:
# Scale Data
#  initialize a StandardScaler and fit_transform the features
scaler = StandardScaler()
# Select only numerical columns from X for StandardScaler
numerical_cols = X.select_dtypes(include=np.number).columns
X_scaled = scaler.fit_transform(X[numerical_cols])

In [None]:
#  Apply K-Means
# Try k=3 and set random_state=42 for reproducibility
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
clusters = kmeans.fit_predict(X_scaled_for_clustering)
mobile["cluster_kmeans"] = clusters
# print cluster counts
print(mobile["cluster_kmeans"].value_counts())

In [None]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

SSE = []
for k in range(1, 20):
    km = KMeans(n_clusters=k, init='k-means++', random_state=42, n_init=10)
    SSE.append(km.fit(X_scaled_for_clustering).inertia_)

plt.plot(range(1, 20), SSE, marker='o')
plt.title('Elbow Method (SSE vs k)')
plt.xlabel('k')
plt.ylabel('SSE')
plt.show()

#  Choose a reasonable k from the elbow. (Can you identify k with this SSE-based method?)

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled_for_clustering)
plt.scatter(X_pca[:,0], X_pca[:,1], c=mobile['cluster_kmeans'])
plt.title('K-Means (PCA 2D)')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.show()

In [None]:
# Choose two features (e.g., 'age' and 'balance')
feature_x = 'age'
feature_y = 'balance'
plt.scatter(mobile[feature_x], mobile[feature_y], c=mobile['cluster_kmeans'])
plt.xlabel(feature_x)
plt.ylabel(feature_y)
plt.title('K-Means by chosen features')
plt.show()

In [None]:
# Try another pair of features
feature_x = 'duration'
feature_y = 'campaign'
plt.scatter(mobile[feature_x], mobile[feature_y], c=mobile['cluster_kmeans'])
plt.xlabel(feature_x)
plt.ylabel(feature_y)
plt.title('K-Means by chosen features')
plt.show()

In [None]:
import scipy.cluster.hierarchy as sch

import matplotlib.pyplot as plt
plt.figure(figsize=(8,5))
dendrogram = sch.dendrogram(sch.linkage(X_scaled_for_clustering, method='ward'))
plt.title('Dendrogram')
plt.xlabel('Sample index')
plt.ylabel('Distance')
plt.show()

# Pick a reasonable k from the dendrogram
k_dend = 4

In [None]:
from sklearn.cluster import AgglomerativeClustering

k_hier = 4
hc = AgglomerativeClustering(n_clusters=k_hier, metric='euclidean', linkage='ward')
y_hc = hc.fit_predict(X_scaled_for_clustering)
mobile['cluster_hier'] = y_hc

# Visualize in PCA space
plt.scatter(X_pca[:,0], X_pca[:,1], c=y_hc)
plt.title('Hierarchical (PCA 2D)')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.show()

In [None]:
#  Visualize Clusters
# Reduce to 2D using PCA and plot
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled_for_clustering) # Use the scaled data for clustering
plt.figure(figsize=(5,5))
plt.scatter(X_pca[:,0], X_pca[:,1], c=mobile["cluster_kmeans"], cmap='tab10', s=8) # Use mobile and cluster_kmeans
plt.title("K-Means Clusters (k=3)")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.show()

In [None]:
#  Train a Global Classifier
# Split data, train DecisionTreeClassifier, and check accuracy
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)
model_global = DecisionTreeClassifier(random_state=42)
model_global.fit(X_train, y_train)
y_pred_global = model_global.predict(X_test)
print("Overall Accuracy:", accuracy_score(y_test, y_pred_global))  # HINT: Expect around 0.8

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Train Classifiers Per Cluster
cluster_results = {}
#  Loop through clusters and train separate models
for c in sorted(mobile["cluster_kmeans"].unique()): # Use mobile and cluster_kmeans
    subset = mobile[mobile["cluster_kmeans"] == c].copy() # Use mobile and cluster_kmeans, and .copy() to avoid SettingWithCopyWarning

    # Define features X_c and target y_c for the current subset
    X_c = subset.drop(columns=["deposit", "cluster_kmeans", "cluster_hier"]) # Drop deposit, cluster_kmeans, and cluster_hier
    y_c = subset["deposit"] # Use deposit as target

    # Identify numerical and categorical columns for this subset
    numerical_cols_c = X_c.select_dtypes(include=np.number).columns
    categorical_cols_c = X_c.select_dtypes(include='object').columns

    # Create a preprocessor for this subset
    preprocessor_c = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_cols_c),
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols_c)
        ],
        remainder='passthrough' # Keep other columns if any, though likely none after selection
    )

    # Check if there's enough data for splitting and if y_c has at least two unique classes
    if len(subset) < 2 or len(y_c.unique()) < 2:
        print(f"Cluster {c}: Not enough samples or classes to train a classifier. Skipping.")
        cluster_results[c] = np.nan # Assign NaN or other placeholder
        continue

    # Split the data for the current cluster
    X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X_c, y_c, test_size=0.3, random_state=42, stratify=y_c)

    # Apply preprocessing to X_train and X_test for the current cluster
    X_train_processed_c = preprocessor_c.fit_transform(X_train_c)
    X_test_processed_c = preprocessor_c.transform(X_test_c)

    model = DecisionTreeClassifier(random_state=42)
    model.fit(X_train_processed_c, y_train_c) # Train on processed data
    y_pred_c = model.predict(X_test_processed_c) # Predict on processed data
    acc = accuracy_score(y_test_c, y_pred_c)
    cluster_results[c] = acc
    print(f"Cluster {c}: Accuracy = {acc:.3f}")

In [None]:
#  Visualize Cluster-Wise Accuracy
#  Create a bar chart of cluster accuracies
plt.bar(cluster_results.keys(), cluster_results.values(), color='lightgreen')
plt.title("Classification Accuracy per Cluster (k=3)")
plt.xlabel("Cluster")
plt.ylabel("Accuracy")
plt.show()

In [None]:
**Optional (Encouraged) Extensions**

○      Outlier Detection (e.g., z-score, IQR, Isolation Forest).

○      Association Rule Mining (e.g., Apriori, FP-Growth).

○      Show how these methods could complement classification/clustering.


In [None]:
**Outlier Detection**

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# Load data
# pd.read_csv('bank.csv')
df = pd.read_csv('bank.csv')

# Based on the available data, 'price_range' is not present in 'bank.csv'.
# Assuming we want to analyze numerical columns for outliers.
# Let's select relevant numerical columns from bank.csv for outlier detection.
# Exclude 'duration' and 'pdays' as they might have specific interpretations not ideal for generic outlier detection without more context.
numeric_cols = ['age', 'balance', 'campaign', 'previous']

# Basic checks
# Uncomment to inspect
display(df.head())
print(df[numeric_cols].isnull().sum())

# Fill missing values so algorithms receive complete data
df = df.fillna(df.mean(numeric_only=True))

# Standardize numeric features for LOF / KMeans
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df[numeric_cols])

print("✅ Data loaded, cleaned (NaN filled), and standardized.")

In [None]:
z_scores = np.abs((df[numeric_cols] - df[numeric_cols].mean()) / df[numeric_cols].std())

#  Choose your threshold
threshold = 2.5

# Flag a row as outlier if ANY feature exceeds the threshold
df['Outlier_Z'] = (z_scores > threshold).any(axis=1)

print(f"[Z-Score] Detected {df['Outlier_Z'].sum()} outliers (threshold={threshold}).")

In [None]:
from sklearn.neighbors import LocalOutlierFactor

# Fill parameters and experiment
n_neighbors   = 40     # try 10, 20, 40
contamination = 0.10    # try 0.03, 0.05, 0.10
metric        = 'minkowski'
p_value       = 1    # 1 (Manhattan), 2 (Euclidean), 3 (higher-order)

lof = LocalOutlierFactor(n_neighbors=n_neighbors,
                         contamination=contamination,
                         metric=metric,
                         p=p_value)

# Fit on standardized data and get predictions (1 = inlier, -1 = outlier)
y_pred_lof = lof.fit_predict(scaled_data)

# Persist LOF flags
df['Outlier_LOF'] = (y_pred_lof == -1)

print(f"[LOF] Detected {df['Outlier_LOF'].sum()} outliers "
      f"(n_neighbors={n_neighbors}, contamination={contamination}, p={p_value}).")

In [None]:
from sklearn.cluster import KMeans
from numpy.linalg import norm
import matplotlib.pyplot as plt

# Choose k and distance cutoff percentile
n_clusters = 5
threshold_percentile = 95

kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
labels = kmeans.fit_predict(scaled_data)

# Distance to closest centroid (Euclidean norm)
distances = np.min(norm(scaled_data[:, np.newaxis] - kmeans.cluster_centers_, axis=2), axis=1)

# Percentile-based cutoff -> boolean flag
dist_thresh = np.percentile(distances, threshold_percentile)
df['Outlier_KMeans'] = (distances > dist_thresh)

print(f"[K-Means] Detected {df['Outlier_KMeans'].sum()} outliers "
      f"(k={n_clusters}, cutoff={threshold_percentile}th).")

In [None]:
method_cols = ['Outlier_Z', 'Outlier_LOF', 'Outlier_KMeans']

# Counts per method
counts = df[method_cols].sum().rename("Detected Outliers")
print("Per‑method counts:\n", counts.to_frame())

# Pairwise and triple overlaps
overlap_Z_LOF  = (df['Outlier_Z'] & df['Outlier_LOF']).sum()
overlap_Z_KM   = (df['Outlier_Z'] & df['Outlier_KMeans']).sum()
overlap_LOF_KM = (df['Outlier_LOF'] & df['Outlier_KMeans']).sum()
overlap_all3   = (df['Outlier_Z'] & df['Outlier_LOF'] & df['Outlier_KMeans']).sum()

print("\nPairwise overlaps:")
print(f"Z ∩ LOF      : {overlap_Z_LOF}")
print(f"Z ∩ K-Means  : {overlap_Z_KM}")
print(f"LOF ∩ K-Means: {overlap_LOF_KM}")
print(f"All three    : {overlap_all3}")

# Consensus vote: how many methods (0..3) flag each row?
df['Outlier_Vote'] = df[method_cols].sum(axis=1)
df['Consensus_2plus'] = df['Outlier_Vote'] >= 2
print(f"Consensus (≥2 methods agree): {df['Consensus_2plus'].sum()} rows")

In [None]:
**Association Rule Mining**

In [None]:
#  Load the dataset using pandas
#  use pd.read_csv('filename.csv')
import pandas as pd

df = pd.read_csv('bank.csv') 
df.head()

In [None]:
import numpy as np
cat_df = df.copy()

# Example loop structure:
for col in df.columns:
    if df[col].dtype != 'object' and col != 'deposit': # Exclude 'deposit' as it's already categorical
        median_val = df[col].median()
        new_col = col + '_cat'
        cat_df[new_col] = ['low'if val <= median_val else 'high' for val in df[col]]

# Add the 'deposit' column to cat_df as it's our target and already categorical
cat_df['deposit'] = df['deposit']

display(cat_df.head()) # Display head to check the new DataFrame

In [None]:
# Keep only the categorical columns in cat_df
cat_cols=[c for c in cat_df.columns if c.endswith('_cat')] + ['deposit']
cat_df=cat_df[cat_cols]
display(cat_df)

In [None]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

# --- FIX: Load and prepare the data from 'bank.csv' ---
# 1. Load the CSV file
df = pd.read_csv('bank.csv')

# 2. Select the categorical columns for Apriori analysis
categorical_cols = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome', 'deposit']
cat_df = df[categorical_cols].copy()

# 3. Convert all columns to string type
for col in cat_df.columns:
    cat_df[col] = cat_df[col].astype(str)
# --- END FIX ---

# Prepare transactions
transactions = []
for i in range(len(cat_df)):
    row = cat_df.iloc[i]
    temp_list = []
    for col in cat_df.columns:
        # The structure is 'ColumnName=Value'
        temp_list.append(col + '=' + str(row[col]))
    transactions.append(temp_list)

# Encode transactions for Apriori
te = TransactionEncoder()
onehot_df = te.fit_transform(transactions)
onehot_df = pd.DataFrame(onehot_df, columns=te.columns_)

# Run Apriori algorithm (min_support=0.2)
frequent_itemsets = apriori(onehot_df, min_support=0.2, use_colnames=True)

# Generate and filter rules predicting deposit (min_threshold=0.7)
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.7)
rules_target = rules[rules['consequents'].apply(lambda x: 'deposit=yes' in x)]

# Show top 3 rules sorted by confidence
show_top_3 = rules_target.sort_values(by='confidence', ascending=False).head(3)
print("\nTop 3 Association Rules Predicting 'deposit=yes':")
print(show_top_3)

In [None]:
#  Generate Association Rules

# We will use the Apriori algorithm to find relationships (rules)
# between feature combinations and the target variable 'price_range'.

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

#  Import required libraries ---
#  Import TransactionEncoder, apriori, and association_rules from mlxtend
# Hint: they come from mlxtend.frequent_patterns
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

# Turn each row into a "transaction" list ---
# Each transaction is a list like ["ram_cat=high", "battery_power_cat=low", ...]
transactions = []
for i in range(len(cat_df)):
    row = cat_df.iloc[i]
    temp_list = []
    for col in cat_df.columns:
        #  combine column name and value into one string
        # Example: "ram_cat=high"
        temp_list.append(col + "=" + str(row[col]))
    transactions.append(temp_list)

#  Encode transactions into one-hot format ---
#  TransactionEncoder converts lists of strings into a DataFrame of 0/1 values.
te = TransactionEncoder()
te_data = te.fit(transactions).transform(transactions)
onehot_df = pd.DataFrame(te_data, columns=te.columns_)

# Run the Apriori algorithm ---
# Try different min_support values (0.05 → 0.03) if you get no rules
# max_len=3 means at most 2 items on left side + 1 item on right side
frequent_itemsets = apriori(
    onehot_df,
    min_support=0.05,
    use_colnames=True,
    max_len=3
)

# Generate rules from frequent itemsets ---
# Generate rules using 'confidence' as the metric
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.4)

# Focus only on rules that predict price_range ---
# filter rules where 'price_range' appears in the consequents (right side)
rules_target = rules[rules["consequents"].astype(str).str.contains("price_range")]

#  Keep rules with ≤2 features on the left side (antecedents) ---
simple_rules = []
for i in range(len(rules_target)):
    if len(rules_target.iloc[i]["antecedents"]) <= 2:
        simple_rules.append(rules_target.iloc[i])

# Sort and show the top 3 rules ---
if len(simple_rules) == 0:
    print("⚠️ No rules found. Try lowering min_support or min_threshold.")
else:
    simple_rules_df = pd.DataFrame(simple_rules)
    top3 = simple_rules_df.sort_values("confidence", ascending=False).head(3)
    print("Top 3 rules by confidence:")
    display(top3[["antecedents", "consequents", "support", "confidence", "lift"]])


In [None]:
#  Add rule-based features manually based on your top 3 rules
# Example:

# Reconstruct cat_df as it was intended after cell 5Utt6aPgNKnj,
# which had binned numerical columns and the 'deposit' column.
# This is necessary because cell 0tU3F7EqKP0W overwrote `cat_df` with only original categorical columns.

# Assuming 'df' (the original bank.csv dataframe) is available from previous cells.
temp_cat_df = df.copy()

# Apply the same binning logic as in cell dMDHHFkPNH4S
for col in df.columns:
    if df[col].dtype != 'object' and col != 'deposit':
        median_val = df[col].median()
        new_col = col + '_cat'
        temp_cat_df[new_col] = ['low' if val <= median_val else 'high' for val in df[col]]

# Add the 'deposit' column to temp_cat_df as it's our target and already categorical
temp_cat_df['deposit'] = df['deposit']

# Filter temp_cat_df to only keep the _cat columns and 'deposit' as in cell 5Utt6aPgNKnj
cat_cols_filtered = [c for c in temp_cat_df.columns if c.endswith('_cat')] + ['deposit']
cat_df = temp_cat_df[cat_cols_filtered].copy() # Assign to cat_df, as expected by the rest of the cell

enhanced_df = cat_df.copy()

# Create example rule-based features using existing columns in cat_df
# These rules are illustrative since no rules were generated in the previous step with the given thresholds.

# Rule 1: High age AND High balance
rule1 = []
for i in range(len(cat_df)):
    if cat_df.iloc[i]['age_cat'] == 'high' and cat_df.iloc[i]['balance_cat'] == 'high':
        rule1.append(1)
    else:
        rule1.append(0)
enhanced_df['Rule1'] = rule1

# Rule 2: High duration AND Deposit is 'yes'
rule2 = []
for i in range(len(cat_df)):
    if cat_df.iloc[i]['duration_cat'] == 'high' and cat_df.iloc[i]['deposit'] == 'yes':
        rule2.append(1)
    else:
        rule2.append(0)
enhanced_df['Rule2'] = rule2

# Rule 3: Low age AND Low campaign
rule3 = []
for i in range(len(cat_df)):
    if cat_df.iloc[i]['age_cat'] == 'low' and cat_df.iloc[i]['campaign_cat'] == 'low':
        rule3.append(1)
    else:
        rule3.append(0)
enhanced_df['Rule3'] = rule3

In [None]:
ench_df = enhanced_df.copy()
ench_df.head()

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
import pandas as pd
import numpy as np

# Prepare data
X_base = df.drop(columns=['deposit'])
y = df['deposit']

# Identify numerical and categorical columns for X_base
numerical_cols_base = X_base.select_dtypes(include=np.number).columns
categorical_cols_base = X_base.select_dtypes(include='object').columns

# Create a preprocessor for the baseline model
preprocessor_base = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols_base),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols_base)
    ])

# Apply preprocessing to X_base
X_base_processed = preprocessor_base.fit_transform(X_base)

# One-hot encode X_enhanced for model training (this part was already mostly correct)
# First, identify the categorical columns in enhanced_df (excluding 'deposit')
categorical_cols_enhanced = [col for col in enhanced_df.columns if (col.endswith('_cat') or col.startswith('Rule')) and col != 'deposit']

# Initialize OneHotEncoder for enhanced features
encoder_enhanced = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

# Fit and transform the categorical columns
X_enhanced_encoded = encoder_enhanced.fit_transform(enhanced_df[categorical_cols_enhanced])

# Create a DataFrame for the encoded enhanced features
X_enhanced_processed = pd.DataFrame(X_enhanced_encoded, columns=encoder_enhanced.get_feature_names_out(categorical_cols_enhanced))

# Ensure y matches the index for train_test_split for enhanced model
y_processed = enhanced_df['deposit'].reset_index(drop=True)


# Split data for baseline model (using X_base_processed)
X_train_base, X_test_base, y_train_base, y_test_base = train_test_split(X_base_processed, y, test_size=0.2, random_state=42)

# Train baseline Random Forest
rf1 = RandomForestClassifier(random_state=42)
rf1.fit(X_train_base, y_train_base)
pred_base = rf1.predict(X_test_base)

# Split data for enhanced model
X_train_enhanced, X_test_enhanced, y_train_enhanced, y_test_enhanced = train_test_split(X_enhanced_processed, y_processed, test_size=0.2, random_state=42)

# Train enhanced Random Forest
rf2 = RandomForestClassifier(random_state=42)
rf2.fit(X_train_enhanced, y_train_enhanced)
pred_enhanced = rf2.predict(X_test_enhanced)

# Compute F1-scores
f1_base = f1_score(y_test_base, pred_base, average='weighted') # Use 'weighted' for multi-class classification
f1_enhanced = f1_score(y_test_enhanced, pred_enhanced, average='weighted') # Use 'weighted' for multi-class classification

# Print results
print(f'Baseline F1-score: {f1_base:.4f}')
print(f'Enhanced F1-score: {f1_enhanced:.4f}')