In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from pyTsetlinMachine.tm import MultiClassTsetlinMachine


# Load the dataset
data = pd.read_csv("bank-additional-full.csv", sep=';')

# Display the first few rows of the data to confirm the structure
print("First few rows of the dataset:")
print(data.head())

# Encode categorical variables
data = pd.get_dummies(data, drop_first=True)

# Extract features and target variable
X = data.drop(columns=['y_yes'])
y = data['y_yes']

# Normalize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)




First few rows of the dataset:
   age        job  marital    education  default housing loan    contact  \
0   56  housemaid  married     basic.4y       no      no   no  telephone   
1   57   services  married  high.school  unknown      no   no  telephone   
2   37   services  married  high.school       no     yes   no  telephone   
3   40     admin.  married     basic.6y       no      no   no  telephone   
4   56   services  married  high.school       no      no  yes  telephone   

  month day_of_week  ...  campaign  pdays  previous     poutcome emp.var.rate  \
0   may         mon  ...         1    999         0  nonexistent          1.1   
1   may         mon  ...         1    999         0  nonexistent          1.1   
2   may         mon  ...         1    999         0  nonexistent          1.1   
3   may         mon  ...         1    999         0  nonexistent          1.1   
4   may         mon  ...         1    999         0  nonexistent          1.1   

   cons.price.idx  cons.c

In [10]:
# Genetic Algorithm for feature selection (placeholder function)
def genetic_algorithm_feature_selection(X, y):
    # Simple feature selection by ranking based on correlation with target
    correlations = np.abs([np.corrcoef(X[:, i], y)[0, 1] for i in range(X.shape[1])])
    sorted_indices = np.argsort(correlations)[::-1]
    selected_features = sorted_indices[:int(0.75 * len(sorted_indices))]  # Select top 75% features
    return selected_features

# Apply Genetic Algorithm for feature selection
selected_features = genetic_algorithm_feature_selection(X_train, y_train)
X_train_selected = X_train[:, selected_features]
X_test_selected = X_test[:, selected_features]

# Binarize the features for Tsetlin Machine
X_train_binarized = np.where(X_train_selected > np.median(X_train_selected, axis=0), 1, 0)
X_test_binarized = np.where(X_test_selected > np.median(X_test_selected, axis=0), 1, 0)

# Hyperparameter tuning
param_grid = {
    'clauses': [50, 100, 200],
    'T': [10, 15, 20],
    's': [3.0, 3.9, 4.5]
}


def grid_search_tsetlin(X_train, y_train, param_grid):
    best_params = None
    best_score = 0
    for clauses in param_grid['clauses']:
        for T in param_grid['T']:
            for s in param_grid['s']:
                tm = MultiClassTsetlinMachine(clauses, T, s)
                tm.fit(X_train, y_train, epochs=10)
                y_pred = tm.predict(X_train)
                score = accuracy_score(y_train, y_pred)
                if score > best_score:
                    best_score = score
                    best_params = {'clauses': clauses, 'T': T, 's': s}
    return best_params

best_params = grid_search_tsetlin(X_train_binarized, y_train, param_grid)
print("Best parameters found: ", best_params)

Best parameters found:  {'clauses': 200, 'T': 10, 's': 4.5}


In [11]:


# Initialize and train the Tsetlin Machine with best parameters
tm = MultiClassTsetlinMachine(best_params['clauses'], best_params['T'], best_params['s'])
tm.fit(X_train_binarized, y_train, epochs=100)

# Evaluate the model
y_pred = tm.predict(X_test_binarized)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"TM with Genetic Algorithm - Accuracy: {accuracy}")
print(f"TM with Genetic Algorithm - Precision: {precision}")
print(f"TM with Genetic Algorithm - Recall: {recall}")
print(f"TM with Genetic Algorithm - F1 Score: {f1}")


TM with Genetic Algorithm - Accuracy: 0.9017965525613013
TM with Genetic Algorithm - Precision: 0.7058823529411765
TM with Genetic Algorithm - Recall: 0.23101604278074866
TM with Genetic Algorithm - F1 Score: 0.34810636583400484


In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier

# Train and evaluate Random Forest with selected features
rf = RandomForestClassifier()
rf.fit(X_train_selected, y_train)
y_pred_rf = rf.predict(X_test_selected)

# Train and evaluate Logistic Regression with selected features
lr = LogisticRegression()
lr.fit(X_train_selected, y_train)
y_pred_lr = lr.predict(X_test_selected)

# Train and evaluate SVM with selected features
svm = SVC()
svm.fit(X_train_selected, y_train)
y_pred_svm = svm.predict(X_test_selected)

# Train and evaluate Gradient Boosting with selected features
gb = GradientBoostingClassifier()
gb.fit(X_train_selected, y_train)
y_pred_gb = gb.predict(X_test_selected)

# Evaluation Metrics
models = {'Random Forest': y_pred_rf, 'Logistic Regression': y_pred_lr, 'SVM': y_pred_svm, 'Gradient Boosting': y_pred_gb}

for model_name, y_pred in models.items():
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    print(f"{model_name} - Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1 Score: {f1}")


Random Forest - Accuracy: 0.911750424860403, Precision: 0.6444444444444445, Recall: 0.49625668449197863, F1 Score: 0.5607250755287009
Logistic Regression - Accuracy: 0.9112648701141054, Precision: 0.6711409395973155, Recall: 0.42780748663101603, F1 Score: 0.5225342913128674
SVM - Accuracy: 0.9081087642631707, Precision: 0.6731517509727627, Recall: 0.3700534759358289, F1 Score: 0.4775707384403037
Gradient Boosting - Accuracy: 0.920004855547463, Precision: 0.6885245901639344, Recall: 0.5390374331550802, F1 Score: 0.6046790641871626
