# iPRules

# Statics

In [None]:
from sklearn.datasets import load_iris
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.utils import Bunch
from sklearn.datasets._base import load_csv_data
from sklearn import metrics
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer


## Load Dataset

In [None]:
# Load Dataset
#iris
#dataset = load_iris()
target_value_name = 'class'

# Mushrooms
filename = 'mushrooms'
target_true = 'p'
target_false = 'e'
test_size = 0.33


data_file_name = f'../../data/{filename}.csv'
pandas_dataset = pd.read_csv(data_file_name)

pandas_dataset.columns = [sub.replace('%', '') for sub in pandas_dataset.columns]
feature_names = pandas_dataset.columns[0:-1]
target_value_name = pandas_dataset.columns[-1]

pandas_dataset[target_value_name] = pandas_dataset[target_value_name].map({target_false:0,target_true:1})

pandas_dataset.columns = [sub.replace(' ', '').replace('class', 'target_value') for sub in pandas_dataset.columns]
target_value_name = pandas_dataset.columns[-1]

#dataset.feature_names = [sub.replace(' ', '').replace('(cm)', '') for sub in dataset.feature_names]

pandas_dataset.head()

# One Hot Encoding

In [None]:
enc = OneHotEncoder(sparse_output=False)
encoded_array = enc.fit_transform(pandas_dataset.loc[:,feature_names])
encoded_feature_names = enc.get_feature_names_out()
df_encoded = pd.DataFrame(encoded_array,columns=encoded_feature_names)
encoded_pandas_dataset = pd.concat([df_encoded, pandas_dataset],axis=1)
encoded_pandas_dataset.drop(labels= feature_names,axis=1,inplace=True)
encoded_pandas_dataset.head()

In [None]:
#print(pandas_dataset.shape)

# One hot encoding + not doing anything in the rest
#ct = make_column_transformer(
#    (OneHotEncoder(), feature_names),
#    n_jobs=3,
#    remainder='passthrough',
#    sparse_threshold=0)

#pandas_dataset_encoded = ct.fit_transform(pandas_dataset).T

#print(pandas_dataset_encoded[0])

#column_names = (ct.named_transformers_["onehotencoder"].get_feature_names_out().tolist()
#               + [target_value_name])

#encoded_dataset = pd.DataFrame(pandas_dataset_encoded, column_names)

#encoded_dataset.head()

In [None]:
X = encoded_pandas_dataset[encoded_feature_names]
y = encoded_pandas_dataset[target_value_name]

encoded_dataset = Bunch(
        data=X.to_numpy(),
        target=y.to_numpy(),
        target_names=target_value_name,
        feature_names=X.columns
)

#X_display = pandas_dataset.drop([target_value_name], axis=1)
#y_display = pandas_dataset[target_value_name]

#encoded_pandas_dataset = pd.DataFrame(data= np.c_[dataset['data'], dataset['target']], columns= dataset['feature_names'] + ['target'])
#encoded_pandas_dataset.head()


# Understand Dataset

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

encoded_pandas_dataset.plot(subplots=True)

plt.tight_layout()
plt.show()

# Divide dataset

In [None]:
#Define dataset
X_train, X_test, y_train, y_test = train_test_split(encoded_dataset.data, encoded_dataset.target, test_size=test_size, random_state=1)

print('Sizes (without target):')
print(f'Original size {encoded_dataset.data.shape}')
print(f'Train size {X_train.shape}')
print(f'Test size {X_test.shape}')

## Define Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_score, make_scorer, recall_score, accuracy_score

# Define scorer
custom_scorer = make_scorer(accuracy_score, greater_is_better=True)
param_grid = {
        'n_estimators': [50, 100, 150, 200, 250, 300],  # being the number of trees in the forest.
        'min_samples_leaf': [3], # number of minimum samples required at a leaf node.
        'min_samples_split': [6], # number of minimum samples required to split an internal node.
        'criterion': ['entropy','gini'], # measures the quality of a split. Can use gini's impurity or entropy.
        }
clf = GridSearchCV(
        # Evaluates the performance of different groups of parameters for a model based on cross-validation.
        RandomForestClassifier(),
        param_grid,  # dict of parameters.
        cv=10,  # Specified number of folds in the Cross-Validation(K-Fold).
        scoring=custom_scorer)

clf.fit(X_train, y_train)

ensemble = clf.best_estimator_

## Fit

In [None]:
# Train the random forest classifier on the Iris dataset
#ensemble.fit(X_train, y_train)

# Make predictions for the test set
#y_pred_test = ensemble.predict(X_test)

## iPRules

In [None]:
from iPRules.iPRules import iPRules

# initialize
tree = iPRules(
                base_ensemble=ensemble,
                feature_names=encoded_dataset.feature_names,
                target_value_name = target_value_name,
                chi_square_probability = 0.95,
                scale_feature_coefficient = 0.85
            )

# Fit model
tree.fit(encoded_pandas_dataset, X_train, y_train)

# Print Model

In [None]:
print(tree)

# Predict

In [None]:
tree_preds = tree.predict(X_test)
print('The accuracy of the Tree model is :\t',metrics.accuracy_score(tree_preds,y_test))

# SHAP explainer

In [None]:
import shap

# Create Tree Explainer object that can calculate shap values
explainer = shap.TreeExplainer(ensemble)

# Evaluate SHAP values
shap_values = explainer.shap_values(X_test)
shap.summary_plot(shap_values, X_train, feature_names=encoded_dataset.feature_names)


In [None]:

shap.summary_plot(shap_values, X_train, feature_names=encoded_dataset.feature_names, plot_type="dot")