# SRules

# Statics

In [1]:
import dask.dataframe as dd
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn import metrics


## Load Dataset

In [2]:

from SRules.Tests.Utils.DatasetUtils import read_dataset

filename = "divorce"
X, y, dataset, target_value_name, pandas_dataset = read_dataset(filename, f'../data/{filename}.csv')

pandas_dataset.head()

Unnamed: 0,Atr1_0,Atr1_1,Atr1_2,Atr1_3,Atr1_4,Atr2_0,Atr2_1,Atr2_2,Atr2_3,Atr2_4,...,Atr53_1,Atr53_2,Atr53_3,Atr53_4,Atr54_0,Atr54_1,Atr54_2,Atr54_3,Atr54_4,Class
0,False,False,True,False,False,False,False,True,False,False,...,False,True,False,False,False,True,False,False,False,True
1,False,False,False,False,True,False,False,False,False,True,...,False,True,False,False,False,False,True,False,False,True
2,False,False,True,False,False,False,False,True,False,False,...,False,True,False,False,False,False,True,False,False,True
3,False,False,False,True,False,False,False,True,False,False,...,False,True,False,False,False,False,True,False,False,True
4,False,False,True,False,False,False,False,True,False,False,...,True,False,False,False,True,False,False,False,False,True


In [3]:
#Define dataset
X_train, X_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.1, random_state=1)
encoded_train_pandas_dataset = pd.DataFrame(data= np.c_[X_train, y_train], columns= list(dataset['feature_names']) + [target_value_name])
encoded_test_pandas_dataset = pd.DataFrame(data= np.c_[X_test, y_test], columns= list(dataset['feature_names']) + [target_value_name])
print('Sizes (without target):')
print(f'Original size {dataset.data.shape}')
print(f'Train size {X_train.shape}')
print(f'Test size {X_test.shape}')
print(f'encoded_train_pandas_dataset size {encoded_train_pandas_dataset.shape}')
print(f'encoded_test_pandas_dataset size {encoded_test_pandas_dataset.shape}')

Sizes (without target):
Original size (170, 270)
Train size (153, 270)
Test size (17, 270)
encoded_train_pandas_dataset size (153, 271)
encoded_test_pandas_dataset size (17, 271)


## Define Random Forest

In [4]:
from sklearn.ensemble import RandomForestClassifier

# Define scorer
ensemble = RandomForestClassifier()
ensemble.fit(X_train, y_train)

## SRules

In [5]:
from SRules.SRules import SRules

import time
start_time = time.time()
rules = SRules(
                feature_names=dataset.feature_names,
                target_value_name = dataset.target_names,
                display_features = False,
                display_logs = False,
                recursive=True,
                p_significance = 0.99,
                minImp = 0.01,
                min_accuracy_coefficient = 0.95,
                minInsNode = 5
            )
# Fit model
rules.fit(
    method=ensemble,
    X_train=X_train,
    y_train=y_train,
    original_dataset=encoded_train_pandas_dataset,
            use_shap=False,
            use_lime=False)

elapsed_time = time.time() - start_time
print(f"Elapsed TOTAL TIME: {elapsed_time:.3f} seconds")

Elapsed TOTAL TIME: 13.607 seconds


# Print Model

In [6]:
print(rules)

> ++++++++++++++++++++++++++++
> SRules --  Number of Rules: 3
> SRules --  Number of Minimal Rules: 3
> ++++++++++++++++++++++++++++
> ------MINIMAL RULES--------
 ** Accuracy: 1.0
 ** Atr5_0 == True  &  Atr20_0 == True --> False
> ------------------------------
 ** Accuracy: 0.9864864864864865
 ** Atr5_0 == False  &  Atr20_0 == False --> True
> ------------------------------
 ** Accuracy: 0.958904109589041
 ** Atr5_0 == True --> False
> ------------------------------
> ------COMPLETE RULES--------
 ** Accuracy: 1.0
 ** Atr5_0 == True  &  Atr20_0 == True --> False
> ------------------------------
 ** Accuracy: 0.9864864864864865
 ** Atr5_0 == False  &  Atr20_0 == False --> True
> ------------------------------
 ** Accuracy: 0.958904109589041
 ** Atr5_0 == True --> False
> ------------------------------


# Predict

In [7]:
# ENSEMBLE
y_pred_test_ensemble = ensemble.predict(X_test)

# RULES
y_pred_test_rules = rules.predict(X_test, sorting_method="target_accuracy")

In [8]:
# CATEGORIZABLES
np_array_rules = np.array(y_pred_test_rules)
#not_filter_indices = np.where(np.logical_and(np_array_rules != 0, np_array_rules!=1))[0]
filter_indices = np.where(np_array_rules != None)[0]


np_filterred_y_test = np.array(y_test)[filter_indices]
np_filterred_y_pred_test_ensemble = np.array(y_pred_test_ensemble)[filter_indices]
np_filterred_y_pred_test_rules = np.array(y_pred_test_rules)[filter_indices]
# CHANGE FORMAT IN ORDER TO NOT HAVE PROBLEMS
np_filterred_y_pred_test_rules = np_filterred_y_pred_test_rules.astype('int64')


print(f'DATASET TEST: {len(y_test)}')
print(f'DATASET TEST categorizable: {len(np_filterred_y_test)}')
print('Cobertura:',str("{:.2f}".format(100*(len(np_filterred_y_pred_test_rules)/len(y_test))))+'%')

ensemble_accuracy = metrics.accuracy_score(np_filterred_y_test, np_filterred_y_pred_test_ensemble)
print('RF accuracy:',str("{:.2f}".format(100*ensemble_accuracy))+'%')
ensemble_accuracy = metrics.f1_score(np_filterred_y_test, np_filterred_y_pred_test_ensemble)
print('RF F1-score:',str("{:.2f}".format(100*ensemble_accuracy))+'%')
rules_accuracy = metrics.accuracy_score(np_filterred_y_test, np_filterred_y_pred_test_rules)
print('Rules Accuracy:',str("{:.2f}".format(100*rules_accuracy))+'%')
rules_F1 = metrics.f1_score(np_filterred_y_test, np_filterred_y_pred_test_rules)
print('Rules F1-score:',str("{:.2f}".format(100*rules_F1))+'%')
rules_roc_auc = metrics.roc_auc_score(np_filterred_y_test, np_filterred_y_pred_test_rules)
print('Rules roc_auc_score:',str("{:.2f}".format(100*rules_roc_auc))+'%')


DATASET TEST: 17
DATASET TEST categorizable: 16
Cobertura: 94.12%
RF accuracy: 56.25%
RF F1-score: 0.00%
Rules Accuracy: 100.00%
Rules F1-score: 100.00%
Rules roc_auc_score: 100.00%
