In [1]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_array
from sklearn.utils.multiclass import unique_labels

In [2]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, GridSearchCV, cross_val_predict
from sklearn.metrics import accuracy_score, confusion_matrix

In [13]:
class FCAClassifier(BaseEstimator, ClassifierMixin):

    def __init__(self, threshold=0.5):
        self.threshold = threshold

    def fit(self, X, y):

        # Check that X and y have correct shape
        X, y = check_X_y(X, y)
        # Store the classes seen during fit
        self.classes_ = unique_labels(y)

        self.X_ = X
        self.y_ = y
        self.X_pos_ = X[y == 1]
        self.X_neg_ = X[y == 0]
        # Return the classifier
        return self

    def predict(self, X):

        # Input validation
        X = check_array(X)

        #closest = np.argmin(euclidean_distances(X, self.X_), axis=1)
        #return self.y_[closest]
        y_pred = []

        for obj in X:
            pos = 0
            neg = 0
            for pos_obj in self.X_pos_:
                if np.sum(obj == pos_obj) > int(len(pos_obj) * self.threshold):
                    pos += 1
            for neg_obj in self.X_neg_:
                if np.sum(obj == neg_obj) > int(len(neg_obj) * self.threshold):
                    neg += 1

            pos = pos / len(self.X_pos_)
            neg = neg / len(self.X_neg_)
            if (pos > neg):
                y_pred.append(1)
            else:
                y_pred.append(0)

        y_pred = np.array(y_pred)
        
        return y_pred
    
    def score(self, X, y):
        y_pred = self.predict(X)
        return accuracy_score(y, y_pred)

In [14]:
df = pd.read_csv('heart.csv')

In [15]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [16]:
def binarize_column(column, bins=5):
    unique = pd.unique(column).shape[0]
    return pd.cut(column, min(unique, bins))

In [17]:
def load_dataset(df):
    X = df.drop(columns=['target'])
    y = np.array(df['target'])
    bin_X = pd.get_dummies(X.transform(binarize_column))
    return bin_X, y

In [18]:
X, y = load_dataset(df)

In [19]:
X.head()

Unnamed: 0,"age_(28.952, 38.6]","age_(38.6, 48.2]","age_(48.2, 57.8]","age_(57.8, 67.4]","age_(67.4, 77.0]","sex_(-0.001, 0.5]","sex_(0.5, 1.0]","cp_(-0.003, 0.75]","cp_(0.75, 1.5]","cp_(1.5, 2.25]",...,"slope_(1.333, 2.0]","ca_(-0.004, 0.8]","ca_(0.8, 1.6]","ca_(1.6, 2.4]","ca_(2.4, 3.2]","ca_(3.2, 4.0]","thal_(-0.003, 0.75]","thal_(0.75, 1.5]","thal_(1.5, 2.25]","thal_(2.25, 3.0]"
0,0,0,0,1,0,0,1,0,0,0,...,0,1,0,0,0,0,0,1,0,0
1,1,0,0,0,0,0,1,0,0,1,...,0,1,0,0,0,0,0,0,1,0
2,0,1,0,0,0,1,0,0,1,0,...,1,1,0,0,0,0,0,0,1,0
3,0,0,1,0,0,0,1,0,1,0,...,1,1,0,0,0,0,0,0,1,0
4,0,0,1,0,0,1,0,1,0,0,...,1,1,0,0,0,0,0,0,1,0


In [20]:
fca_classifier = FCAClassifier()
np.mean(cross_val_score(fca_classifier, X, y, cv=10))

0.5187393400074155

In [21]:
parameters = {'threshold': np.linspace(0.1, 1, 10)}
grid_search = GridSearchCV(fca_classifier, parameters)
grid_search.fit(X, y)
grid_search.best_estimator_

FCAClassifier(threshold=0.7000000000000001)

In [109]:
grid_search.best_score_

0.834983498349835

In [125]:
metrics = ["accuracy", "precision", "recall", "f1"]

In [136]:
def calculate_metrics(clf, metrics, X, y, cv=10):
    scores = {}
    for metric in metrics:
        scores[metric] = np.mean(cross_val_score(clf, X, y, cv=cv, scoring=metric))
    y_pred = cross_val_predict(clf, X, y, cv=cv)
    conf_mat = confusion_matrix(y, y_pred)
    tn, fp, fn, tp = conf_mat.ravel()
    scores["true positive"] = tp
    scores["false positive"] = fp
    scores["true negative"] = tn
    scores["false negative"] = fn
    return scores
    

In [123]:
fca_classifier_tuned = FCAClassifier(grid_search.best_params_['threshold'])

In [130]:
calculate_metrics(fca_classifier_tuned, metrics, X, y)

accuracy 0.8339896180941787
precision 0.811921961767163
recall 0.9143382352941177
f1 0.8583838299318176
102 36 14 151


In [120]:
neigh = KNeighborsClassifier(n_neighbors=3)

In [137]:
calculate_metrics(neigh, metrics, X, y)

{'accuracy': 0.7924063774564332,
 'precision': 0.7713059107215454,
 'recall': 0.8849264705882351,
 'f1': 0.8223421570364294,
 'true positive': 146,
 'false positive': 44,
 'true negative': 94,
 'false negative': 19}

In [115]:
lr = LogisticRegression(random_state=42)

In [116]:
calculate_metrics(lr, metrics, X, y)

accuracy 0.8403188728216536
precision 0.8514884638065752
recall 0.8720588235294118
f1 0.8586715111264447
