# Experiments

## Dataset 1 - Breast Cancer

In [52]:
from sklearn.datasets import load_breast_cancer
X, Y = load_breast_cancer(return_X_y=True)

## Preparing data

In [53]:
from sklearn.preprocessing import OneHotEncoder
from collinearity import SelectNonCollinear
from sklearn.preprocessing import StandardScaler


def remove_missing_rows(X, Y):
    mask = np.isnan(X).sum(axis=1) == 0
    return X[mask], Y[mask]

def encode_categorical(X):
    mask = np.array([isinstance(x, str) for x in X[0]])
    if mask.sum() > 0:    
        X = X[:, mask]
        enc = OneHotEncoder(handle_unknown='ignore')
        enc.fit(X)
        X = enc.transform(X).toarray()
    return X

def remove_colinear(X, correlation_threshold=0.9):
    # Source: https://github.com/gianlucamalato/collinearity
    # 1. Take the couple of features that have the lowest absolute 
    #    value of the linear correlation coefficient.
    # 2. If it's lower than the threshold, consider these features
    # 3. Keep adding features as long as the correlation matrix doesn't
    #    show off-diagonal elements whose absolute value is greater than the threshold.
    selector = SelectNonCollinear(correlation_threshold=correlation_threshold)
    selector.fit(X, y=None)
    X = selector.transform(X)
    columns_selection = selector.get_support()
    return X, columns_selection   

def scale_features(X):
    scaler = StandardScaler()
    scaler.fit(X)
    X = scaler.transform(X)
    return X

Before preparation:

In [54]:
print(f"In the raw data, there are {X.shape[0]} observations and {X.shape[1]} features.")

In the raw data, there are 569 observations and 30 features.


Dropping rows with missing values, encoding categorical variables, dropping features with correlation > 0.9, and scaling the data.

In [55]:
X, Y = remove_missing_rows(X, Y)
X = encode_categorical(X)
X, columns_selection = remove_colinear(X, correlation_threshold=0.9)

After preparation:

In [60]:
print(f"In the prepared data, there are {X.shape[0]} observations and {X.shape[1]} features (removed {len(columns_selection) - np.array(columns_selection).sum()} features).")

In the prepared data, there are 569 observations and 21 features (removed 9 features).


### Train/Test Split

In [61]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

## Models and metrics

In [63]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from LogisticRegressionIRLS import LR
from sklearn.neighbors import KNeighborsClassifier
from functools import partial

models = {'LDA': LinearDiscriminantAnalysis,
        'QDA': QuadraticDiscriminantAnalysis,
        'Logistic Regression': LogisticRegression,
        'Logistic Regression IRLS': LR,
        'KNN': KNeighborsClassifier}

In [65]:
# precision, recall, accuracy, f1

from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

metrics = {'precision': precision_score,
            'recall': recall_score,
            'accuracy': accuracy_score,
            'f1': f1_score}

## Experiments

In [66]:
def train_model(model, X_train, Y_train, X_test, Y_test, metrics):
    model.fit(X_train, Y_train)
    Y_pred = model.predict(X_test)
    return {metric: metrics[metric](Y_test, Y_pred) for metric in metrics}

results = {model: train_model(models[model](), X_train, Y_train, X_test, Y_test, metrics) for model in models}

AttributeError: 'numpy.ndarray' object has no attribute 'values'