In [1]:
# Import the necessary packages
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix

In [None]:
def read_in_data(datapath: str) -> pd.DataFrame:
    """Read in data
    Parameters
    ----------
    datapath:
        path to the csv file where columns are
        the attributes, rows are the observations
    Return
    ------
    data_in:
        pandas dataframe
    """
    data_in = pd.read_csv(datapath)
    return data_in

In [None]:
def select_columns(data_in: pd.DataFrame, selected_columns: list) -> pd.DataFrame:
    """Select a subset of the data
    Parameters
    ----------
    data_in:
        pandas table containing attributes as columns,
        observations as rows
    selected_columns:
        list of strings of column names to keep
    Return
    ------
    data_subset:
        pandas table containing only the selected columns
    """
    data_subset = data_in[selected_columns]
    return data_subset

In [None]:
def visualize_data(data_in: pd.DataFrame, y_ID: str) -> None:
    """Visualize a dataset
    Parameters
    ----------
    data_in:
        pandas table containing attributes as columns,
        observations as rows
    y_ID:
        name of the column used as outcome
    """
    data_gridplot = sns.PairGrid(data_in, hue=y_ID)
    data_gridplot.map_diag(sns.kdeplot)
    data_gridplot.map_offdiag(sns.kdeplot)
    data_gridplot.add_legend()
    return

In [None]:
def create_x_y_arrays(data_in: pd.DataFrame, y_ID: str) -> tuple:
    """Split the predictors and the outcome
    Parameters
    ----------
    data_in:
        pandas table containing attributes as columns,
        observations as rows
    y_ID:
        name of the column used as outcome
    Return
    ------
    X:
        pandas.DataFrame of the predicting attributes as columns,
        observations as rows
    y:
        pandas.DataFrame of the outcome attribute as column,
        observations as rows
    """
    y = data_in[y_ID]
    X = data_in.drop(y_ID, axis=1)
    return X, y

In [None]:
def plot_evaluation_result(confusion_matrix:np.ndarray) -> None:
    """Plot confusion matrix
    Parameters
    ----------
    confusion_matrix:
        2D array of a confusion matrix of 2 classes (Positive, Negative) 
    """
    group_names = ["True Neg","False Pos","False Neg","True Pos"]
    group_percentages = ["{0:.2%}".format(value) for value in
                        confusion_matrix.flatten()/np.sum(confusion_matrix)]
    labels = [f"{v1}\n{v2}" for v1, v2 in
            zip(group_names,group_percentages)]
    labels = np.asarray(labels).reshape(2,2)
    sns.heatmap(confusion_matrix, annot=labels, fmt='', cmap='Blues')