In [1]:
import numpy as np
import itertools as it
from sklearn.model_selection import KFold

from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier


# Supervised Learning 

Input: Features.csv file that contains output variable in last column  

Output: n x 2 csv matrix (results.csv) with 1st column = the predicted label and the 2nd column = the actual label 

Driver function below called TuneAndReport


## Model Tuning Code with Cross-Validation

In [None]:
def tuneHyperparameters(classifier, training_set_X, training_set_Y, hyperparameters, number_of_folds, type_of_score):
    """This function runs k-fold cross validation on given set of parameters and returns best combo of parameters (according to specified model performance metric)
    
    Parameters:
    ----------
    -classifier: base classifer, parameters will be set during this function
    -training_set_X: X data
    -training_set_Y: Y data
    -hyperparameters: dictionary of parameter name to options
    -number_of_folds: number of k-fold folds
    -type_of_score: performance metric used to compare hyperparameters, options are "recall", "precision", "f1", or "accuracy"
    
    Returns: 
    -------
    Dictionary of best parameter values, for example {'C': 2, 'penalty': 'l2'}
    
    Example:
    --------
    parameters = {'C': [3,2,5,6], 'penalty': ['l2']}
    tuneHyperparameters(LogisticRegression(), X, Y, parameters, 5, 'accuracy')
    """
    
    kf = KFold(n_splits= number_of_folds)
    kf.get_n_splits(training_set_X)
    
    allNames = sorted(hyperparameters)
    parameter_combos = it.product(*(hyperparameters[Name] for Name in allNames))
    
    metrics = []
    params = []
    for hyperparameter_combo in parameter_combos:
        params.append(hyperparameter_combo)
        for p in range(len(allNames)):
            classifier.set_params(**{allNames[p]: hyperparameter_combo[p]})
              
        metrics.append(kFoldCrossValidation(classifier, training_set_X, training_set_Y, number_of_folds, type_of_score, kf))

    best_params = params[metrics.index(max(metrics))]
    return {allNames[i]: best_params[i] for i in range(len(best_params))}
  

In [None]:
def kFoldCrossValidation(classifier, training_set_X, training_set_Y, number_of_folds, type_of_score = 'all', kf = None):
    """Function that returns cross-validated model performance score
    
    Parameters:
    ----------
    -classifer: supervised learning model
    -training_set_X: the X data 
    -training_set_Y: the Y data 
    -number_of_folds: number of cross validation folds
    -type_of_score: performance metric to return, options are "recall", "precision", "f1", "accuracy", or "all"
    -kf: folds to consider
    
    Return:
    ------
    Float or list depending on type_of_score argument representing the model's performance
    
    Example:
    --------
    kFoldCrossValidation(LogisticRegression(penalty='l2', tol=0.0001, C=1.0) , X, Y, 5, 'recall')
    """
    if kf == None:
        kf = KFold(n_splits= number_of_folds)
        kf.get_n_splits(training_set_X)

    precision = []
    recall = []
    f1_score = []
    accuracy = []

    for train_index, test_index in kf.split(training_set_X):
        X_train, X_validation = training_set_X[train_index], training_set_X[test_index]
        Y_train, Y_validation = training_set_Y[train_index], training_set_Y[test_index]

        classifier.fit(X_train, Y_train)

        y_pred = classifier.predict(X_validation)
        prec, rec, f1, sup = precision_recall_fscore_support(Y_validation, y_pred, average= "binary")
        acc = accuracy_score(Y_validation, y_pred)

        precision.append(prec)
        recall.append(rec)
        f1_score.append(f1)
        accuracy.append(acc)

    mean_precision = np.mean(precision)
    mean_recall = np.mean(recall)
    mean_f1 = np.mean(f1_score)
    mean_accuracy = np.mean(accuracy)
    
    if type_of_score == "accuracy":
        return mean_accuracy
    if type_of_score == "precision":
        return mean_precision
    if type_of_score == "f1":
        return mean_f1
    if type_of_score == "recall":
        return mean_recall
    else:
        return [mean_precision, mean_recall, mean_f1, mean_accuracy]


## Model Ouput on Test Set

In [None]:
def test_model(training_set_X, test_set_X, training_set_Y, test_set_Y, model, hyperparameters):
    """Function that finds test set predictions 
    
    Parameters:
    ----------
    -model: base supervised learning model
    -hyperparameters: dictionary of parameter name to options, like {'C': 1.0, 'penalty': 'l2'}
    
    Return:
    ------
    Saves a csv called "results.csv"- first column is predicted, second is actual
    
    Example:
    -------
    test_model(trainX, testX, trainY, testY, LogisticRegression(), {'C': 1.0, 'penalty': 'l2'})
    """
    
    allNames = sorted(hyperparameters)
    for p in allNames:
        model.set_params(**{p: hyperparameters[p]})
    
    model.fit(training_set_X, training_set_Y)

    y_pred = model.predict(test_set_X)
    try:
        probabilities = model.predict_proba(test_set_X)
        results = [y_pred, test_set_Y, probabilities[:,1]]
    except:
        results = [y_pred, test_set_Y]
        
    numpy_results = np.transpose(np.matrix(results))
    np.savetxt("results.csv", numpy_results, delimiter=",")
    return y_pred


## Driver Function For Picking Best Model and Reporting Results

In [None]:
def TuneAndReport(modeltype, hyperparameters, kfold, metric):
    """Function that uses k-fold cross validation for hyperparameter tuning then
    writes results on test set to a csv file
    
    Parameters:
    ----------
    -modeltype: base model
    -hyperparameters: dictionary of parameter name to options, like {'C': 1.0, 'penalty': 'l2'}
    -kfold: how many cross validation folds to use
    -metric: metric to evaluate tuning with; options "recall", "precision", "f1", "accuracy"
    
    Output:
    ------
    Saves results.csv file
    
    """
    data_train = np.loadtxt('Features_train.csv', delimiter=",", skiprows = 1)
    data_test = np.loadtxt('Features_test.csv', delimiter=",", skiprows = 1)
    

    if modeltype == "LogisticRegression":
        model = LogisticRegression()
        
    if modeltype == "RandomForest":
        model = RandomForestClassifier()
        
    if modeltype == "knn":
        model = KNeighborsClassifier()
        
    if modeltype == 'MLP':
        model = MLPClassifier()
        
    if modeltype == "SVM":
        model = SVC()
        
    if modeltype == "AdaBoost":
        model = AdaBoostClassifier()
        
    if modeltype == "GradientBoosting":
        model = GradientBoostingClassifier()
        
    if modeltype == "LinearSVM":
        model = LinearSVC()

    if modeltype == "DecisionTree":
        model = DecisionTreeClassifier()
        
    training_set_X, test_set_X, training_set_Y, test_set_Y = data_train[:,:-1], data_test[:,:-1], data_train[:,-1], data_test[:,-1]
    
    best_params = tuneHyperparameters(model, training_set_X, training_set_Y, hyperparameters, kfold, metric)

    test_model(training_set_X, test_set_X, training_set_Y, test_set_Y, model, best_params)
    
    #print best_params
    return best_params

## Linear Regression Example- Tune then Create Result Numpy Matrix

#Tune model and pick best hyperparameters
parameters = {'C': [0.001,0.01,0.1,0.5,1], 'penalty': ['l1','l2']}
model = "AdaBoost"
TuneAndReport(model, parameters, 5, 'recall')
