In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2
%matplotlib inline
seed = 0
np.random.seed(seed)

#### FYI: The classification problem below is very easy to solve, but due to the limited number of features it is useful to showcase how one can use a Tree Feature Generator.  A more difficult problem would be nicer!

In [3]:
#skin dataset - 3 features and 1 target
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00229/Skin_NonSkin.txt'
skin_data_raw = pd.read_csv(url,sep='\t',names=['x1','x2','x3','target'])
skin_data_raw['target']=skin_data_raw['target'].map({1:0,2:1})

In [4]:
skin_data_raw.head(5)

Unnamed: 0,x1,x2,x3,target
0,74,85,123,0
1,73,84,122,0
2,72,83,121,0
3,70,81,119,0
4,70,81,119,0


In [5]:
skin_data_raw['target'].unique()

array([0, 1])

In [6]:
X = skin_data_raw.values[:,:3]
y = skin_data_raw.values[:,3]

In [7]:
from sklearn.model_selection import train_test_split
#X_test,y_test used for final model evaluation
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.33,random_state=0)

In [8]:
from sklearn.metrics import roc_curve, accuracy_score, precision_score, recall_score, f1_score, auc, roc_auc_score

In [9]:
#a function which summarizes results of a classifier
#input is true y labels and the predicted labels
def summarize_performance(y_true,y_pred,y_pred_proba):
    acc_score = accuracy_score(y_true,y_pred)
    prec_score = precision_score(y_true,y_pred)
    rec_score = recall_score(y_true,y_pred)
    f1 = f1_score(y_true,y_pred)
    auc = roc_auc_score(y_true,y_pred)

    print('accuracy: %0.6f'%(acc_score))
    print('precision: %0.6f'%(prec_score))
    print('recall: %0.6f'%(rec_score))
    print('f1: %0.6f'%(f1))
    print('auc: %0.6f'%(auc))

In [10]:
#a function which computes true positives and false positives arrays and an AUC score for a particular model
#inputs are the true y labels and array of predicted values generated by the model
def generate_tp_fp_auc(y_true,y_pred_proba):
    y_pred_proba = y_pred_proba[:,1]
    #computing false and true positive rates
    fpr, tpr, _ = roc_curve(y_true,y_pred_proba)
    #computing the area under the curve
    roc_auc = auc(fpr, tpr)
    #roc_auc = roc_auc_score(y_true,y_pred_proba)
    return fpr, tpr, roc_auc

#### Building RF model