In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2
%matplotlib inline
seed = 0
np.random.seed(seed)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


#### FYI: The classification problem below is very easy to solve, but due to the limited number of features it is useful to showcase how one can use a Tree Feature Generator.  A more difficult problem would be nicer!

In [4]:
#skin dataset - 3 features and 1 target
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00229/Skin_NonSkin.txt'
skin_data_raw = pd.read_csv(url,sep='\t',names=['x1','x2','x3','target'])
skin_data_raw['target']=skin_data_raw['target'].map({1:0,2:1})

In [5]:
skin_data_raw.head(5)

Unnamed: 0,x1,x2,x3,target
0,74,85,123,0
1,73,84,122,0
2,72,83,121,0
3,70,81,119,0
4,70,81,119,0


In [6]:
skin_data_raw['target'].unique()

array([0, 1])

In [7]:
X = skin_data_raw.values[:,:3]
y = skin_data_raw.values[:,3]

In [None]:
from sklearn.model_selection import train_test_split
#X_test,y_test used for final model evaluation
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.33,random_state=0)

In [None]:
from sklearn.metrics import roc_curve, accuracy_score, precision_score, recall_score, f1_score, auc, roc_auc_score

In [None]:
#a function which summarizes results of a classifier
#input is true y labels and the predicted labels
def summarize_performance(y_true,y_pred,y_pred_proba):
    acc_score = accuracy_score(y_true,y_pred)
    prec_score = precision_score(y_true,y_pred)
    rec_score = recall_score(y_true,y_pred)
    f1 = f1_score(y_true,y_pred)
    auc = roc_auc_score(y_true,y_pred)

    print('accuracy: %0.6f'%(acc_score))
    print('precision: %0.6f'%(prec_score))
    print('recall: %0.6f'%(rec_score))
    print('f1: %0.6f'%(f1))
    print('auc: %0.6f'%(auc))

In [None]:
#a function which computes true positives and false positives arrays and an AUC score for a particular model
#inputs are the true y labels and array of predicted values generated by the model
def generate_tp_fp_auc(y_true,y_pred_proba):
    y_pred_proba = y_pred_proba[:,1]
    #computing false and true positive rates
    fpr, tpr, _ = roc_curve(y_true,y_pred_proba)
    #computing the area under the curve
    roc_auc = auc(fpr, tpr)
    #roc_auc = roc_auc_score(y_true,y_pred_proba)
    return fpr, tpr, roc_auc

#### Building RF model

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
#limiting the performance of the RF by setting max depth of 5 in order for the ROC results in the end to be legible
#otherwise the models perform evently (the problem is just really easy to solve)
#max_depth pf 5 results in shallower trees => lower variance across trees
rfv = RandomForestClassifier(n_estimators=10,max_depth=5)

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
#cross_val score for the Random Forest model (using F1)
f1_scores = cross_val_score(rfv, X_train, y_train,cv=10,scoring='f1')
print('Mean 10CV F1 Score: %0.5f' %(f1_scores.mean()))
print('Std 10CV F1 Score: %0.5f' %(f1_scores.std()))

In [None]:
#training the RF on the train portion of the data
rfv.fit(X_train,y_train)

In [None]:
#generating a model prediction using the train/test set
y_pred_rfv_train = rfv.predict(X_train)
y_pred_rfv_train_proba = rfv.predict_proba(X_train)
y_pred_rfv_test = rfv.predict(X_test)
y_pred_rfv_test_proba = rfv.predict_proba(X_test)

In [None]:
#train data performance
summarize_performance(y_train,y_pred_rfv_train,y_pred_rfv_train_proba)

In [None]:
#test data performance
summarize_performance(y_test,y_pred_rfv_test,y_pred_rfv_test_proba)

In [None]:
#calculating true and false positives for the RF for an ROC visualization
fpr_rfv, tpr_rfv, auc_rfv = generate_tp_fp_auc(y_test,y_pred_rfv_test_proba)

#### Building a Vanilla (untunned) Gradient Boosted model

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
#limiting the performance of the GBT by setting n_estimators=10 in order for the ROC results in the end 
#to be legible otherwise the models perform evently (the problem is just really easy to solve)
#the trees by default in boosting are shallower, so limiting the number of trees results in weaker 
#ensemble performance (compared to a RF)
grdbv = GradientBoostingClassifier(n_estimators = 10)

In [None]:
#cross_val score for the Gradient Boosting Model (using F1)
f1_scores = cross_val_score(grdbv, X_train, y_train,cv=10,scoring='f1')
print('Mean 10CV F1 Score: %0.5f' %(f1_scores.mean()))
print('Std 10CV F1 Score: %0.5f' %(f1_scores.std()))

In [None]:
#training the GB model on the train portion of the data
grdbv.fit(X_train,y_train)

In [None]:
#generating a model prediction using the train/test set
y_pred_grdbv_train = grdbv.predict(X_train)
y_pred_grdbv_train_proba = grdbv.predict_proba(X_train)
y_pred_grdbv_test = grdbv.predict(X_test)
y_pred_grdbv_test_proba = grdbv.predict_proba(X_test)

In [None]:
#train data performance
summarize_performance(y_train,y_pred_grdbv_train,y_pred_grdbv_train_proba)

In [None]:
#test data performance
summarize_performance(y_test,y_pred_grdbv_test,y_pred_grdbv_test_proba)

In [None]:
#calculating true and false positives for the GB model for an ROC visualization
fpr_grdbv, tpr_grdbv, auc_grdbv = generate_tp_fp_auc(y_test,y_pred_grdbv_test_proba)

#### Building a vanilla Linear Model using the data

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
#logistic regression by itself
lm = LogisticRegression()

In [None]:
#cross_val score for the LM Model (using F1)
f1_scores = cross_val_score(lm, X_train, y_train,cv=10,scoring='f1')
print('Mean 10CV F1 Score: %0.5f' %(f1_scores.mean()))
print('Std 10CV F1 Score: %0.5f' %(f1_scores.std()))

In [None]:
#fitting the linear model to all of the X_train data
lm.fit(X_train,y_train)

In [None]:
#generating a model prediction using the train/test set
y_pred_lm_train = lm.predict(X_train)
y_pred_lm_train_proba = lm.predict_proba(X_train)
y_pred_lm_test = lm.predict(X_test)
y_pred_lm_test_proba = lm.predict_proba(X_test)

In [None]:
#train data performance
summarize_performance(y_train,y_pred_lm_train,y_pred_lm_train_proba)

In [None]:
#test data performance
summarize_performance(y_test,y_pred_lm_test,y_pred_lm_test_proba)

In [None]:
#calculating true and false positives for the LM model for an ROC visualization
fpr_lm, tpr_lm, auc_lm = generate_tp_fp_auc(y_test,y_pred_lm_test_proba)

#### Building a vanilla version of the Tree Feature Generator Model

In [None]:
from tree_feature_transformation import TreeTransformClf 

In [None]:
tftrc = TreeTransformClf()

In [None]:
#cross_val score for the tree feature transformer model (using F1)
f1_scores = cross_val_score(tftrc, X_train, y_train,cv=10,scoring='f1')
print('Mean 10CV F1 Score: %0.5f' %(f1_scores.mean()))
print('Std 10CV F1 Score: %0.5f' %(f1_scores.std()))

In [None]:
#training the feature transformer model on the train portion of the data
tftrc.fit(X_train,y_train)

In [None]:
#generating a model prediction using the train/test set
y_pred_tftrc_train = tftrc.predict(X_train)
y_pred_tftrc_train_proba = tftrc.predict_proba(X_train)
y_pred_tftrc_test = tftrc.predict(X_test)
y_pred_tftrc_test_proba = tftrc.predict_proba(X_test)

In [None]:
#train data performance
summarize_performance(y_train,y_pred_tftrc_train,y_pred_tftrc_train_proba)

In [None]:
#test data performance
summarize_performance(y_test,y_pred_tftrc_test,y_pred_tftrc_test_proba)

In [None]:
#calculating true and false positives for the LM model for an ROC visualization
fpr_tftrc, tpr_tftrc, auc_tftrc = generate_tp_fp_auc(y_test,y_pred_tftrc_test_proba)

#### Visualizing AUC across the various models

In [None]:
plt.figure(figsize=(16,5))

plt.subplot(1,2,1)
plt.plot([0,1],[0,1],'k--')
plt.plot(fpr_rfv, tpr_rfv, label='RF')
plt.plot(fpr_tftrc, tpr_tftrc, label='RF + LR')
plt.plot(fpr_grdbv, tpr_grdbv, label='GBT')
plt.plot(fpr_lm, tpr_lm,label='LR')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves')
plt.legend(loc='lower right')

plt.subplot(1,2,2)
plt.xlim(-0.01, 0.2)
plt.ylim(0.85, 1.01)
plt.plot([0,1],[0,1],'k--')
plt.plot(fpr_rfv, tpr_rfv, label='RF')
plt.plot(fpr_tftrc, tpr_tftrc, label='RF + LR')
plt.plot(fpr_grdbv, tpr_grdbv, label='GBT')
plt.plot(fpr_lm, tpr_lm,label='LR')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves (Zoomed-in)')
plt.legend(loc='lower right')

plt.show()

#### Example of how the Tree Feature Generator Model can be tuned through Grid Search

In [None]:
clf = TreeTransformClf(tree_clf=RandomForestClassifier(),
                       meta_clf=LogisticRegression(),
                       blend_split=0.5,
                       random_state=0)

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid = {"tree_clf__max_depth":[3,5,None],
              "tree_clf__max_features":[1,2,3],
              "tree_clf__random_state" : [0],
              "tree_clf__n_jobs" : [-1],
              "meta_clf__penalty" : ['l1','l2'],
              "meta_clf__C" : [10,1,0.1,0.001]}

In [None]:
# run grid search
grid_search = GridSearchCV(clf, param_grid=param_grid)

In [None]:
grid_search.fit(X,y)

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_score_

#### Pipeline and GridSearch integration example

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA

In [None]:
pipe = Pipeline([('pca',PCA()),
                 ('clf',TreeTransformClf(tree_clf=RandomForestClassifier(),meta_clf=LogisticRegression()))])

In [None]:
param_grid = {"pca__n_components" : [2,3],
              "clf__tree_clf__max_depth":[3,5,None],
              "clf__tree_clf__max_features":[1,2],
              "clf__tree_clf__random_state" : [0],
              "clf__tree_clf__n_jobs" : [-1],
              "clf__meta_clf__penalty" : ['l1','l2'],
              "clf__meta_clf__C" : [10,1,0.1,0.001]}

In [None]:
grid_search =  GridSearchCV(pipe, param_grid=param_grid)

In [None]:
grid_search.fit(X,y)

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_score_

#### Examples of using tree models other than RF for feature generation

In [None]:
from sklearn.ensemble import RandomTreesEmbedding, GradientBoostingClassifier, ExtraTreesClassifier

In [None]:
clf = TreeTransformClf(tree_clf=GradientBoostingClassifier(),
                       meta_clf=LogisticRegression(),
                       blend_split=0.5,
                       random_state=0)

In [None]:
#mean CV F1 score with GB
cross_val_score(clf, X_train, y_train, cv=10,scoring='f1').mean()

In [None]:
clf = TreeTransformClf(tree_clf=ExtraTreesClassifier(),
                       meta_clf=LogisticRegression(),
                       blend_split=0.5,
                       random_state=0)

In [None]:
#mean CV F1 score with Extra Trees
cross_val_score(clf, X_train, y_train, cv=10,scoring='f1').mean()

In [None]:
clf = TreeTransformClf(tree_clf=RandomTreesEmbedding(),
                       meta_clf=LogisticRegression(),
                       blend_split=0.5,
                       random_state=0)

In [None]:
#mean CV F1 score with Tree Embedding
cross_val_score(clf, X_train, y_train, cv=10,scoring='f1').mean()