# Python Data Science Template

In [44]:
import re
import bs4
import time
import pprint
import plyfile
import html5lib
import multiprocessing

import numpy as np
import pandas as pd

from scipy import misc
import scipy.io.wavfile as wavfile

import scipy
from sklearn import tree
from sklearn.svm import SVC
from sklearn import manifold
from tempfile import mkdtemp
from sklearn import linear_model
from sklearn import preprocessing
import sklearn.metrics as metrics
from pandas.plotting import scatter_matrix
from scipy.stats import randint as sp_randint
from sklearn.metrics.scorer import make_scorer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.datasets import load_iris
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Binarizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier, DummyRegressor
from sklearn.cluster import AgglomerativeClustering, KMeans
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import recall_score, accuracy_score, confusion_matrix, roc_curve, roc_auc_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_score, cross_validate, KFold, ShuffleSplit

import matplotlib
import matplotlib.pyplot as plt
from matplotlib import cm as cm
from mpl_toolkits.mplot3d import Axes3D
from pandas.plotting import parallel_coordinates, andrews_curves

%matplotlib inline
matplotlib.style.use('ggplot')

In [45]:
def summary(df):
    print(df.shape)
    print(df.dtypes)
    print(df.describe())

# Scikit-Learn Pipeline

In [46]:
#missing some more cleaning steps in pipeline
ml_pipe = make_pipeline(Binarizer(), MultinomialNB())
ml_pipe.steps
ml_pipe.named_steps

{'binarizer': Binarizer(copy=True, threshold=0.0),
 'multinomialnb': MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)}

### Example pipeline with optional step

```python
>>> from sklearn.linear_model import LogisticRegression
>>> param_grid = dict(reduce_dim=[None, PCA(5), PCA(10)],
...                   clf=[SVC(), LogisticRegression()],
...                   clf__C=[0.1, 10, 100])
>>> grid_search = GridSearchCV(pipe, param_grid=param_grid)
```

# Nested Cross Validation

In [47]:
print(__doc__)

Automatically created module for IPython interactive environment


In [48]:
# Number of random trials
NUM_TRIALS = 30

In [49]:
# Load the dataset
iris = load_iris()
X_iris = iris.data
y_iris = iris.target

In [50]:
# Set up possible values of parameters to optimize over
p_grid = {"C": [1, 10, 100],
          "gamma": [.01, .1]}

## Comparing Against Dummy Classifier Baseline

In [51]:
X_train, X_test, y_train, y_test = train_test_split(X_iris, y_iris, random_state=0)
clf = DummyClassifier(strategy='most_frequent',random_state=0)
clf.fit(X_train, y_train)
DummyClassifier(constant=None, random_state=0, strategy='stratified')
clf.score(X_test, y_test)  

0.23684210526315788

## CV & SKLearn Pipeline

In [52]:
cv = ShuffleSplit(n_splits=3, test_size=0.3, random_state=0)
cachedir = mkdtemp()
clf = make_pipeline(preprocessing.StandardScaler(), SVC(C=1), memory=cachedir)
cross_val_score(clf, iris.data, iris.target, cv=cv)

array([ 0.97777778,  0.93333333,  0.95555556])

## Nested Cross Validation Example
### Nested cross validation example using GridSearch
The alternative to cross_val_score is cross_validate.

In [53]:
# We will use a Support Vector Classifier with "rbf" kernel
svm = SVC(kernel="rbf")

In [54]:
# The scorers can be either be one of the predefined metric strings or a scorer
# callable, like the one returned by make_scorer
scoring = {'Accuracy': make_scorer(accuracy_score, greater_is_better=True),'AUC': 'roc_auc'}

In [24]:
# Arrays to store scores
non_nested_scores = np.zeros(NUM_TRIALS)
nested_scores = np.zeros(NUM_TRIALS)

In [55]:
# Loop for each trial
for i in range(NUM_TRIALS):

    # Choose cross-validation techniques for the inner and outer loops,
    # independently of the dataset.
    # E.g "LabelKFold", "LeaveOneOut", "LeaveOneLabelOut", etc.
    inner_cv = KFold(n_splits=4, shuffle=True, random_state=i)
    outer_cv = KFold(n_splits=4, shuffle=True, random_state=i)

    # Non_nested parameter search and scoring
    clf = GridSearchCV(estimator=svm, param_grid=p_grid, scoring=scoring, cv=inner_cv, n_jobs=-1, error_score=0, refit='AUC')
    clf.fit(X_iris, y_iris)
    non_nested_scores[i] = clf.best_score_

    # Nested CV with parameter optimization
    nested_score = cross_validate(clf, X=X_iris, y=y_iris, scoring=scoring, cv=outer_cv, refit='AUC')
    nested_scores[i] = nested_score.mean()

score_difference = non_nested_scores - nested_scores
print("Average difference of {0:6f} with std. dev. of {1:6f}."
      .format(score_difference.mean(), score_difference.std()))

KeyboardInterrupt: 

In [None]:
# Plot scores on each trial for nested and non-nested CV
plt.figure()
plt.subplot(211)
non_nested_scores_line, = plt.plot(non_nested_scores, color='r')
nested_line, = plt.plot(nested_scores, color='b')
plt.ylabel("score", fontsize="14")
plt.legend([non_nested_scores_line, nested_line],
           ["Non-Nested CV", "Nested CV"],
           bbox_to_anchor=(0, .4, .5, 0))
plt.title("Non-Nested and Nested Cross Validation on Iris Dataset",
          x=.5, y=1.1, fontsize="15")

# Plot bar chart of the difference.
plt.subplot(212)
difference_plot = plt.bar(range(NUM_TRIALS), score_difference)
plt.xlabel("Individual Trial #")
plt.legend([difference_plot],
           ["Non-Nested CV - Nested CV Score"],
           bbox_to_anchor=(0, 1, .8, 0))
plt.ylabel("score difference", fontsize="14")

plt.show()

In [None]:
def plot_results(results):
    plt.figure(figsize=(13, 13))
    plt.title("GridSearchCV evaluating using multiple scorers simultaneously",
              fontsize=16)

    plt.xlabel("min_samples_split")
    plt.ylabel("Score")
    plt.grid()

    ax = plt.axes()
    ax.set_xlim(0, 402)
    ax.set_ylim(0.73, 1)

    # Get the regular numpy array from the MaskedArray
    X_axis = np.array(results['param_min_samples_split'].data, dtype=float)

    for scorer, color in zip(sorted(scoring), ['g', 'k']):
        for sample, style in (('train', '--'), ('test', '-')):
            sample_score_mean = results['mean_%s_%s' % (sample, scorer)]
            sample_score_std = results['std_%s_%s' % (sample, scorer)]
            ax.fill_between(X_axis, sample_score_mean - sample_score_std,
                            sample_score_mean + sample_score_std,
                            alpha=0.1 if sample == 'test' else 0, color=color)
            ax.plot(X_axis, sample_score_mean, style, color=color,
                    alpha=1 if sample == 'test' else 0.7,
                    label="%s (%s)" % (scorer, sample))

        best_index = np.nonzero(results['rank_test_%s' % scorer] == 1)[0][0]
        best_score = results['mean_test_%s' % scorer][best_index]

        # Plot a dotted vertical line at the best score for that scorer marked by x
        ax.plot([X_axis[best_index], ] * 2, [0, best_score],
                linestyle='-.', color=color, marker='x', markeredgewidth=3, ms=8)

        # Annotate the best score for that scorer
        ax.annotate("%0.2f" % best_score,
                    (X_axis[best_index], best_score + 0.005))

    plt.legend(loc="best")
    plt.grid('off')
    plt.show()

### Randomized Search
Will deliver faster results than GridSearchCV.

In [56]:
svm = SVC(kernel="rbf")

In [57]:
# Load the dataset
iris = load_iris()
X_iris = iris.data
y_iris = iris.target

In [58]:
p_dist = {'C': scipy.stats.expon(scale=100), 'gamma': scipy.stats.expon(scale=.1),
  'kernel': ['rbf'], 'class_weight':['balanced', None]}

In [59]:
clf = random_search = RandomizedSearchCV(svm, param_distributions=p_dist,
                                   n_iter=NUM_TRIALS)

In [60]:
clf.fit(X_iris, y_iris)

RandomizedSearchCV(cv=None, error_score='raise',
          estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
          fit_params=None, iid=True, n_iter=30, n_jobs=1,
          param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x1a19cf6940>, 'gamma': <scipy.stats._distn_infrastructure.rv_frozen object at 0x1a19deeb38>, 'kernel': ['rbf'], 'class_weight': ['balanced', None]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=0)

In [61]:
confusion_matrix(clf.predict(X_iris), y_iris)

array([[50,  0,  0],
       [ 0, 48,  0],
       [ 0,  2, 50]])

# Confusion Matrix

In [38]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [64]:
y_true = [2, 0, 2, 2, 0, 1]
y_pred = [0, 0, 2, 2, 0, 2]
c_m = confusion_matrix(y_true, y_pred)
c_m

array([[2, 0, 0],
       [0, 0, 1],
       [1, 0, 2]])

In [41]:
#TODO plot_confusion_matrix(c_m, )

# Classification Report

# ROC