In [1]:
import pickle
from sklearn.model_selection import StratifiedShuffleSplit

In [2]:
from feature_format import featureFormat, targetFeatureSplit

In [3]:
PERF_FORMAT_STRING = "\
\tAccuracy: {:>0.{display_precision}f}\tPrecision: {:>0.{display_precision}f}\t\
Recall: {:>0.{display_precision}f}\tF1: {:>0.{display_precision}f}\tF2: {:>0.{display_precision}f}"
RESULTS_FORMAT_STRING = "\tTotal predictions: {:4d}\tTrue positives: {:4d}\tFalse positives: {:4d}\
\tFalse negatives: {:4d}\tTrue negatives: {:4d}"

In [4]:
CLF_PICKLE_FILENAME = "my_classifier.pkl"
DATASET_PICKLE_FILENAME = "my_dataset.pkl"
FEATURE_LIST_FILENAME = "my_feature_list.pkl"

In [5]:
def dump_classifier_and_data(clf, dataset, feature_list):
    with open(CLF_PICKLE_FILENAME, "w") as clf_outfile:
        pickle.dump(clf, clf_outfile)
    with open(DATASET_PICKLE_FILENAME, "w") as dataset_outfile:
        pickle.dump(dataset, dataset_outfile)
    with open(FEATURE_LIST_FILENAME, "w") as featurelist_outfile:
        pickle.dump(feature_list, featurelist_outfile)

def load_classifier_and_data():
    with open(CLF_PICKLE_FILENAME, "r") as clf_infile:
        clf = pickle.load(clf_infile)
    with open(DATASET_PICKLE_FILENAME, "r") as dataset_infile:
        dataset = pickle.load(dataset_infile)
    with open(FEATURE_LIST_FILENAME, "r") as featurelist_infile:
        feature_list = pickle.load(featurelist_infile)
    return clf, dataset, feature_list

In [6]:
clf, dataset, feature_list = load_classifier_and_data()

In [7]:
data = featureFormat(dataset, feature_list, sort_keys=True)

In [8]:
labels, features = targetFeatureSplit(data)

In [9]:
cv = StratifiedShuffleSplit(n_splits=1000, random_state=42)

In [10]:
cv

StratifiedShuffleSplit(n_splits=1000, random_state=42, test_size=0.1,
            train_size=None)

In [11]:
import numpy as np
features = np.array(features)

In [12]:
labels=np.array(labels)

In [14]:
for train_idx, test_idx in cv.split(features, labels):
    print("Train:", train_idx, "Test:", test_idx)
    features_train, features_test = features[train_idx], features[test_idx]
    labels_train, labels_test = labels[train_idx], labels[test_idx]

('Train:', array([122,  30,   4,  85,  34,   1,  55, 116,  54,   2,  71,  17, 140,
        14, 136,  28,  66, 102,  44,  41,  37,  61,  45, 128,  36,  90,
       129,  91,  94, 107, 131,  62,   9, 132,  33, 119,   8, 121, 101,
        80,   6,  12,  58,  74,  67,  46,  25, 113, 139, 103,  83,  11,
        57,  69,  65, 124,  49,  98, 133,  82,  81, 108,  53,  78, 100,
        73, 134,  32,  92,  59,  47,   7, 141,  63,  19, 126,  42,  79,
        96,  76,  21,  38, 110,  56, 125,  97,  39,  48, 114, 104,  18,
       135,  72,  20, 115,  29, 138,  40,   3,  86,  51,  27,  52,  10,
       112,  15,   0, 120, 137,  89,  22,  75, 111,  31,  88,  50,   5,
        93,  64,  68,  23, 127, 106,  13,  35, 109,  77,  43]), 'Test:', array([142, 117,  24,  84,  60,  87, 130, 105,  95, 123, 118,  70,  99,
        16,  26]))
('Train:', array([ 31,  86,  96,  25, 133, 124,  89,  64, 128,  56, 119,  58, 121,
       111,  62,  19, 102,  16,  52,  27,  13,  72,  36,  38,  17,  83,
       134, 110, 116, 

In [25]:
len(features_train)

128

In [27]:
len(features_test)

15

In [28]:
len(labels_train)

128

In [29]:
len(labels_test)

15

In [15]:
clf.fit(features_train, labels_train)

Pipeline(steps=[('minmaxscaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('pca', PCA(copy=True, iterated_power='auto', n_components=12, random_state=42,
  svd_solver='auto', tol=0.0, whiten=False)), ('decisiontreeclassifier', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,...plit=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=42, splitter='best'))])

In [16]:
predictions = clf.predict(features_test)

In [17]:
predictions

array([ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        1.,  0.])

In [18]:
labels_test

array([ 0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        1.,  0.])

In [24]:
len(labels_test)

15

In [19]:
true_negatives = 0
false_negatives = 0
true_positives = 0
false_positives = 0

In [20]:
for prediction, truth in zip(predictions, labels_test):
    if prediction == 0 and truth == 0:
        true_negatives += 1
    elif prediction == 0 and truth == 1:
        false_negatives += 1
    elif prediction == 1 and truth == 0:
        false_positives += 1
    elif prediction == 1 and truth == 1:
        true_positives += 1
    else:
        print "Warning: Found a predicted label not == 0 or 1."
        print "All predictions should take value 0 or 1."
        print "Evaluating performance for processed predictions:"
        break

In [21]:
try:
    total_predictions = true_negatives + false_negatives + false_positives + true_positives
    accuracy = 1.0*(true_positives + true_negatives)/total_predictions
    precision = 1.0*true_positives/(true_positives+false_positives)
    recall = 1.0*true_positives/(true_positives+false_negatives)
    f1 = 2.0 * true_positives/(2*true_positives + false_positives+false_negatives)
    f2 = (1+2.0*2.0) * precision*recall/(4*precision + recall)
    print clf
    print PERF_FORMAT_STRING.format(accuracy, precision, recall, f1, f2, display_precision = 5)
    print RESULTS_FORMAT_STRING.format(total_predictions, true_positives, false_positives, false_negatives, true_negatives)
    print ""
except:
    print "Got a divide by zero when trying out:", clf
    print "Precision or recall may be undefined due to a lack of true positive predicitons."

Pipeline(steps=[('minmaxscaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('pca', PCA(copy=True, iterated_power='auto', n_components=12, random_state=42,
  svd_solver='auto', tol=0.0, whiten=False)), ('decisiontreeclassifier', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,...plit=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=42, splitter='best'))])
	Accuracy: 0.86667	Precision: 0.50000	Recall: 0.50000	F1: 0.50000	F2: 0.50000
	Total predictions:   15	True positives:    1	False positives:    1	False negatives:    1	True negatives:   12



In [22]:
from tester import test_classifier



In [23]:
test_classifier(clf, dataset, feature_list)

Pipeline(steps=[('minmaxscaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('pca', PCA(copy=True, iterated_power='auto', n_components=12, random_state=42,
  svd_solver='auto', tol=0.0, whiten=False)), ('decisiontreeclassifier', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,...plit=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=42, splitter='best'))])
	Accuracy: 0.86667	Precision: 0.50000	Recall: 0.50000	F1: 0.50000	F2: 0.50000
	Total predictions:   15	True positives:    1	False positives:    1	False negatives:    1	True negatives:   12

