Permalink
Browse files

Using coo_matrix to accelerate confusion_matrix calculation

  • Loading branch information...
weilinear authored and amueller committed Sep 17, 2012
1 parent 7c8521c commit 9d3fa6e73a086040c03513aa55a521af7ef6e7df
Showing with 24 additions and 4 deletions.
  1. +10 −4 sklearn/metrics/cluster/supervised.py
  2. +14 −0 sklearn/metrics/metrics.py
@@ -12,6 +12,7 @@
from scipy.misc import comb
from scipy.special import gammaln
from scipy.sparse import coo_matrix
+from ..metrics import confusion_matrix
import numpy as np
@@ -71,6 +72,8 @@ def contingency_matrix(labels_true, labels_pred, eps=None):
clusters, cluster_idx = np.unique(labels_pred, return_inverse=True)
n_classes = classes.shape[0]
n_clusters = clusters.shape[0]
+ # using coo_matrix to accelerate calculation of contingency matrix
+ # it can accelerate 2d-histogram like construction
contingency = np.asarray(coo_matrix((np.ones(class_idx.shape[0]),
(class_idx, cluster_idx)),
shape=(n_classes, n_clusters),
@@ -533,10 +536,13 @@ def mutual_info_score(labels_true, labels_pred, contingency=None):
pj = np.sum(contingency, axis=0)
outer = np.outer(pi, pj)
nnz = contingency != 0.0
- mi = ((contingency[nnz] / contingency_sum) *
- (np.log(contingency[nnz]) - log(contingency_sum))
- + (contingency[nnz] / contingency_sum) *
- (-np.log(outer[nnz]) + log(pi.sum()) + log(pj.sum())))
+ # normalized contingency
+ contingency_nm = contingency[nnz]
+ log_contingency_nm = np.log(contingency_nm)
+ contingency_nm /= contingency_sum
+ log_outer = -np.log(outer[nnz]) + log(pi.sum()) + log(pj.sum())
+ mi = (contingency_nm * (log_contingency_nm - log(contingency_sum))
+ + contingency_nm * log_outer)
return mi.sum()
View
@@ -15,6 +15,7 @@
import numpy as np
from ..utils import check_arrays
+from scipy.sparse import coo_matrix
def unique_labels(*lists_of_labels):
@@ -63,6 +64,19 @@ def confusion_matrix(y_true, y_pred, labels=None):
n_labels = labels.size
label_to_ind = dict((y, x) for x, y in enumerate(labels))
+ # convert yt, yp into index
+ y_pred = np.array([label_to_ind[x] for x in y_pred])
+ y_true = np.array([label_to_ind[x] for x in y_true])
+
+ # intersect y_pred, y_true with labels
+ y_pred = y_pred[y_pred < n_labels]
+ y_true = y_true[y_true < n_labels]
+
+ CM = np.asarray(coo_matrix((np.ones(y_true.shape[0]),
+ (y_true, y_pred)),
+ shape=(n_labels, n_labels),
+ dtype=np.int).todense())
+ return CM
if n_labels >= 15:
CM = np.zeros((n_labels, n_labels), dtype=np.long)

0 comments on commit 9d3fa6e

Please sign in to comment.