init commit.

playerkk · Dec 4, 2014 · 4173a17 · 4173a17
1 parent fd84e5c
commit 4173a17
Show file tree

Hide file tree

Showing 294 changed files with 32,055 additions and 0 deletions.
diff --git a/boosting/test_boosted_dt_mc.m b/boosting/test_boosted_dt_mc.m
@@ -0,0 +1,42 @@
+function confidences = test_boosted_dt_mc(classifier, features)
+    % confidences = test_boosted_dt_mc(classifier, features)
+    %
+    % Returns a log likelihod ratio for each class in the classifier    
+    % 
+    % Input:
+    %  classifier: boosted decision tree classifier
+    %  features:   classifier features (ndata, nvariables)
+    % Output:
+    %   confidences(ndata, nclasses): 
+    %      P(class=k|features) \propto 1./(1+exp(-confidences(k)))
+
+    npred = classifier.wcs(1).dt.npred;
+    if size(features, 2)~=npred
+        error('Incorrect number of attributes')
+    end
+
+    wcs = classifier.wcs;  
+    nclasses = size(wcs, 2);
+
+    ntrees = size(wcs, 1);
+
+    confidences = zeros(size(features, 1), nclasses);
+    for c = 1:nclasses    
+        for t = 1:ntrees        
+            if ~isempty(wcs(t,c).dt)                                              
+                if 0
+                    dt = wcs(t,c).dt; 
+                    [var, cut, children, catsplit] = tree_getParameters(dt);
+                    nodes = treevalc(int32(var), cut, int32(children(:, 1)), ...
+                            int32(children(:, 2)), catsplit(:, 1), features');  
+                    %disp(num2str(nodes));      
+                else
+                    [class_indices, nodes, classes] = treeval(wcs(t, c).dt, features);             
+                end
+                confidences(:, c) = confidences(:, c) + wcs(t, c).confidences(nodes);
+            end        
+        end
+        confidences(:, c) = confidences(:, c) + classifier.h0(c);
+    end
+
+
diff --git a/boosting/test_boosted_kde_2c.m b/boosting/test_boosted_kde_2c.m
@@ -0,0 +1,12 @@
+function p = test_boosted_kde_2c(density, x, data)
+% used to evaluate the likelihood of a point from a kernel density estimate
+% density is the density function
+% x are the points at which f is defined (assumed to be equally spaced)
+% data are the data points to be evaluated
+% p is the value of f(xi) where x(xi) is the closest value in x to y
+
+n = length(x);
+wx = x(2)-x(1);
+indices = round((data- x(1))/wx)+1;
+indices = min(max(indices, 1), n);
+p = density(indices);
diff --git a/boosting/train_boosted_dt_2c.m b/boosting/train_boosted_dt_2c.m
@@ -0,0 +1,91 @@
+function classifier = train_boosted_dt_2c(features, cat_features, ...
+    labels, num_iterations, nodespertree, stopval, w)
+% classifier = train_boosted_dt_2c(features, cat_features, ...
+%    labels, num_iterations, nodespertree, stopval, w)
+%
+% Trains a two-class classifier based on boosted decision trees.  Boosting done by the
+% logistic regression version of Adaboost (Adaboost.L - Collins, Schapire,
+% Singer 2002).  At each iteration, a set of decision trees is created, with
+% confidences equal to 1/2*ln(P+/P-), according to the
+% weighted distribution.  Weights are assigned as
+% w(i,j) = 1 / (1+exp(sum{t in iterations}[yij*ht(xi, j)])).  
+% features(ndata, nfeatures)
+% cat_features - discrete-valued output feature indices (could be [])
+% labels - {-1, 1}
+% num_iterations - the number of trees to create
+
+
+num_data = length(labels);
+
+cl = [-1 1];
+y = labels;
+
+if ~exist('stopval', 'var') || isempty(stopval)
+    stopval = 0;
+end
+
+if ~exist('w', 'var') || isempty(w)
+    w = ones(num_data, 1);
+end
+w = w/sum(w);
+
+classifier.h0 = 0;
+
+% for i = 1:2
+%     indices = find(y==cl(i));
+%     count = numel(indices);
+%     if cl(i)==1
+%         classifier.h0 = log(count / (num_data-count));
+%     end
+%     w(indices) = 1 / count/2;
+% end    
+
+data_confidences = zeros(num_data, 1);
+aveconf = [];
+
+for t = 1:num_iterations
+    % learn decision tree based on weighted distribution
+    dt = treefitw(features, y, w, 1/num_data/2, 'catidx', cat_features, 'method', 'classification', 'maxnodes', nodespertree*4);
+    [tmp, level] = min(abs(dt.ntermnodes-nodespertree));
+    dt = treeprune(dt, 'level', level-1);
+    % assign partition confidences
+    temp_c = dt.classname;
+    pi = (strcmp(temp_c{1},'1')) + (2*strcmp(temp_c{2},'1'));
+    ni = (strcmp(temp_c{1},'-1')) + (2*strcmp(temp_c{2},'-1'));
+    classprob = dt.classprob;
+    confidences = 1/2*(log(classprob(:, pi)) - log(classprob(:, ni)));             
+
+    % assign weights
+    [class_indices, nodes, classes] = treeval(dt, features);        
+    data_confidences = data_confidences + confidences(nodes);
+    w = 1 ./ (1+exp(y.*data_confidences));        
+    w = w / sum(w);
+
+    pconf = mean(1./(1+exp(-data_confidences(y==1))));
+    nconf = mean(1./(1+exp(-data_confidences(y==-1))));
+
+    disp(['t: ', num2str(t), ' c: ' num2str(mean(1 ./ (1+exp(-y.*data_confidences)))) '  e: ' ...
+        num2str(mean(y.*data_confidences < 0)) '  c_p: ' num2str(pconf) '  c_n: ' num2str(nconf)]);  
+
+    classifier.wcs(t,1).dt = dt;
+    classifier.wcs(t,1).confidences = confidences;       
+    %pause(0.1);
+
+    aveconf(t) = mean(1 ./ (1+exp(-y.*data_confidences)));
+    if t>10 && (aveconf(t)-aveconf(t-10) < stopval)
+        disp(num2str(aveconf))        
+        disp(['Stopping after ' num2str(t) ' trees'])
+        break;
+    end    
+
+end
+
+disp(['mean conf = ' num2str(mean(1 ./ (1+exp(-y.*data_confidences))))]);
+disp(['training error: ' num2str(mean(y.*data_confidences < 0))]);    
+
+
+
+
+
+
+
diff --git a/boosting/train_boosted_dt_mc.m b/boosting/train_boosted_dt_mc.m
@@ -0,0 +1,155 @@
+function classifier = train_boosted_dt_mc(features, cat_features, labels, ...
+    num_iterations, num_nodes, stopval, init_weights, varargin)
+%
+%classifier = train_boosted_dt_mc(features, cat_features, labels, ...
+%    num_iterations, num_nodes, stopval, init_weights, varargin)
+%
+% Train a classifier based on boosted decision trees.  Boosting done by the
+% logistic regression version of Adaboost (Adaboost.L - Collins, Schapire,
+% Singer 2002).  At each
+% iteration, a set of decision trees is created for each class, with
+% confidences equal to 1/2*ln(P+/P-) for that class, according to the
+% weighted distribution.  Final classification is based on the largest
+% confidence label (possibly incorporating a prior as h0(c) =
+% 1/2*ln(Pc/(1-Pc)).  Weights are assigned as
+% w(i,j) = 1 / (1+exp(sum{t in iterations}[yij*ht(xi, j)])).  
+
+if length(varargin) == 1  % class names supplied
+    gn = varargin{1};
+    gid = zeros(size(labels));
+    for c = 1:length(gn)
+        ind = find(strcmp(labels, gn{c}));
+        gid(ind) = c;
+        if ~isempty(init_weights)
+            disp([gn{c} ': ' num2str(sum(init_weights(ind)))]);
+        else
+            disp([gn{c} ': ' num2str(length(ind))]);
+        end
+    end
+    ind = find(gid==0);
+    gid(ind) = [];
+    labels(ind) = [];
+    features(ind, :) = [];
+else    
+    [gid, gn] = grp2idx(labels);    
+end
+
+if ~exist('stopval', 'var') || isempty(stopval)
+    stopval = 0;
+end
+if ~exist('init_weights', 'var') 
+    init_weights = [];
+end
+
+classifier.names = gn;
+
+num_classes = length(gn);
+num_data = length(gid);
+
+if isempty(init_weights)
+    init_weights = ones(num_data, 1)/num_data;
+else
+    init_weights = init_weights / sum(init_weights);
+end
+
+% if no examples from a class are present, create one dummy example for
+% that class with very small weight
+for c = 1:numel(gn)
+    if ~any(gid==c)
+        disp(['warning: no examples from class ' gn(c)])
+        gid(end+1) = c;
+        features(end+1, :) = zeros(size(features(end, 1)));
+        num_data = num_data + 1;
+        init_weights(end+1) = min(init_weights)/2;        
+    end
+end
+
+all_conf = zeros(num_data, num_classes);
+for c = 1:num_classes
+
+    disp(['class: ' num2str(gn{c})]);    
+    y = (gid == c)*2-1;
+    cl = [-1 1];
+    nc = 2;
+    w = zeros(num_data, 1);
+    cw = zeros(num_classes, 1);  
+    for i = 1:2
+        indices = find(y==cl(i));
+        %count = sum(init_weights(indices));
+        %w(indices) = init_weights(indices) / count / 2;
+        w(indices) = init_weights(indices);
+
+        if cl(i)==1
+            %classifier.h0(c) = log(count / (1-count));
+            classifier.h0(c) = 0;
+        end
+
+    end
+
+    data_confidences = zeros(num_data, 1);
+    aveconf = [];
+
+    for t = 1:num_iterations
+        % learn decision tree based on weighted distribution
+        dt = treefitw(features, y, w, 1/num_data/2, 'catidx', cat_features, 'method', 'classification', 'maxnodes', num_nodes*4);
+        [tmp, level] = min(abs(dt.ntermnodes-num_nodes));
+        dt = treeprune(dt, 'level', level-1);
+
+        % assign partition confidences
+        temp_c = dt.classname;
+        pi = (strcmp(temp_c{1},'1')) + (2*strcmp(temp_c{2},'1'));
+        ni = (strcmp(temp_c{1},'-1')) + (2*strcmp(temp_c{2},'-1'));
+
+        classprob = dt.classprob;
+        confidences = 1/2*(log(classprob(:, pi)) - log(classprob(:, ni)));             
+
+        % assign weights
+        [class_indices, nodes, classes] = treeval(dt, features);        
+        data_confidences = data_confidences + confidences(nodes);
+
+        w = 1 ./ (1+exp(y.*data_confidences));        
+        w = w / sum(w);   
+
+        %disp(['c: ' num2str(mean(1 ./ (1+exp(-y.*data_confidences)))) '  e: ' num2str(mean(y.*data_confidences < 0)) '   w: ' num2str(max(w))]);  
+
+        classifier.wcs(t, c).dt = dt;
+        classifier.wcs(t, c).confidences = confidences;       
+
+
+        aveconf(t) = mean(1 ./ (1+exp(-y.*data_confidences)));
+        if t>10 && (aveconf(t)-aveconf(t-10) < stopval)
+            disp(num2str(aveconf))
+            disp(['Stopping after ' num2str(t) ' trees'])            
+            break;
+        end
+
+    end
+
+    finalconf = 1 ./ (1+exp(-y.*data_confidences));
+    finalerr = (y.*data_confidences < 0);
+    disp(['confidence:: mean: ' num2str(mean(finalconf)) ...
+        '  pos: ' num2str(mean(finalconf(y==1))) ...
+        '  neg: ' num2str(mean(finalconf(y~=1)))]);
+    disp(['training error:: mean: ' num2str(mean(finalerr)) ...
+        '  pos: ' num2str(mean(finalerr(y==1))) ...
+        '  neg: ' num2str(mean(finalerr(y~=1)))]);    
+    all_conf(:, c) = data_confidences+classifier.h0(c);
+
+end
+
+% compute and display training error
+[tmp, assigned_label] = max(all_conf, [], 2);
+conf_matrix = zeros(num_classes, num_classes);
+for c = 1:num_classes    
+    indices = find(gid==c);
+    for c2 = 1:num_classes
+        conf_matrix(c, c2) = mean(assigned_label(indices)==c2);
+    end
+    disp([gn{c} ' error: ' num2str(mean(assigned_label(indices)~=c))]);
+end
+disp('Confusion Matrix: ');
+disp(num2str(conf_matrix));
+disp(['total error: ' num2str(mean(assigned_label~=gid))]);
+
+
+