Skip to content

Commit

Permalink
init commit.
Browse files Browse the repository at this point in the history
  • Loading branch information
playerkk committed Dec 4, 2014
1 parent fd84e5c commit 4173a17
Show file tree
Hide file tree
Showing 294 changed files with 32,055 additions and 0 deletions.
42 changes: 42 additions & 0 deletions boosting/test_boosted_dt_mc.m
@@ -0,0 +1,42 @@
function confidences = test_boosted_dt_mc(classifier, features)
% confidences = test_boosted_dt_mc(classifier, features)
%
% Returns a log likelihod ratio for each class in the classifier
%
% Input:
% classifier: boosted decision tree classifier
% features: classifier features (ndata, nvariables)
% Output:
% confidences(ndata, nclasses):
% P(class=k|features) \propto 1./(1+exp(-confidences(k)))

npred = classifier.wcs(1).dt.npred;
if size(features, 2)~=npred
error('Incorrect number of attributes')
end

wcs = classifier.wcs;
nclasses = size(wcs, 2);

ntrees = size(wcs, 1);

confidences = zeros(size(features, 1), nclasses);
for c = 1:nclasses
for t = 1:ntrees
if ~isempty(wcs(t,c).dt)
if 0
dt = wcs(t,c).dt;
[var, cut, children, catsplit] = tree_getParameters(dt);
nodes = treevalc(int32(var), cut, int32(children(:, 1)), ...
int32(children(:, 2)), catsplit(:, 1), features');
%disp(num2str(nodes));
else
[class_indices, nodes, classes] = treeval(wcs(t, c).dt, features);
end
confidences(:, c) = confidences(:, c) + wcs(t, c).confidences(nodes);
end
end
confidences(:, c) = confidences(:, c) + classifier.h0(c);
end


12 changes: 12 additions & 0 deletions boosting/test_boosted_kde_2c.m
@@ -0,0 +1,12 @@
function p = test_boosted_kde_2c(density, x, data)
% used to evaluate the likelihood of a point from a kernel density estimate
% density is the density function
% x are the points at which f is defined (assumed to be equally spaced)
% data are the data points to be evaluated
% p is the value of f(xi) where x(xi) is the closest value in x to y

n = length(x);
wx = x(2)-x(1);
indices = round((data- x(1))/wx)+1;
indices = min(max(indices, 1), n);
p = density(indices);
91 changes: 91 additions & 0 deletions boosting/train_boosted_dt_2c.m
@@ -0,0 +1,91 @@
function classifier = train_boosted_dt_2c(features, cat_features, ...
labels, num_iterations, nodespertree, stopval, w)
% classifier = train_boosted_dt_2c(features, cat_features, ...
% labels, num_iterations, nodespertree, stopval, w)
%
% Trains a two-class classifier based on boosted decision trees. Boosting done by the
% logistic regression version of Adaboost (Adaboost.L - Collins, Schapire,
% Singer 2002). At each iteration, a set of decision trees is created, with
% confidences equal to 1/2*ln(P+/P-), according to the
% weighted distribution. Weights are assigned as
% w(i,j) = 1 / (1+exp(sum{t in iterations}[yij*ht(xi, j)])).
% features(ndata, nfeatures)
% cat_features - discrete-valued output feature indices (could be [])
% labels - {-1, 1}
% num_iterations - the number of trees to create


num_data = length(labels);

cl = [-1 1];
y = labels;

if ~exist('stopval', 'var') || isempty(stopval)
stopval = 0;
end

if ~exist('w', 'var') || isempty(w)
w = ones(num_data, 1);
end
w = w/sum(w);

classifier.h0 = 0;

% for i = 1:2
% indices = find(y==cl(i));
% count = numel(indices);
% if cl(i)==1
% classifier.h0 = log(count / (num_data-count));
% end
% w(indices) = 1 / count/2;
% end

data_confidences = zeros(num_data, 1);
aveconf = [];

for t = 1:num_iterations
% learn decision tree based on weighted distribution
dt = treefitw(features, y, w, 1/num_data/2, 'catidx', cat_features, 'method', 'classification', 'maxnodes', nodespertree*4);
[tmp, level] = min(abs(dt.ntermnodes-nodespertree));
dt = treeprune(dt, 'level', level-1);
% assign partition confidences
temp_c = dt.classname;
pi = (strcmp(temp_c{1},'1')) + (2*strcmp(temp_c{2},'1'));
ni = (strcmp(temp_c{1},'-1')) + (2*strcmp(temp_c{2},'-1'));
classprob = dt.classprob;
confidences = 1/2*(log(classprob(:, pi)) - log(classprob(:, ni)));

% assign weights
[class_indices, nodes, classes] = treeval(dt, features);
data_confidences = data_confidences + confidences(nodes);
w = 1 ./ (1+exp(y.*data_confidences));
w = w / sum(w);

pconf = mean(1./(1+exp(-data_confidences(y==1))));
nconf = mean(1./(1+exp(-data_confidences(y==-1))));

disp(['t: ', num2str(t), ' c: ' num2str(mean(1 ./ (1+exp(-y.*data_confidences)))) ' e: ' ...
num2str(mean(y.*data_confidences < 0)) ' c_p: ' num2str(pconf) ' c_n: ' num2str(nconf)]);

classifier.wcs(t,1).dt = dt;
classifier.wcs(t,1).confidences = confidences;
%pause(0.1);

aveconf(t) = mean(1 ./ (1+exp(-y.*data_confidences)));
if t>10 && (aveconf(t)-aveconf(t-10) < stopval)
disp(num2str(aveconf))
disp(['Stopping after ' num2str(t) ' trees'])
break;
end

end

disp(['mean conf = ' num2str(mean(1 ./ (1+exp(-y.*data_confidences))))]);
disp(['training error: ' num2str(mean(y.*data_confidences < 0))]);







155 changes: 155 additions & 0 deletions boosting/train_boosted_dt_mc.m
@@ -0,0 +1,155 @@
function classifier = train_boosted_dt_mc(features, cat_features, labels, ...
num_iterations, num_nodes, stopval, init_weights, varargin)
%
%classifier = train_boosted_dt_mc(features, cat_features, labels, ...
% num_iterations, num_nodes, stopval, init_weights, varargin)
%
% Train a classifier based on boosted decision trees. Boosting done by the
% logistic regression version of Adaboost (Adaboost.L - Collins, Schapire,
% Singer 2002). At each
% iteration, a set of decision trees is created for each class, with
% confidences equal to 1/2*ln(P+/P-) for that class, according to the
% weighted distribution. Final classification is based on the largest
% confidence label (possibly incorporating a prior as h0(c) =
% 1/2*ln(Pc/(1-Pc)). Weights are assigned as
% w(i,j) = 1 / (1+exp(sum{t in iterations}[yij*ht(xi, j)])).

if length(varargin) == 1 % class names supplied
gn = varargin{1};
gid = zeros(size(labels));
for c = 1:length(gn)
ind = find(strcmp(labels, gn{c}));
gid(ind) = c;
if ~isempty(init_weights)
disp([gn{c} ': ' num2str(sum(init_weights(ind)))]);
else
disp([gn{c} ': ' num2str(length(ind))]);
end
end
ind = find(gid==0);
gid(ind) = [];
labels(ind) = [];
features(ind, :) = [];
else
[gid, gn] = grp2idx(labels);
end

if ~exist('stopval', 'var') || isempty(stopval)
stopval = 0;
end
if ~exist('init_weights', 'var')
init_weights = [];
end

classifier.names = gn;

num_classes = length(gn);
num_data = length(gid);

if isempty(init_weights)
init_weights = ones(num_data, 1)/num_data;
else
init_weights = init_weights / sum(init_weights);
end

% if no examples from a class are present, create one dummy example for
% that class with very small weight
for c = 1:numel(gn)
if ~any(gid==c)
disp(['warning: no examples from class ' gn(c)])
gid(end+1) = c;
features(end+1, :) = zeros(size(features(end, 1)));
num_data = num_data + 1;
init_weights(end+1) = min(init_weights)/2;
end
end

all_conf = zeros(num_data, num_classes);
for c = 1:num_classes

disp(['class: ' num2str(gn{c})]);
y = (gid == c)*2-1;
cl = [-1 1];
nc = 2;
w = zeros(num_data, 1);
cw = zeros(num_classes, 1);
for i = 1:2
indices = find(y==cl(i));
%count = sum(init_weights(indices));
%w(indices) = init_weights(indices) / count / 2;
w(indices) = init_weights(indices);

if cl(i)==1
%classifier.h0(c) = log(count / (1-count));
classifier.h0(c) = 0;
end

end

data_confidences = zeros(num_data, 1);
aveconf = [];

for t = 1:num_iterations
% learn decision tree based on weighted distribution
dt = treefitw(features, y, w, 1/num_data/2, 'catidx', cat_features, 'method', 'classification', 'maxnodes', num_nodes*4);
[tmp, level] = min(abs(dt.ntermnodes-num_nodes));
dt = treeprune(dt, 'level', level-1);

% assign partition confidences
temp_c = dt.classname;
pi = (strcmp(temp_c{1},'1')) + (2*strcmp(temp_c{2},'1'));
ni = (strcmp(temp_c{1},'-1')) + (2*strcmp(temp_c{2},'-1'));

classprob = dt.classprob;
confidences = 1/2*(log(classprob(:, pi)) - log(classprob(:, ni)));

% assign weights
[class_indices, nodes, classes] = treeval(dt, features);
data_confidences = data_confidences + confidences(nodes);

w = 1 ./ (1+exp(y.*data_confidences));
w = w / sum(w);

%disp(['c: ' num2str(mean(1 ./ (1+exp(-y.*data_confidences)))) ' e: ' num2str(mean(y.*data_confidences < 0)) ' w: ' num2str(max(w))]);

classifier.wcs(t, c).dt = dt;
classifier.wcs(t, c).confidences = confidences;


aveconf(t) = mean(1 ./ (1+exp(-y.*data_confidences)));
if t>10 && (aveconf(t)-aveconf(t-10) < stopval)
disp(num2str(aveconf))
disp(['Stopping after ' num2str(t) ' trees'])
break;
end

end

finalconf = 1 ./ (1+exp(-y.*data_confidences));
finalerr = (y.*data_confidences < 0);
disp(['confidence:: mean: ' num2str(mean(finalconf)) ...
' pos: ' num2str(mean(finalconf(y==1))) ...
' neg: ' num2str(mean(finalconf(y~=1)))]);
disp(['training error:: mean: ' num2str(mean(finalerr)) ...
' pos: ' num2str(mean(finalerr(y==1))) ...
' neg: ' num2str(mean(finalerr(y~=1)))]);
all_conf(:, c) = data_confidences+classifier.h0(c);

end

% compute and display training error
[tmp, assigned_label] = max(all_conf, [], 2);
conf_matrix = zeros(num_classes, num_classes);
for c = 1:num_classes
indices = find(gid==c);
for c2 = 1:num_classes
conf_matrix(c, c2) = mean(assigned_label(indices)==c2);
end
disp([gn{c} ' error: ' num2str(mean(assigned_label(indices)~=c))]);
end
disp('Confusion Matrix: ');
disp(num2str(conf_matrix));
disp(['total error: ' num2str(mean(assigned_label~=gid))]);



0 comments on commit 4173a17

Please sign in to comment.