Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP

E2006 tfidf regression evaluation (PA, AROW)

Makoto YUI edited this page · 9 revisions

Analytics

Clone this wiki locally

http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression.html#E2006-tfidf


[Preparation]

delete jar /home/myui/tmp/hivemall.jar;
add jar /home/myui/tmp/hivemall.jar;

source /home/myui/tmp/define-all.hive;

[PA1a]

Training

set mapred.reduce.tasks=64;
drop table e2006tfidf_pa1a_model ;
create table e2006tfidf_pa1a_model as
select 
 feature,
 cast(avg(weight) as float) as weight
from 
 (select 
     pa1a_regress(addBias(features),target) as (feature,weight)
  from 
     e2006tfidf_train_x3
 ) t 
group by feature;
set mapred.reduce.tasks=-1;

Caution: Do not use voted_avg() for regression. voted_avg() is for classification.

prediction

create or replace view e2006tfidf_pa1a_predict
as
select
  t.rowid, 
  sum(m.weight * t.value) as predicted
from 
  e2006tfidf_test_exploded t LEFT OUTER JOIN
  e2006tfidf_pa1a_model m ON (t.feature = m.feature)
group by
  t.rowid;

evaluation

drop table e2006tfidf_pa1a_submit;
create table e2006tfidf_pa1a_submit as
select 
  t.target as actual, 
  pd.predicted as predicted
from 
  e2006tfidf_test t JOIN e2006tfidf_pa1a_predict pd 
    on (t.rowid = pd.rowid);

select avg(actual), avg(predicted) from e2006tfidf_pa1a_submit;

-3.8200363760415414 -3.8869923258589476

set hivevar:mean_actual=-3.8200363760415414;

select 
   sqrt(sum(pow(predicted - actual,2.0))/count(1)) as RMSE, 
   sum(pow(predicted - actual,2.0))/count(1) as MSE, 
   sum(abs(predicted - actual))/count(1) as MAE,
   1 - sum(pow(actual - predicted,2.0)) / sum(pow(actual - ${mean_actual},2.0)) as R2
from 
   e2006tfidf_pa1a_submit;

0.3797959864675519 0.14424499133686086 0.23846059576113587 0.5010367946980386


[PA2a]

Training

set mapred.reduce.tasks=64;
drop table e2006tfidf_pa2a_model;
create table e2006tfidf_pa2a_model as
select 
 feature,
 cast(avg(weight) as float) as weight
from 
 (select 
     pa2a_regress(addBias(features),target) as (feature,weight)
  from 
     e2006tfidf_train_x3
 ) t 
group by feature;
set mapred.reduce.tasks=-1;

prediction

create or replace view e2006tfidf_pa2a_predict
as
select
  t.rowid, 
  sum(m.weight * t.value) as predicted
from 
  e2006tfidf_test_exploded t LEFT OUTER JOIN
  e2006tfidf_pa2a_model m ON (t.feature = m.feature)
group by
  t.rowid;

evaluation

drop table e2006tfidf_pa2a_submit;
create table e2006tfidf_pa2a_submit as
select 
  t.target as actual, 
  pd.predicted as predicted
from 
  e2006tfidf_test t JOIN e2006tfidf_pa2a_predict pd 
    on (t.rowid = pd.rowid);

select avg(actual), avg(predicted) from e2006tfidf_pa2a_submit;

-3.8200363760415414 -3.9124877451612488

set hivevar:mean_actual=-3.8200363760415414;

select 
   sqrt(sum(pow(predicted - actual,2.0))/count(1)) as RMSE, 
   sum(pow(predicted - actual,2.0))/count(1) as MSE, 
   sum(abs(predicted - actual))/count(1) as MAE,
   1 - sum(pow(actual - predicted,2.0)) / sum(pow(actual - ${mean_actual},2.0)) as R2
from 
   e2006tfidf_pa2a_submit;

0.38538660838804495 0.14852283792484033 0.2466732002711477 0.48623913673053565


[AROW]

Training

set mapred.reduce.tasks=64;
drop table e2006tfidf_arow_model ;
create table e2006tfidf_arow_model as
select 
 feature,
 cast(avg(weight) as float) as weight
from 
 (select 
     -- arow_regress(addBias(features),target) as (feature,weight)    -- [hivemall v0.1]
     arow_regress(addBias(features),target) as (feature,weight,covar) -- [hivemall v0.2 or later]
  from 
     e2006tfidf_train_x3
 ) t 
group by feature;
set mapred.reduce.tasks=-1;

prediction

create or replace view e2006tfidf_arow_predict
as
select
  t.rowid, 
  sum(m.weight * t.value) as predicted
from 
  e2006tfidf_test_exploded t LEFT OUTER JOIN
  e2006tfidf_arow_model m ON (t.feature = m.feature)
group by
  t.rowid;

evaluation

drop table e2006tfidf_arow_submit;
create table e2006tfidf_arow_submit as
select 
  t.target as actual, 
  pd.predicted as predicted
from 
  e2006tfidf_test t JOIN e2006tfidf_arow_predict pd 
    on (t.rowid = pd.rowid);

select avg(actual), avg(predicted) from e2006tfidf_arow_submit;

-3.8200363760415414 -3.8692518911517433

set hivevar:mean_actual=-3.8200363760415414;

select 
   sqrt(sum(pow(predicted - actual,2.0))/count(1)) as RMSE, 
   sum(pow(predicted - actual,2.0))/count(1) as MSE, 
   sum(abs(predicted - actual))/count(1) as MAE,
   1 - sum(pow(actual - predicted,2.0)) / sum(pow(actual - ${mean_actual},2.0)) as R2
from 
   e2006tfidf_arow_submit;

0.37862513029019407 0.14335698928726642 0.2368787001269389 0.5041085155590119


[AROWe]

AROWe is a modified version of AROW that uses Hinge loss (epsilion = 0.1)

Training

set mapred.reduce.tasks=64;
drop table e2006tfidf_arowe_model ;
create table e2006tfidf_arowe_model as
select 
 feature,
 cast(avg(weight) as float) as weight
from 
 (select 
     -- arowe_regress(addBias(features),target) as (feature,weight)    -- [hivemall v0.1]
     arowe_regress(addBias(features),target) as (feature,weight,covar) -- [hivemall v0.2 or later]
  from 
     e2006tfidf_train_x3
 ) t 
group by feature;
set mapred.reduce.tasks=-1;

prediction

create or replace view e2006tfidf_arowe_predict
as
select
  t.rowid, 
  sum(m.weight * t.value) as predicted
from 
  e2006tfidf_test_exploded t LEFT OUTER JOIN
  e2006tfidf_arowe_model m ON (t.feature = m.feature)
group by
  t.rowid;

evaluation

drop table e2006tfidf_arowe_submit;
create table e2006tfidf_arowe_submit as
select 
  t.target as actual, 
  pd.predicted as predicted
from 
  e2006tfidf_test t JOIN e2006tfidf_arowe_predict pd 
    on (t.rowid = pd.rowid);

select avg(actual), avg(predicted) from e2006tfidf_arowe_submit;

-3.8200363760415414 -3.86494905688414

set hivevar:mean_actual=-3.8200363760415414;

select 
   sqrt(sum(pow(predicted - actual,2.0))/count(1)) as RMSE, 
   sum(pow(predicted - actual,2.0))/count(1) as MSE, 
   sum(abs(predicted - actual))/count(1) as MAE,
   1 - sum(pow(actual - predicted,2.0)) / sum(pow(actual - ${mean_actual},2.0)) as R2
from 
   e2006tfidf_arowe_submit;

0.37789148212861856 0.14280197226536404 0.2357339155291536 0.5060283955470721

Something went wrong with that request. Please try again.