In [1]:
df <- arrow::read_parquet(here("data", "featurefull.parquet") ) %>% relocate(starts_with("winner"), starts_with("looser"), starts_with("match"), starts_with("tournament"))
# vfold
set.seed(1)
df.folds <- vfold_cv(df, v = 5, repeats = 1) # no strata TODO make repeat 5
df.folds %>% glimpse()

relocate: columns reordered (winner.name, winner.id, winner.rank, winner.bornAt, winner.height, …)



Rows: 25
Columns: 3
$ splits [3m[90m<list>[39m[23m [<vfold_split[14340 x 3586 x 17926 x 69]>], [<vfold_split[1434…
$ id     [3m[90m<chr>[39m[23m "Repeat1", "Repeat1", "Repeat1", "Repeat1", "Repeat1", "Repeat2…
$ id2    [3m[90m<chr>[39m[23m "Fold1", "Fold2", "Fold3", "Fold4", "Fold5", "Fold1", "Fold2", …


In [7]:
vars <- c(
    "winner.lastTournamentDelayW",
    "looser.lastTournamentDelayW",
    "match.domHands",
    "match.tournamentRound",
    "match.hadTieBreaker", 
    "tournament.dateDistance",
    "match.diffRank",
    "winner.bornAt",
    "looser.bornAt",
    "winner.heightF",
    "looser.heightF",
    "winner.winrate",
    "looser.winrate",
    "winner.njogos",
    "looser.njogos",
    "winner.nTournament",
    "looser.nTournament",
    "match.backHands",
    "tournament.durationF",
    "tournament.quarterL2",
    "tournament.prize"
)
library(formulaic)
library(tictoc)

metrics <- metric_set(accuracy, f_meas, roc_auc, pr_auc)

form <- create.formula(input.names = vars, outcome.name = "match.setsCount", dat = df)
form$formula

match.setsCount ~ winner.lastTournamentDelayW + looser.lastTournamentDelayW + 
    match.domHands + match.tournamentRound + match.hadTieBreaker + 
    tournament.dateDistance + match.diffRank + winner.bornAt + 
    looser.bornAt + winner.heightF + looser.heightF + winner.winrate + 
    looser.winrate + winner.njogos + looser.njogos + winner.nTournament + 
    looser.nTournament + match.backHands + tournament.durationF + 
    tournament.quarterL2 + tournament.prize
<environment: 0x000001f41b491578>

In [15]:
# logistic regression
wf <- workflow() %>% add_formula(form$formula)
lg <- logistic_reg() %>%
    set_engine("glm") %>%
    set_mode("classification")
tic()
    lg.fitted <- fit_resamples(wf %>% add_model(lg), df.folds, metrics = metrics, control = control_resamples(save_pred = TRUE))
toc(log=T)

66.94 sec elapsed


In [16]:
collect_metrics(lg.fitted) %>% select(metrica = .metric, valor = mean)

select: renamed 2 variables (metrica, valor) and dropped 4 variables



metrica,valor
<chr>,<dbl>
accuracy,0.6810784
f_meas,0.7888075
pr_auc,0.7654898
roc_auc,0.645415


In [17]:
# svm radial
svmR <- svm_rbf() %>%
    set_engine("kernlab") %>%
    set_mode("classification")
tic()
    svmR.fitted <- fit_resamples(wf %>% add_model(svmR), df.folds, metrics = metrics, control = control_resamples(save_pred = TRUE))
toc(log=T)

: 

: 

In [None]:
collect_metrics(svmR.fitted) %>% select(metrica = .metric, valor = mean)