# OnenessTwoness model training
This notebook provides the code for training the OnenessTwoness classifier described in the manuscript, and gives examples for how to produced predictions.

# Load data

Read features and genotype annotations

In [1]:
## genotypes curated by Zi-Ning
hrd_tbl = readRDS("./data/hrd-supp-table_ZC.rds")

## new features for revision
all.features.dt = readRDS("./data/revision.features.dt.rds")

# Define training and test sets

Training is done on pan-cancer non-HMF tumors; testing is done on pan-cancer HMF tumors

In [2]:
## add genotype annotations
all.features.dt[, fmut_bi := hrd_tbl[sample, fmut_bi]]
all.features.dt[, dataset := hrd_tbl[sample, dataset]]
all.features.dt[, in_bopp := hrd_tbl[sample, in_bopp]]
all.features.dt[, HRDetect := hrd_tbl[sample, HRDetect]]

all.features.dt[, train.set := (fmut_bi %in% c("WT", "BRCA1", "BRCA2")) & (dataset != "HMF")]
all.features.dt[, test.set := (fmut_bi %in% c("WT", "BRCA1", "BRCA2")) & (dataset == "HMF")]

# Train models for BRCA1 versus BRCA2

## Define features
This time use all features that are available

In [13]:
big_features_ot = c("del.mh.prop", "SNV3", "SNV8", "RS3", "RS5", "hrd", "tib", "qrdel", "qrdup", "ihdel", "DUP_1kb_100kb")

In [14]:
b12.train.1v2.x = as.data.frame(all.features.dt[(train.set), .SD, .SDcols = big_features_ot])

## response variable is the same
train.1v2.y = all.features.dt[(train.set), factor(ifelse(fmut_bi == "WT", "OTHER", as.character(fmut_bi)), 
                                                  levels = c("OTHER", "BRCA1", "BRCA2"))]

In [15]:
## THIS IS OUR TRAINED MODEL THAT IS SAVED WITH THIS REPOSITORY

set.seed(10)
rf.1v2.b12 = randomForest::randomForest(x = b12.train.1v2.x, y = train.1v2.y, ntree = 1000, importance = TRUE)

# Making predictions

In [17]:
## get feature matrix for predictions
all.samples.x = all.features.dt[, .SD, .SDcols = big_features_ot]

In [22]:
## this is what the data table should look like if you want to make predictions
head(all.samples.x)

del.mh.prop,SNV3,SNV8,RS3,RS5,hrd,tib,qrdel,qrdup,ihdel,DUP_1kb_100kb
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
0.1255061,0.0,2678.358,0,27.72856,12,1,1,0,0,6
0.1454545,897.2618,1457.471,0,0.0,2,0,1,0,0,2
0.1886792,1073.9493,1566.11,0,0.0,4,0,0,1,1,2
0.173913,646.6614,0.0,0,0.0,1,0,0,0,0,0
0.1472684,0.0,3245.374,0,0.0,1,0,1,4,0,1
0.1372549,0.0,2238.701,0,0.0,2,1,0,0,0,1


In [27]:
mod = readRDS("../models/stash.retrained.model.rds")

predictions.dt = all.features.dt[, .(sample,
                                     fmut_bi,
                                     train.set,
                                     test.set,
                                     b1 = predict(mod, all.samples.x, type = "prob")[, "BRCA1"],
                                     b2 = predict(mod, all.samples.x, type = "prob")[, "BRCA2"])]

## HR-status of samples with BRCA1 + BRCA2 > 0.5

In [34]:
predictions.dt[(test.set), table(b1 + b2 > 0.5, fmut_bi != "WT")]

       
        FALSE TRUE
  FALSE  1826   23
  TRUE     12  109