# evaluomeR autoRSKC

In [1]:
library("evaluomeR")
library("sparcl")
library("plotly")
library("stringr")
library("reshape2")
library("viridis")
library("scales")

options(scipen=10)

Loading required package: SummarizedExperiment
Loading required package: GenomicRanges
Loading required package: stats4
Loading required package: BiocGenerics
Loading required package: parallel

Attaching package: 'BiocGenerics'

The following objects are masked from 'package:parallel':

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB

The following objects are masked from 'package:stats':

    IQR, mad, sd, var, xtabs

The following objects are masked from 'package:base':

    anyDuplicated, append, as.data.frame, basename, cbind, colnames,
    dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
    grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
    order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
    rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
    union, unique, unsplit, which, which.max, which

# Table of contents
* [Datasets](#datasets)
* [Analysis across repositories](#analysis_across_repositories)
    * [AgroPortal: All metrics analysis (RSKC)](#agroportal_analysis)
        * [AgroPortal: Optimal K (RSKC)](#agroportal_optimal_k)
    * [OBO Foundry: All metrics analysis (RSKC)](#obofoundry_analysis)
        * [OBO Foundry: Optimal K (RSKC)](#obofoundry_optimal_k)
    * [Plotting: Stability of all metrics across repositories](#plot_stability_all)
* [Metrics relevancy across repositories](#metrics_relevancy)
    * [AgroPortal metrics relevancy](#metrics_relevancy_agroportal)
    * [OBO Foundry metrics relevancy](#metrics_relevancy_obofoundry)
    * [Plotting metrics relevancy](#plotting_metrics_relevancy)
* [Comparison of trimmed vs non-trimmed](#trimmed_vs_non_trimeed)
* [References](#references)

# Datasets <a class="anchor" id="datasets"></a>

In [44]:
# usecases\usecase2\data
seed = 13606
scaled = TRUE
k.range=c(2,6)
set.seed(seed)
agro_portal = read.csv(paste0(getwd(), "/","../../usecases/usecase2/data/agro.csv"), header=TRUE, stringsAsFactors=FALSE)
obo_foundry = read.csv(paste0(getwd(), "/","../../usecases/usecase2/data/obo-119.csv"), header=TRUE, stringsAsFactors=FALSE)

In [3]:
if (scaled) {
#     agro_portal[c(2:length(agro_portal))] = scale(agro_portal[c(2:length(agro_portal))])
#     obo_foundry[c(2:length(obo_foundry))] = scale(obo_foundry[c(2:length(obo_foundry))])
    agro_portal[c(2:length(agro_portal))] = rescale(as.matrix(agro_portal[c(2:length(agro_portal))]), to=c(0,1))
    obo_foundry[c(2:length(obo_foundry))] = rescale(as.matrix(obo_foundry[c(2:length(obo_foundry))]), to=c(0,1))
}

In [45]:
raw_data = agro_portal
data = raw_data[-1]
wbounds = seq(2,sqrt(ncol(data)), len=30)
km.perm <- sparcl::KMeansSparseCluster.permute(data,K=5,wbounds=wbounds,nperms=5,silent=TRUE)
L1_boundry = km.perm$bestw

In [46]:
qualRange = qualityRange(data=raw_data, k.range=k.range, 
                            seed=seed,
                            all_metrics=TRUE,
                            cbi="rskc", L1=2, alpha=0)
qualStd = standardizeQualityData(qualRange)
qualStd


Data loaded.
Number of rows: 78
Number of columns: 20


Processing all metrics, 'merge', in dataframe (19)
	Calculation of k = 2
	Calculation of k = 3
	Calculation of k = 4
	Calculation of k = 5
	Calculation of k = 6


Unnamed: 0,k_2,k_3,k_4,k_5,k_6
all_metrics,0.9765144,0.9652177,0.9491769,0.9192709,0.09758856


In [62]:
test = getMetricsRelevancy(raw_data[-1], alpha=0.03, k=3, seed=seed)
test$trimmed_cases
test_data = raw_data[-c(test$trimmed_cases), ]

[1] "No L1 provided. Computing best L1 boundry with 'sparcl::KMeansSparseCluster.permute'"
[1] "Alpha set as: 0.03"
[1] "L1 set as: 2"


In [64]:
test_data

Unnamed: 0,Description,ANOnto,AROnto,CBOOnto,CBOOnto2,CROnto,DITOnto,INROnto,LCOMOnto,NACOnto,NOCOnto,NOMOnto,POnto,PROnto,RFCOnto,RROnto,TMOnto,TMOnto2,WMCOnto,WMCOnto2
1,ADO,0.000000000,3.9503849,0.9991446,0.9991446,0.995722840,3,0.9991446,1.999142,1.000000,292.000000,2.9632164,0.9957228,0.7478411,3.9623610,0.252158895,0.00000000,0.000000,1.999142,1.000000
2,AEO,0.929824561,0.5438596,0.9824561,0.9824561,0.000000000,5,0.9824561,2.357143,1.000000,3.733333,0.9824561,0.5789474,0.5000000,1.9649123,0.500000000,0.00000000,0.000000,2.357143,1.000000
3,AFO,0.750000000,0.0000000,0.8750000,0.8750000,3998.875000000,3,0.8750000,1.333333,1.000000,3.500000,2275.7500000,0.2500000,0.9996157,2276.6250000,0.000384341,0.00000000,0.000000,1.333333,1.000000
4,AGRO,0.990740741,3.1018519,1.0694444,1.0694444,0.363425926,16,1.0694444,7.695971,1.052174,2.287129,1.2037037,1.0555556,0.5295316,2.2731481,0.470468432,0.06264501,2.148148,9.134783,1.186957
5,AGRORDF,1.236263736,0.0000000,1.0659341,1.0659341,0.000000000,6,1.0329670,2.467532,1.077465,4.700000,0.5879121,0.8571429,0.3627119,1.6538462,0.637288136,0.07182320,2.000000,2.676056,1.084507
6,ANAEETHES,0.000000000,0.0000000,0.6666667,0.6666667,1107.666667000,2,0.6666667,1.000000,1.000000,2.000000,0.0000000,0.0000000,0.0000000,0.6666667,1.000000000,0.00000000,0.000000,1.000000,1.000000
7,ASCOPAIN-T,0.000000000,0.0000000,0.5000000,0.5000000,0.500000000,2,0.5000000,1.000000,1.000000,1.000000,0.0000000,0.0000000,0.0000000,0.5000000,1.000000000,0.00000000,0.000000,1.000000,1.000000
8,ATOL,0.998092513,0.0000000,1.0371960,1.0371960,1.999046257,14,1.0371960,7.634698,1.031485,3.419811,0.0000000,1.0381497,0.0000000,1.0371960,1.000000000,0.03387405,2.154930,8.554415,1.120465
9,BCO,1.312101911,3.5286624,1.0318471,1.0318471,0.178343949,11,1.0318471,5.087591,1.027778,3.306122,2.7197452,1.0382166,0.7249576,3.7515924,0.275042445,0.05769231,2.000000,6.453704,1.268519
10,BFO,0.972222222,0.0000000,0.9722222,0.9722222,0.000000000,7,0.9722222,4.904762,1.000000,2.333333,0.0000000,0.9444444,0.0000000,0.9722222,1.000000000,0.00000000,0.000000,4.904762,1.000000


In [63]:
qualRange = qualityRange(data=test_data, k.range=k.range, 
                            seed=seed,
                            all_metrics=TRUE,
                            cbi="rskc", L1=2, alpha=0)
qualStd = standardizeQualityData(qualRange)
qualStd


Data loaded.
Number of rows: 76
Number of columns: 20


Processing all metrics, 'merge', in dataframe (19)
	Calculation of k = 2
	Calculation of k = 3
	Calculation of k = 4
	Calculation of k = 5
	Calculation of k = 6


Unnamed: 0,k_2,k_3,k_4,k_5,k_6
all_metrics,0.9741553,0.9434622,0.8874426,0.4668025,0.1791566
