In [1]:
kernel.silent(true)

In [2]:
import coursierapi.MavenRepository
interp.repositories() ++= Seq(MavenRepository.of("https://jitpack.io"))

In [3]:
import $ivy.`com.github.propi:rdfrules:1.5.0`
import collection._
import org.apache.jena.riot.Lang

import com.github.propi.rdfrules.data._
import com.github.propi.rdfrules.algorithm.amie._
import com.github.propi.rdfrules.algorithm.dbscan._
import com.github.propi.rdfrules.utils._
import com.github.propi.rdfrules.index._
import com.github.propi.rdfrules.rule._
import com.github.propi.rdfrules.ruleset._
//kernel.silent(false)

In [4]:
import reflect.io._, Path._
import reflect.io._
import Path._

In [5]:
val uri = (value: String) => TripleItem.Uri(value)
val qbDataSet = "http://purl.org/linked-data/cube#dataSet";

In [6]:
val graphs = "../data/salaries".toDirectory.files.map(_.path).filter(g => g matches """.*\.ttl""")

In [7]:
var salariesCubesMap: Map[String, Graph] = Map()
graphs.foreach(g => {
    val name = g.replaceAll("\\.ttl", "").replaceAll("^.*/", "")
    salariesCubesMap = salariesCubesMap + (name -> Graph(g))
})

In [8]:
val intervalCounts = Array(2, 3, 5, 7)
val supportPercents = Array(1, 2, 5, 10, 20)
val equiSize = (support: Int) => DiscretizationTask.Equisize(support)
val equiFrequent = (intervalCount: Int) => DiscretizationTask.Equifrequency(intervalCount)

In [9]:
import eu.easyminer.discretization.impl.Interval
import eu.easyminer.discretization.impl.IntervalBound._
val rounded = (value: Double, scale: Integer) => BigDecimal(value).setScale(scale, BigDecimal.RoundingMode.HALF_UP).toDouble
val intervalToString = (i: Interval) => "<"+rounded(i.minValue.value,2)+ "__"+rounded(i.maxValue.value,2)+")"

In [10]:
val averageSalary = TripleItem.Uri("http://data.czso.cz/ontology/prumernaMzda")
val medianSalary = TripleItem.Uri("http://data.czso.cz/ontology/medianMzdy")
val measureUris = Array(averageSalary,medianSalary)

In [11]:
val discretizeEquisize = (graph: Graph, relativeSupport: Int, measure: TripleItem.Uri) => {
    val observationsCount = graph.filter(t => t.predicate.hasSameUriAs(qbDataSet)).size
    val absoluteSupport = (observationsCount * (relativeSupport.toFloat / 100)).ceil.toInt
    graph.filter(t => t.predicate == measure)
         .discretize(equiSize(absoluteSupport))(quad => quad.triple.predicate == measure)
}

val discretizeEquifrequent = (graph: Graph, intervalsCount: Int, measure: TripleItem.Uri) => {
    graph.filter(t => t.predicate == measure)
         .discretize(equiFrequent(intervalsCount))(quad => quad.triple.predicate == measure)
}

val discretizeEquisizeAndGetIntervals = (graph: Graph, relativeSupport: Int, measure: TripleItem.Uri) => {
    val observationsCount = graph.filter(t => t.predicate.hasSameUriAs(qbDataSet)).size
    val absoluteSupport = (observationsCount * (relativeSupport.toFloat / 100)).ceil.toInt
    graph.filter(t => t.predicate == measure)
         .discretizeAndGetIntervals(equiSize(absoluteSupport))(quad => quad.triple.predicate == measure)   
}

val discretizeEquifrequentAndGetIntervals = (graph: Graph, intervalsCount: Int, measure: TripleItem.Uri) => {
    graph.filter(t => t.predicate == measure)
         .discretizeAndGetIntervals(equiFrequent(intervalsCount))(quad => quad.triple.predicate == measure)
}

In [12]:
val lowerBound = (interval: String) => interval.split(" ")(1)
val upperBound = (interval: String) => interval.split(" ")(3)
val lowerBoundInterval = (interval: Interval) => interval.minValue.value.toString
val upperBoundInterval = (interval: Interval) => interval.maxValue.value.toString

val equalIntervals = (i1: String, i2: Interval) => {
    lowerBound(i1).equals(lowerBoundInterval(i2)) && upperBound(i1).equals(upperBoundInterval(i2))
}

In [13]:
val renameInterval = (t: Triple, intervals: IndexedSeq[Interval], suffix: String) => {
    var result = t
    intervals.foreach(interval => {
        if (equalIntervals(t.`object`.toString, interval)) {
            result = t.copy(`object` = TripleItem.Uri(intervalToString(interval)+"_"+suffix+"_"+(intervals.indexOf(interval)+1)+"/"+intervals.size) )
        }
    })
    result
}

val discretize = (graph: Graph, name: String) => {
    var result = Dataset() + 
    graph.filter(t => !measureUris.contains(t.predicate))
    .map(t => if (t.predicate.hasSameUriAs(qbDataSet)) t.copy(`object` = uri(name)) else t)
    
        measureUris.foreach(measure => {
            intervalCounts.foreach(intervalCount => {
            val intervals = discretizeEquifrequentAndGetIntervals(graph, intervalCount, measure)
            val measures = discretizeEquifrequent(graph, intervalCount, measure)
            result = result + measures.map(t => {renameInterval(t, intervals, "ef"+intervalCount)})
        })

        supportPercents.foreach(percentage => {
            val intervals = discretizeEquisizeAndGetIntervals(graph, percentage, measure)
            val measures = discretizeEquisize(graph, percentage, measure)
            result = result + measures.map(t => {renameInterval(t, intervals, "es"+percentage)})
        })
    })
    result
}

In [14]:
var dataset = Dataset()
salariesCubesMap
.map(t => discretize(t._2,t._1))
.foreach(d => dataset = dataset + d)

In [15]:
dataset.cache("../cache/salariesCube.cache")
dataset.export("../exports/salariesCube.ttl")

In [16]:
var cachedDataset = Dataset.fromCache("../cache/salariesCube.cache")
println(cachedDataset.size)
println(dataset.size)

2898
2898
