In [None]:
kernel.silent(true)

# Dependencies

In [None]:
import coursierapi.MavenRepository
interp.repositories() ++= Seq(MavenRepository.of("https://jitpack.io"))

In [None]:
import $ivy.`com.github.propi:rdfrules:1.5.0`
//import $ivy.`com.github.propi.rdfrules::core:1.0.0`
import collection._
import org.apache.jena.riot.Lang
import scala.util.control.Breaks._
import scala.collection.immutable.ListMap

In [None]:
import $ivy.`org.plotly-scala::plotly-almond:0.8.2`
import plotly._, plotly.element._, plotly.layout._, plotly.Almond._
init(offline=true)
repl.pprinter() = repl.pprinter().copy(defaultHeight = 3)

In [None]:
import com.github.propi.rdfrules.data._
import com.github.propi.rdfrules.algorithm.amie._
import com.github.propi.rdfrules.algorithm.dbscan._
import com.github.propi.rdfrules.utils._
import com.github.propi.rdfrules.index._
import com.github.propi.rdfrules.rule._
import com.github.propi.rdfrules.ruleset._

# Data Sets

In [None]:
val rdfsLabel = "http://www.w3.org/2000/01/rdf-schema#label"
val rdfsComment = "http://www.w3.org/2000/01/rdf-schema#comment"
val alternateName = "http://schema.org/alternateName"
val image = "http://schema.org/image"
val rdfType = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"
val rounded = (value: Double, scale: Integer) => BigDecimal(value).setScale(scale, BigDecimal.RoundingMode.HALF_UP).toDouble

# Yago

In [None]:
val yagoHop0 = Dataset.fromCache("../cache/yago-hop0.cache")
yagoHop0.size
val yagoHop1 = Dataset.fromCache("../cache/yago-hop1.cache")
yagoHop1.size
val yagoHop2 = Dataset.fromCache("../cache/yago-hop2.cache")
yagoHop2.size

In [None]:
val yagoDataset = yagoHop0 + yagoHop1 + yagoHop2
yagoDataset.size

In [None]:
val yagoDatasetFiltered = yagoDataset.
filter(q => !q.triple.predicate.hasSameUriAs(rdfsLabel) &&
                !q.triple.predicate.hasSameUriAs(rdfsComment) &&
                !q.triple.predicate.hasSameUriAs(alternateName) &&
                !q.triple.predicate.hasSameUriAs(rdfType) &&
                !q.triple.predicate.hasSameUriAs(image))

val ratio: Double = (yagoDatasetFiltered.size.toDouble / yagoDataset.size.toDouble)
yagoDatasetFiltered.size + " / " + yagoDataset.size + " = " + rounded(ratio,2)*100 + "%"

In [None]:
yagoDatasetFiltered.cache("../cache/jaur-yago/yago.cache")

In [None]:
val yagoDatasetFiltered = Dataset.fromCache("../cache/jaur-yago/yago.cache")

# Merge

In [None]:
val regionTotalSlice = Dataset.fromCache("../cache/jaur/jaur-regions-total.cache")
val regionBySexSlice = Dataset.fromCache("../cache/jaur/jaur-regions-bysex.cache")
val districtTotalSlice = Dataset.fromCache("../cache/jaur/jaur-districts-total.cache")
val districtBySexSlice = Dataset.fromCache("../cache/jaur/jaur-districts-bysex.cache")

In [None]:
val refAreaLinking = Dataset("../data/linking/yagoCZSOLinking.ttl")

In [None]:
val regionTotalDataset = regionTotalSlice + yagoDatasetFiltered + refAreaLinking
val regionBySexDataset = regionBySexSlice + yagoDatasetFiltered + refAreaLinking
val districtTotalDataset = districtTotalSlice + yagoDatasetFiltered + refAreaLinking
val districtBySexDataset = districtBySexSlice + yagoDatasetFiltered + refAreaLinking

In [None]:
val regionTotalIndex = regionTotalDataset.index().cache("../cache/jaur-yago/regionTotalIndex.cache")
val regionBySexIndex = regionBySexDataset.index().cache("../cache/jaur-yago/regionBySexIndex.cache")
val districtTotalIndex = districtTotalDataset.index().cache("../cache/jaur-yago/districtTotalIndex.cache")
val districtBySexIndex = districtBySexDataset.index().cache("../cache/jaur-yago/districtBySexIndex.cache")

In [None]:
val regionTotalIndex = Index.fromCache("../cache/jaur-yago/regionTotalIndex.cache",false)
val regionBySexIndex = Index.fromCache("../cache/jaur-yago/regionBySexIndex.cache",false)
val districtTotalIndex = Index.fromCache("../cache/jaur-yago/districtTotalIndex.cache",false)
val districtBySexIndex = Index.fromCache("../cache/jaur-yago/districtBySexIndex.cache",false)

# Mining

In [None]:
val uri = (value: String) => TripleItem.Uri(value)
val qbDataSet = uri("http://purl.org/linked-data/cube#dataSet")
val czsoUri = "http://data.czso.cz/ontology/"
val refArea = uri("http://data.czso.cz/ontology/refArea")
val constantsAtObject = RuleConstraint.ConstantsAtPosition.ConstantsPosition.Object
val constantsOnlyAtObject = RuleConstraint.ConstantsAtPosition(constantsAtObject)

In [None]:
val regionTotalSliceUri = uri("jaur-regions-total")
val regionBySexSliceUri = uri("jaur-regions-bysex")
val oneOfRegionCubes = OneOf(regionTotalSliceUri,regionBySexSliceUri)
val districtTotalSliceUri = uri("jaur-districts-total")
val districtBySexSliceUri = uri("jaur-districts-bysex")
val oneOfDistrictCubes = OneOf(districtTotalSliceUri,districtBySexSliceUri)

## Regions Total

In [None]:
val regionTotalPattern = (
    AtomPattern(subject = 'b', graph = uri("yago")) &:
    AtomPattern(subject = 'a', predicate = refArea, `object` = 'b', graph = uri("czso")) &:
    AtomPattern(subject = 'a', predicate = qbDataSet, `object` = regionTotalSliceUri, graph = uri("czso"))
    =>: 
    AtomPattern(subject = 'a', predicate = oneOfAllMeasures, graph = uri("czso"))
)

In [None]:
val minSupport = (d: Dataset) => {
    val n = d.filter(q => q.triple.predicate == qbDataSet).size
    val nArea = d.filter(q => q.triple.predicate == refArea).triples.map(t => t.`object`).toSet.size
    (n / nArea) + 1
}

In [None]:
val regionTotalTask = Amie()
    .addThreshold(Threshold.MinSupport(minSupport(regionTotalSlice)))
    .addThreshold(Threshold.MaxRuleLength(6))
    .addThreshold(Threshold.MinHeadSize(0))
    .addThreshold(Threshold.Timeout(1))
    .addConstraint(constantsOnlyAtObject)
    .addPattern(regionTotalPattern)

In [None]:
val startTimeMillis = System.currentTimeMillis()
val regionTotalTaskRuleset = regionTotalIndex.mine(regionTotalTask)
println("rules: "+regionTotalTaskRuleset.size)
println("duration: " + (System.currentTimeMillis() - startTimeMillis) / 1000 + "s")

In [None]:
regionTotalTaskRuleset.export("../rulesets/jaur-yago/regionTotal.txt")

In [None]:
val filterRuleset = (r: Ruleset) => r.filterResolved(rr => {rr.body.count(i => i.predicate == refArea) == 1 && rr.body.count(i => measures.contains(i.predicate)) == 0})


val regionTotalTaskRulesetFiltered = filterRuleset(regionTotalTaskRuleset)
.cache
regionTotalTaskRulesetFiltered.export("../rulesets/jaur-yago/regionTotalFiltered.txt")
println("rules: " + regionTotalTaskRulesetFiltered.size)

In [None]:
val plotHistogram = (seq: Seq[Double], color: String) => {
    val data = Seq(plotly.Histogram(seq,marker = Marker(color = Color.StringColor(color),opacity = 0.6)))
    plot(data)
}

val measureSequence = (r: Ruleset, m: TypedKeyMap.Key[Measure]) => r
.resolvedRules
.map(r => r.measures.get(m).get)
.toSeq

val supportSeq = measureSequence(regionTotalTaskRulesetFiltered, Measure.Support)
.map(m => m.asInstanceOf[Measure.Support].value.toDouble)
plotHistogram(supportSeq, "green")

In [None]:
val computeConfidence = (r: Ruleset, minConf: Double) => r
.computePcaConfidence(minConf)
.sortBy(Measure.PcaConfidence, Measure.Support)

val regionTotalTaskRulesetConfComputed = computeConfidence(regionTotalTaskRulesetFiltered,0.0)
.filter(r => r.measures.get(Measure.PcaConfidence).get.value < 1)
.cache

regionTotalTaskRulesetConfComputed.export("../rulesets/jaur-yago/regionTotalConfComputed.txt")

val confSeq = measureSequence(regionTotalTaskRulesetConfComputed, Measure.PcaConfidence)
.map(m => m.asInstanceOf[Measure.PcaConfidence].value)
plotHistogram(confSeq, "green")

In [None]:
val computeLift = (r: Ruleset, minLift: Double) => r
.computeLift(minLift)
.sortBy(Measure.Lift, Measure.Support)

val regionTotalTaskRulesetLiftComputed = computeLift(regionTotalTaskRulesetConfComputed,0.0).cache
regionTotalTaskRulesetLiftComputed.export("../rulesets/jaur-yago/regionTotalLiftComputed.txt")

val liftSeq = measureSequence(regionTotalTaskRulesetLiftComputed, Measure.Lift)
.map(m => m.asInstanceOf[Measure.Lift].value)
plotHistogram(liftSeq, "green")

In [None]:
val regionTotalTaskRulesetMinLift = regionTotalTaskRulesetLiftComputed
.filter(r => r.measures.get(Measure.Lift).get.value >= 1.0)
.filter(r => r.measures.get(Measure.Confidence).get.value >= 0.5)
.cache

println("rules: " + regionTotalTaskRulesetMinLift.size)
regionTotalTaskRulesetMinLift.export("../rulesets/jaur-yago/regionTotalMinLift.txt")

In [None]:
val filterByLength = (r: ResolvedRule, length: Int) => r.body.size == length - 1

val groupRulesByLength = (r: Ruleset, minLength: Int, maxLength: Int) => {
    var lengthMap: ListMap[Int, Int] = ListMap()
    breakable {
        var counter = minLength
        while (counter <= maxLength) {
            val size = r.filterResolved(r => filterByLength(r,counter)).size
            lengthMap = lengthMap + (counter -> size)
            counter = counter + 1
        }
    }
    ListMap(lengthMap.toSeq.sortWith(_._2 > _._2):_*)
}

val plotHorizontalBar = (map: ListMap[Int, Int], color: String) => {
    val xValue = map.values.toSeq ; val yValue = map.keys.toSeq
    val data = Seq(Bar(xValue, yValue, orientation = Orientation.Horizontal, 
                       marker = Marker(color = Color.StringColor(color),opacity = 0.6)))
    val annotations = xValue.zip(yValue).map {
        case (x, y) =>
            Annotation(
              x = x, y = y, text = x.toString,
              xanchor = Anchor.Center, yanchor = Anchor.Bottom, showarrow = false
            )
    }
    plot(data,Layout(annotations = annotations))
}

plotHorizontalBar(groupRulesByLength(regionTotalTaskRulesetMinLift, 4, 6), "green")

In [None]:
val makeClusters = (r: Ruleset, minNeighbours: Int, minSimilarity: Double) => r.makeClusters {
    implicit val ruleSimilarityCounting: SimilarityCounting[Rule.Simple] = SimilarityCounting.AtomsSimilarityCounting
    DbScan(minNeighbours = minNeighbours, minSimilarity = minSimilarity)
}.cache

val regionTotalTaskRulesetClustered = makeClusters(regionTotalTaskRulesetMinLift, 3, 0.85)
regionTotalTaskRulesetClustered.export("../rulesets/jaur-yago/regionTotalClustered.txt")

In [None]:
val filterByCluster = (r: ResolvedRule, cluster: Int) => r.measures.get(Measure.Cluster).get == Measure.Cluster(cluster)

val groupRulesByCluster = (r: Ruleset) => {
    var clustersMap: ListMap[Int, Int] = ListMap()
    breakable {
        var counter = 0
        while (true) {
            val size = r.filterResolved(r => filterByCluster(r,counter)).size
            if (size > 0) {clustersMap = clustersMap + (counter -> size) ; counter = counter + 1}
            else break;
        }
    }
    ListMap(clustersMap.toSeq.sortWith(_._2 > _._2):_*)
}

plotHorizontalBar(groupRulesByCluster(regionTotalTaskRulesetClustered), "green")

In [None]:
val getClusters = (r: Ruleset) => {
    
    var array = Array[Int]()
    
    breakable {
        var counter = 0
        while (true) {
            val rules = r.filterResolved(r => filterByCluster(r,counter))
            if (rules.size > 0) {
                array = counter +: array ; counter = counter + 1 
            }
            else break;
        }
    }
    
    array
}

val pickOneForCluster = (r: Ruleset) => {
    
    var ruleset = r.slice(0,0)
    val array = getClusters(r)
    
    array.foreach(i => {
        val rules = r.filterResolved(r => filterByCluster(r,i))
        ruleset = ruleset + rules.take(1)
    })
    
    ruleset
}

In [None]:
val regionTotalTaskRulesetOneForCluster = pickOneForCluster(regionTotalTaskRulesetClustered)
regionTotalTaskRulesetOneForCluster.export("../rulesets/jaur-yago/regionTotalTaskRulesetOneForCluster.txt")

## Regions By Sex

In [None]:
val regionBySexPattern = (
    AtomPattern(subject = 'b', graph = uri("yago")) &:
    AtomPattern(subject = 'a', predicate = refArea, `object` = 'b', graph = uri("czso")) &:
    AtomPattern(subject = 'a', predicate = qbDataSet, `object` = regionBySexSliceUri, graph = uri("czso"))
    =>: 
    AtomPattern(subject = 'a',predicate = oneOfBySexMeasures, graph = uri("czso"))
)

In [None]:
val regionBySexTask = Amie()
    .addThreshold(Threshold.MinSupport(minSupport(regionBySexSlice)))
    .addThreshold(Threshold.MaxRuleLength(6))
    .addThreshold(Threshold.MinHeadSize(1))
    .addConstraint(constantsOnlyAtObject)
    .addPattern(regionBySexPattern)

In [None]:
val startTimeMillis = System.currentTimeMillis()
val regionBySexTaskRuleset = regionBySexIndex.mine(regionBySexTask)
println("rules: "+regionBySexTaskRuleset.size)
println("duration: " + (System.currentTimeMillis() - startTimeMillis) / 1000 + "s")

In [None]:
regionBySexTaskRuleset.export("../rulesets/jaur-yago/RegionBySex.txt")

In [None]:
val regionBySexTaskRulesetFiltered = filterRuleset(regionBySexTaskRuleset).cache
regionBySexTaskRulesetFiltered.export("../rulesets/jaur-yago/RegionBySexFiltered.txt")
println("rules: " + regionBySexTaskRulesetFiltered.size)

In [None]:
val supportSeq = measureSequence(regionBySexTaskRulesetFiltered, Measure.Support)
.map(m => m.asInstanceOf[Measure.Support].value.toDouble)
plotHistogram(supportSeq, "grey")

In [None]:
val regionBySexTaskRulesetConfComputed = computeConfidence(regionBySexTaskRulesetFiltered,0.0).cache
regionBySexTaskRulesetConfComputed.export("../rulesets/jaur-yago/RegionBySexConfComputed.txt")

val confSeq = measureSequence(regionBySexTaskRulesetConfComputed, Measure.PcaConfidence)
.map(m => m.asInstanceOf[Measure.PcaConfidence].value)
plotHistogram(confSeq, "grey")

In [None]:
val regionBySexTaskRulesetLiftComputed = computeLift(regionBySexTaskRulesetFiltered,0.0).cache
regionBySexTaskRulesetLiftComputed.export("../rulesets/jaur-yago/RegionBySexLiftComputed.txt")

val liftSeq = measureSequence(regionBySexTaskRulesetLiftComputed, Measure.Lift)
.map(m => m.asInstanceOf[Measure.Lift].value)
plotHistogram(liftSeq, "grey")

In [None]:
val regionBySexTaskRulesetMinLift = regionBySexTaskRulesetLiftComputed
.filter(r => r.measures.get(Measure.Lift).get.value >= 1.0)
.filter(r => r.measures.get(Measure.Confidence).get.value >= 0.5)
//.pruned(false, false)
.cache

regionBySexTaskRulesetMinLift.export("../rulesets/jaur-yago/regionBySexMinLift.txt")
println("rules: " + regionBySexTaskRulesetMinLift.size)

In [None]:
plotHorizontalBar(groupRulesByLength(regionBySexTaskRulesetMinLift, 4, 6), "grey")

In [None]:
val regionBySexTaskRulesetClustered = makeClusters(regionBySexTaskRulesetMinLift, 3, 0.85)
regionBySexTaskRulesetClustered.export("../rulesets/jaur-yago/regionBySexTaskRulesetClustered.txt")

In [None]:
val regionBySexTaskRulesetOneForCluster = pickOneForCluster(regionBySexTaskRulesetClustered)
regionBySexTaskRulesetOneForCluster.export("../rulesets/jaur-yago/regionBySexTaskRulesetOneForCluster.txt")

## Districts Total

In [None]:
val districtTotalPattern = (
    AtomPattern(subject = 'b', graph = uri("yago")) &:
    AtomPattern(subject = 'a', predicate = refArea, `object` = 'b', graph = uri("czso")) &:
    AtomPattern(subject = 'a', predicate = qbDataSet, `object` = districtTotalSliceUri, graph = uri("czso"))
    =>: 
    AtomPattern(subject = 'a', predicate = oneOfAllMeasures, graph = uri("czso"))
)

In [None]:
val districtTotalTask = Amie()
    .addThreshold(Threshold.MinSupport(minSupport(districtTotalSlice)*3))
    .addThreshold(Threshold.MaxRuleLength(6))
    .addConstraint(constantsOnlyAtObject)
    .addPattern(districtTotalPattern)

In [None]:
val startTimeMillis = System.currentTimeMillis()
val districtTotalTaskRuleset = districtTotalIndex.mine(districtTotalTask)
println("rules: "+districtTotalTaskRuleset.size)
println("duration: " + (System.currentTimeMillis() - startTimeMillis) / 1000 + "s")

In [None]:
districtTotalTaskRuleset.export("../rulesets/jaur-yago/districtTotal.txt")

In [None]:
val districtTotalTaskRulesetFiltered = filterRuleset(districtTotalTaskRuleset).cache
districtTotalTaskRulesetFiltered.export("../rulesets/jaur-yago/districtTotalFiltered.txt")
println("rules: " + districtTotalTaskRulesetFiltered.size)

In [None]:
val supportSeq = measureSequence(districtTotalTaskRulesetFiltered, Measure.Support)
.map(m => m.asInstanceOf[Measure.Support].value.toDouble)
plotHistogram(supportSeq, "red")

In [None]:
val districtTotalTaskRulesetConfComputed = computeConfidence(districtTotalTaskRulesetFiltered,0.0).cache
districtTotalTaskRulesetConfComputed.export("../rulesets/jaur-yago/districtTotalConfComputed.txt")
val confSeq = measureSequence(districtTotalTaskRulesetConfComputed, Measure.PcaConfidence)
.map(m => m.asInstanceOf[Measure.PcaConfidence].value)
.filter(c => c <= 1)
plotHistogram(confSeq, "red")

In [None]:
val districtTotalTaskRulesetLiftComputed = computeLift(districtTotalTaskRulesetConfComputed,0.0).cache
districtTotalTaskRulesetLiftComputed.export("../rulesets/jaur-yago/districtTotalLiftComputed.txt")

val liftSeq = measureSequence(districtTotalTaskRulesetLiftComputed, Measure.Lift)
.map(m => m.asInstanceOf[Measure.Lift].value)
plotHistogram(liftSeq, "red")

In [None]:
val districtTotalTaskRulesetMinLift = districtTotalTaskRulesetLiftComputed
.filter(r => r.measures.get(Measure.Lift).get.value > 1.0)
.filter(r => r.measures.get(Measure.PcaConfidence).get.value > 0.6)
.cache

districtTotalTaskRulesetMinLift.export("../rulesets/jaur-yago/districtTotalMinLift.txt")
println("rules: " + districtTotalTaskRulesetMinLift.size)

In [None]:
val districtTotalTaskRulesetClustered = makeClusters(districtTotalTaskRulesetMinLift, 3, 0.85)
districtTotalTaskRulesetClustered.export("../rulesets/jaur-yago/districtTotalClustered.txt")

plotHorizontalBar(groupRulesByCluster(districtTotalTaskRulesetClustered), "red")

## Districts By Sex

In [None]:
val districtBySexPattern = (
    AtomPattern(subject = 'b', graph = uri("yago")) &:
    AtomPattern(subject = 'a', predicate = refArea, `object` = 'b', graph = uri("czso")) &:
    AtomPattern(subject = 'a', predicate = qbDataSet, `object` = districtBySexSliceUri, graph = uri("czso"))
    =>: 
    AtomPattern(subject = 'a', predicate = oneOfBySexMeasures, graph = uri("czso"))
)

In [None]:
val districtBySexTask = Amie()
    .addThreshold(Threshold.MinSupport(minSupport(districtBySexSlice)*3))
    .addThreshold(Threshold.MaxRuleLength(6))
    .addConstraint(constantsOnlyAtObject)
    .addPattern(districtBySexPattern)

In [None]:
val startTimeMillis = System.currentTimeMillis()
val districtBySexTaskRuleset = districtBySexIndex.mine(districtBySexTask)
println("rules: "+districtBySexTaskRuleset.size)
println("duration: " + (System.currentTimeMillis() - startTimeMillis) / 1000 + "s")

In [None]:
districtBySexTaskRuleset.export("../rulesets/jaur-yago/districtBySex.txt")

In [None]:
val districtBySexTaskRulesetFiltered = filterRuleset(districtBySexTaskRuleset).cache
districtBySexTaskRulesetFiltered.export("../rulesets/jaur-yago/districtBySexFiltered.txt")
println("rules: " + districtBySexTaskRulesetFiltered.size)

In [None]:
val supportSeq = measureSequence(districtBySexTaskRulesetFiltered, Measure.Support)
.map(m => m.asInstanceOf[Measure.Support].value.toDouble)
plotHistogram(supportSeq, "cls")

In [None]:
val districtBySexTaskRulesetConfComputed = computeConfidence(districtBySexTaskRulesetFiltered,0.0).cache

districtBySexTaskRulesetConfComputed.export("../rulesets/jaur-yago/districtBySexConfComputed.txt")

val confSeq = measureSequence(districtBySexTaskRulesetConfComputed, Measure.PcaConfidence)
.map(m => m.asInstanceOf[Measure.PcaConfidence].value)
.filter(c => c <= 1)
plotHistogram(confSeq, "cls")

In [None]:
val districtBySexTaskRulesetLiftComputed = computeLift(districtBySexTaskRulesetConfComputed,0.0).cache
districtBySexTaskRulesetLiftComputed.export("../rulesets/jaur-yago/districtBySexLiftComputed.txt")

val liftSeq = measureSequence(districtBySexTaskRulesetLiftComputed, Measure.Lift)
.map(m => m.asInstanceOf[Measure.Lift].value)
plotHistogram(liftSeq, "cls")

In [None]:
val districtBySexTaskRulesetMinLift = districtBySexTaskRulesetLiftComputed
.filter(r => r.measures.get(Measure.Lift).get.value >= 1.0)
.filter(r => r.measures.get(Measure.PcaConfidence).get.value >= 0.55)
.cache

districtBySexTaskRulesetMinLift.export("../rulesets/jaur-yago/districtBySexMinLift.txt")
println("rules: " + districtBySexTaskRulesetMinLift.size)

In [None]:
val districtBySexTaskRulesetClustered = makeClusters(districtBySexTaskRulesetMinLift, 3, 0.85).cache
districtBySexTaskRulesetClustered.export("../rulesets/jaur-yago/districtBySexClustered.txt")

plotHorizontalBar(groupRulesByCluster(districtBySexTaskRulesetClustered), "cls")