Compute RSVD
============

Here we read the preprcessed data and compute the rSVD

In [None]:
import com.criteo.rsvd._
import scala.util.Random
import org.apache.spark.mllib.linalg.distributed.MatrixEntry

  

>     import com.criteo.rsvd._
>     import scala.util.Random
>     import org.apache.spark.mllib.linalg.distributed.MatrixEntry

  

### Set up RSVD config with JSON file

In [None]:
// code snippet for saving config as json
val config_map = Map("embeddingDim" -> 100, "oversample" -> 30, "powerIter" -> 1, "seed" -> 0, "blockSize" -> 50000, "partitionWidthInBlocks" -> 35, "partitionHeightInBlocks" -> 10)
val config_spark_save = config_map.toSeq.toDF("key","value")
config_spark_save.write.mode("overwrite").json("/projects/group21/rsvd_config.json")

  

>     config_map: scala.collection.immutable.Map[String,Int] = Map(seed -> 0, oversample -> 30, blockSize -> 50000, partitionWidthInBlocks -> 35, partitionHeightInBlocks -> 10, powerIter -> 1, embeddingDim -> 100)
>     config_spark_save: org.apache.spark.sql.DataFrame = [key: string, value: int]

In [None]:
// load config from json (assuming only integer values)
val config_spark = spark.read.json("/projects/group21/rsvd_config.json").rdd.map(r => (r(0).toString -> r(1).toString.toInt)).collect.toMap

  

>     config_spark: scala.collection.immutable.Map[String,Int] = Map(seed -> 0, oversample -> 30, blockSize -> 50000, partitionWidthInBlocks -> 35, partitionHeightInBlocks -> 10, powerIter -> 1, embeddingDim -> 100)

In [None]:
// Create RSVD configuration
val config = RSVDConfig(
  embeddingDim = config_spark("embeddingDim"),
  oversample = config_spark("oversample"),
  powerIter = config_spark("powerIter"),
  seed = config_spark("seed"),
  blockSize = config_spark("blockSize"),
  partitionWidthInBlocks = config_spark("partitionWidthInBlocks"),
  partitionHeightInBlocks = config_spark("partitionHeightInBlocks"),
  computeLeftSingularVectors = true,
  computeRightSingularVectors = true
)

  

>     config: com.criteo.rsvd.RSVDConfig = RSVDConfig(100,30,1,0,50000,35,10,true,true)

  

### Create pipeline for computing RSVD from dataframe of edge

In [None]:
def computeRSVD (groupedCanonicalEdges : org.apache.spark.sql.DataFrame, config : RSVDConfig): RsvdResults = {
  val matHeight = groupedCanonicalEdges.count()
  val matWidth = groupedCanonicalEdges.select("src").union(groupedCanonicalEdges.select("dst")).distinct().count()
//   val incidenceMatrixEntries = groupedCanonicalEdges.rdd.flatMap{
//     case Row(src: Int, dst: Int, id: Int) => List(MatrixEntry(id-1, src, -1), MatrixEntry(id-1, dst, 1))
//   }
  val incidenceMatrixEntries = groupedCanonicalEdges.rdd.flatMap{
    case Row(src: Int, dst: Int, id: Int) => List(MatrixEntry(id-1, src-1, -1), MatrixEntry(id-1, dst-1, 1))
  }
  // Create block matrix and compute RSVD
  val matrixToDecompose = BlockMatrix.fromMatrixEntries(incidenceMatrixEntries, matHeight = matHeight, matWidth = matWidth, config.blockSize, config.partitionHeightInBlocks, config.partitionWidthInBlocks)
  return RSVD.run(matrixToDecompose, config, sc)
}

  

>     computeRSVD: (groupedCanonicalEdges: org.apache.spark.sql.DataFrame, config: com.criteo.rsvd.RSVDConfig)com.criteo.rsvd.RsvdResults

  

### Code for ethereum table

In [None]:
val groupedCanonicalEdges = spark.read.format("parquet").load("/projects/group21/test_ethereum_canonical_edges").drop("flow")
val rsvd_results_path: String = "/projects/group21/test_ethereum_"

val RsvdResults(leftSingularVectors, singularValues, rightSingularVectors) = computeRSVD(groupedCanonicalEdges, config)

  

>     groupedCanonicalEdges: org.apache.spark.sql.DataFrame = [src: int, dst: int ... 1 more field]
>     rsvd_results_path: String = /projects/group21/test_ethereum_
>     leftSingularVectors: Option[com.criteo.rsvd.SkinnyBlockMatrix] = Some(SkinnyBlockMatrix(MapPartitionsRDD[331] at mapValues at SkinnyBlockMatrix.scala:149,50000,2152835,100,10))
>     singularValues: breeze.linalg.DenseVector[Double] = DenseVector(458.4034722863035, 345.1960245414585, 297.3895643496496, 271.6473930317087, 223.76310129031822, 206.0976997756398, 189.19632466447192, 137.7967230861944, 137.56623843203414, 135.5429090023961, 132.08931171241719, 128.72107390801992, 120.93923408578108, 120.62389811722176, 119.54471077668268, 118.21384174672237, 112.19577063002367, 111.39418637351407, 106.97086678678659, 106.56415959171053, 102.47323443175753, 100.63351038159684, 99.9528847938504, 96.35068331317134, 94.63883930018112, 93.63420288135465, 91.14614276423288, 85.6957564959455, 85.3307056057203, 82.78064180877004, 79.46368530163008, 78.301723084172, 77.52345859456192, 75.89415601567978, 75.17444923371285, 74.60719378218757, 72.76955624490422, 72.46205544360612, 72.1194574793734, 71.36743768743942, 69.68190130698409, 69.34880256578182, 69.23852630919076, 68.3496999327091, 68.08810547606788, 67.58052315764036, 67.00671105014125, 66.73755332897335, 66.50556789291338, 65.61826598949622, 65.26026410908358, 64.08933544145536, 63.75670758508847, 63.28412409499844, 61.8793675172377, 61.72183276227415, 60.674029927660726, 60.11054425362686, 60.04368500362045, 59.484031098783596, 58.70193268015032, 58.3267455505323, 57.332722900924786, 56.71833133469237, 55.988778733088814, 55.473263959063694, 54.952717142978095, 54.58134293797009, 54.2242297902958, 53.44845065373051, 53.2733517395476, 53.0151929459794, 52.830274310533866, 52.53875013967804, 52.04259394726295, 51.959976901583154, 51.46518325104534, 51.27521575244191, 50.96925279272197, 50.850190518253065, 50.556574587753424, 50.27755187580497, 49.7901913198631, 49.46663543730333, 49.313416136040296, 49.14614394421691, 48.81614269456599, 48.38266680345277, 48.21467328287131, 48.110092141642596, 47.88127863589444, 47.61474033120138, 47.422728862338815, 47.20378964600103, 46.97852199223743, 46.83620532681896, 46.53323067801465, 46.02425975867164, 45.681123495339364, 45.61205652605013)
>     rightSingularVectors: Option[com.criteo.rsvd.SkinnyBlockMatrix] = Some(SkinnyBlockMatrix(MapPartitionsRDD[332] at mapValues at SkinnyBlockMatrix.scala:149,50000,1520925,100,10))

In [None]:
val groupedCanonicalEdges = spark.read.format("parquet").load("/projects/group21/test_ethereum_canonical_edges")

  

>     groupedCanonicalEdges: org.apache.spark.sql.DataFrame = [src: int, dst: int ... 2 more fields]

In [None]:
groupedCanonicalEdges.printSchema()

  

>     root
>      |-- src: integer (nullable = true)
>      |-- dst: integer (nullable = true)
>      |-- flow: double (nullable = true)
>      |-- id: integer (nullable = true)

In [None]:
val leftIndexed = leftSingularVectors.get.toIndexedEmbeddings.map(x => (x._1, x._2.toArray) ).toDF("index", "values")
val singularDF = sc.parallelize(singularValues.toArray).toDF()
val rightIndexed = rightSingularVectors.get.toIndexedEmbeddings.map(x => (x._1, x._2.toArray) ).toDF("index", "values")

leftIndexed.write.format("parquet").mode("overwrite").save(rsvd_results_path + "LeftSingularVectors")
singularDF.write.format("parquet").mode("overwrite").save(rsvd_results_path + "SingularValues")
rightIndexed.write.format("parquet").mode("overwrite").save(rsvd_results_path + "RightSingularVectors")

  

>     leftIndexed: org.apache.spark.sql.DataFrame = [index: bigint, values: array<double>]
>     singularDF: org.apache.spark.sql.DataFrame = [value: double]
>     rightIndexed: org.apache.spark.sql.DataFrame = [index: bigint, values: array<double>]

  

### Code for random graph

(saved in notebook 03*generate*graphs)

In [None]:
val groupedCanonicalEdges = spark.read.format("parquet").load("/projects/group21/uniform_random_graph")
val rsvd_results_path: String = "/projects/group21/uniform_random_graph_"

val RsvdResults(leftSingularVectors, singularValues, rightSingularVectors) = computeRSVD(groupedCanonicalEdges, config)

  

>     groupedCanonicalEdges: org.apache.spark.sql.DataFrame = [src: int, dst: int ... 1 more field]
>     rsvd_results_path: String = /projects/group21/uniform_random_graph_
>     leftSingularVectors: Option[com.criteo.rsvd.SkinnyBlockMatrix] = Some(SkinnyBlockMatrix(MapPartitionsRDD[478] at mapValues at SkinnyBlockMatrix.scala:149,50000,2152831,100,10))
>     singularValues: breeze.linalg.DenseVector[Double] = DenseVector(2.4048991653662957, 2.4046356787782766, 2.4041855583808713, 2.403812563120061, 2.4035821872876793, 2.403478107244044, 2.403073553596892, 2.4028460017010653, 2.4027350399472045, 2.402552930809589, 2.4022459225109087, 2.4020007255252636, 2.4017887619265803, 2.401568908997331, 2.401335917407542, 2.4012086040740783, 2.401107142204493, 2.4008575020720895, 2.400421096528722, 2.400395043008221, 2.4002688439219693, 2.4001641454746814, 2.3999454333820847, 2.399561744688428, 2.399327141891845, 2.3990948536499457, 2.3990196799709262, 2.398972318847929, 2.3987831427613053, 2.3985281219169017, 2.398353976443457, 2.3982522801884603, 2.398070775862362, 2.397922813691155, 2.3976833385174694, 2.397513005270522, 2.397391305992767, 2.397346415535947, 2.3970452817762773, 2.3969294224468882, 2.396776688761879, 2.3967073456521337, 2.3965615563251594, 2.3963211087373275, 2.396138536549947, 2.396080742731518, 2.3957875807186593, 2.395696609471369, 2.3955033854481713, 2.395473231161125, 2.395275137964756, 2.395139375923309, 2.3949884712318417, 2.3948238467331247, 2.394774934099713, 2.394553180128688, 2.3944349608766826, 2.3943973088092196, 2.3940997105908797, 2.3940031834310798, 2.3938079918520243, 2.393555880411589, 2.3933755072556573, 2.393313836078364, 2.3931148484103777, 2.393036289848337, 2.3927683502588897, 2.3926032976194547, 2.392477136503132, 2.392326212162523, 2.3920565551901793, 2.3920089292709656, 2.391765328144042, 2.391714114711596, 2.3915908255363734, 2.391369805703702, 2.391247870365735, 2.3911015122605677, 2.3909410320722775, 2.3908066193534543, 2.3905945894094978, 2.390347919715368, 2.390247043795248, 2.3901286997724784, 2.389954622314303, 2.3898789872652024, 2.3896958043266285, 2.389606145680447, 2.389410977113493, 2.389325758949565, 2.389170135749956, 2.3889975499077694, 2.3888818177392603, 2.388660698180604, 2.388571767139018, 2.3884517597285773, 2.3881242402686906, 2.387908597091317, 2.3876588767363627, 2.3876314437146515)
>     rightSingularVectors: Option[com.criteo.rsvd.SkinnyBlockMatrix] = Some(SkinnyBlockMatrix(MapPartitionsRDD[479] at mapValues at SkinnyBlockMatrix.scala:149,50000,1511245,100,10))

In [None]:
val leftIndexed = leftSingularVectors.get.toIndexedEmbeddings.map(x => (x._1, x._2.toArray) ).toDF("index", "values")
val singularDF = sc.parallelize(singularValues.toArray).toDF()
val rightIndexed = rightSingularVectors.get.toIndexedEmbeddings.map(x => (x._1, x._2.toArray) ).toDF("index", "values")

leftIndexed.write.format("parquet").mode("overwrite").save(rsvd_results_path + "LeftSingularVectors")
singularDF.write.format("parquet").mode("overwrite").save(rsvd_results_path + "SingularValues")
rightIndexed.write.format("parquet").mode("overwrite").save(rsvd_results_path + "RightSingularVectors")

  

>     leftIndexed: org.apache.spark.sql.DataFrame = [index: bigint, values: array<double>]
>     singularDF: org.apache.spark.sql.DataFrame = [value: double]
>     rightIndexed: org.apache.spark.sql.DataFrame = [index: bigint, values: array<double>]

  

### Code for RMAT random graph

In [None]:
val groupedCanonicalEdges = spark.read.format("parquet").load("/projects/group21/rmat_random_graph")
val rsvd_results_path: String = "/projects/group21/rmat_random_graph_"

val RsvdResults(leftSingularVectors, singularValues, rightSingularVectors) = computeRSVD(groupedCanonicalEdges, config)

In [None]:
val leftIndexed = leftSingularVectors.get.toIndexedEmbeddings.map(x => (x._1, x._2.toArray) ).toDF("index", "values")
val singularDF = sc.parallelize(singularValues.toArray).toDF()
val rightIndexed = rightSingularVectors.get.toIndexedEmbeddings.map(x => (x._1, x._2.toArray) ).toDF("index", "values")

leftIndexed.write.format("parquet").mode("overwrite").save(rsvd_results_path + "LeftSingularVectors")
singularDF.write.format("parquet").mode("overwrite").save(rsvd_results_path + "SingularValues")
rightIndexed.write.format("parquet").mode("overwrite").save(rsvd_results_path + "RightSingularVectors")