From 4678de4985d0ccaec2f6bfc4901e0b3f0623ab7f Mon Sep 17 00:00:00 2001 From: geosarr <83412621+geosarr@users.noreply.github.com> Date: Mon, 1 Apr 2024 22:57:19 +0200 Subject: [PATCH] Page rank algorithm support (#623) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Added PageRank algorithm. * Added page rank doc test. * Added UnitMeasure trait to generalize Page rank. * Some code refacto. * Added benchmark and deleted iteration prints. * Moved page rank test in tests folder. * Added doc and quickcheck to match contribution guide requirements. * Added time and space complexity. * Correction of doc typo. * Added parallel computation of page rank along with test and bench. * Improved performance on edge iterations. Dropped useless Trait constrait. * Corrected Nan values in parallel page ranks. Improved performance by iterating once over out edges. * Added tolerance to speed up further parallel page rank. * Added empty vec return instead of panicking on empty graph. Doc for UnitMeasure. * Apply suggestions from code review Return empty vec from parallel_pg too. Add link to doc. --------- Co-authored-by: Agustín Borgna --- benches/page_rank.rs | 36 ++++++++ src/algo/mod.rs | 43 ++++++++++ src/algo/page_rank.rs | 185 ++++++++++++++++++++++++++++++++++++++++++ tests/page_rank.rs | 83 +++++++++++++++++++ tests/quickcheck.rs | 20 ++++- 5 files changed, 366 insertions(+), 1 deletion(-) create mode 100644 benches/page_rank.rs create mode 100644 src/algo/page_rank.rs create mode 100644 tests/page_rank.rs diff --git a/benches/page_rank.rs b/benches/page_rank.rs new file mode 100644 index 000000000..a7a6bbaf7 --- /dev/null +++ b/benches/page_rank.rs @@ -0,0 +1,36 @@ +#![feature(test)] +extern crate petgraph; +extern crate test; + +use test::Bencher; + +use petgraph::algo::page_rank; + +#[allow(dead_code)] +mod common; + +use common::directed_fan; + +#[cfg(feature = "rayon")] +use petgraph::algo::page_rank::parallel_page_rank; +#[cfg(feature = "rayon")] +use rayon::prelude::*; + +#[bench] +fn page_rank_bench(bench: &mut Bencher) { + static NODE_COUNT: usize = 500; + let g = directed_fan(NODE_COUNT); + bench.iter(|| { + let _ranks = page_rank(&g, 0.6_f64, 10); + }); +} + +#[bench] +#[cfg(feature = "rayon")] +fn par_page_rank_bench(bench: &mut Bencher) { + static NODE_COUNT: usize = 2_000; + let g = directed_fan(NODE_COUNT); + bench.iter(|| { + let _ranks = parallel_page_rank(&g, 0.6_f64, 100, None); + }); +} diff --git a/src/algo/mod.rs b/src/algo/mod.rs index 67e36245f..7d52ec4c2 100644 --- a/src/algo/mod.rs +++ b/src/algo/mod.rs @@ -13,6 +13,7 @@ pub mod floyd_warshall; pub mod isomorphism; pub mod k_shortest_path; pub mod matching; +pub mod page_rank; pub mod simple_paths; pub mod tred; @@ -44,6 +45,7 @@ pub use isomorphism::{ }; pub use k_shortest_path::k_shortest_path; pub use matching::{greedy_matching, maximum_matching, Matching}; +pub use page_rank::page_rank; pub use simple_paths::all_simple_paths; /// \[Generic\] Return the number of connected components of the graph. @@ -901,3 +903,44 @@ macro_rules! impl_bounded_measure_float( ); impl_bounded_measure_float!(f32, f64); + +/// A floating-point measure that can be computed from `usize` +/// and with a default measure of proximity. +pub trait UnitMeasure: + Measure + + std::ops::Sub + + std::ops::Mul + + std::ops::Div + + std::iter::Sum +{ + fn zero() -> Self; + fn one() -> Self; + fn from_usize(nb: usize) -> Self; + fn default_tol() -> Self; +} + +macro_rules! impl_unit_measure( + ( $( $t:ident ),* )=> { + $( + impl UnitMeasure for $t { + fn zero() -> Self { + 0 as $t + } + fn one() -> Self { + 1 as $t + } + + fn from_usize(nb: usize) -> Self { + nb as $t + } + + fn default_tol() -> Self { + 1e-6 as $t + } + + } + + )* + } +); +impl_unit_measure!(f32, f64); diff --git a/src/algo/page_rank.rs b/src/algo/page_rank.rs new file mode 100644 index 000000000..1f9954ef2 --- /dev/null +++ b/src/algo/page_rank.rs @@ -0,0 +1,185 @@ +use crate::visit::{EdgeRef, IntoEdges, NodeCount, NodeIndexable}; + +#[cfg(feature = "rayon")] +use rayon::prelude::*; + +use super::UnitMeasure; +/// \[Generic\] Page Rank algorithm. +/// +/// Computes the ranks of every node in a graph using the [Page Rank algorithm][pr]. +/// +/// Returns a `Vec` container mapping each node index to its rank. +/// +/// # Panics +/// The damping factor should be a number of type `f32` or `f64` between 0 and 1 (0 and 1 included). Otherwise, it panics. +/// +/// # Complexity +/// Time complexity is **O(N|V|²|E|)**. +/// Space complexity is **O(|V| + |E|)** +/// where **N** is the number of iterations, **|V|** the number of vertices (i.e nodes) and **|E|** the number of edges. +/// +/// [pr]: https://en.wikipedia.org/wiki/PageRank +/// +/// # Example +/// ```rust +/// use petgraph::Graph; +/// use petgraph::algo::page_rank; +/// let mut g: Graph<(), usize> = Graph::new(); +/// assert_eq!(page_rank(&g, 0.5_f64, 1), vec![]); // empty graphs have no node ranks. +/// let a = g.add_node(()); +/// let b = g.add_node(()); +/// let c = g.add_node(()); +/// let d = g.add_node(()); +/// let e = g.add_node(()); +/// g.extend_with_edges(&[(0, 1), (0, 3), (1, 2), (1, 3)]); +/// // With the following dot representation. +/// //digraph { +/// // 0 [ label = "()" ] +/// // 1 [ label = "()" ] +/// // 2 [ label = "()" ] +/// // 3 [ label = "()" ] +/// // 4 [ label = "()" ] +/// // 0 -> 1 [ label = "0.0" ] +/// // 0 -> 3 [ label = "0.0" ] +/// // 1 -> 2 [ label = "0.0" ] +/// // 1 -> 3 [ label = "0.0" ] +/// //} +/// let damping_factor = 0.7_f32; +/// let number_iterations = 10; +/// let output_ranks = page_rank(&g, damping_factor, number_iterations); +/// let expected_ranks = vec![0.14685437, 0.20267677, 0.22389607, 0.27971846, 0.14685437]; +/// assert_eq!(expected_ranks, output_ranks); +/// ``` +pub fn page_rank(graph: G, damping_factor: D, nb_iter: usize) -> Vec +where + G: NodeCount + IntoEdges + NodeIndexable, + D: UnitMeasure + Copy, +{ + let node_count = graph.node_count(); + if node_count == 0 { + return vec![]; + } + assert!( + D::zero() <= damping_factor && damping_factor <= D::one(), + "Damping factor should be between 0 et 1." + ); + let nb = D::from_usize(node_count); + let mut ranks = vec![D::one() / nb; node_count]; + let nodeix = |i| graph.from_index(i); + let out_degrees: Vec = (0..node_count) + .map(|i| graph.edges(nodeix(i)).map(|_| D::one()).sum::()) + .collect(); + + for _ in 0..nb_iter { + let pi = (0..node_count) + .enumerate() + .map(|(v, _)| { + ranks + .iter() + .enumerate() + .map(|(w, r)| { + let mut w_out_edges = graph.edges(nodeix(w)); + if let Some(_) = w_out_edges.find(|e| e.target() == nodeix(v)) { + damping_factor * *r / out_degrees[w] + } else if out_degrees[w] == D::zero() { + damping_factor * *r / nb // stochastic matrix condition + } else { + (D::one() - damping_factor) * *r / nb // random jumps + } + }) + .sum::() + }) + .collect::>(); + let sum = pi.iter().map(|score| *score).sum::(); + ranks = pi.iter().map(|r| *r / sum).collect::>(); + } + ranks +} + +#[allow(dead_code)] +fn out_edges_info(graph: G, index_w: usize, index_v: usize) -> (D, bool) +where + G: NodeCount + IntoEdges + NodeIndexable + std::marker::Sync, + D: UnitMeasure + Copy + std::marker::Send + std::marker::Sync, +{ + let node_w = graph.from_index(index_w); + let node_v = graph.from_index(index_v); + let mut out_edges = graph.edges(node_w); + let mut out_edge = out_edges.next(); + let mut out_degree = D::zero(); + let mut flag_points_to = false; + while let Some(edge) = out_edge { + out_degree = out_degree + D::one(); + if edge.target() == node_v { + flag_points_to = true; + } + out_edge = out_edges.next(); + } + (out_degree, flag_points_to) +} +/// \[Generic\] Parallel Page Rank algorithm. +/// +/// See [`page_rank`]. +#[cfg(feature = "rayon")] +pub fn parallel_page_rank( + graph: G, + damping_factor: D, + nb_iter: usize, + tol: Option, +) -> Vec +where + G: NodeCount + IntoEdges + NodeIndexable + std::marker::Sync, + D: UnitMeasure + Copy + std::marker::Send + std::marker::Sync, +{ + let node_count = graph.node_count(); + if node_count == 0 { + return vec![]; + } + assert!( + D::zero() <= damping_factor && damping_factor <= D::one(), + "Damping factor should be between 0 et 1." + ); + let mut tolerance = D::default_tol(); + if let Some(_tol) = tol { + tolerance = _tol; + } + let nb = D::from_usize(node_count); + let mut ranks: Vec = (0..node_count) + .into_par_iter() + .map(|i| D::one() / nb) + .collect(); + for _ in 0..nb_iter { + let pi = (0..node_count) + .into_par_iter() + .map(|v| { + ranks + .iter() + .enumerate() + .map(|(w, r)| { + let (out_deg, w_points_to_v) = out_edges_info(graph, w, v); + if w_points_to_v { + damping_factor * *r / out_deg + } else if out_deg == D::zero() { + damping_factor * *r / nb // stochastic matrix condition + } else { + (D::one() - damping_factor) * *r / nb // random jumps + } + }) + .sum::() + }) + .collect::>(); + let sum = pi.par_iter().map(|score| *score).sum::(); + let new_ranks = pi.par_iter().map(|r| *r / sum).collect::>(); + let squared_norm_2 = new_ranks + .par_iter() + .zip(&ranks) + .map(|(new, old)| (*new - *old) * (*new - *old)) + .sum::(); + if squared_norm_2 <= tolerance { + return ranks; + } else { + ranks = new_ranks; + } + } + ranks +} diff --git a/tests/page_rank.rs b/tests/page_rank.rs new file mode 100644 index 000000000..224415eb0 --- /dev/null +++ b/tests/page_rank.rs @@ -0,0 +1,83 @@ +use petgraph::{algo::page_rank, Graph}; + +#[cfg(feature = "rayon")] +use petgraph::algo::page_rank::parallel_page_rank; + +fn graph_example() -> Graph { + // Taken and adapted from https://github.com/neo4j-labs/graph?tab=readme-ov-file#how-to-run-algorithms + let mut graph = Graph::<_, f32>::new(); + graph.add_node("A".to_owned()); + graph.add_node("B".to_owned()); + graph.add_node("C".to_owned()); + graph.add_node("D".to_owned()); + graph.add_node("E".to_owned()); + graph.add_node("F".to_owned()); + graph.add_node("G".to_owned()); + graph.add_node("H".to_owned()); + graph.add_node("I".to_owned()); + graph.add_node("J".to_owned()); + graph.add_node("K".to_owned()); + graph.add_node("L".to_owned()); + graph.add_node("M".to_owned()); + graph.extend_with_edges(&[ + (1, 2), // B->C + (2, 1), // C->B + (4, 0), // D->A + (4, 1), // D->B + (5, 4), // E->D + (5, 1), // E->B + (5, 6), // E->F + (6, 1), // F->B + (6, 5), // F->E + (7, 1), // G->B + (7, 5), // F->E + (8, 1), // G->B + (8, 5), // G->E + (9, 1), // H->B + (9, 5), // H->E + (10, 1), // I->B + (10, 5), // I->E + (11, 5), // J->B + (12, 5), // K->B + ]); + graph +} + +fn expected_ranks() -> Vec { + vec![ + 0.029228685, + 0.38176042, + 0.3410649, + 0.014170233, + 0.035662483, + 0.077429585, + 0.035662483, + 0.014170233, + 0.014170233, + 0.014170233, + 0.014170233, + 0.014170233, + 0.014170233, + ] +} + +#[test] +fn test_page_rank() { + let graph = graph_example(); + let output_ranks = page_rank(&graph, 0.85_f32, 100); + assert_eq!(expected_ranks(), output_ranks); +} + +#[test] +#[cfg(feature = "rayon")] + +fn test_par_page_rank() { + let graph = graph_example(); + let output_ranks = parallel_page_rank(&graph, 0.85_f32, 100, Some(1e-12)); + assert!(!expected_ranks() + .iter() + .zip(output_ranks) + .any(|(expected, computed)| ((expected - computed).abs() > 1e-6) + || computed.is_nan() + || expected.is_nan())); +} diff --git a/tests/quickcheck.rs b/tests/quickcheck.rs index aad74335b..1f7a1af2c 100644 --- a/tests/quickcheck.rs +++ b/tests/quickcheck.rs @@ -26,7 +26,7 @@ use petgraph::algo::{ bellman_ford, condensation, dijkstra, find_negative_cycle, floyd_warshall, greedy_feedback_arc_set, greedy_matching, is_cyclic_directed, is_cyclic_undirected, is_isomorphic, is_isomorphic_matching, k_shortest_path, kosaraju_scc, maximum_matching, - min_spanning_tree, tarjan_scc, toposort, Matching, + min_spanning_tree, page_rank, tarjan_scc, toposort, Matching, }; use petgraph::data::FromElements; use petgraph::dot::{Config, Dot}; @@ -1312,3 +1312,21 @@ quickcheck! { true } } + +quickcheck! { + // The ranks are probabilities, + // as such they are positive and they should sum up to 1. + fn test_page_rank_proba(gr: Graph<(), f32>) -> bool { + if gr.node_count() == 0 { + return true; + } + let tol = 1e-10; + let ranks: Vec = page_rank(&gr, 0.85_f64, 5); + let at_least_one_neg_rank = ranks.iter().any(|rank| *rank < 0.); + let not_sumup_to_one = (ranks.iter().sum::() - 1.).abs() > tol; + if at_least_one_neg_rank | not_sumup_to_one{ + return false; + } + true + } +}