# 1. Librerías & Set Up

In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=494cfa5376d796cf61726721db28fba3a89b959bead45f05e122d84350020f71
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [2]:
from pyspark.sql import SparkSession
from itertools import product

In [3]:
#import neo4j [TO-EDIT]
#from neo4j import GraphDatabase
#import pandas as pd

In [4]:
spark = SparkSession.builder \
    .getOrCreate()

sc = spark.sparkContext
sc # Elemento que ejecuta toda instrucción.

# 2. Neo4j Graph

In [5]:
# TODO

In [6]:
graph = [(1,11,2),(1,11,3),(2,11,3),(3,11,2),(3,11,4),(4,11,1),(4,11,2),(4,11,3),(4,12,5),(5,12,1),(5,12,2),(5,12,6)]

# 3. MapReduce Algorithm for Triangles

In [7]:
rdd_graph = sc.parallelize(graph)
rdd_graph.collect()

[(1, 11, 2),
 (1, 11, 3),
 (2, 11, 3),
 (3, 11, 2),
 (3, 11, 4),
 (4, 11, 1),
 (4, 11, 2),
 (4, 11, 3),
 (4, 12, 5),
 (5, 12, 1),
 (5, 12, 2),
 (5, 12, 6)]

In [8]:
def hash(number):
  """
  Returns number mod 2. The ouput will be 0 or 1.
  """
  return number % 2

In [13]:
def get_keys(edge, b_dim, b_set, pattern_dim):
  """
  output: retorna las llaves correspondientes para un vertice.

  Idea general: buscamos el par hash_n1,hash_n2 dentro de las posibles
  combinaciones dentro del espacio de imagenes de la funcion de hash.
  Dentro del for, obtenemos un string con la codificacion de las llaves y luego
  verificamos si es una llave candidata para el vertice entregado:

  1.  sequence_in_reducer: Si la secuencia 'b1b2' esta en la llave del reducer
      codificada como hash(n1)hash(n2) ?, donde ? = 0 o 1, entonces se considerará
      el par reducer_key : edge.
  2.  edge_case: El otro caso, es para cuando tenemos por ejemplo x = n2 y z = n2
      para patrones de 3 vertices.
  """

  hash_n1 = hash(edge[0]) # valor de hash para el nodo 1 = b1
  hash_n2 = hash(edge[2]) # valor de hash para el nodo 2 = b2
  values = [] # posible keys
  sequence = '{}{}'.format(hash_n1, hash_n2)

  for i in range(0, b_dim ** pattern_dim):
    reducer = ''.join(str(num) for num in b_set[i])
    sequence_in_reducer = sequence in reducer
    edge_case = reducer[0] == sequence[1] and reducer[pattern_dim - 1] == sequence[0]
    if sequence_in_reducer or edge_case:
        reducer_key = tuple(int(digit) for digit in reducer)
        values.append((reducer_key, edge))

  return values


def map_phase(rdd, b_dim, b_set, pattern_dim):
  """
  input:
    - rdd: RDD del grafo de dimension 'dim'
    - b_dim: Cantidad de elementos de las imagenes de la funcion de hash.
    - b_set: Imagenes de la funcion de hash.
    - pattern_dim: cantidad de nodos del patron de grafo.
  ouput: Mapeo de cada arista con respecto a las llaves
  """

  mapped_keys = rdd.flatMap(lambda edge: get_keys(edge, b_dim, b_set, pattern_dim))
  reducers = mapped_keys.groupByKey().mapValues(list)
  return reducers

def reduce_phase(reducers, pattern_dim):
  """
  input: RDD del grafo y cantidad de nodos del patron de grafo.
  """




B = 2 # Dimension de elementos del conjunto de imagenes de la funcion de hash: |{0,1}|
L = 3 # Dimension del patron de grafo (triangulo para este caso)
BSET = list(range(B))
REDUCERS = list(product(BSET, repeat=L))

# Fase de Map: Obtenemos las llaves de cada reducer y el conjunto de aristas mapeados a estas llaves.
#
reducers = map_phase(rdd_graph, B, REDUCERS, L)
# Fase Reduce: Obtenemos todos los posibles patrones de L nodos.



In [26]:
reducers.collect()

[((0, 0, 1),
  [(1, 11, 2),
   (2, 11, 3),
   (3, 11, 2),
   (3, 11, 4),
   (4, 11, 1),
   (4, 11, 2),
   (4, 11, 3),
   (4, 12, 5),
   (5, 12, 2),
   (5, 12, 6)]),
 ((0, 1, 0),
  [(1, 11, 2),
   (2, 11, 3),
   (3, 11, 2),
   (3, 11, 4),
   (4, 11, 1),
   (4, 11, 2),
   (4, 11, 3),
   (4, 12, 5),
   (5, 12, 2),
   (5, 12, 6)]),
 ((1, 0, 0),
  [(1, 11, 2),
   (2, 11, 3),
   (3, 11, 2),
   (3, 11, 4),
   (4, 11, 1),
   (4, 11, 2),
   (4, 11, 3),
   (4, 12, 5),
   (5, 12, 2),
   (5, 12, 6)]),
 ((1, 1, 1), [(1, 11, 3), (5, 12, 1)]),
 ((0, 1, 1),
  [(1, 11, 2),
   (1, 11, 3),
   (2, 11, 3),
   (3, 11, 2),
   (3, 11, 4),
   (4, 11, 1),
   (4, 11, 3),
   (4, 12, 5),
   (5, 12, 1),
   (5, 12, 2),
   (5, 12, 6)]),
 ((1, 0, 1),
  [(1, 11, 2),
   (1, 11, 3),
   (2, 11, 3),
   (3, 11, 2),
   (3, 11, 4),
   (4, 11, 1),
   (4, 11, 3),
   (4, 12, 5),
   (5, 12, 1),
   (5, 12, 2),
   (5, 12, 6)]),
 ((1, 1, 0),
  [(1, 11, 2),
   (1, 11, 3),
   (2, 11, 3),
   (3, 11, 2),
   (3, 11, 4),
   (4, 11, 1),
  

In [215]:
def find_patterns(edges, pattern_dim):
  neighbors = []
  for i in range(len(edges)):
    curr_edge = edges[i]
    neighbors.append([])
    for j in range(len(edges)):
      next_edge = edges[j]
      if next_edge == curr_edge:
        continue
      if curr_edge[2] == next_edge[0]:
        neighbors[i].append(next_edge)
  cycles = find_cycles(edges, neighbors, pattern_dim)

  return cycles


def find_cycles(edges, neighbors, pattern_dim):
    cycles = []
    for edge in edges: # Por cada vértice, verificamos si existe un ciclo.
        visited = [False for i in range(len(edges))] # Restauramos la lista de nodos visitados
        init_edge = edge
        stack = [(init_edge, [init_edge])]
        while len(stack):
            # Extraemos una arista del stack
            curr_edge, path = stack.pop()
            id_curr_edge = edges.index(curr_edge)
            # Marcamos la arista como visitada
            visited[id_curr_edge] = True

            # En el caso que esta arista tenga un camino de largo pattern_dim,
            # entonces el posible que presente un ciclo de 'pattern_dim' aristas.
            if len(path) == pattern_dim:
                last_node = path[pattern_dim - 1][2] # primer nodo del camino
                first_node = path[0][0] # ultimo nodo del camino
                if last_node == first_node: # si es el mismo nodo, entonces hay ciclo
                  # n1, n2, n3
                  nodes = tuple(elem[0] for elem in path)
                  cycles.append(nodes)
                continue # seguimos iterando

            for neighbor_edge in neighbors[edges.index(curr_edge)]:
                if not visited[edges.index(neighbor_edge)]:
                    stack.append((neighbor_edge, path + [neighbor_edge]))

    return cycles


pattern_dim = 4

reducers_edges = reducers.map(lambda v: v[1]).filter(lambda x: len(x) >= pattern_dim)

patterns = reducers_edges.map(lambda edges: find_patterns(edges, pattern_dim))
all_patterns = patterns
all_patterns.flatMap(list).distinct().collect()

#reducers.map(lambda message: find_patterns(message, pattern_dim)).collect()

[(1, 2, 3, 4),
 (2, 3, 4, 5),
 (2, 3, 4, 3),
 (2, 3, 4, 1),
 (3, 2, 3, 4),
 (3, 4, 5, 2),
 (4, 1, 2, 3),
 (4, 5, 2, 3),
 (5, 2, 3, 4),
 (1, 3, 4, 5),
 (3, 4, 5, 1),
 (5, 1, 3, 4)]

In [None]:
def dfs(edges):
  patterns = []
  visited = [False for i in range(len(edges))]
  stack = []

  init_edge = edges[0] # inicializamos el stack con la primera arista
  stack.append((init_edge, [init_edge])) # arista y camino recorrido hasta el nodo actual

  while len(stack) > 0:
    curr_edge, edge_path = stack.pop() # Extraemos una arista del stack
    curr_edge_idx = edges.index(curr_edge)

    if (not visited[curr_edge_idx]):
      visited[curr_edge_idx] = True

    # ...





# 4. MapReduce Algorithm for Squares
