#  <center> Import Job : Process Marvels data into Spark and create Neo4j data. </center> 

In [1]:
# Imports

import timeit
from pyspark.sql import DataFrame
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from pyspark.sql import types as T
from pyspark.sql import Window
from pyspark import SparkConf, SparkContext, StorageLevel
from pyspark.sql import SparkSession
import os

# imports  - to connect to neo4j DB
from py2neo import Graph,Node,Relationship

# Draw the neo4j data
##  from scripts.vis import draw

#### Initiate the timer

In [2]:
time_start = timeit.default_timer()
print("Start Time: " + str(time_start))

Start Time: 21579.948899


In [3]:
os.environ["PYSPARK_SUBMIT_ARGS"] = "--jars /home/jovyan/jars/neo4j-connector-apache-spark_2.12-4.1.2_for_spark_3.jar pyspark-shell"

In [4]:
spark = SparkSession.builder.appName('Marvel Graph Test').getOrCreate()

print("Get Spark Time: " + str(timeit.default_timer() - time_start))
print(f"spark version: {spark.version}")


22/07/18 21:11:27 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/07/18 21:11:28 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


Get Spark Time: 2.2029655690021173
spark version: 3.1.2


In [5]:
marvel_graph_df = spark \
            .read \
            .text(r"./data/Marvel-Graph.txt") \
            .select(F.split(F.trim(F.col("value")), " ").alias("value_list")) \
            .select((F.col("value_list")[0]).alias("id"),
                    (F.col("value_list"))) \
            .select(F.col("id"),
                    F.explode(F.col("value_list")).alias("appeared_with")) \
            .select(F.col("id").cast(T.IntegerType()),
                    F.col("appeared_with").cast(T.IntegerType())) \
            .where(F.col("id") != F.col("appeared_with"))

In [6]:
marvel_graph_df.show(5)

+----+-------------+
|  id|appeared_with|
+----+-------------+
|5988|          748|
|5988|         1722|
|5988|         3752|
|5988|         4655|
|5988|         5743|
+----+-------------+
only showing top 5 rows



##### <b> Import Marvels names </b>

In [7]:
schema_names = T.StructType() \
        .add(nullable=False, data_type=T.IntegerType(), field="id") \
        .add(nullable=False, data_type=T.StringType(), field="name")

marvel_names_df = spark \
        .read \
        .option("header", "false") \
        .option("sep", " ") \
        .csv(path=r"./data/Marvel-Names.txt", schema=schema_names)

In [8]:
marvel_names_df.show(5)

+---+--------------------+
| id|                name|
+---+--------------------+
|  1|24-HOUR MAN/EMMANUEL|
|  2|3-D MAN/CHARLES CHAN|
|  3|    4-D MAN/MERCURIO|
|  4|             8-BALL/|
|  5|                   A|
+---+--------------------+
only showing top 5 rows



In [9]:
marvel_graph_df \
        .groupby(F.col("appeared_with")) \
        .count().show(5)

+-------------+-----+
|appeared_with|count|
+-------------+-----+
|         1645|   19|
|         4935|  432|
|         1959|  482|
|         2122|   50|
|         2659|   67|
+-------------+-----+
only showing top 5 rows



In [10]:
hero_popularity = marvel_graph_df \
        .groupby(F.col("appeared_with")) \
        .count() \
        .join(marvel_names_df, F.col("appeared_with") == F.col("id"), "inner") \
        .drop(F.col("appeared_with")) \
        .orderBy(F.col("count").desc()) \
        .select(F.col("id"),
                F.col("name"),
                F.col("count").alias("appearances"),
                F.row_number()
                .over(Window
                      .orderBy(F.col("count").desc()))
                .alias("popularity_index"))

In [11]:
hero_popularity.orderBy(F.col("popularity_index")).show(5)

22/07/18 21:11:31 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+----+--------------------+-----------+----------------+
|  id|                name|appearances|popularity_index|
+----+--------------------+-----------+----------------+
| 859|     CAPTAIN AMERICA|       1933|               1|
|5306|SPIDER-MAN/PETER PAR|       1741|               2|
|2664|IRON MAN/TONY STARK |       1528|               3|
|5716|THING/BENJAMIN J. GR|       1426|               4|
|6306|    WOLVERINE/LOGAN |       1394|               5|
+----+--------------------+-----------+----------------+
only showing top 5 rows



##### Write nodes and edges into Neo4j database


In [12]:
hero_popularity \
        .write \
        .format("org.neo4j.spark.DataSource") \
        .option("url", "bolt://neo4j:7687") \
        .option("authentication.type", "basic") \
        .option("authentication.basic.username", "neo4j") \
        .option("authentication.basic.password", "test") \
        .option("labels", ":Hero") \
        .option("node.keys", "id") \
        .mode("Overwrite") \
        .save()

22/07/18 21:11:33 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
                                                                                

In [13]:
x = marvel_graph_df \
        .join(hero_popularity, marvel_graph_df.id == hero_popularity.id, "inner") \
        .where(F.col("popularity_index") < 2) \
        .select(marvel_graph_df.id,
                F.col("appeared_with"))

In [14]:
x.show(6)

22/07/18 21:11:51 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+---+-------------+
| id|appeared_with|
+---+-------------+
|859|         1369|
|859|         3198|
|859|          478|
|859|         1365|
|859|          472|
|859|         4397|
+---+-------------+
only showing top 6 rows



In [15]:
x \
    .write \
    .format("org.neo4j.spark.DataSource") \
    .option("url", "bolt://neo4j:7687") \
    .option("authentication.type", "basic") \
    .option("authentication.basic.username", "neo4j") \
    .option("authentication.basic.password", "test") \
    .option("relationship", "KNOWS") \
    .option("relationship.save.strategy", "keys") \
    .option("relationship.source.labels", ":Hero") \
    .option("relationship.source.node.keys", "id:id") \
    .option("relationship.source.save.mode", "Overwrite") \
    .option("relationship.target.labels", ":Hero") \
    .option("relationship.target.node.keys", "appeared_with:id") \
    .option("relationship.target.save.mode", "Overwrite") \
    .mode("Overwrite") \
    .save()

22/07/18 21:11:52 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
                                                                                

In [16]:
## Stop the timer
time_stop = timeit.default_timer()
print(f"Run Time: {time_stop - time_start}s")


Run Time: 37.78631800199946s
