In [3]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

spark = SparkSession.builder.appName("Superheros").getOrCreate()

23/12/17 09:30:30 WARN Utils: Your hostname, perezs-zeenbook resolves to a loopback address: 127.0.1.1; using 10.120.151.100 instead (on interface wlo1)
23/12/17 09:30:30 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/12/17 09:30:31 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
# read csv
graph = spark.read.text("../data/marvel/Marvel+Graph")

graph.show(5)

+--------------------+
|               value|
+--------------------+
|5988 748 1722 375...|
|5989 4080 4264 44...|
|5982 217 595 1194...|
|5983 1165 3836 43...|
|5980 2731 3712 15...|
+--------------------+
only showing top 5 rows



In [5]:
# Get hero_id and num_of_friends of each row
num_of_friends = (
    graph.withColumn(
        "hero_id",
        F.split(F.trim(graph["value"]), " ")[
            0
        ],  # first space separated number of each line
    )
    .withColumn(
        "num_of_friends",
        F.size(F.split(F.trim(graph["value"]), " ")) - 1,  # Number of ids per line - 1
    )
    .drop("value")  # drop the original column
)

# the most popular heros appear in multiple lines
num_of_friends = num_of_friends.groupBy("hero_id").agg(
    F.sum("num_of_friends").alias("num_of_friends")
)

num_of_friends.show(5)

+-------+--------------+
|hero_id|num_of_friends|
+-------+--------------+
|    691|             6|
|   1159|            11|
|   3959|           142|
|   1572|            35|
|   2294|            14|
+-------+--------------+
only showing top 5 rows



In [6]:
# read csv
heros_info = spark.read.csv("../data/marvel/Marvel+Names", sep=" ")
# descriptive column names
column_names = ["hero_id", "hero_name"]
heros_info = heros_info.toDF(*column_names)

heros_info.show(5)

+-------+--------------------+
|hero_id|           hero_name|
+-------+--------------------+
|      1|24-HOUR MAN/EMMANUEL|
|      2|3-D MAN/CHARLES CHAN|
|      3|    4-D MAN/MERCURIO|
|      4|             8-BALL/|
|      5|                   A|
+-------+--------------------+
only showing top 5 rows



In [7]:
# add hero name to num_of_friends
num_of_friends = num_of_friends.join(heros_info, "hero_id", "left")

# change column order
num_of_friends = num_of_friends.select(["hero_id", "hero_name", "num_of_friends"])

# show heros with the most friends
num_of_friends.sort("num_of_friends", ascending=False).show(5)

+-------+--------------------+--------------+
|hero_id|           hero_name|num_of_friends|
+-------+--------------------+--------------+
|    859|     CAPTAIN AMERICA|          1933|
|   5306|SPIDER-MAN/PETER PAR|          1741|
|   2664|IRON MAN/TONY STARK |          1528|
|   5716|THING/BENJAMIN J. GR|          1426|
|   6306|    WOLVERINE/LOGAN |          1394|
+-------+--------------------+--------------+
only showing top 5 rows



In [8]:
# show heros with the least friends
num_of_friends.sort("num_of_friends", ascending=True).show(5)

+-------+--------------------+--------------+
|hero_id|           hero_name|num_of_friends|
+-------+--------------------+--------------+
|   3490|MARVEL BOY II/MARTIN|             0|
|   1089|       CLUMSY FOULUP|             0|
|    467|        BERSERKER II|             0|
|    577|              BLARE/|             0|
|   3489|MARVEL BOY/MARTIN BU|             0|
+-------+--------------------+--------------+
only showing top 5 rows



In [9]:
# Show all the heros with no friends
num_of_friends.filter(num_of_friends["num_of_friends"] == 0).show()

+-------+--------------------+--------------+
|hero_id|           hero_name|num_of_friends|
+-------+--------------------+--------------+
|    467|        BERSERKER II|             0|
|    577|              BLARE/|             0|
|   3490|MARVEL BOY II/MARTIN|             0|
|   3489|MARVEL BOY/MARTIN BU|             0|
|   2139|      GIURESCU, RADU|             0|
|   1089|       CLUMSY FOULUP|             0|
|   1841|              FENRIS|             0|
|   4517|              RANDAK|             0|
|   5028|           SHARKSKIN|             0|
|    835|     CALLAHAN, DANNY|             0|
|   1408|         DEATHCHARGE|             0|
|   4784|                RUNE|             0|
|   4945|         SEA LEOPARD|             0|
|   4602|         RED WOLF II|             0|
|   6411|              ZANTOR|             0|
|   3014|JOHNSON, LYNDON BAIN|             0|
|   3298|          LUNATIK II|             0|
|   2911|                KULL|             0|
|   2117|GERVASE, LADY ALYSSA|    

In [10]:
spark.stop()