<a href="https://colab.research.google.com/github/momo54/large_scale_data_management/blob/main/GraphFramesPageRank.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

! RDF and  GraphFrames

GraphFrame is an additional package to perform graph processing in Spark. It is concurrent to GraphX, but available in Python.


launching in a terminal

```
pyspark --packages graphframes:graphframes:0.8.1-spark3.0-s_2.12
```

Valid configuration are listed in:
```
https://spark-packages.org/package/graphframes/graphframes
```


In [1]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
!pip install graphframes

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [14]:
!wget -nc https://repos.spark-packages.org/graphframes/graphframes/0.8.2-spark3.2-s_2.12/graphframes-0.8.2-spark3.2-s_2.12.jar
!cp graphframes-0.8.2-spark3.2-s_2.12.jar /usr/local/lib/python3.6/dist-packages/jars

File ‘graphframes-0.8.2-spark3.2-s_2.12.jar’ already there; not retrieving.



In [13]:
from pyspark.sql import SparkSession
from graphframes import GraphFrame

spark = SparkSession.builder.appName("Basics").getOrCreate()


In [5]:
spark = SparkSession.builder.master("local[*]").config("spark.jars.packages", "graphframes:graphframes:0.8.2-spark3.2-s_2.12").getOrCreate()  

In [12]:
# checking that everything works...

# Create a Vertex DataFrame with unique ID column "id"
v = spark.createDataFrame([
  ("a", "Alice", 34),
  ("b", "Bob", 36),
  ("c", "Charlie", 30),
], ["id", "name", "age"])
# Create an Edge DataFrame with "src" and "dst" columns
e = spark.createDataFrame([
  ("a", "b", "friend"),
  ("b", "c", "follow"),
  ("c", "b", "follow"),
], ["src", "dst", "relationship"])
# Create a GraphFrame
from graphframes import *
g = GraphFrame(v, e)

# Query: Get in-degree of each vertex.
g.inDegrees.show()

# Query: Count the number of "follow" connections in the graph.
print(g.edges.filter("relationship = 'follow'").count())

# Run PageRank algorithm, and show results.
results = g.pageRank(resetProbability=0.01, maxIter=20)
results.vertices.select("id", "pagerank").show()


+---+--------+
| id|inDegree|
+---+--------+
|  b|       2|
|  c|       1|
+---+--------+

2
+---+------------------+
| id|          pagerank|
+---+------------------+
|  c|1.8994109890559092|
|  b|1.0905890109440908|
|  a|              0.01|
+---+------------------+



In [19]:
e.printSchema()

root
 |-- src: string (nullable = true)
 |-- dst: string (nullable = true)
 |-- relationship: string (nullable = true)



In [15]:
# using a more realistic RDF graph (only triples)

!wget -nc -q https://raw.githubusercontent.com/momo54/large_scale_data_management/main/small_page_links.nt

In [105]:
from pyspark.sql.types import StructType,StringType
schema=StructType() \
  .add("src",StringType(),True) \
  .add("relationship",StringType(),True) \
  .add("dst",StringType(),True) 

# reading Triples
# managing quads requires Reification :-/
edges=spark.read.format("csv") \
  .options(delimiter=" ") \
  .schema(schema) \
  .load(["multi.txt0.txt"])
#  .load(["multi.txt0.txt","catalog.txt0.txt"])

edges.take(1)

#generating Vertices from Edges...
vertices=edges.select('src') \
  .union(edges.select('dst')) \
  .distinct() \
  .withColumnRenamed('src', 'id')

vertices.take(1)

graph = GraphFrame(vertices, edges)

# Query: Get in-degree of each vertex.
graph.inDegrees.show()

vertices.show(10)

edges.select("relationship").distinct().show(200)
edges.filter("relationship='includes'").show(200)

  "DataFrame.sql_ctx is an internal property, and will be removed "


+-----------------+--------+
|               id|inDegree|
+-----------------+--------+
|        date_4143|       1|
|     integer_2859|       1|
|      string_1891|       1|
|OtherProduct_1219|       1|
|      string_1643|       1|
|     integer_3390|       1|
|     integer_2419|       1|
|       Gender_728|       1|
|      Country_677|       1|
|        date_4033|       1|
|      string_1471|       1|
|        date_3716|       1|
|      string_2240|       1|
|     integer_3114|       1|
|     integer_2728|       1|
|     integer_3010|       1|
|        date_4096|       2|
|      string_1824|       1|
|     integer_2451|       1|
|      Country_671|       1|
+-----------------+--------+
only showing top 20 rows

+-------------+
|           id|
+-------------+
|   Offer_1264|
|  Review_1362|
|    User_1391|
|Purchase_1342|
|  Review_1364|
|   Offer_1244|
|  Review_1374|
|   Offer_1238|
|   Offer_1326|
|Purchase_1336|
+-------------+
only showing top 10 rows

+----------------+
|    rela

In [106]:
subgraph=graph.filterEdges("relationship='includes'").dropIsolatedVertices()
subgraph.triplets.show(truncate=200)

+------------+-------------------------------------------------+---------------------------+
|         src|                                             edge|                        dst|
+------------+-------------------------------------------------+---------------------------+
|{Offer_1302}|{Offer_1302, includes, ClassicalMusicConcert_730}|{ClassicalMusicConcert_730}|
|{Offer_1295}|           {Offer_1295, includes, MusicAlbum_731}|           {MusicAlbum_731}|
|{Offer_1297}|           {Offer_1297, includes, MusicAlbum_738}|           {MusicAlbum_738}|
|{Offer_1248}|           {Offer_1248, includes, MusicAlbum_746}|           {MusicAlbum_746}|
|{Offer_1307}|           {Offer_1307, includes, MusicAlbum_750}|           {MusicAlbum_750}|
|{Offer_1285}|           {Offer_1285, includes, MusicAlbum_753}|           {MusicAlbum_753}|
|{Offer_1286}|           {Offer_1286, includes, MusicAlbum_753}|           {MusicAlbum_753}|
|{Offer_1251}|           {Offer_1251, includes, MusicAlbum_754}|      

In [107]:
offers = graph.find("(s)-[p]->(o)")\
  .filter("p.relationship='offers'") 
offers.show(200,truncate=200)

+---------------+-----------------------------------+------------+
|              s|                                  p|           o|
+---------------+-----------------------------------+------------+
|{Retailer_1231}|{Retailer_1231, offers, Offer_1242}|{Offer_1242}|
|{Retailer_1231}|{Retailer_1231, offers, Offer_1242}|{Offer_1242}|
|{Retailer_1231}|{Retailer_1231, offers, Offer_1255}|{Offer_1255}|
|{Retailer_1231}|{Retailer_1231, offers, Offer_1260}|{Offer_1260}|
|{Retailer_1231}|{Retailer_1231, offers, Offer_1268}|{Offer_1268}|
|{Retailer_1231}|{Retailer_1231, offers, Offer_1272}|{Offer_1272}|
|{Retailer_1231}|{Retailer_1231, offers, Offer_1278}|{Offer_1278}|
|{Retailer_1231}|{Retailer_1231, offers, Offer_1282}|{Offer_1282}|
|{Retailer_1231}|{Retailer_1231, offers, Offer_1282}|{Offer_1282}|
|{Retailer_1231}|{Retailer_1231, offers, Offer_1284}|{Offer_1284}|
|{Retailer_1231}|{Retailer_1231, offers, Offer_1296}|{Offer_1296}|
|{Retailer_1231}|{Retailer_1231, offers, Offer_1299}|{Offer_12

In [108]:
includes = graph.find("(s)-[p]->(o)")\
  .filter("p.relationship='includes'") 
includes.show(200,truncate=200)

+------------+-------------------------------------------------+---------------------------+
|           s|                                                p|                          o|
+------------+-------------------------------------------------+---------------------------+
|{Offer_1302}|{Offer_1302, includes, ClassicalMusicConcert_730}|{ClassicalMusicConcert_730}|
|{Offer_1295}|           {Offer_1295, includes, MusicAlbum_731}|           {MusicAlbum_731}|
|{Offer_1297}|           {Offer_1297, includes, MusicAlbum_738}|           {MusicAlbum_738}|
|{Offer_1248}|           {Offer_1248, includes, MusicAlbum_746}|           {MusicAlbum_746}|
|{Offer_1307}|           {Offer_1307, includes, MusicAlbum_750}|           {MusicAlbum_750}|
|{Offer_1285}|           {Offer_1285, includes, MusicAlbum_753}|           {MusicAlbum_753}|
|{Offer_1286}|           {Offer_1286, includes, MusicAlbum_753}|           {MusicAlbum_753}|
|{Offer_1251}|           {Offer_1251, includes, MusicAlbum_754}|      

In [109]:
chain4 = graph.find("(a)-[offers]->(b);(b)-[includes]->(c)")\
  .filter("offers.relationship='offers'") \
  .filter("includes.relationship='includes'")
chain4.show(200)

+---------------+--------------------+------------+--------------------+-------------------+
|              a|              offers|           b|            includes|                  c|
+---------------+--------------------+------------+--------------------+-------------------+
|{Retailer_1231}|{Retailer_1231, o...|{Offer_1242}|{Offer_1242, incl...|{OtherProduct_1183}|
|{Retailer_1231}|{Retailer_1231, o...|{Offer_1242}|{Offer_1242, incl...|{OtherProduct_1183}|
|{Retailer_1231}|{Retailer_1231, o...|{Offer_1260}|{Offer_1260, incl...|   {MusicAlbum_759}|
|{Retailer_1231}|{Retailer_1231, o...|{Offer_1272}|{Offer_1272, incl...|{OtherProduct_1133}|
|{Retailer_1231}|{Retailer_1231, o...|{Offer_1304}|{Offer_1304, incl...|   {MusicAlbum_809}|
|{Retailer_1231}|{Retailer_1231, o...|{Offer_1305}|{Offer_1305, incl...|{OtherProduct_1187}|
|{Retailer_1231}|{Retailer_1231, o...|{Offer_1305}|{Offer_1305, incl...|{OtherProduct_1187}|
|{Retailer_1231}|{Retailer_1231, o...|{Offer_1309}|{Offer_1309, incl..