In [1]:
# Compute degreeRatio and PageRank for hosts/domains from CommonCrawl dataset
# PJ 29/8/2017
#
# Launch this notebook with the following command to enable graphFrames:
#  %pyspark --packages graphframes:graphframes:0.5.0-spark2.1-s_2.11
# and add the following two lines to your ~/.bashrc:
#  export PYSPARK_DRIVER_PYTHON=jupyter
#  export PYSPARK_DRIVER_PYTHON_OPTS='notebook'
#
# Pre-requisite to running this notebook - construct a graph using cc-pyspark example code as follows:
#  git clone https://github.com/commoncrawl/cc-pyspark.git
#  cd cc-pyspark
#  spark-submit ./wat_extract_links.py --num_output_partitions 1 --log_level WARN ./input/test_wat.txt links
#  spark-submit ./hostlinks_to_graph.py spark-warehouse/links graph

In [2]:
from graphframes import *
from pyspark.sql.functions import *
import matplotlib.pyplot as plt

In [3]:
# Read in vertices and edges created by above cc-pyspark jobs
df_v = spark.read.parquet("../cc-pyspark/spark-warehouse/graph_vertices")
df_e = spark.read.parquet("../cc-pyspark/spark-warehouse/graph_edges")
#df_v.show()

In [None]:
# Save links as CSV (e.g. for import into Gephi)
#df = sqlContext.read.parquet("../cc-pyspark/spark-warehouse/links")
#str(df)
#f = open('web-graph-edges.csv','w')
#for row in df.collect(): f.write(row.s + "," + row.t)
#f.close()

In [4]:
# Rename s and t in edges to src and dst for GraphFrame
df_e2 = df_e.select(col("s").alias("src"), col("t").alias("dst"))

In [5]:
# Make a GraphFrame
g = GraphFrame(df_v, df_e2)

In [6]:
# Show top in-degrees
g.inDegrees.orderBy("inDegree", ascending=False).show(5)

+------+--------+
|    id|inDegree|
+------+--------+
| 77159|   15769|
|144325|   14505|
| 87626|    7341|
|159004|    6510|
| 87654|    5325|
+------+--------+
only showing top 5 rows



In [7]:
# Show summary of in-degrees by node count
ind = g.inDegrees.groupBy("inDegree").count().orderBy("count", ascending=False)
ind.show(5)
ind.count()

+--------+------+
|inDegree| count|
+--------+------+
|       1|199751|
|       2| 16031|
|       3|  5043|
|       4|  2395|
|       5|  1222|
+--------+------+
only showing top 5 rows



234

In [8]:
# Display scatter plot of in degrees
# TODO: Figure out why this is crashing the kernel!
#plt.scatter(ind.inDegree, ind.count, s=1.0)
#plt.show()

In [33]:
# Compute degree ratio for each node (inDeg/outDeg)
inDeg=g.inDegrees
outDeg=g.outDegrees
degreeRatio = inDeg.join(outDeg, inDeg.id == outDeg.id, how='outer') \
                   .drop(outDeg.id) \
                   .selectExpr("id", "double(inDegree)/double(outDegree) as degreeRatio") \
                   .cache()
degreeRatio.orderBy("degreeRatio",ascending=False).show(5)

+------+-----------+
|    id|degreeRatio|
+------+-----------+
| 94974|      879.0|
| 87626|      734.1|
| 81213|      502.0|
|117487|      466.0|
|159004|      434.0|
+------+-----------+
only showing top 5 rows



In [34]:
# Join back to the vertices dataframe to create a hubs dataframe
hubs = degreeRatio.join(df_v, degreeRatio.id == df_v.id, how='outer') \
                  .drop(df_v.id) \
                  #.filter("degreeRatio > 50")
hubs.orderBy("degreeRatio", ascending=False).show(10)

+------+------------------+--------------------+
|    id|       degreeRatio|                name|
+------+------------------+--------------------+
| 94974|             879.0|       com.instagram|
| 87626|             734.1|     com.google.plus|
| 81213|             502.0|com.feedburner.feeds|
|117487|             466.0|          com.paypal|
|159004|             434.0|         com.youtube|
|134829|             209.5|     com.stumbleupon|
| 87566|             162.3|          com.google|
| 19624|101.33333333333333|    com.apple.itunes|
|119759|              98.8|       com.pinterest|
| 87625|              81.0|     com.google.play|
+------+------------------+--------------------+
only showing top 10 rows



In [35]:
# Compute page-rank on the GraphFrame and display top results
pr = g.pageRank(resetProbability=0.01, maxIter=20)
pr_v = pr.vertices.select("id", "pagerank")
pr_v.orderBy("pagerank",ascending=False).show(5)

+------+------------------+
|    id|          pagerank|
+------+------------------+
| 77159|1660.5479246392313|
|144325|1481.6075968768785|
| 87626| 750.6779587146389|
| 87654| 642.9487697576785|
|159004|  605.087697326464|
+------+------------------+
only showing top 5 rows



In [36]:
# Join back to the vertices DataFrame to extend our hubs DataFrame
hubs2 = hubs.join(pr_v, hubs.id == pr_v.id, how='outer') \
                  .drop(pr_v.id) \
                  #.filter("pagerank > 2")
hubs2.orderBy("pagerank", ascending=False).show(25)

+------+------------------+--------------------+------------------+
|    id|       degreeRatio|                name|          pagerank|
+------+------------------+--------------------+------------------+
| 77159|              null|        com.facebook|1660.5479246392313|
|144325|              null|         com.twitter|1481.6075968768785|
| 87626|             734.1|     com.google.plus| 750.6779587146389|
| 87654|              null|com.googleapis.fonts| 642.9487697576785|
|159004|             434.0|         com.youtube|  605.087697326464|
| 94974|             879.0|       com.instagram|357.85294188397035|
| 87566|             162.3|          com.google| 335.5888404982098|
|119759|              98.8|       com.pinterest|317.21378081830125|
|219462|              null|            org.gmpg|  227.710666601869|
|102302|              null|        com.linkedin| 213.5927093247866|
| 87612|              null|     com.google.maps|159.77992983987357|
| 25510|              null|         com.blogger|

In [2]:
# Note that many sites have 'null' degree ratio - this is because CommonCrawl has no out-going links from these
# sites since their 'robots.txt' rules prevent them from being crawled. Hence their outDeg=0.

In [None]:
import networkx as nx
import pandas as pd
import matplotlib.pyplot as plt
edges = df_e2.toPandas()
nxg=nx.from_pandas_dataframe(edges,'src','dst')
pos = nx.spring_layout(nxg)
nx.draw_networkx_nodes(nxg, pos, cmap=plt.get_cmap('jet'), node_color = values)
nx.draw_networkx_edges(nxg, pos, edgelist=red_edges, edge_color='r', arrows=True)
nx.draw_networkx_edges(nxg, pos, edgelist=black_edges, arrows=False)
plt.show()