# Graph Building & Analysis

In [11]:
import pandas as pd 
import json
import matplotlib.pyplot as plt 

#Graph network imports
from graphframes import *
from pyspark import *
from pyspark.sql import *
import numpy as np
from pyspark.ml.linalg import *
from pyspark.ml.linalg import *
from pyspark.sql.types import * 
from pyspark.sql.functions import *
from functools import reduce
from pyspark.sql.functions import col, lit, when

from pyspark.sql.functions import udf #user defined function
from pyspark.sql.types import * #Import types == IntegerType, StringType etc.

import nltk

### Read data

In [12]:
spark_df = spark.read.json('big_data_project/unzipped_files/')

In [13]:
spark_df.printSchema()

root
 |-- authors: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- ids: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)
 |    |    |-- name: string (nullable = true)
 |-- doi: string (nullable = true)
 |-- doiUrl: string (nullable = true)
 |-- entities: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- fieldsOfStudy: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- id: string (nullable = true)
 |-- inCitations: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- journalName: string (nullable = true)
 |-- journalPages: string (nullable = true)
 |-- journalVolume: string (nullable = true)
 |-- outCitations: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- paperAbstract: string (nullable = true)
 |-- pdfUrls: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- pmid: string (nullable = t

In [4]:
# mini_df = spark_df.sample(fraction=0.1, seed=3)

In [14]:
spark_df.select('id', 'inCitations', 'outCitations').show()

+--------------------+--------------------+--------------------+
|                  id|         inCitations|        outCitations|
+--------------------+--------------------+--------------------+
|b9c27ac1bb8b3ec9d...|                  []|                  []|
|91cac7b12800a6238...|                  []|                  []|
|412b10a92babf3509...|                  []|                  []|
|0de1cecd2ed49812f...|[8ecaab2a03953fa5...|[cf5cdba6424524ee...|
|1d9fee40f59bf00eb...|                  []|                  []|
|5e33c40c33d9e4753...|                  []|                  []|
|d477d10c48912335d...|                  []|                  []|
|b87f8f3daaa92d3bc...|                  []|                  []|
|598567aa0229a6b31...|[ecec08f5ca15e07d...|                  []|
|b790f30e811445a68...|[2da3df885d1f032d...|                  []|
|69a00b492e0ba6e25...|                  []|                  []|
|6b75f9994d131b2ee...|[ac89cd3e34dbc788...|                  []|
|9eda969296439bf4c...|   

In [24]:
from pyspark.sql.functions import col, count, when
spark_df.select([count(when(col(c).isNull(), c)).alias(c) for c in spark_df.columns]).toPandas().T

Unnamed: 0,0
authors,0
doi,0
doiUrl,0
entities,0
fieldsOfStudy,0
id,0
inCitations,0
journalName,0
journalPages,0
journalVolume,0


### Creating Vertices

In [15]:
vertices = spark_df.select('id', 'title', 'year', 'fieldsOfStudy', 'paperAbstract')

### Creating Edges

In [16]:
from pyspark.sql.functions import explode
from pyspark.sql.functions import lit

#Getting the inCitations id -> (cited) -> article id 
edges = spark_df.select(explode('inCitations').alias('src'), spark_df.id.alias('dst')).withColumn('type', lit('cited'))

#Getting the article id -> (cited) -> outCitations id
edges2 = spark_df.select(spark_df.id.alias('src'), explode('outCitations').alias('dst')).withColumn('type', lit('cited'))

#Union of these two 
edges_total = edges.union(edges2)

In [17]:
edges_total.show(5, truncate=False)

+----------------------------------------+----------------------------------------+-----+
|src                                     |dst                                     |type |
+----------------------------------------+----------------------------------------+-----+
|8ecaab2a03953fa5cf08ab5db2bb49c16d90527d|0de1cecd2ed49812f3a55e4b78edf9ffd292618c|cited|
|8ea68d48d5595730f082f625a2ad759f934411cf|0de1cecd2ed49812f3a55e4b78edf9ffd292618c|cited|
|766a2b4d54b541c737f6a6ec81ad487b56dc83ed|0de1cecd2ed49812f3a55e4b78edf9ffd292618c|cited|
|8479962401362c07f2d6cbff871a11cbd7a4913f|0de1cecd2ed49812f3a55e4b78edf9ffd292618c|cited|
|a0ab0cf6c8fe0b1e50407576ba5b80f0f3f58476|0de1cecd2ed49812f3a55e4b78edf9ffd292618c|cited|
+----------------------------------------+----------------------------------------+-----+
only showing top 5 rows



In [18]:
g = GraphFrame(vertices, edges_total)
## Take a look at the DataFrames
# g.vertices.show()
# g.edges.show()
## Check the number of edges of each vertex
g.degrees.sort("degree", ascending=False).show()

+--------------------+------+
|                  id|degree|
+--------------------+------+
|83730969c0686b1d1...| 38079|
|abd1c342495432171...| 32441|
|13d4c2f76a7c1a4d0...| 20769|
|a512385be058b1e2e...| 17759|
|a411f6a0e6473137a...| 17398|
|87f40e6f3022adbc1...| 12909|
|73679f1ba00de9e73...| 11435|
|e6dd0e6cf076b1207...| 11408|
|d2860a370a0386c57...| 10815|
|6a17ebeeb80cd696b...|  9429|
|6da2a9ffa23a59523...|  9200|
|10d6778bc45aebcd5...|  9121|
|490020c0d4fa1eb85...|  9047|
|fc448a7db5a2fac24...|  8676|
|54205667c1f65a320...|  8613|
|a42d6065d0b1a31c5...|  8255|
|cc90910b6e31fe44c...|  8237|
|6889e5a22598521d8...|  7679|
|0e68beebb4c7ccbd9...|  7507|
|9f649b234f5ebf207...|  7350|
+--------------------+------+
only showing top 20 rows



#### indegree = The number of edges directed into a vertex in a directed graph.

In [19]:
g.inDegrees.filter("inDegree >= 10").sort("inDegree", ascending=False).show()

+--------------------+--------+
|                  id|inDegree|
+--------------------+--------+
|83730969c0686b1d1...|   38079|
|abd1c342495432171...|   32420|
|13d4c2f76a7c1a4d0...|   20763|
|a512385be058b1e2e...|   17634|
|a411f6a0e6473137a...|   17370|
|87f40e6f3022adbc1...|   12898|
|73679f1ba00de9e73...|   11428|
|e6dd0e6cf076b1207...|   11276|
|d2860a370a0386c57...|   10815|
|6a17ebeeb80cd696b...|    9429|
|6da2a9ffa23a59523...|    9197|
|10d6778bc45aebcd5...|    9120|
|490020c0d4fa1eb85...|    9028|
|fc448a7db5a2fac24...|    8676|
|54205667c1f65a320...|    8611|
|cc90910b6e31fe44c...|    8231|
|a42d6065d0b1a31c5...|    8182|
|6889e5a22598521d8...|    7656|
|0e68beebb4c7ccbd9...|    7486|
|9f649b234f5ebf207...|    7350|
+--------------------+--------+
only showing top 20 rows



In [20]:
top_indegree = g.inDegrees.filter("inDegree >= 10").sort("inDegree", ascending=False).select('id').take(1)[0][0]

In [21]:
top_node = g.vertices.filter('id == "{}"'.format(top_indegree)).toPandas()
top_node

Unnamed: 0,id,title,year,fieldsOfStudy,paperAbstract
0,83730969c0686b1d185bcca39f9b5743fa53ebc1,R: A language and environment for statistical ...,2014,[Computer Science],Copyright (©) 1999–2012 R Foundation for Stati...


#### Creating checkpoint directory in HDFS

In [22]:
sc.setCheckpointDir('graphframes_cps')

## Looking at PageRank scores of nodes

In [25]:
g.pageRank?

In [23]:
pr = g.pageRank(resetProbability=0.15, tol=0.2,)

Py4JJavaError: An error occurred while calling o184.run.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 14 in stage 18.0 failed 4 times, most recent failure: Lost task 14.3 in stage 18.0 (TID 3298, hd05.rcc.local, executor 303): java.lang.ClassNotFoundException: org.graphframes.GraphFrame$$anonfun$5
	at java.net.URLClassLoader.findClass(URLClassLoader.java:381)
	at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
	at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
	at java.lang.Class.forName0(Native Method)
	at java.lang.Class.forName(Class.java:348)
	at org.apache.spark.serializer.JavaDeserializationStream$$anon$1.resolveClass(JavaSerializer.scala:67)
	at java.io.ObjectInputStream.readNonProxyDesc(ObjectInputStream.java:1868)
	at java.io.ObjectInputStream.readClassDesc(ObjectInputStream.java:1751)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2042)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1573)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2287)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2211)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2069)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1573)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2287)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2211)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2069)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1573)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2287)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2211)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2069)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1573)
	at java.io.ObjectInputStream.readObject(ObjectInputStream.java:431)
	at org.apache.spark.serializer.JavaDeserializationStream.readObject(JavaSerializer.scala:75)
	at org.apache.spark.serializer.JavaSerializerInstance.deserialize(JavaSerializer.scala:114)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:88)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55)
	at org.apache.spark.scheduler.Task.run(Task.scala:121)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$11.apply(Executor.scala:407)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1408)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:413)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1890)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1878)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1877)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1877)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:929)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:929)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:929)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2111)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2060)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2049)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:740)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2081)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2178)
	at org.apache.spark.rdd.RDD$$anonfun$reduce$1.apply(RDD.scala:1035)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.RDD.reduce(RDD.scala:1017)
	at org.apache.spark.graphx.impl.VertexRDDImpl.count(VertexRDDImpl.scala:90)
	at org.apache.spark.graphx.Pregel$.apply(Pregel.scala:140)
	at org.apache.spark.graphx.lib.PageRank$.runUntilConvergenceWithOptions(PageRank.scala:355)
	at org.graphframes.lib.PageRank$.runUntilConvergence(PageRank.scala:152)
	at org.graphframes.lib.PageRank.run(PageRank.scala:102)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.ClassNotFoundException: org.graphframes.GraphFrame$$anonfun$5
	at java.net.URLClassLoader.findClass(URLClassLoader.java:381)
	at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
	at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
	at java.lang.Class.forName0(Native Method)
	at java.lang.Class.forName(Class.java:348)
	at org.apache.spark.serializer.JavaDeserializationStream$$anon$1.resolveClass(JavaSerializer.scala:67)
	at java.io.ObjectInputStream.readNonProxyDesc(ObjectInputStream.java:1868)
	at java.io.ObjectInputStream.readClassDesc(ObjectInputStream.java:1751)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2042)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1573)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2287)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2211)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2069)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1573)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2287)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2211)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2069)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1573)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2287)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2211)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2069)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1573)
	at java.io.ObjectInputStream.readObject(ObjectInputStream.java:431)
	at org.apache.spark.serializer.JavaDeserializationStream.readObject(JavaSerializer.scala:75)
	at org.apache.spark.serializer.JavaSerializerInstance.deserialize(JavaSerializer.scala:114)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:88)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55)
	at org.apache.spark.scheduler.Task.run(Task.scala:121)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$11.apply(Executor.scala:407)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1408)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:413)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more


In [None]:
display(results.vertices)
## look at the pagerank score for every vertex
results.vertices.show()
## look at the weight of every edge
results.edges.show()

### Matrix 