# Graph Building & Analysis

In [1]:
import pandas as pd 
import json
import matplotlib.pyplot as plt 

#Graph network imports
from graphframes import *
from pyspark import *
from pyspark.sql import *
import numpy as np
from pyspark.ml.linalg import *
from pyspark.ml.linalg import *
from pyspark.sql.types import * 
from pyspark.sql.functions import *
from functools import reduce
from pyspark.sql.functions import col, lit, when

from pyspark.sql.functions import udf #user defined function
from pyspark.sql.types import * #Import types == IntegerType, StringType etc.

import nltk

### Read data

In [2]:
spark_df = spark.read.parquet('big_data_project/encodedData/')

In [3]:
spark_df.printSchema()

root
 |-- fieldsOfStudy: string (nullable = true)
 |-- id: string (nullable = true)
 |-- inCitations: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- journalName: string (nullable = true)
 |-- outCitations: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- paperAbstract: string (nullable = true)
 |-- sources: string (nullable = true)
 |-- title: string (nullable = true)
 |-- venue: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- inCitations_count: integer (nullable = true)
 |-- outCitations_count: integer (nullable = true)
 |-- abstract_wcount: integer (nullable = true)
 |-- title_wcount: integer (nullable = true)
 |-- abstract_tfidf: vector (nullable = true)
 |-- title_tfidf: vector (nullable = true)
 |-- title_wordVectors: vector (nullable = true)
 |-- journal_name: string (nullable = true)
 |-- SJR: float (nullable = true)
 |-- author_ids: array (nullable = true)
 |    |-- element: array (containsNull = true)

In [4]:
#spark_df = spark_df.sample(fraction=0.1, seed=3)

In [5]:
spark_df.select('id', 'inCitations', 'outCitations').show()

+--------------------+--------------------+--------------------+
|                  id|         inCitations|        outCitations|
+--------------------+--------------------+--------------------+
|44fdd24c628c1c9a3...|                  []|                  []|
|364c9f2447af7a9d7...|                  []|                  []|
|bce22d50b71a488f8...|                  []|                  []|
|725082372e368340c...|                  []|                  []|
|8519fa1b1d7feafc0...|                  []|                  []|
|fd811c10681e57317...|[71254b0089881891...|                  []|
|874fa6458b74adb18...|[19cabf48d4c305f2...|                  []|
|3ee134975b8166725...|[13bbf248f4da3fd9...|[e88006b448d419f5...|
|1d3216b16d0778e34...|                  []|                  []|
|50606afeb9ab4d0de...|[5a82f05a306697f8...|[e9ace3ca5032b062...|
|8edac1b3a7e994087...|[0532ab999b9af0ac...|[4d98aca81c463553...|
|c217e8aee87c2e2f9...|                  []|                  []|
|0f5c41eb1786f2053...|[99

In [6]:
# from pyspark.sql.functions import col, count, when
# spark_df.select([count(when(col(c).isNull(), c)).alias(c) for c in spark_df.columns]).toPandas().T

### Creating Vertices

In [7]:
vertices = spark_df#.select('id', 'title', 'year', 'fieldsOfStudy', 'paperAbstract')

### Creating Edges

In [8]:
from pyspark.sql.functions import explode
from pyspark.sql.functions import lit

#Getting the inCitations id -> (cited) -> article id 
edges = spark_df.select(explode('inCitations').alias('src'), spark_df.id.alias('dst')).withColumn('type', lit('cited'))

#Getting the article id -> (cited) -> outCitations id
edges2 = spark_df.select(spark_df.id.alias('src'), explode('outCitations').alias('dst')).withColumn('type', lit('cited'))

#Union of these two 
edges_total = edges.union(edges2)

In [10]:
edges_total.show(5, truncate=False)

+----------------------------------------+----------------------------------------+-----+
|src                                     |dst                                     |type |
+----------------------------------------+----------------------------------------+-----+
|71254b0089881891dee33916e9582a6b1c7f6d83|fd811c10681e57317ae042036b424f6ca2e4b087|cited|
|82bc9f0ba6a586e6e8bd976c8f16c4ee9461af9b|fd811c10681e57317ae042036b424f6ca2e4b087|cited|
|68fe4a464eefac315bcacf933e8d5c093cf06d5a|fd811c10681e57317ae042036b424f6ca2e4b087|cited|
|49e9959fe6c1a7b33a079ab77f467f0b0c5f4dd3|fd811c10681e57317ae042036b424f6ca2e4b087|cited|
|19cabf48d4c305f2e399acee297e5d4ab5a784ea|874fa6458b74adb187e0e06e583108863a01af9d|cited|
+----------------------------------------+----------------------------------------+-----+
only showing top 5 rows



In [11]:
g = GraphFrame(vertices, edges_total)
## Take a look at the DataFrames
# g.vertices.show()
# g.edges.show()
## Check the number of edges of each vertex
g.degrees.sort("degree", ascending=False).show()

+--------------------+------+
|                  id|degree|
+--------------------+------+
|83730969c0686b1d1...| 40020|
|abd1c342495432171...| 34265|
|273dfbcb68080251f...| 29275|
|13d4c2f76a7c1a4d0...| 21976|
|b4f847bc5c6aacc6e...| 18953|
|a512385be058b1e2e...| 18792|
|a411f6a0e6473137a...| 18297|
|b83217d6bb8419de3...| 18033|
|2d6f573c36c5e2153...| 16823|
|083e00c042d2e43e6...| 15676|
|87f40e6f3022adbc1...| 13666|
|73679f1ba00de9e73...| 12075|
|e6dd0e6cf076b1207...| 11920|
|83271813204b63f9d...| 11217|
|3524cdf7cf8344e7e...| 11075|
|b9544a1bf4b02c664...| 11041|
|a6f1dfcc44277d4cf...| 10520|
|6fca260e9a3c37e24...| 10071|
|6a17ebeeb80cd696b...|  9955|
|f444aecb9a6cc1219...|  9799|
+--------------------+------+
only showing top 20 rows



#### indegree = The number of edges directed into a vertex in a directed graph.

In [12]:
g.inDegrees.filter("inDegree >= 10").sort("inDegree", ascending=False).show()

+--------------------+--------+
|                  id|inDegree|
+--------------------+--------+
|83730969c0686b1d1...|   40020|
|abd1c342495432171...|   34244|
|273dfbcb68080251f...|   29251|
|13d4c2f76a7c1a4d0...|   21970|
|b4f847bc5c6aacc6e...|   18953|
|a512385be058b1e2e...|   18663|
|a411f6a0e6473137a...|   18269|
|b83217d6bb8419de3...|   18032|
|2d6f573c36c5e2153...|   16815|
|083e00c042d2e43e6...|   15676|
|87f40e6f3022adbc1...|   13654|
|73679f1ba00de9e73...|   12068|
|e6dd0e6cf076b1207...|   11785|
|83271813204b63f9d...|   11217|
|3524cdf7cf8344e7e...|   11071|
|b9544a1bf4b02c664...|   11021|
|a6f1dfcc44277d4cf...|   10498|
|6fca260e9a3c37e24...|   10071|
|6a17ebeeb80cd696b...|    9955|
|a06547951c97b2a32...|    9788|
+--------------------+--------+
only showing top 20 rows



## Let's look at the top 10 papers by inDegrees (i.e. number of times cited)

In [13]:
#Get the top 10 ids by inDegree
top_10 = g.inDegrees.filter("inDegree >= 10").sort("inDegree", ascending=False).select('id').take(10)
#Isolate the ids
top_10_ids = [i.id for i in top_10]
#Filter and get the df in pandas
top_10_pdf = spark_df.where(spark_df.id.isin(top_10_ids)).toPandas()

In [14]:
pd.set_option('display.max_colwidth', 100)
top_10_pdf[['title', 'fieldsOfStudy', 'year', 'paperAbstract', 'author_names']]

Unnamed: 0,title,fieldsOfStudy,year,paperAbstract,author_names
0,r a language and environment for statistical computing,Computer Science,2014,copyright r foundation for statistical computing permission is granted to make and distribute ...,[R Core Team]
1,imagenet classification with deep convolutional neural networks,Computer Science,2012,we trained a large deep convolutional neural network to classify the million highresolution ima...,"[Alex Krizhevsky, Ilya Sutskever, Geoffrey E. Hinton]"
2,statistical methods for assessing agreement between two methods of clinical measurement,Medicine,1986,in clinical measurement comparison of a new measurement technique with an established one is oft...,"[J Martin Bland, Douglas G. Altman]"
3,evaluating structural equation models with unobservable variables and measurement error,Business,1981,the statistical tests used in the analysis of structural equation models with unobservable varia...,"[Claes Fornell, David F. Larcker]"
4,gene ontology tool for the unification of biology,"Biology, Medicine",2000,genomic sequencing has made it clear that a large fraction of the genes specifying the core biol...,"[Michael Ashburner, Catherine A. Ball, Judith A. Blake, David Botstein, Heather Butler, J. Micha..."
5,basics of qualitative research techniques and procedures for developing grounded theory,Psychology,2010,in the third edition of the classic text basics of qualitative research techniques and procedure...,[Brad Wuetherick]
6,mega molecular evolutionary genetics analysis mega software version,"Medicine, Biology",2007,we announce the release of the fourth version of mega software which expands on the existing fac...,"[Koichiro Tamura, Joel Dudley, Masatoshi Nei, Sudhir Kumar]"
7,global cancer statistics,Medicine,1999,the global burden of cancer continues to increase largely because of the aging and growth of the...,"[Ahmedin Jemal, Freddie Bray, Melissa M. Center, Jacques Ferlay, Elizabeth Ward, David Forman]"
8,random forests,"Mathematics, Computer Science",2001,random forests are a combination of tree predictors such that each tree depends on the values of...,[Leo Breiman]
9,libsvm a library for support vector machines,Computer Science,2005,libsvm is a library for support vector machines svms we have been actively developing this packa...,"[Chih-Chung Chang, Chih-Jen Lin]"


In [15]:
# g.vertices.filter('id == "{}"'.format(top_indegree)).toPandas()
# top_indegree = g.inDegrees.filter("inDegree >= 10").sort("inDegree", ascending=False).select('id').take(1)[0][0]
# top_node = g.vertices.filter('id == "{}"'.format(top_indegree)).toPandas()
# top_node

### Motifs finding

In [16]:
# Search for pairs of vertices with edges in both directions between them.
motifs = g.find("(a)-[e]->(b); (b)-[e2]->(a)")
motifs.show()

+--------------------+--------------------+--------------------+--------------------+
|                   a|                   e|                   b|                  e2|
+--------------------+--------------------+--------------------+--------------------+
|[Engineering, d37...|[d375edc5062c7995...|[Engineering, 061...|[061db65af85b7d12...|
|[Engineering, d37...|[d375edc5062c7995...|[Engineering, 061...|[061db65af85b7d12...|
|[Engineering, d37...|[d375edc5062c7995...|[Engineering, 061...|[061db65af85b7d12...|
|[Engineering, d37...|[d375edc5062c7995...|[Engineering, 061...|[061db65af85b7d12...|
|[Medicine, Biolog...|[e79a3ffbe5c943cf...|[Medicine, Biolog...|[06651807972ddbaf...|
|[Medicine, Biolog...|[e79a3ffbe5c943cf...|[Medicine, Biolog...|[06651807972ddbaf...|
|[Medicine, Biolog...|[e79a3ffbe5c943cf...|[Medicine, Biolog...|[06651807972ddbaf...|
|[Medicine, Biolog...|[e79a3ffbe5c943cf...|[Medicine, Biolog...|[06651807972ddbaf...|
|[Engineering, 476...|[4760e18f126d21e5...|[Engineerin

In [103]:
motifs.count()

16240

In [104]:
loop_motif = motifs.toPandas()

In [None]:
def RDD_to_dictionaries(row):
    df_pd_items = row.asDict()
    return df_pd_items

# Convert the RDD rows into dictionaries
a = loop_motif.loc[:,'a'].apply(RDD_to_dictionaries)
b = loop_motif.loc[:,'b'].apply(RDD_to_dictionaries)

#Create dataframe of dictionaries
motif_pdf =pd.DataFrame({'a':a, 'b':b})

In [122]:
#Explode and select relevant columns
a_df = pd.DataFrame(motif_pdf['a'].tolist())[['title', 'fieldsOfStudy', 'year', 'author_names', 'inCitations_count']]
b_df = pd.DataFrame(motif_pdf['b'].tolist())[['title', 'fieldsOfStudy', 'year', 'author_names', 'inCitations_count']]
a_df = a_df.add_suffix('_a')
b_df = b_df.add_suffix('_b')

In [124]:
b_df.columns

Index(['title_b', 'fieldsOfStudy_b', 'year_b', 'author_names_b',
       'inCitations_count_b'],
      dtype='object')

In [125]:
#Concat and drop duplicates 
bi_references = pd.concat([a_df, b_df], axis = 1)\
.drop_duplicates(subset = ['title_a', 'fieldsOfStudy_a', 'year_a','title_b', 'fieldsOfStudy_b', 'year_b']) # keys=['A', 'B']


In [127]:
bi_references.sort_values(by=['inCitations_count_a', 'inCitations_count_b'], ascending = False)

Unnamed: 0,title_a,fieldsOfStudy_a,year_a,author_names_a,inCitations_count_a,title_b,fieldsOfStudy_b,year_b,author_names_b,inCitations_count_b
276,principles of fluorescence spectroscopy,Chemistry,1983,[Joseph R. Lakowicz],7745,mechanisms of tryptophan fluorescence shifts in proteins,"Medicine, Chemistry",2001,"[Jean-Marc Vivian, Patrik R. Callis]",328
8484,multiplex genome engineering using crisprcas systems,"Biology, Medicine",2013,"[Lê Chí Công, Fei Ann Ran, David Cox, Shuailiang Lin, Robert P. J. Barretto, Naomi Habib, Patric...",4660,rnaguided human genome engineering via cas,"Medicine, Biology",2013,"[Prashant R. Mali, Luhan Yang, Kevin M Esvelt, John Aach, Marc Guell, James E. DiCarlo, Julie E....",2937
7648,geodesic active contours,Computer Science,1995,"[Vicent Caselles, Ron Kimmel, Guillermo Sapiro]",3991,gradient flows and geometric active contour models,Computer Science,1995,"[Satyanad Kichenassamy, Arun Kumar, Peter J. Olver, Allen R. Tannenbaum, Anthony J. Yezzi]",536
9724,rnaguided human genome engineering via cas,"Medicine, Biology",2013,"[Prashant R. Mali, Luhan Yang, Kevin M Esvelt, John Aach, Marc Guell, James E. DiCarlo, Julie E....",2937,multiplex genome engineering using crisprcas systems,"Biology, Medicine",2013,"[Lê Chí Công, Fei Ann Ran, David Cox, Shuailiang Lin, Robert P. J. Barretto, Naomi Habib, Patric...",4660
2856,longterm recurrent convolutional networks for visual recognition and description,"Computer Science, Medicine",2014,"[Jeff Donahue, Lisa Anne Hendricks, Marcus Rohrbach, Subhashini Venugopalan, Sergio Guadarrama, ...",1710,translating videos to natural language using deep recurrent neural networks,Computer Science,2014,"[Subhashini Venugopalan, Huijuan Xu, Jeff Donahue, Marcus Rohrbach, Raymond J. Mooney, Kate Saenko]",480
13560,longterm recurrent convolutional networks for visual recognition and description,"Computer Science, Medicine",2014,"[Jeff Donahue, Lisa Anne Hendricks, Marcus Rohrbach, Subhashini Venugopalan, Sergio Guadarrama, ...",1710,exploring nearest neighbor approaches for image captioning,Computer Science,2015,"[Jacob Devlin, Saurabh Gupta, Ross B. Girshick, Margaret Mitchell, C. Lawrence Zitnick]",104
5772,impotence and its medical and psychosocial correlates results of the massachusetts male aging study,Medicine,2002,[Jonathan C Levy],1603,measurement of erectile dysfunction in populationbased studies the use of a single question self...,Medicine,2000,"[C Derby, A. B. Araujo, C B Johannes, HA Feldman, JB McKinlay]",64
5060,modes of resistance to antiangiogenic therapy,"Medicine, Biology",2008,"[Gabriele Bergers, Douglas Hanahan]",1469,the role of myeloid cells in the promotion of tumour angiogenesis,"Medicine, Biology",2008,"[Craig Murdoch, Munitta Muthana, Seth B Coffelt, Claire E. Lewis]",697
6840,ecological momentary assessment,"Psychology, Medicine",2008,"[Saul Shiffman, Arthur A. Stone, Michael R. Hufford]",1316,paper and plastic in daily diary research comment on green rafaeli bolger shrout and reis,"Psychology, Medicine",2006,"[Howard Tennen, Glenn Affleck, James C. Coyne, Randy J. Larsen, Anita Delongis]",53
1872,the bethesda system terminology for reporting results of cervical cytology,Medicine,2002,"[Diane Solomon, Diane Davis Davey, Robert J. Kurman, Ann Moriarty, Dennis O'connor, Marianne U. ...",1307,consensus guidelines for the management of women with cervical cytological abnormalities,Medicine,2002,"[Thomas C. Wright, John Thomas Cox, L Stewart Massad, Leo B. Twiggs, Edward John Wilkinson]",415


In [128]:
#bi_references.to_csv('bireference.csv')

## Fields of study graph

In [129]:
spark.conf.set("spark.sql.crossJoin.enabled", "true")

In [130]:
edsges_total_FOS = edges_total.join(spark_df.select('id', col('fieldsOfStudy').alias('fieldsOfStudy_src')), edges_total.src == spark_df.id, how = 'outer').join(spark_df.select('id', col('fieldsOfStudy').alias('fieldsOfStudy_dst')), edges_total.dst == spark_df.id, how = 'outer')

In [131]:
edsges_total_FOS.show(5)

Py4JJavaError: An error occurred while calling o190.showString.
: java.lang.OutOfMemoryError: Not enough memory to build and broadcast the table to all worker nodes. As a workaround, you can either disable broadcast by setting spark.sql.autoBroadcastJoinThreshold to -1 or increase the spark driver memory by setting spark.driver.memory to a higher value
	at org.apache.spark.sql.execution.exchange.BroadcastExchangeExec$$anonfun$relationFuture$1$$anonfun$apply$1.apply(BroadcastExchangeExec.scala:122)
	at org.apache.spark.sql.execution.exchange.BroadcastExchangeExec$$anonfun$relationFuture$1$$anonfun$apply$1.apply(BroadcastExchangeExec.scala:76)
	at org.apache.spark.sql.execution.SQLExecution$$anonfun$withExecutionId$1.apply(SQLExecution.scala:101)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withExecutionId(SQLExecution.scala:98)
	at org.apache.spark.sql.execution.exchange.BroadcastExchangeExec$$anonfun$relationFuture$1.apply(BroadcastExchangeExec.scala:75)
	at org.apache.spark.sql.execution.exchange.BroadcastExchangeExec$$anonfun$relationFuture$1.apply(BroadcastExchangeExec.scala:75)
	at scala.concurrent.impl.Future$PromiseCompletingRunnable.liftedTree1$1(Future.scala:24)
	at scala.concurrent.impl.Future$PromiseCompletingRunnable.run(Future.scala:24)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)


#### Creating checkpoint directory in HDFS

In [57]:
sc.setCheckpointDir('graphframes_cps')

In [None]:
result = g.connectedComponents()
result.select("id", "component").orderBy("component").show()

In [None]:
result.save()

### Communities 

In [31]:
communities = g.labelPropagation(maxIter=5)
communities.persist().show(10)

Py4JJavaError: An error occurred while calling o314.run.
: org.apache.spark.SparkException: Job aborted due to stage failure: 
Aborting TaskSet 61.0 because task 2 (partition 2)
cannot run anywhere due to node and executor blacklist.
Most recent failure:
Lost task 2.1 in stage 61.0 (TID 13235, hd02.rcc.local, executor 37): java.lang.ClassCastException: cannot assign instance of scala.collection.immutable.List$SerializationProxy to field org.apache.spark.rdd.RDD.org$apache$spark$rdd$RDD$$dependencies_ of type scala.collection.Seq in instance of org.apache.spark.rdd.MapPartitionsRDD
	at java.io.ObjectStreamClass$FieldReflector.setObjFieldValues(ObjectStreamClass.java:2287)
	at java.io.ObjectStreamClass.setObjFieldValues(ObjectStreamClass.java:1417)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2293)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2211)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2069)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1573)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2287)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2211)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2069)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1573)
	at java.io.ObjectInputStream.readObject(ObjectInputStream.java:431)
	at scala.collection.immutable.List$SerializationProxy.readObject(List.scala:490)
	at sun.reflect.GeneratedMethodAccessor2.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at java.io.ObjectStreamClass.invokeReadObject(ObjectStreamClass.java:1170)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2178)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2069)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1573)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2287)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2211)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2069)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1573)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2287)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2211)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2069)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1573)
	at java.io.ObjectInputStream.readObject(ObjectInputStream.java:431)
	at org.apache.spark.serializer.JavaDeserializationStream.readObject(JavaSerializer.scala:75)
	at org.apache.spark.serializer.JavaSerializerInstance.deserialize(JavaSerializer.scala:114)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:88)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55)
	at org.apache.spark.scheduler.Task.run(Task.scala:121)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$11.apply(Executor.scala:407)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1408)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:413)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)


Blacklisting behavior can be configured via spark.blacklist.*.

	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1890)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1878)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1877)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1877)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:929)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:929)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:929)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2111)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2060)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2049)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:740)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2081)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2178)
	at org.apache.spark.rdd.RDD$$anonfun$reduce$1.apply(RDD.scala:1035)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.RDD.reduce(RDD.scala:1017)
	at org.apache.spark.graphx.impl.VertexRDDImpl.count(VertexRDDImpl.scala:90)
	at org.apache.spark.graphx.Pregel$.apply(Pregel.scala:140)
	at org.apache.spark.graphx.lib.LabelPropagation$.run(LabelPropagation.scala:64)
	at org.graphframes.lib.LabelPropagation$.org$graphframes$lib$LabelPropagation$$run(LabelPropagation.scala:62)
	at org.graphframes.lib.LabelPropagation.run(LabelPropagation.scala:53)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)


## TRIANGLE COUNT

In [15]:
#Computes the number of triangles passing through each vertex.
g.triangleCount().orderBy("count", ascending=False).show()

Py4JJavaError: An error occurred while calling o145.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 11 in stage 26.0 failed 4 times, most recent failure: Lost task 11.3 in stage 26.0 (TID 6719, hd02.rcc.local, executor 9): ExecutorLostFailure (executor 9 exited caused by one of the running tasks) Reason: Container marked as failed: container_e67_1577383759214_3503_01_000040 on host: hd02.rcc.local. Exit status: 143. Diagnostics: [2020-03-03 21:57:08.427]Container killed on request. Exit code is 143
[2020-03-03 21:57:08.427]Container exited with a non-zero exit code 143. 
[2020-03-03 21:57:08.427]Killed by external signal

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1890)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1878)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1877)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1877)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:929)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:929)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:929)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2111)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2060)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2049)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:740)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2081)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2178)
	at org.apache.spark.rdd.RDD$$anonfun$reduce$1.apply(RDD.scala:1035)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.RDD.reduce(RDD.scala:1017)
	at org.apache.spark.rdd.RDD$$anonfun$takeOrdered$1.apply(RDD.scala:1439)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.RDD.takeOrdered(RDD.scala:1426)
	at org.apache.spark.sql.execution.TakeOrderedAndProjectExec.executeCollect(limit.scala:136)
	at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectFromPlan(Dataset.scala:3383)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2544)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2544)
	at org.apache.spark.sql.Dataset$$anonfun$53.apply(Dataset.scala:3364)
	at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:78)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:73)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3363)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2544)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:2758)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:254)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:291)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)


## Looking at PageRank scores of nodes

In [25]:
g.pageRank?

In [23]:
pr = g.pageRank(resetProbability=0.15, tol=0.2,)

Py4JJavaError: An error occurred while calling o184.run.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 14 in stage 18.0 failed 4 times, most recent failure: Lost task 14.3 in stage 18.0 (TID 3298, hd05.rcc.local, executor 303): java.lang.ClassNotFoundException: org.graphframes.GraphFrame$$anonfun$5
	at java.net.URLClassLoader.findClass(URLClassLoader.java:381)
	at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
	at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
	at java.lang.Class.forName0(Native Method)
	at java.lang.Class.forName(Class.java:348)
	at org.apache.spark.serializer.JavaDeserializationStream$$anon$1.resolveClass(JavaSerializer.scala:67)
	at java.io.ObjectInputStream.readNonProxyDesc(ObjectInputStream.java:1868)
	at java.io.ObjectInputStream.readClassDesc(ObjectInputStream.java:1751)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2042)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1573)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2287)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2211)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2069)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1573)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2287)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2211)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2069)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1573)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2287)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2211)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2069)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1573)
	at java.io.ObjectInputStream.readObject(ObjectInputStream.java:431)
	at org.apache.spark.serializer.JavaDeserializationStream.readObject(JavaSerializer.scala:75)
	at org.apache.spark.serializer.JavaSerializerInstance.deserialize(JavaSerializer.scala:114)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:88)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55)
	at org.apache.spark.scheduler.Task.run(Task.scala:121)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$11.apply(Executor.scala:407)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1408)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:413)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1890)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1878)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1877)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1877)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:929)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:929)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:929)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2111)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2060)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2049)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:740)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2081)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2178)
	at org.apache.spark.rdd.RDD$$anonfun$reduce$1.apply(RDD.scala:1035)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.RDD.reduce(RDD.scala:1017)
	at org.apache.spark.graphx.impl.VertexRDDImpl.count(VertexRDDImpl.scala:90)
	at org.apache.spark.graphx.Pregel$.apply(Pregel.scala:140)
	at org.apache.spark.graphx.lib.PageRank$.runUntilConvergenceWithOptions(PageRank.scala:355)
	at org.graphframes.lib.PageRank$.runUntilConvergence(PageRank.scala:152)
	at org.graphframes.lib.PageRank.run(PageRank.scala:102)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.ClassNotFoundException: org.graphframes.GraphFrame$$anonfun$5
	at java.net.URLClassLoader.findClass(URLClassLoader.java:381)
	at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
	at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
	at java.lang.Class.forName0(Native Method)
	at java.lang.Class.forName(Class.java:348)
	at org.apache.spark.serializer.JavaDeserializationStream$$anon$1.resolveClass(JavaSerializer.scala:67)
	at java.io.ObjectInputStream.readNonProxyDesc(ObjectInputStream.java:1868)
	at java.io.ObjectInputStream.readClassDesc(ObjectInputStream.java:1751)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2042)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1573)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2287)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2211)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2069)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1573)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2287)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2211)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2069)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1573)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2287)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2211)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2069)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1573)
	at java.io.ObjectInputStream.readObject(ObjectInputStream.java:431)
	at org.apache.spark.serializer.JavaDeserializationStream.readObject(JavaSerializer.scala:75)
	at org.apache.spark.serializer.JavaSerializerInstance.deserialize(JavaSerializer.scala:114)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:88)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55)
	at org.apache.spark.scheduler.Task.run(Task.scala:121)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$11.apply(Executor.scala:407)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1408)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:413)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more


In [None]:
display(results.vertices)
## look at the pagerank score for every vertex
results.vertices.show()
## look at the weight of every edge
results.edges.show()

### Matrix 