In [1]:
from streamgraph.graph import spark, sc, Graph

In [2]:
g = Graph('/mnt/d/Datasets/harvey_streams/2017-08-25 09_00_00.csv')

[6.66] Loaded 13624 nodes.
[14.65] Parsed lists within hashtags and mentions.
[15.21] Found 3507876 edges by topic.
[31.63] Found 4174765 edges by authority.
[38.71] Found 510427 edges by hashtags.
[45.30] Found 3865403 edges by mentions.
[55.94] Found 7998447 edges in total.


In [3]:
from pyspark.sql.functions import collect_set, collect_list, array_union, concat, array, col, sum, pandas_udf, PandasUDFType, udf
from pyspark.sql.types import ArrayType, LongType, StructType, StructField
from pyspark.ml.linalg import VectorUDT, SparseVector
import pandas as pd
import numpy as np

def getNodeAdjacency(edges, num_nodes):
    # get dataframe with bi-directional edges
    a = edges.groupby('src').agg(collect_list('dst').alias('dst'))
    b = edges \
        .withColumnRenamed('src', 'tmp') \
        .withColumnRenamed('dst', 'src') \
        .withColumnRenamed('tmp', 'dst') \
        .groupby('src').agg(collect_list('dst').alias('dst'))
    neighbors = a.union(b)

    @pandas_udf('node long, neighbors array<long>', PandasUDFType.GROUPED_MAP)
    def joinArrays(a):
        dst = np.concatenate(a.dst)
        return pd.DataFrame([(a.src.iloc[0], dst)], columns=['node', 'neighbors'])

    neighbors = neighbors.groupby('src').apply(joinArrays)

    # convert to sparse vectors
    @udf(VectorUDT())
    def toSparse(a):
        vector = [(dst, 1) for dst in a]
        return SparseVector(num_nodes, vector)

    neighbors = neighbors.withColumn('neighbors', toSparse('neighbors'))
    return neighbors

In [4]:
na = getNodeAdjacency(g.edges, g.num_nodes.value)

In [5]:
na.show()

+----+--------------------+
|node|           neighbors|
+----+--------------------+
|  26|(13624,[5,18,36,4...|
|  29|(13624,[21,58,122...|
| 474|(13624,[9,23,35,5...|
| 964|(13624,[5,6,9,10,...|
|1677|(13624,[5,7,8,11,...|
|1697|(13624,[5,7,8,11,...|
|1806|(13624,[21,23,35,...|
|1950|(13624,[93,95,116...|
|2040|(13624,[20,61,79,...|
|2214|(13624,[114,162,1...|
|2250|(13624,[7,11,14,2...|
|2453|(13624,[28,87,110...|
|2509|(13624,[6,10,39,1...|
|2529|(13624,[3,33,36,6...|
|2927|(13624,[21,37,43,...|
|3091|(13624,[4,77,112,...|
|3506|(13624,[1,3,14,67...|
|3764|(13624,[1,3,14,69...|
|4590|(13624,[40,64,84,...|
|4823|(13624,[7,58,61,8...|
+----+--------------------+
only showing top 20 rows



In [36]:
g.nodes.select('id').subtract(neighbors.select('src').alias('id')).collect()

[Row(id=168576), Row(id=236408), Row(id=124832), Row(id=37872)]

In [37]:
g.nodes.filter(g.nodes.id == 168576).show()

+------+------------------+-------------------+----------+--------------------+-----+-----+-------------+---------+--------+----------+
|    id|         twitterID|          timestamp|      user|        originalText|topic|reply|inReplyToUser|authority|hashtags|  mentions|
+------+------------------+-------------------+----------+--------------------+-----+-----+-------------+---------+--------+----------+
|168576|901273847550410752|2017-08-26 02:43:24|ulovejamie|@iCyclone Thx for...|   17|false|         none| iCyclone|    null|[iCyclone]|
+------+------------------+-------------------+----------+--------------------+-----+-----+-------------+---------+--------+----------+



In [38]:
g.nodes.filter(g.nodes.topic == 17).show()

+------+------------------+-------------------+----------+--------------------+-----+-----+-------------+---------+--------+----------+
|    id|         twitterID|          timestamp|      user|        originalText|topic|reply|inReplyToUser|authority|hashtags|  mentions|
+------+------------------+-------------------+----------+--------------------+-----+-----+-------------+---------+--------+----------+
|168576|901273847550410752|2017-08-26 02:43:24|ulovejamie|@iCyclone Thx for...|   17|false|         none| iCyclone|    null|[iCyclone]|
+------+------------------+-------------------+----------+--------------------+-----+-----+-------------+---------+--------+----------+



In [7]:
g.nodes.select('id').distinct().count()

2399