In [0]:
uuRDD = sc.textFile('/FileStore/tables/uu_noheader.txt')


uuRDD.take(5)

In [0]:
## tokenise the string
uu_tokens = uuRDD.map(lambda x: x.split(','))

uu_tokens.take(5)

find the top 3 contexts that have the highest number of records

In [0]:
## all we need is the first element of each list, and we just use the "wordCount" pattern to count the number of occurrences

contexts = uu_tokens.map(lambda l: l[0])

contexts.take(5)

In [0]:
## this is just wordcount:

context_occurrences = contexts.map(lambda s: (s,1)).reduceByKey(lambda x,y: x+y)

top_5_contexts = context_occurrences.sortBy(lambda x: x[1], ascending=False).map(lambda x: x[0]).take(5)

In [0]:
top_5_contexts

task 2: for the top context, calculate the total weight associated with each outgoing edge for that context

ex 
`'wear-purple-for-jia-2018,foomooboo,Ed_Miliband,1',
 'dry-january-2018,kellyld77,DrinkTg,1',`
 
 the edges are `foomooboo --> Ed_Miliband` and `kellyld77 --> DrinkTg`

In [0]:
## the idea is to use a pair (from,to) to represent an edge. Thus we use this pair as the key in the key value pair

In [0]:

edge_weights = uu_tokens.filter(lambda l: l[0] == '16-days-of-action-2018').map(lambda l: ((l[1],l[2]), l[3]))

In [0]:
edge_weights.take(5)

In [0]:
total_edge_weights = edge_weights.reduceByKey(lambda x,y : x+y).sortBy(lambda x: x[1])

total_edge_weights.collect()

we can achieve the same result more easily, using the DataFrame API:

In [0]:
from pyspark.sql.types import IntegerType

## load the uu.txt raw file as RDD

uuDF = spark.read.csv('/FileStore/tables/uu.csv', header=True)

uuDF = uuDF.withColumn("weight",uuDF.weight.cast(IntegerType()))


In [0]:
# top 5 contexts:
uuDF.groupBy(uuDF.context).count().sort('count', ascending = False).take(5)

task 2: for the top context, calculate the total weight associated with each edge for that context

using DataFrames

In [0]:
uuDF.filter(uuDF.context == '16-days-of-action-2018').groupby(['from', 'to']).sum('weight').take(5)

finally, we can achieve the same goal using the SQL API

In [0]:
uuDF.createOrReplaceTempView('uu')

In [0]:
## top contexts:
spark.sql("SELECT context, count(context) as records_per_context FROM uu group by context order by records_per_context desc").show()

In [0]:
## total weights for a given context:
spark.sql("SELECT uu.from, uu.to, sum(uu.weight) as tot_weight FROM uu group by uu.from, uu.to").show()
