In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext

spark_session = SparkSession\
            .builder\
            .master("spark://192.168.2.75:7077")\
            .appName("Test_reddit_sample")\
            .config("spark.executor.cores",4)\
            .getOrCreate()

spark_context = spark_session.sparkContext

In [2]:
df1 =  spark_session.read.json('hdfs://192.168.2.75:9000/data/reddit_sample.json')

In [3]:
df5 = df1.select("author","subreddit")\
        .groupBy("subreddit")\
        .count()\
        .orderBy(["count"],ascending=False)\
        .limit(5)

print(df5.take(5))

[Row(subreddit='AskReddit', count=486), Row(subreddit='CFB', count=403), Row(subreddit='CrazyIdeas', count=261), Row(subreddit='news', count=158), Row(subreddit='ConciseIAmA', count=147)]


In [4]:
from re import sub

def map_values(subreddit,string):
    string_out = sub(r'[^A-Za-z0-9 \'/]+', u'', string.lower())
    string_out = string_out.split(' ')
    entry_list = []
    for entry in string_out:
        entry_list.append([(subreddit,entry),1])
    return (entry_list)

rdd1 = df5.join(df1.select("subreddit","body"),df5.subreddit==df1.subreddit)\
        .select(df1.subreddit,"body").rdd\
        .flatMap(lambda x: map_values(x[0],x[1]))\
        .reduceByKey(lambda x,y:x+y)\
        .sortBy(lambda x:x[1],ascending=False)

print(rdd1.take(10))

[(('AskReddit', 'the'), 435), (('AskReddit', 'a'), 333), (('AskReddit', 'to'), 306), (('AskReddit', 'and'), 300), (('CrazyIdeas', 'this'), 259), (('CrazyIdeas', 'lets'), 258), (('CrazyIdeas', 'see'), 258), (('CrazyIdeas', 'how'), 258), (('CrazyIdeas', 'deep'), 258), (('CrazyIdeas', 'rabbit'), 258)]


In [5]:
CrazyIdeas = rdd1.filter(lambda x: x[0][0]=='CrazyIdeas')

print(CrazyIdeas.take(10))

[(('CrazyIdeas', 'this'), 259), (('CrazyIdeas', 'lets'), 258), (('CrazyIdeas', 'see'), 258), (('CrazyIdeas', 'how'), 258), (('CrazyIdeas', 'deep'), 258), (('CrazyIdeas', 'rabbit'), 258), (('CrazyIdeas', 'hole'), 258), (('CrazyIdeas', 'goes'), 258), (('CrazyIdeas', 'i'), 3), (('CrazyIdeas', 'never'), 1)]


In [6]:
spark_session.stop()