In [1]:
import time
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext

spark_session = SparkSession\
            .builder\
            .master("spark://192.168.2.75:7077")\
            .appName("codecheck")\
            .config("spark.executor.cores",4)\
            .getOrCreate()

spark_context = spark_session.sparkContext


In [2]:
start_time = time.time()
df1 =  spark_session.read.json('hdfs://192.168.2.75:9000/reddit_data/1gb_data/RC_2011-08.bz2')

In [5]:
df5 = df1.select("author","subreddit")\
        .groupBy("subreddit")\
        .count()\
        .orderBy(["count"],ascending=False)\
        .limit(1)

print(df5.take(1))

[Row(subreddit='AskReddit', count=1801041)]


In [12]:
rdd1 = df5.join(df1.select("subreddit","body"),df5.subreddit==df1.subreddit)\
        .select("body").rdd\

print(rdd1.take(10))

[Row(body='[deleted]'), Row(body="How do you think you're going to become responsible enough for a job? Do you believe time will take care of that?"), Row(body="I can't speak for other women, but I love facial hair.  Well, that's not entirely true.  I love *beards.*  Mustaches and goatees and chinstraps aren't my thing.  "), Row(body='Guess we know the answer.'), Row(body="Well this woman clearly is evil and manipulative, waiting to falsely accuse this man of abuse. So your consternation is sort of misplaced. \n\nBut really, r/mensrights is built to help people like throwaway2019. Literally, they exist to support people like him during crises like the one he is going through, when more official channels can be less than willing to help him out. I can't see any problem with suggesting he find help there."), Row(body='[something similar to this](http://i.imgur.com/ohYbN.jpg)'), Row(body="I never had the heart to tell him it wasn't funny."), Row(body='No, just make sure you have parents w

In [27]:
from re import sub

def lower_tokenize(content):
    clean_content = content.lower()
    clean_content = sub(r'[^A-Za-z0-9 \'/]+', u'', clean_content)
    clean_content = clean_content.replace(u'\xa0', u' ')
    clean_content = clean_content.split(' ')
    return clean_content

tokenized_rdd1 = rdd1\
    .map(lambda content: lower_tokenize(content[0]))\


print(tokenized_rdd1.take(10))

[['deleted'], ['how', 'do', 'you', 'think', "you're", 'going', 'to', 'become', 'responsible', 'enough', 'for', 'a', 'job', 'do', 'you', 'believe', 'time', 'will', 'take', 'care', 'of', 'that'], ['i', "can't", 'speak', 'for', 'other', 'women', 'but', 'i', 'love', 'facial', 'hair', '', 'well', "that's", 'not', 'entirely', 'true', '', 'i', 'love', 'beards', '', 'mustaches', 'and', 'goatees', 'and', 'chinstraps', "aren't", 'my', 'thing', '', ''], ['guess', 'we', 'know', 'the', 'answer'], ['well', 'this', 'woman', 'clearly', 'is', 'evil', 'and', 'manipulative', 'waiting', 'to', 'falsely', 'accuse', 'this', 'man', 'of', 'abuse', 'so', 'your', 'consternation', 'is', 'sort', 'of', 'misplaced', 'but', 'really', 'r/mensrights', 'is', 'built', 'to', 'help', 'people', 'like', 'throwaway2019', 'literally', 'they', 'exist', 'to', 'support', 'people', 'like', 'him', 'during', 'crises', 'like', 'the', 'one', 'he', 'is', 'going', 'through', 'when', 'more', 'official', 'channels', 'can', 'be', 'less', '

In [28]:
from operator import add

word_rdd1 = tokenized_rdd1\
    .flatMap(lambda t: [a for a in t if len(a)>2])\
    .map(lambda w: (w,1))

word2_rdd1 = word_rdd1.reduceByKey(add)

print(word2_rdd1.takeOrdered(20, key=lambda x: -x[1]))


[('the', 1888915), ('and', 1228079), ('you', 822382), ('that', 708855), ('for', 465084), ('was', 442019), ('with', 357729), ('have', 348063), ('but', 342508), ('not', 315108), ('this', 312043), ('are', 280663), ('just', 257993), ('your', 254691), ('they', 253077), ('like', 236984), ('out', 205622), ('about', 198618), ("don't", 190521), ('all', 190133)]


In [29]:
spark_session.stop()

In [None]:
    
    
    stop_time = time.time()
time_4_cores = stop_time - start_time
