In [247]:
from pyspark.sql import SparkSession

ss = SparkSession\
        .builder\
        .master("spark://192.168.2.61:7077") \
        .appName("template")\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.dynamicAllocation.shuffleTracking.enabled", True)\
        .config("spark.shuffle.service.enabled", False)\
        .config("spark.dynamicAllocation.executorIdleTimeout", "3000s")\
        .config("spark.executor.cores", 1)\
        .config("spark.driver.port", 9998)\
        .config("spark.blockManager.port", 10005)\
        .getOrCreate()

22/03/15 20:20:40 WARN ExecutorAllocationManager: Dynamic allocation without a shuffle service is an experimental feature.


In [248]:
df = ss.read.json('hdfs://192.168.2.184:9000/RC_2013-01').cache()

                                                                                

In [249]:
#df.printSchema()

In [250]:
#df.select('body').show()

In [251]:
#getting controversial words
cont_words = 'abuse, administration, afghanistan, aid, america,' + \
'american, army, attack, attacks, authorities, authority, ban, banks, benefits, bill, bills,' + \
'border, budget, campaign, candidate, candidates, catholic, china, chinese, church,'+ \
'concerns, congress, conservative, control, country, court, crime, criminal, crisis, cuts,'+\
'debate, debt, defense, deficit, democrats, disease, dollar, drug, drugs, economy, education,'+\
'egypt, election, elections, enforcement, fighting, finance, fiscal, force, funding,'+\
'gas, government, gun, health, immigration, inaccuracies, india, insurance, investigation,'+\
'investigators, iran, israel, job, jobs, judge, justice, killing, korea, labor, land,'+\
'law, lawmakers, laws, lawsuit, leadership, legislation, marriage, media, mexico, military,'+\
'money, murder, nation, nations, news, obama, offensive, officials, oil, parties,'+\
'peace, police, policies, policy, politics, poll, power, president, prices, primary, prison,'+\
'progress, race, reform, republican, republicans, restrictions, rule, rules, ruling, russia,'+\
'russian, school, security, senate, sex, shooting, society, spending, strategy, strike, support,'+\
'syria, syrian, tax, taxes, threat, trial, unemployment, union, usa, victim, victims,'+\
'violence, vote, voters, war, washington, weapons, world,'

In [252]:
#getting semi-controversial words
semi_cont_words = 'account, advantage, amount, attorney, chairman,'+\
'charge, charges, cities, class, comment, companies, cost, credit, delays, effect, expectations,'+\
'families, family, february, germany, goal, housing, information, investment,'+\
'markets, numbers, oklahoma, parents, patients, population, price, projects, raise, rate,'+\
'reason, sales, schools, sector, shot, source, sources, status, stock, store, worth,'

In [253]:
#splitting the words into list
import re
controversial = re.findall('\w+', cont_words.strip().lower())
semi_controversial = re.findall('\w+', semi_cont_words.strip().lower())

In [254]:
#creating udf to tokenize text of a comment
from pyspark.sql.functions import udf, col, size
from pyspark.sql.types import StringType

def tokenize(line):
    return re.findall('\w+', line.strip().lower())
udf_tokenize =  udf(tokenize, StringType())

In [255]:
#applying udf
df = df.withColumn('body', udf_tokenize('body'))

In [256]:
#df.select('body').show()

In [257]:
#downloading english words, importing nltk
import nltk
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()
nltk.download('words')
nltk.download('wordnet')

words = set(nltk.corpus.words.words())
words = [word.lower() for word in words]

[nltk_data] Downloading package words to /home/ubuntu/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ubuntu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [258]:
#creating udf to keep only english words
from pyspark.sql.types import ArrayType, IntegerType

def only_english_words(line, words = words):
    return [word for word in line if word in words]

udf_english = udf(only_english_words, ArrayType(StringType()))


In [259]:
df2 = df.withColumn('body', udf_english('body'))

In [260]:
df2.rdd.getNumPartitions()
#.repartition(10) 

130

In [261]:
#df2.select('body').limit(100).show()

In [262]:
df2.take(100)

                                                                                

[Row(archived=True, author='rghol5212', author_flair_css_class=None, author_flair_text=None, body=['ya', 'we', 'were', 'worried', 'about', 'this', 'before', 'release', 'more', 'life', 'to', 'him', 'face', 'and', 'fell', 'into', 'the', 'make', 'better', 'category', 'and', 'we', 'have', 'been', 'so', 'on', 'complete', 'game', 'mode', 'love', 'hearing', 'that', 'we', 'are', 'thinking', 'correctly', 'or', 'even', 'if', 'we', 'are', 'on', 'the', 'wrong', 'track', 'love', 'hearing', 'about', 'it', 'thank', 'you', 'look', 'forward', 'to', 'a', 'more', 'alive', 'in', 'the', 'future'], controversiality=0, created_utc='1356998400', distinguished=None, downs=0, edited='false', gilded=0, id='c7ozqta', link_id='t3_15n1ve', name='t1_c7ozqta', parent_id='t1_c7o4zvc', removal_reason=None, retrieved_on=1430820301, score=1, score_hidden=False, subreddit='Android', subreddit_id='t5_2qlqh', ups=1),
 Row(archived=True, author='Schrodingers_Moose', author_flair_css_class=None, author_flair_text=None, body=[

In [263]:
df2.printSchema()

root
 |-- archived: boolean (nullable = true)
 |-- author: string (nullable = true)
 |-- author_flair_css_class: string (nullable = true)
 |-- author_flair_text: string (nullable = true)
 |-- body: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- controversiality: long (nullable = true)
 |-- created_utc: string (nullable = true)
 |-- distinguished: string (nullable = true)
 |-- downs: long (nullable = true)
 |-- edited: string (nullable = true)
 |-- gilded: long (nullable = true)
 |-- id: string (nullable = true)
 |-- link_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- parent_id: string (nullable = true)
 |-- removal_reason: string (nullable = true)
 |-- retrieved_on: long (nullable = true)
 |-- score: long (nullable = true)
 |-- score_hidden: boolean (nullable = true)
 |-- subreddit: string (nullable = true)
 |-- subreddit_id: string (nullable = true)
 |-- ups: long (nullable = true)



In [264]:
df2 = df2.filter(size('body') > 0)

In [265]:
#df2.select('body').limit(100).show()

In [266]:
def controversially(line, controversial = controversial, semi_controversial = semi_controversial):
    if any(word in line for word in controversial):
        return 2
    elif any(word in line for word in semi_controversial):
        return 1
    else:
        return 0
    
udf_controversy = udf(controversially, StringType())

In [267]:
df3 = df2.withColumn('controversial_words', udf_controversy('body')).limit(100)
#df3.rdd.repartition(10)

In [268]:
df3.take(100)

                                                                                

[Row(archived=True, author='rghol5212', author_flair_css_class=None, author_flair_text=None, body=['ya', 'we', 'were', 'worried', 'about', 'this', 'before', 'release', 'more', 'life', 'to', 'him', 'face', 'and', 'fell', 'into', 'the', 'make', 'better', 'category', 'and', 'we', 'have', 'been', 'so', 'on', 'complete', 'game', 'mode', 'love', 'hearing', 'that', 'we', 'are', 'thinking', 'correctly', 'or', 'even', 'if', 'we', 'are', 'on', 'the', 'wrong', 'track', 'love', 'hearing', 'about', 'it', 'thank', 'you', 'look', 'forward', 'to', 'a', 'more', 'alive', 'in', 'the', 'future'], controversiality=0, created_utc='1356998400', distinguished=None, downs=0, edited='false', gilded=0, id='c7ozqta', link_id='t3_15n1ve', name='t1_c7ozqta', parent_id='t1_c7o4zvc', removal_reason=None, retrieved_on=1430820301, score=1, score_hidden=False, subreddit='Android', subreddit_id='t5_2qlqh', ups=1, controversial_words='0'),
 Row(archived=True, author='Schrodingers_Moose', author_flair_css_class=None, autho

In [269]:
ss.stop()
#it took 8.4 min to process with 16.24 GB file

In [270]:
#take(10)