In [172]:
from pyspark.sql import SparkSession

ss = SparkSession\
        .builder\
        .master("spark://192.168.2.61:7077") \
        .appName("template")\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.dynamicAllocation.shuffleTracking.enabled", True)\
        .config("spark.shuffle.service.enabled", False)\
        .config("spark.dynamicAllocation.executorIdleTimeout", "3000s")\
        .config("spark.executor.cores", 1)\
        .config("spark.driver.port", 9998)\
        .config("spark.blockManager.port", 10005)\
        .getOrCreate()

22/03/15 18:02:55 WARN ExecutorAllocationManager: Dynamic allocation without a shuffle service is an experimental feature.


In [173]:
df = ss.read.json('hdfs://192.168.2.184:9000/RC_2012-05').cache()

                                                                                

In [174]:
#df.printSchema()

In [175]:
#df.select('body').show()

In [176]:
#getting controversial words
cont_words = 'abuse, administration, afghanistan, aid, america,' + \
'american, army, attack, attacks, authorities, authority, ban, banks, benefits, bill, bills,' + \
'border, budget, campaign, candidate, candidates, catholic, china, chinese, church,'+ \
'concerns, congress, conservative, control, country, court, crime, criminal, crisis, cuts,'+\
'debate, debt, defense, deficit, democrats, disease, dollar, drug, drugs, economy, education,'+\
'egypt, election, elections, enforcement, fighting, finance, fiscal, force, funding,'+\
'gas, government, gun, health, immigration, inaccuracies, india, insurance, investigation,'+\
'investigators, iran, israel, job, jobs, judge, justice, killing, korea, labor, land,'+\
'law, lawmakers, laws, lawsuit, leadership, legislation, marriage, media, mexico, military,'+\
'money, murder, nation, nations, news, obama, offensive, officials, oil, parties,'+\
'peace, police, policies, policy, politics, poll, power, president, prices, primary, prison,'+\
'progress, race, reform, republican, republicans, restrictions, rule, rules, ruling, russia,'+\
'russian, school, security, senate, sex, shooting, society, spending, strategy, strike, support,'+\
'syria, syrian, tax, taxes, threat, trial, unemployment, union, usa, victim, victims,'+\
'violence, vote, voters, war, washington, weapons, world,'

In [177]:
#getting semi-controversial words
semi_cont_words = 'account, advantage, amount, attorney, chairman,'+\
'charge, charges, cities, class, comment, companies, cost, credit, delays, effect, expectations,'+\
'families, family, february, germany, goal, housing, information, investment,'+\
'markets, numbers, oklahoma, parents, patients, population, price, projects, raise, rate,'+\
'reason, sales, schools, sector, shot, source, sources, status, stock, store, worth,'

In [178]:
#splitting the words into list
import re
controversial = re.findall('\w+', cont_words.strip().lower())
semi_controversial = re.findall('\w+', semi_cont_words.strip().lower())

In [179]:
#creating udf to tokenize text of a comment
from pyspark.sql.functions import udf, col, size
from pyspark.sql.types import StringType

def tokenize(line):
    return re.findall('\w+', line.strip().lower())
udf_tokenize =  udf(tokenize, StringType())

In [180]:
#applying udf
df = df.withColumn('body', udf_tokenize('body'))

In [181]:
#df.select('body').show()

In [182]:
#downloading english words, importing nltk
import nltk
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()
nltk.download('words')
nltk.download('wordnet')

words = set(nltk.corpus.words.words())
words = [word.lower() for word in words]

[nltk_data] Downloading package words to /home/ubuntu/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ubuntu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [183]:
#creating udf to keep only english words
from pyspark.sql.types import ArrayType, IntegerType

def only_english_words(line, words = words):
    return [word for word in line if word in words]

udf_english = udf(only_english_words, ArrayType(StringType()))


In [184]:
df2 = df.withColumn('body', udf_english('body'))

In [185]:
df2.rdd.getNumPartitions()
#.repartition(10) 

26

In [186]:
#df2.select('body').limit(100).show()

In [187]:
df2.take(100)

                                                                                

[Row(archived=True, author='_silentheartsong', author_flair_css_class=None, author_flair_text=None, body=['this', 'is', 'a', 'failure', 'for', 'shame', 'on', 'you'], controversiality=0, created_utc='1291161600', distinguished=None, downs=0, edited='false', gilded=0, id='c17frnf', link_id='t3_ee5la', name='t1_c17frnf', parent_id='t3_ee5la', retrieved_on=1426603881, score=-5, score_hidden=False, subreddit='Music', subreddit_id='t5_2qh1u', ups=-5),
 Row(archived=True, author='product19', author_flair_css_class=None, author_flair_text=None, body=['you', 'made', 'an', 'excellent', 'choice', 'not', 'a', 'home', 'in', 'arizona'], controversiality=0, created_utc='1291161601', distinguished=None, downs=0, edited='false', gilded=0, id='c17frng', link_id='t3_edzco', name='t1_c17frng', parent_id='t1_c17ei3w', retrieved_on=1426603881, score=8, score_hidden=False, subreddit='IAmA', subreddit_id='t5_2qzb6', ups=8),
 Row(archived=True, author='RegularFreddieWilson', author_flair_css_class=None, author

In [188]:
df2.printSchema()

root
 |-- archived: boolean (nullable = true)
 |-- author: string (nullable = true)
 |-- author_flair_css_class: string (nullable = true)
 |-- author_flair_text: string (nullable = true)
 |-- body: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- controversiality: long (nullable = true)
 |-- created_utc: string (nullable = true)
 |-- distinguished: string (nullable = true)
 |-- downs: long (nullable = true)
 |-- edited: string (nullable = true)
 |-- gilded: long (nullable = true)
 |-- id: string (nullable = true)
 |-- link_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- parent_id: string (nullable = true)
 |-- retrieved_on: long (nullable = true)
 |-- score: long (nullable = true)
 |-- score_hidden: boolean (nullable = true)
 |-- subreddit: string (nullable = true)
 |-- subreddit_id: string (nullable = true)
 |-- ups: long (nullable = true)



In [189]:
df2 = df2.filter(size('body') > 0)

In [190]:
#df2.select('body').limit(100).show()

In [191]:
def controversially(line, controversial = controversial, semi_controversial = semi_controversial):
    if any(word in line for word in controversial):
        return 2
    elif any(word in line for word in semi_controversial):
        return 1
    else:
        return 0
    
udf_controversy = udf(controversially, StringType())

In [192]:
df3 = df2.withColumn('controversial_words', udf_controversy('body')).limit(100)
#df3.rdd.repartition(10)

In [193]:
df3.take(100)

                                                                                

[Row(archived=True, author='_silentheartsong', author_flair_css_class=None, author_flair_text=None, body=['this', 'is', 'a', 'failure', 'for', 'shame', 'on', 'you'], controversiality=0, created_utc='1291161600', distinguished=None, downs=0, edited='false', gilded=0, id='c17frnf', link_id='t3_ee5la', name='t1_c17frnf', parent_id='t3_ee5la', retrieved_on=1426603881, score=-5, score_hidden=False, subreddit='Music', subreddit_id='t5_2qh1u', ups=-5, controversial_words='0'),
 Row(archived=True, author='product19', author_flair_css_class=None, author_flair_text=None, body=['you', 'made', 'an', 'excellent', 'choice', 'not', 'a', 'home', 'in', 'arizona'], controversiality=0, created_utc='1291161601', distinguished=None, downs=0, edited='false', gilded=0, id='c17frng', link_id='t3_edzco', name='t1_c17frng', parent_id='t1_c17ei3w', retrieved_on=1426603881, score=8, score_hidden=False, subreddit='IAmA', subreddit_id='t5_2qzb6', ups=8, controversial_words='0'),
 Row(archived=True, author='RegularF

In [194]:
ss.stop()

In [None]:
#take(10)