In [60]:
from pyspark.sql import SparkSession

ss = SparkSession\
        .builder\
        .master("spark://192.168.2.61:7077") \
        .appName("template")\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.dynamicAllocation.shuffleTracking.enabled", True)\
        .config("spark.shuffle.service.enabled", False)\
        .config("spark.dynamicAllocation.executorIdleTimeout", "3000s")\
        .config("spark.executor.cores", 1)\
        .config("spark.driver.port", 9998)\
        .config("spark.blockManager.port", 10005)\
        .getOrCreate()

22/03/15 14:18:07 WARN ExecutorAllocationManager: Dynamic allocation without a shuffle service is an experimental feature.


In [61]:
df = ss.read.json('hdfs://192.168.2.184:9000/RC_2011-07').cache()

                                                                                

In [62]:
df.printSchema()

root
 |-- _corrupt_record: string (nullable = true)
 |-- archived: boolean (nullable = true)
 |-- author: string (nullable = true)
 |-- author_flair_css_class: string (nullable = true)
 |-- author_flair_text: string (nullable = true)
 |-- body: string (nullable = true)
 |-- controversiality: long (nullable = true)
 |-- created_utc: string (nullable = true)
 |-- distinguished: string (nullable = true)
 |-- downs: long (nullable = true)
 |-- edited: string (nullable = true)
 |-- gilded: long (nullable = true)
 |-- id: string (nullable = true)
 |-- link_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- parent_id: string (nullable = true)
 |-- retrieved_on: long (nullable = true)
 |-- score: long (nullable = true)
 |-- score_hidden: boolean (nullable = true)
 |-- subreddit: string (nullable = true)
 |-- subreddit_id: string (nullable = true)
 |-- ups: long (nullable = true)



In [63]:
df.select('body').show()

[Stage 1:>                                                          (0 + 1) / 1]

+--------------------+
|                body|
+--------------------+
|    Good lord.  Yes.|
|I don't know abou...|
|Explain something...|
|I would add that ...|
|care to explain #...|
|A society where t...|
|Normally, when I ...|
|           [deleted]|
|No one wants to t...|
|    I don't get it. |
|I wonder whether ...|
|           [deleted]|
|I would suggest j...|
|Remember, don't c...|
|I made it [here](...|
|I think you may h...|
|Not clever but "I...|
|Aww, very nice! H...|
|Wow! Thanks! I th...|
|I read the title ...|
+--------------------+
only showing top 20 rows



                                                                                

In [64]:
#getting controversial words
cont_words = 'abuse, administration, afghanistan, aid, america,' + \
'american, army, attack, attacks, authorities, authority, ban, banks, benefits, bill, bills,' + \
'border, budget, campaign, candidate, candidates, catholic, china, chinese, church,'+ \
'concerns, congress, conservative, control, country, court, crime, criminal, crisis, cuts,'+\
'debate, debt, defense, deficit, democrats, disease, dollar, drug, drugs, economy, education,'+\
'egypt, election, elections, enforcement, fighting, finance, fiscal, force, funding,'+\
'gas, government, gun, health, immigration, inaccuracies, india, insurance, investigation,'+\
'investigators, iran, israel, job, jobs, judge, justice, killing, korea, labor, land,'+\
'law, lawmakers, laws, lawsuit, leadership, legislation, marriage, media, mexico, military,'+\
'money, murder, nation, nations, news, obama, offensive, officials, oil, parties,'+\
'peace, police, policies, policy, politics, poll, power, president, prices, primary, prison,'+\
'progress, race, reform, republican, republicans, restrictions, rule, rules, ruling, russia,'+\
'russian, school, security, senate, sex, shooting, society, spending, strategy, strike, support,'+\
'syria, syrian, tax, taxes, threat, trial, unemployment, union, usa, victim, victims,'+\
'violence, vote, voters, war, washington, weapons, world,'

In [65]:
#getting semi-controversial words
semi_cont_words = 'account, advantage, amount, attorney, chairman,'+\
'charge, charges, cities, class, comment, companies, cost, credit, delays, effect, expectations,'+\
'families, family, february, germany, goal, housing, information, investment,'+\
'markets, numbers, oklahoma, parents, patients, population, price, projects, raise, rate,'+\
'reason, sales, schools, sector, shot, source, sources, status, stock, store, worth,'

In [66]:
#splitting the words into list
import re
controversial = re.findall('\w+', cont_words.strip().lower())
semi_controversial = re.findall('\w+', semi_cont_words.strip().lower())

In [67]:
#creating udf to tokenize text of a comment
from pyspark.sql.functions import udf, col, size
from pyspark.sql.types import StringType

def tokenize(line):
    return re.findall('\w+', line.strip().lower())
udf_tokenize =  udf(tokenize, StringType())

In [68]:
#applying udf
df = df.withColumn('body', udf_tokenize('body'))

In [69]:
df.select('body').show()

+--------------------+
|                body|
+--------------------+
|   [good, lord, yes]|
|[i, don, t, know,...|
|[explain, somethi...|
|[i, would, add, t...|
|[care, to, explai...|
|[a, society, wher...|
|[normally, when, ...|
|           [deleted]|
|[no, one, wants, ...|
|[i, don, t, get, it]|
|[i, wonder, wheth...|
|           [deleted]|
|[i, would, sugges...|
|[remember, don, t...|
|[i, made, it, her...|
|[i, think, you, m...|
|[not, clever, but...|
|[aww, very, nice,...|
|[wow, thanks, i, ...|
|[i, read, the, ti...|
+--------------------+
only showing top 20 rows



                                                                                

In [70]:
#downloading english words, importing nltk
import nltk
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()
nltk.download('words')
nltk.download('wordnet')

words = set(nltk.corpus.words.words())
words = [word.lower() for word in words]

[nltk_data] Downloading package words to /home/ubuntu/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ubuntu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [71]:
#creating udf to keep only english words
from pyspark.sql.types import ArrayType, IntegerType

def only_english_words(line, words = words):
    return [word for word in line if word in words]

udf_english = udf(only_english_words, ArrayType(StringType()))


In [72]:
df2 = df.withColumn('body', udf_english('body'))

In [73]:
df2.rdd.getNumPartitions()
#.repartition(10) 

36

In [74]:
df2.select('body').limit(100).show()

[Stage 3:>                                                          (0 + 1) / 1]

+--------------------+
|                body|
+--------------------+
|   [good, lord, yes]|
|[i, don, t, know,...|
|[explain, somethi...|
|[i, would, add, t...|
|[care, to, explai...|
|[a, society, wher...|
|[normally, when, ...|
|                  []|
|[no, one, to, tal...|
|[i, don, t, get, it]|
|[i, wonder, wheth...|
|                  []|
|[i, would, sugges...|
|[remember, don, t...|
|[i, made, it, her...|
|[i, think, you, m...|
|[not, clever, but...|
|[very, nice, how,...|
|[wow, thanks, i, ...|
|[i, read, the, ti...|
+--------------------+
only showing top 20 rows



                                                                                

In [75]:
df2.printSchema()

root
 |-- _corrupt_record: string (nullable = true)
 |-- archived: boolean (nullable = true)
 |-- author: string (nullable = true)
 |-- author_flair_css_class: string (nullable = true)
 |-- author_flair_text: string (nullable = true)
 |-- body: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- controversiality: long (nullable = true)
 |-- created_utc: string (nullable = true)
 |-- distinguished: string (nullable = true)
 |-- downs: long (nullable = true)
 |-- edited: string (nullable = true)
 |-- gilded: long (nullable = true)
 |-- id: string (nullable = true)
 |-- link_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- parent_id: string (nullable = true)
 |-- retrieved_on: long (nullable = true)
 |-- score: long (nullable = true)
 |-- score_hidden: boolean (nullable = true)
 |-- subreddit: string (nullable = true)
 |-- subreddit_id: string (nullable = true)
 |-- ups: long (nullable = true)



In [76]:
df2 = df2.filter(size('body') > 0)

In [80]:
df2.select('body').limit(100).show()



+--------------------+
|                body|
+--------------------+
|[this, is, why, i...|
|[i, t, mind, seei...|
|         [the, drug]|
|[came, to, the, t...|
|[small, world, he...|
|[the, new, are, a...|
|[your, dedication...|
|[pretty, sure, to...|
|[i, bleed, from, ...|
|[when, did, you, ...|
|[this, me, even, ...|
|[there, s, some, ...|
|[i, m, an, irish,...|
|[i, have, the, co...|
|             [might]|
|[yes, some, of, u...|
|[i, chisel, on, t...|
|[was, observing, ...|
|[ron, paul, off, ...|
|[but, but, but, i...|
+--------------------+
only showing top 20 rows



                                                                                

In [None]:
def controversially(line, controversial = controversial, semi_controversial = semi_controversial):
    if any(word in line for word in controversial):
        return 2
    elif any(word in line for word in semi_controversial):
        return 1
    else:
        return 0
    
udf_controversy = udf(controversially, StringType())

In [None]:
df3 = df2.withColumn('controversial_words', udf_controversy('body')).limit(100)
#df3.rdd.repartition(10)

In [None]:
df3.take(100)

In [None]:
ss.stop()

In [None]:
#take(10)