In [44]:
# Imports

from pyspark.sql import SparkSession
from pyspark.sql.functions import UserDefinedFunction as UDF
from pyspark.sql.functions import size
from pyspark.sql.types import ArrayType, StringType, IntegerType

In [45]:
# Start session

ss = SparkSession\
        .builder\
        .master("spark://192.168.2.90:7077") \
        .appName("andreas")\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.dynamicAllocation.shuffleTracking.enabled", True)\
        .config("spark.shuffle.service.enabled", False)\
        .config("spark.dynamicAllocation.executorIdleTimeout", "30s")\
        .config("spark.executor.cores", 2)\
        .config("spark.executor.memory", "2g")\
        .config("spark.driver.port", 9998)\
        .config("spark.blockManager.port", 10005)\
        .getOrCreate()

22/03/15 21:00:04 WARN ExecutorAllocationManager: Dynamic allocation without a shuffle service is an experimental feature.


In [46]:
# Read file

df = ss.read.json('hdfs://192.168.2.90:9000/user/ubuntu/RC_2012-01')
df.printSchema()



root
 |-- archived: boolean (nullable = true)
 |-- author: string (nullable = true)
 |-- author_flair_css_class: string (nullable = true)
 |-- author_flair_text: string (nullable = true)
 |-- body: string (nullable = true)
 |-- controversiality: long (nullable = true)
 |-- created_utc: string (nullable = true)
 |-- distinguished: string (nullable = true)
 |-- downs: long (nullable = true)
 |-- edited: string (nullable = true)
 |-- gilded: long (nullable = true)
 |-- id: string (nullable = true)
 |-- link_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- parent_id: string (nullable = true)
 |-- removal_reason: string (nullable = true)
 |-- retrieved_on: long (nullable = true)
 |-- score: long (nullable = true)
 |-- score_hidden: boolean (nullable = true)
 |-- subreddit: string (nullable = true)
 |-- subreddit_id: string (nullable = true)
 |-- ups: long (nullable = true)



                                                                                

In [47]:
# Drop unused columns

df = df.drop('archived', 'author', 'author_flair_css_class', 'author_flair_text', 'created_utc', 'edited',\
            'gilded', 'id', 'link_id', 'name', 'parent_id', 'removal_reason', 'retrieved_on',\
            'score_hidden', 'subreddit', 'ups', 'downs', 'distinguished');

df.printSchema()

root
 |-- body: string (nullable = true)
 |-- controversiality: long (nullable = true)
 |-- score: long (nullable = true)
 |-- subreddit_id: string (nullable = true)



In [48]:
# Tokenize comment body

import string
def tokenize(text):
    return text.translate(str.maketrans('', '', string.punctuation)).lower().split()

udf_tokenize =  UDF(tokenize, ArrayType(StringType()))

df = df.withColumn('body', udf_tokenize(df.body))

In [40]:
# Only english

# from nltk.corpus import words

# words = set(words.words())
# words = [word.lower() for word in words]

# def only_english(line, words = words):
#     return [word for word in line if word in words]

# udf_english = UDF(only_english, ArrayType(StringType()))

# df = df.withColumn('body', udf_english(df.body))
# df = df.filter(size(df.body) > 0)

In [49]:
# Comment length feature

df = df.withColumn('comment_length', size(df.body))

In [50]:
# Deleted comment feature
def check_deleted(line):
    for word in line:
        if word in ['removed', 'deleted']:
            return 1
    return 0

udf_deleted = UDF(check_deleted, IntegerType())

df = df.withColumn('deleted', udf_deleted(df.body))

In [54]:
# Controversial words feature

cont_words = 'abuse, administration, afghanistan, aid, america,' + \
'american, army, attack, attacks, authorities, authority, ban, banks, benefits, bill, bills,' + \
'border, budget, campaign, candidate, candidates, catholic, china, chinese, church,'+ \
'concerns, congress, conservative, control, country, court, crime, criminal, crisis, cuts,'+\
'debate, debt, defense, deficit, democrats, disease, dollar, drug, drugs, economy, education,'+\
'egypt, election, elections, enforcement, fighting, finance, fiscal, force, funding,'+\
'gas, government, gun, health, immigration, inaccuracies, india, insurance, investigation,'+\
'investigators, iran, israel, job, jobs, judge, justice, killing, korea, labor, land,'+\
'law, lawmakers, laws, lawsuit, leadership, legislation, marriage, media, mexico, military,'+\
'money, murder, nation, nations, news, obama, offensive, officials, oil, parties,'+\
'peace, police, policies, policy, politics, poll, power, president, prices, primary, prison,'+\
'progress, race, reform, republican, republicans, restrictions, rule, rules, ruling, russia,'+\
'russian, school, security, senate, sex, shooting, society, spending, strategy, strike, support,'+\
'syria, syrian, tax, taxes, threat, trial, unemployment, union, usa, victim, victims,'+\
'violence, vote, voters, war, washington, weapons, world,'

semi_cont_words = 'account, advantage, amount, attorney, chairman,'+\
'charge, charges, cities, class, comment, companies, cost, credit, delays, effect, expectations,'+\
'families, family, february, germany, goal, housing, information, investment,'+\
'markets, numbers, oklahoma, parents, patients, population, price, projects, raise, rate,'+\
'reason, sales, schools, sector, shot, source, sources, status, stock, store, worth,'

controversial = tokenize(cont_words)
semi_controversial = tokenize(semi_cont_words)

def controversial_words(line, controversial = controversial, semi_controversial = semi_controversial):
    score = 0
    for word in line:
        if word in controversial:
            score += 3
        elif word in semi_controversial:
            score += 1
    return score
    
udf_controversial = UDF(controversial_words, IntegerType())

df = df.withColumn('cont_word_score', udf_controversial(df.body))

In [57]:
df.count()

                                                                                

16350205

In [58]:
df.printSchema()

root
 |-- body: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- controversiality: long (nullable = true)
 |-- score: long (nullable = true)
 |-- subreddit_id: string (nullable = true)
 |-- comment_length: integer (nullable = false)
 |-- deleted: integer (nullable = true)
 |-- cont_word_score: integer (nullable = true)



In [59]:
ss.stop()