In [2]:
from pyspark.sql import SparkSession

ss = SparkSession\
        .builder\
        .master("spark://192.168.2.90:7077") \
        .appName("template")\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.dynamicAllocation.shuffleTracking.enabled", True)\
        .config("spark.shuffle.service.enabled", False)\
        .config("spark.dynamicAllocation.executorIdleTimeout", "30s")\
        .config("spark.executor.cores", 1)\
        .config("spark.driver.port", 9998)\
        .config("spark.blockManager.port", 10005)\
        .getOrCreate()

In [3]:
df = ss.read.json('hdfs://192.168.2.90:9000/user/ubuntu/RC_2006-03').cache()

                                                                                

In [4]:
df.printSchema()

root
 |-- author: string (nullable = true)
 |-- author_flair_css_class: string (nullable = true)
 |-- author_flair_text: string (nullable = true)
 |-- body: string (nullable = true)
 |-- controversiality: long (nullable = true)
 |-- created_utc: long (nullable = true)
 |-- distinguished: string (nullable = true)
 |-- edited: string (nullable = true)
 |-- gilded: long (nullable = true)
 |-- id: string (nullable = true)
 |-- link_id: string (nullable = true)
 |-- parent_id: string (nullable = true)
 |-- retrieved_on: long (nullable = true)
 |-- score: long (nullable = true)
 |-- stickied: boolean (nullable = true)
 |-- subreddit: string (nullable = true)
 |-- subreddit_id: string (nullable = true)
 |-- ups: long (nullable = true)



In [5]:
df.select('body').show()

                                                                                

+----------------------------------+
|                              body|
+----------------------------------+
|              It's not garbage,...|
|              Actually, it's ni...|
|              I've read them, a...|
|              If you look at a ...|
|              That's true, but ...|
|              High school drop ...|
|                         [deleted]|
|              They put the powe...|
| 手元のiCalでも化けていたので、...|
|              or, download the ...|
|              there's some conf...|
|                         [removed]|
|              I do.  See the co...|
|                         [removed]|
|              what exactly is i...|
|平面上の位置、および押す強さ(?)...|
|              I love the last s...|
|              maybe he thought ...|
|              I've done this.  ...|
|              Open your wife's ...|
+----------------------------------+
only showing top 20 rows



In [6]:
#getting controversial words
cont_words = 'abuse, administration, afghanistan, aid, america,' + \
'american, army, attack, attacks, authorities, authority, ban, banks, benefits, bill, bills,' + \
'border, budget, campaign, candidate, candidates, catholic, china, chinese, church,'+ \
'concerns, congress, conservative, control, country, court, crime, criminal, crisis, cuts,'+\
'debate, debt, defense, deficit, democrats, disease, dollar, drug, drugs, economy, education,'+\
'egypt, election, elections, enforcement, fighting, finance, fiscal, force, funding,'+\
'gas, government, gun, health, immigration, inaccuracies, india, insurance, investigation,'+\
'investigators, iran, israel, job, jobs, judge, justice, killing, korea, labor, land,'+\
'law, lawmakers, laws, lawsuit, leadership, legislation, marriage, media, mexico, military,'+\
'money, murder, nation, nations, news, obama, offensive, officials, oil, parties,'+\
'peace, police, policies, policy, politics, poll, power, president, prices, primary, prison,'+\
'progress, race, reform, republican, republicans, restrictions, rule, rules, ruling, russia,'+\
'russian, school, security, senate, sex, shooting, society, spending, strategy, strike, support,'+\
'syria, syrian, tax, taxes, threat, trial, unemployment, union, usa, victim, victims,'+\
'violence, vote, voters, war, washington, weapons, world,'

In [7]:
#getting semi-controversial words
semi_cont_words = 'account, advantage, amount, attorney, chairman,'+\
'charge, charges, cities, class, comment, companies, cost, credit, delays, effect, expectations,'+\
'families, family, february, germany, goal, housing, information, investment,'+\
'markets, numbers, oklahoma, parents, patients, population, price, projects, raise, rate,'+\
'reason, sales, schools, sector, shot, source, sources, status, stock, store, worth,'

In [8]:
#splitting the words into list
import re
controversial = re.findall('\w+', cont_words.strip().lower())
semi_controversial = re.findall('\w+', semi_cont_words.strip().lower())

In [9]:
#creating udf to tokenize text of a comment
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

def tokenize(line):
    return re.findall('\w+', line.strip().lower())
udf_tokenize =  udf(tokenize, StringType())

In [10]:
#applying udf
df = df.withColumn('body', udf_tokenize('body'))

In [11]:
df.select('body').show()

[Stage 2:>                                                          (0 + 1) / 1]

+---------------------------------+
|                             body|
+---------------------------------+
|             [it, s, not, garb...|
|             [actually, it, s,...|
|             [i, ve, read, the...|
|             [if, you, look, a...|
|             [that, s, true, b...|
|             [high, school, dr...|
|                        [deleted]|
|             [they, put, the, ...|
| [手元のicalでも化けていたので...|
|             [or, download, th...|
|             [there, s, some, ...|
|                        [removed]|
|             [i, do, see, the,...|
|                        [removed]|
|             [what, exactly, i...|
|[平面上の位置, および押す強さ,...|
|             [i, love, the, la...|
|             [maybe, he, thoug...|
|             [i, ve, done, thi...|
|             [open, your, wife...|
+---------------------------------+
only showing top 20 rows



                                                                                

In [13]:
#downloading english words, importing nltk
import nltk
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()
nltk.download('words')
nltk.download('wordnet')



[nltk_data] Downloading package words to /home/ubuntu/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ubuntu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [14]:
#creating udf to keep only english words
from pyspark.sql.types import ArrayType

def only_english_words(line):
    import nltk
    return [word for word in line if wnl.lemmatize(word) in nltk.corpus.words.words()]

udf_only_english_words = udf(only_english_words, ArrayType(StringType()))

In [15]:
df2 = df.withColumn('body', udf_only_english_words('body'))

In [16]:
df2.select('body').show()

22/03/01 13:37:29 WARN TaskSetManager: Lost task 0.0 in stage 3.0 (TID 4) (192.168.2.251 executor 0): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/home/ubuntu/spark-3.2.1-bin-hadoop3.2/python/lib/pyspark.zip/pyspark/worker.py", line 603, in main
    func, profiler, deserializer, serializer = read_udfs(pickleSer, infile, eval_type)
  File "/home/ubuntu/spark-3.2.1-bin-hadoop3.2/python/lib/pyspark.zip/pyspark/worker.py", line 449, in read_udfs
    udfs.append(read_single_udf(pickleSer, infile, eval_type, runner_conf, udf_index=i))
  File "/home/ubuntu/spark-3.2.1-bin-hadoop3.2/python/lib/pyspark.zip/pyspark/worker.py", line 251, in read_single_udf
    f, return_type = read_command(pickleSer, infile)
  File "/home/ubuntu/spark-3.2.1-bin-hadoop3.2/python/lib/pyspark.zip/pyspark/worker.py", line 71, in read_command
    command = serializer._read_with_length(file)
  File "/home/ubuntu/spark-3.2.1-bin-hadoop3.2/python/lib/pyspark.zip/pyspark/seriali

PythonException: 
  An exception was thrown from the Python worker. Please see the stack trace below.
Traceback (most recent call last):
  File "/home/ubuntu/spark-3.2.1-bin-hadoop3.2/python/lib/pyspark.zip/pyspark/worker.py", line 603, in main
    func, profiler, deserializer, serializer = read_udfs(pickleSer, infile, eval_type)
  File "/home/ubuntu/spark-3.2.1-bin-hadoop3.2/python/lib/pyspark.zip/pyspark/worker.py", line 449, in read_udfs
    udfs.append(read_single_udf(pickleSer, infile, eval_type, runner_conf, udf_index=i))
  File "/home/ubuntu/spark-3.2.1-bin-hadoop3.2/python/lib/pyspark.zip/pyspark/worker.py", line 251, in read_single_udf
    f, return_type = read_command(pickleSer, infile)
  File "/home/ubuntu/spark-3.2.1-bin-hadoop3.2/python/lib/pyspark.zip/pyspark/worker.py", line 71, in read_command
    command = serializer._read_with_length(file)
  File "/home/ubuntu/spark-3.2.1-bin-hadoop3.2/python/lib/pyspark.zip/pyspark/serializers.py", line 160, in _read_with_length
    return self.loads(obj)
  File "/home/ubuntu/spark-3.2.1-bin-hadoop3.2/python/lib/pyspark.zip/pyspark/serializers.py", line 430, in loads
    return pickle.loads(obj, encoding=encoding)
ModuleNotFoundError: No module named 'nltk'


Requirement already up-to-date: nltk in ./.local/lib/python3.8/site-packages (3.7)
Note: you may need to restart the kernel to use updated packages.
