### Installing Spark

Install Dependencies:


1.   Java 8
2.   Apache Spark with hadoop
3.   Findspark (used to locate the spark in the system)


In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz
!tar xf spark-3.1.1-bin-hadoop3.2.tgz
!pip install -q findspark

Set Environment Variables:

In [2]:
import os

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop3.2"

In [3]:
import findspark

findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local[*]").getOrCreate()
# Property used to format output tables better
spark.conf.set("spark.sql.repl.eagerEval.enabled", True)
spark

### Loading Data

Loading data as RDD

In [4]:
data = spark.sparkContext.textFile('input.txt')

In [5]:
# sample first item of data
data.first()

'Games are a fun way to get people involved and learning in a happy environment and get them to work on concepts and tactics without them knowing it a lot of the time. Because of this, these games were perfect in a class on negotiation and persuasion because it loosened people up and allowed them to learn in a fun environment. The games used in this class were reinforcing the concepts we talked about and got familiar with in the lectures, but the games were a safe place where we could give them a spin, test drive these persuasion tactics on our peers. With this we were able to make the connection between theory and application of concepts which have no use on paper. This class was about learning how to use language and framing to the extent where you make people think they want what you want, get to identify with your subject and get them on your level before you persuade them to act, and such concepts are great in theory but the application of them takes some practice and with mastery

In [6]:
# count all data
data.count()

34

In [7]:
# get number of partitions (logical division of data stored on a node in cluster)
data.getNumPartitions()

2

### RDD

#### Part 1

Count total words in RDD

In [8]:
import string

# mapper function
def word_counter(line):
  # remove punctuation
  line = line.replace('“',r'"')
  line = line.replace('”',r'"')
  line = line.replace('’',r"'")
  line = line.translate(str.maketrans('', '', string.punctuation))

  return len(line.split())

In [9]:
from operator import add

cnt = data.map(word_counter).reduce(add)
print('total number of words: ', cnt)

total number of words:  5004


Count words frequecy in RDD

In [10]:
import string

# mapper function
def word_freq(line):
  # remove punctuation
  line = line.replace('“',r'"')
  line = line.replace('”',r'"')
  line = line.replace('’',r"'")
  line = line.translate(str.maketrans('', '', string.punctuation))

  # normalization
  line = line.lower()
  
  word_set = set(line.split())
  output = {}
  for w in word_set:
    output[w] = line.count(w)
  return output

# reducer function
def reducer(d1, d2):
  output = {**d1, **d2}
  intersection = d1.keys() & d2.keys()
  for key in intersection:
    output[key] = d1[key] + d2[key]
  return output

In [11]:
import json

res = data.map(word_freq).reduce(reducer)

with open('p1.txt', 'w') as f:
  for key, value in res.items():
    f.write('{0}, {1}\n'.format(key, value))

#### Part 2

Count total words that start with `m\M` letter 

In [12]:
import string

# mapper function
def mword_counter(line):
  # remove punctuation
  line = line.replace('“',r'"')
  line = line.replace('”',r'"')
  line = line.replace('’',r"'")
  line = line.translate(str.maketrans('', '', string.punctuation))

  # normalization
  line = line.lower()
  
  word_list = line.split()
  mword_list = list(filter(lambda w: w.startswith('m'), word_list))
  return len(mword_list)

In [13]:
from operator import add

cnt = data.map(mword_counter).reduce(add)
print('total number of words start with m/M: ', cnt)

total number of words start with m/M:  150


#### Part 3

Count words with `length=5` that don't start with vowel letters

In [14]:
import string

vowel = ['a', 'e', 'i', 'o', 'u']

# mapper function
def mapper(line):
  # remove punctuation
  line = line.replace('“',r'"')
  line = line.replace('”',r'"')
  line = line.replace('’',r"'")
  line = line.translate(str.maketrans('', '', string.punctuation))

  # normalization
  line = line.lower()

  word_list = line.split()
  # filter by word length
  word_list = list(filter(lambda w: len(w)==5, word_list))
  # filter by start letter
  word_list = list(filter(lambda w: w.lower()[0] not in vowel, word_list))
  return len(word_list)

In [15]:
from operator import add

cnt = data.map(mapper).reduce(add)
print('total number of words with 5 length and not start with vowel: ', cnt)

total number of words with 5 length and not start with vowel:  492


#### Part 4

Filter items from stop words and non-alphabatic letters

In [16]:
import json

# word frequency dict
wf = {}
with open('p1.txt') as f:
    for line in f:
      key, value = line.split(',')
      value = int(value)
      wf[key] = value

# stop words (top 10 percent in frequency ranking)
sw = dict(sorted(wf.items(), key=lambda item: item[1], reverse=True)[:len(wf) // 10])
sw = list(sw.keys())

In [17]:
import string
import re

# mapper function
def mapper(line):
  # remove punctuation
  line = line.replace('“',r'"')
  line = line.replace('”',r'"')
  line = line.replace('’',r"'")
  line = line.translate(str.maketrans('', '', string.punctuation))

  # normalization
  line = line.lower()

  # filter by stop words
  filtered_line = ' '.join([w for w in line.split() if not w in sw])
  # filter by non-alphabetic letters
  filtered_line = re.sub(r'\W+', ' ', filtered_line)
  return filtered_line

In [18]:
res = data.map(mapper).collect()

with open('p4.txt', 'w') as f:
  for line in res:
    f.write('{}\n'.format(line))

#### Part 5

Count bigrams frequency

In [19]:
import string

# mapper function
def mapper(line):
  # remove punctuation
  line = line.replace('“',r'"')
  line = line.replace('”',r'"')
  line = line.replace('’',r"'")
  line = line.translate(str.maketrans('', '', string.punctuation))

  # normalization
  line = line.lower()

  bigrams = [bg for bg in zip(line.split()[:-1], line.split()[1:])]
  output = {bg: bigrams.count(bg) for bg in set(bigrams)}
  return output

# reducer function
def reducer(d1, d2):
  output = {**d1, **d2}
  intersection = d1.keys() & d2.keys()
  for key in intersection:
    output[key] = d1[key] + d2[key]
  return output

In [22]:
res = data.map(mapper).reduce(reducer)

res = dict(sorted(res.items(), key=lambda item: item[1], reverse=True))

In [23]:
with open('p5.txt', 'w') as f:
  for key, value in res.items():
    f.write('{0}, {1}\n'.format(key, value))