**This will only work on my AWS account since Spark is not installed on my local machine.**

In [1]:
from pyspark import SparkContext

In [2]:
sc = SparkContext()

In [3]:
%%writefile example.txt
first line
second line
third line
fourth line

Writing example.txt


In [4]:
# textFile is the RDD
textFile = sc.textFile('example.txt')

In [5]:
textFile.first()

'first line'

In [6]:
textFile.count()

4

In [12]:
# This is just a recipe, it is not actually executed
# until you run an action command like collect
secfind = textFile.filter(lambda line: 'second' in line)

In [13]:
# No results until we run the collect command
secfind

PythonRDD[6] at RDD at PythonRDD.scala:48

In [10]:
# Results are not displayed until you run an action command
secfind.collect()

['second line']

In [11]:
secfind.count()

1

In [14]:
%%writefile example2.txt
first 
second line
the third line
then a fourth line

Writing example2.txt


In [15]:
text_rdd = sc.textFile('example2.txt')

In [20]:
words = text_rdd.map(lambda line: line.split())

In [22]:
words.collect()

[['first'],
 ['second', 'line'],
 ['the', 'third', 'line'],
 ['then', 'a', 'fourth', 'line']]

In [28]:
words2 = text_rdd.flatMap(lambda line: line.split())

In [31]:
words2.collect()

['first',
 'second',
 'line',
 'the',
 'third',
 'line',
 'then',
 'a',
 'fourth',
 'line']

In [32]:
%%writefile services.txt
#EventId    Timestamp    Customer   State    ServiceID    Amount
201       10/13/2017      100       NY       131          100.00
204       10/18/2017      700       TX       129          450.00
202       10/15/2017      203       CA       121          200.00
206       10/19/2017      202       CA       131          500.00
203       10/17/2017      101       NY       173          750.00
205       10/19/2017      202       TX       121          200.00

Writing services.txt


In [33]:
services = sc.textFile('services.txt')

In [34]:
services.take(2)

['#EventId    Timestamp    Customer   State    ServiceID    Amount',
 '201       10/13/2017      100       NY       131          100.00']

In [77]:
clean = services.map(lambda x: x[1:] if x[0]=='#' else x)\
                .map(lambda x: x.split())

In [96]:
from operator import add
# Important to keep key-value pairs in Tuple format for this
clean.map(lambda lst: (lst[3],lst[-1])) \
    .filter(lambda lst: lst[0] != 'State') \
     .map(lambda lst: (lst[0], float(lst[1]))) \
     .reduceByKey(add) \
     .sortBy(lambda lst: lst[1], ascending=False) \
     .collect()

[('NY', 850.0), ('CA', 700.0), ('TX', 650.0)]