#### Some Warmup -- Python Code to Deal with Sequences of Tuples

In [1]:
people_lines = ["Smith,John,35", 
                "Biden,Joe,80", 
                "Thunberg,Greta,20",
                "Swift,Taylor,33"]

def parse_line(line):
    return tuple(line.strip().split(","))

def parse_people(lines):
    return list(map(parse_line, lines))


VBox()

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
3,application_1683835639732_0004,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [None]:
records = parse_people(people_lines)

In [None]:
records

In [None]:
# Filter certain records
def old_people(tuples):
    return [t for t in tuples if int(t[2]) > 30 ]

# Do an operation on a record
def print_old_message(t):
    print(f"{t[1]} {t[0]}, who is {t[2]} years old, is really old!")


In [None]:
# Iteration over the records -- also, chain 
tuples = parse_people(people_lines)
old_tuples = old_people(tuples)

for t in old_tuples:
    print_old_message(t)

In [None]:
# Same thing, only functional chaining

for t in old_people(parse_people(people_lines)):
    print_old_message(t)

In [None]:
#  Aggregation operation -- a reduction operator

def count_old_people(lines):
    return len(old_people(parse_people(lines)))

count_old_people(people)

In [None]:
# Using map along with a reduction operator to get average age

sum(map(lambda t: int(t[2]), parse_people(people_lines))) / len(parse_people(people_lines))

In [None]:
# We want a list of all names in the data set (first and last).
# Map has a problem when each input tuple generates multiple output values

list(map(lambda t: [t[0], t[1]], parse_people(people_lines)))


In [2]:
#  Convert a list of lists into a list of (non-list) elements
def flatten(l):
    return [item for sublist in l for item in sublist]

flatten(list(map(lambda t: [t[0], t[1]], parse_people(people_lines))))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

['Smith', 'John', 'Biden', 'Joe', 'Thunberg', 'Greta', 'Swift', 'Taylor']

In [None]:
# Get the name of the youngest person
# Sort, follow by select
pp = parse_people(people_lines)
pp.sort(key = lambda t: t[2])
youngest = pp[0]
print(f"The youngest person is {youngest[1]} {youngest[0]}")

In [None]:
# Same thing but chain operations together.
# Why can't we chain???? 
parse_people(people_lines).sort(key = lambda t: t[2])[0]

In [None]:
## Named Tuples are a cool Python trick to avoid having to 
## refer to obscure tuple indexes

from collections import namedtuple
Person = namedtuple("Person", "lastName firstName age")

def parse_line(line):
    tuple = line.strip().split(",")
    return Person(tuple[0], tuple[1], int(tuple[2]))

pp = parse_people(people_lines)
pp[0]
pp[0].lastName

In [None]:
# Functions that 'destructure' the tuple look cleaner!

#def print_old_message(tuple):
#    print(f"{tuple[1]} {tuple[0]}, who is {tuple[2]} years old, is really old!")
    
def print_old_message(tuple):
    print(f"{tuple.firstName} {tuple.lastName}, who is {tuple.age} years old, is really old!")
    
print_old_message(parse_people(people_lines)[0])

# sort(key = lambda t: t.age)

####  Now on to Spark, and our "College Population data sets"

There are two edited versions of the "college data files" in the repository;  one for 2015/2016 and one for 2017/2018.

The fields are the following:
*  Unique ID
*  Name
*  City
*  State
*  Zip
*  Admission rate
*  Average SAT score
*  Enrollment
*  CostA
*  CostP

The last two break down cost to public and private universities.  If one is non-null, the other should be null.
If both are null, that's a missing value.  If both are non-null,  use either value.

We will only keep these: attributes:  ID, name, state, enrollment, cost where cost is either costA or costP as above


In [None]:
# Is the Spark Context available?
sc

In [None]:
year1516 = sc.textFile('s3://5330spark/College_2015_16.csv')

In [None]:
# What is this thing?  

In [None]:
#  Prep the file:  convert from lines to a list/tuple of fields, and only keep these:
#  ID, name, state, enrollment, cost
#  where cost is either costA or costP as above

from collections import namedtuple
CollegeRecord = namedtuple("CollegeRecord", "id name state city enrollment cost")

# Normalize the costa and costp fields to a single field cost.

def prep_cost(costa, costp):
  try:
    if costa == 'NONE':
      if costp == 'NONE':
        return None
      else:
        return int(costp)
    else:
      return int(costa)
  except:
    return None

# Enrollment -- Error checking and convert to integer
def get_enrollment(enr):
    enrollment = None
    try:
        enrollment = int(enr)
    except:
        enrollment = None
    return enrollment
    
# To process a record -- do some sanity checking,
# normalize the fields, convert to a CollegeRecrod
def prep_line(line):
    pass


In [None]:
# Map the data set over prep_line, we should see pretty records

In [None]:
# Filter out records that might have been rejected due to bad format
p1516 = ???

In [None]:
# How many records were filtered due to errors in format


In [None]:
# Filtering -- How many records are there from the state of California?
# rdd.count() gives number of elements in the RDD. 

In [None]:
# rdd.sortBy(lambda:  ... , ascending=bool) returns a sorted RDD.
#   What is the name and cost of the 5 cheapest universities


In [None]:
# What is the name and cost of the 5 most expensive universities


In [None]:
# Grouping and aggregating is a really big deal.
#   What does a groupby look like?
p1516.groupBy(lambda t: t.state)

In [None]:
# What is in the iterable?
p1516.groupBy(lambda t: t.state).mapValues(list).take(5)

In [None]:
# Using just groupby, get the number of universities in each state
p1516.groupBy(lambda t: t.state).map(lambda t: (t[0], len(list(t[1])))).sortBy(lambda t: t[1]).collect()

In [None]:
# If your reduction is simple counting, countByValue can be your friend.  Easy way to get # universities per state
p1516.map(lambda t: t.state).countByValue()

In [4]:
# foldByKey does the aggregation without bringing the iterable elements into memory
# Let's do total enrollment instead
initial_value = 0
add_func = lambda x,y: x+y

folded = p1516.filter(lambda t: t.enrollment).\
    map(lambda t: (t.state, t.enrollment)).\
        foldByKey(initial_value, add_func).\
            sortBy(lambda t: t[1], ascending=False).\
                take(5)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

An error was encountered:
name 'p1516' is not defined
Traceback (most recent call last):
NameError: name 'p1516' is not defined



In [None]:
# More general form of aggregation/accumulation to take an average

#  Accumulate sum and count.   Two cases
#    Incrementally count c to a running total (sum, count)   =>  (sum+1, count + c)
#    Aggregate two (sum, count) totals:  (s1+s2, c1+c2)

def aggCounts(p1, p2):
    return (p1[0]+p2[0], p1[1]+p2[1])

def accumCount(sofar, new):
    return (sofar[0] + new, sofar[1] + 1)

l = p1516.filter(lambda t: t.enrollment != None)
m = l.map(lambda t: (t.state, t.enrollment))

f = m.aggregateByKey((0,0), accumCount, aggCounts)

g = f.map(lambda t: (t[0], int(t[1][0]/t[1][1])))

In [None]:
# Another example of accumulation -- List of unique cities per state.
#  The 'unique' part is the tricky

def addItem(tp, i):
    if i in tp:
        return tp
    else:
        return tuple(list(tp) + [i])
    
a = p1516.map(lambda t: (t.state, t.city)).foldByKey((), addItem)

In [None]:
### Joining.  Calculate % change in enrollment from one year to the next
# Consider change between the two time periods.

# Same data set from the next time period
p1718 = sc.textFile('s3://5330spark/College_2017_18.csv').map(prep_line).filter(lambda t: t != None)

In [None]:
k1516 = p1516.keyBy(lambda t: t.id)
k1718 = p1718.keyBy(lambda t: t.id)

In [None]:
k1516.take(3)

In [None]:
j = k1516.join(k1718)

In [None]:
j.take(3)

In [None]:
# Now figure out to get % increase in enrollment from one year to the next!

###  One last loose end.  Flatmap in Spark RDDs

In [None]:
people_rdd = sc.parallelize(people_lines)

In [None]:
p = people_rdd.map(lambda l: l.strip().split(','))

In [None]:
p.take(2)

In [None]:
p.map(lambda t: [t[0], t[1]]).collect()

In [None]:
p.flatMap(lambda t: [t[0], t[1]]).collect()