# Codecademy: Practice with PySpark

### Getting Started

In [8]:
from pyspark.sql import SparkSession
student_data = [("Chris",1523,0.72,"CA"),
                ("Jake", 1555,0.83,"NY"),
                ("Cody", 1439,0.92,"CA"),
                ("Lisa",1442,0.81,"FL"),
                ("Daniel",1600,0.88,"TX"),
                ("Kelvin",1382,0.99,"FL"),
                ("Nancy",1442,0.74,"TX"),
                ("Pavel",1599,0.82,"NY"),
                ("Josh",1482,0.78,"CA"),
                ("Cynthia",1582,0.94,"CA")]

1. Start a SparkSession and assign it the name `spark`.

In [9]:
## YOUR SOLUTION HERE ##
spark = SparkSession.builder.getOrCreate()

# confirm your session is built
print(spark)

<pyspark.sql.session.SparkSession object at 0x106292bd0>


2. Change the list `student_data` into an RDD called `student_rdd` with 5 partitions.

In [10]:
## YOUR SOLUTION HERE ##
student_rdd = spark.sparkContext.parallelize(student_data, 5)

# confirm your RDD contains the correct data
student_rdd.collect()

[('Chris', 1523, 0.72, 'CA'),
 ('Jake', 1555, 0.83, 'NY'),
 ('Cody', 1439, 0.92, 'CA'),
 ('Lisa', 1442, 0.81, 'FL'),
 ('Daniel', 1600, 0.88, 'TX'),
 ('Kelvin', 1382, 0.99, 'FL'),
 ('Nancy', 1442, 0.74, 'TX'),
 ('Pavel', 1599, 0.82, 'NY'),
 ('Josh', 1482, 0.78, 'CA'),
 ('Cynthia', 1582, 0.94, 'CA')]

3. Check the number of partitions for `student_rdd`.

In [11]:
## YOUR SOLUTION HERE ##
student_rdd.getNumPartitions()

5

### Transformations

In [12]:
from pyspark.sql import SparkSession
student_data = [("Chris",1523,0.72,"CA"),
                ("Jake", 1555,0.83,"NY"),
                ("Cody", 1439,0.92,"CA"),
                ("Lisa",1442,0.81,"FL"),
                ("Daniel",1600,0.88,"TX"),
                ("Kelvin",1382,0.99,"FL"),
                ("Nancy",1442,0.74,"TX"),
                ("Pavel",1599,0.82,"NY"),
                ("Josh",1482,0.78,"CA"),
                ("Cynthia",1582,0.94,"CA")]
spark = SparkSession.builder.getOrCreate()
student_rdd = spark.sparkContext.parallelize(student_data)

1. Convert the grades in the third column from decimals to whole numbers (multiply by 100) and keep the other three variables. Save this new RDD as `rdd_transformation`.

In [13]:
## YOUR SOLUTION HERE ##
rdd_transformation = student_rdd.map(lambda x: (x[0], x[1], x[2]*100, x[3]))

# confirm transformation is correct
rdd_transformation.collect()

[('Chris', 1523, 72.0, 'CA'),
 ('Jake', 1555, 83.0, 'NY'),
 ('Cody', 1439, 92.0, 'CA'),
 ('Lisa', 1442, 81.0, 'FL'),
 ('Daniel', 1600, 88.0, 'TX'),
 ('Kelvin', 1382, 99.0, 'FL'),
 ('Nancy', 1442, 74.0, 'TX'),
 ('Pavel', 1599, 82.0, 'NY'),
 ('Josh', 1482, 78.0, 'CA'),
 ('Cynthia', 1582, 94.0, 'CA')]

2. Filter `rdd_transformation` to just those rows with grades above 80 and save the new RDD as `rdd_filtered`.

In [14]:
## YOUR SOLUTION HERE ##
rdd_filtered = rdd_transformation.filter(lambda x: x[2]>80)

# confirm transformation is correct
rdd_filtered.collect()

[('Jake', 1555, 83.0, 'NY'),
 ('Cody', 1439, 92.0, 'CA'),
 ('Lisa', 1442, 81.0, 'FL'),
 ('Daniel', 1600, 88.0, 'TX'),
 ('Kelvin', 1382, 99.0, 'FL'),
 ('Pavel', 1599, 82.0, 'NY'),
 ('Cynthia', 1582, 94.0, 'CA')]

### Actions

In [15]:
from pyspark.sql import SparkSession
student_data = [("Chris",1523,0.72,"CA"),
                ("Jake", 1555,0.83,"NY"),
                ("Cody", 1439,0.92,"CA"),
                ("Lisa",1442,0.81,"FL"),
                ("Daniel",1600,0.88,"TX"),
                ("Kelvin",1382,0.99,"FL"),
                ("Nancy",1442,0.74,"TX"),
                ("Pavel",1599,0.82,"NY"),
                ("Josh",1482,0.78,"CA"),
                ("Cynthia",1582,0.94,"CA")]
spark = SparkSession.builder.getOrCreate()
student_rdd = spark.sparkContext.parallelize(student_data)
rdd_transformation = student_rdd.map(lambda x: (x[0], x[1], int(x[2]*100), x[3]))

1. View the first 5 elements of `rdd_transformation`.

In [16]:
## YOUR SOLUTION HERE ##
rdd_transformation.take(5)

[('Chris', 1523, 72, 'CA'),
 ('Jake', 1555, 83, 'NY'),
 ('Cody', 1439, 92, 'CA'),
 ('Lisa', 1442, 81, 'FL'),
 ('Daniel', 1600, 88, 'TX')]

2. Sum the grades in `rdd_transformation` and save the result as `sum_gpa`.

In [17]:
## YOUR SOLUTION HERE ##
sum_gpa = rdd_transformation.map(lambda x: x[2]).reduce(lambda x, y: x+y)

# view the sum
sum_gpa

843

3. Divide `sum_gpa` by `rdd_transformation.count()` to get the average grade.

In [19]:
## YOUR SOLUTION HERE ##
sum_gpa / rdd_transformation.count()

84.3

### Associative and Commutative Properties

In [21]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

1. Run the provided code. What do you notice about the result of the summation as the number of partitions grows?

In [22]:
data = [1,2,3,4,5]
for i in range(1,5):
    rdd = spark.sparkContext.parallelize(data, i)
    print('partition: ', rdd.glom().collect())
    print('addition: ', rdd.reduce(lambda a,b: a+b))

partition:  [[1, 2, 3, 4, 5]]
addition:  15
partition:  [[1, 2], [3, 4, 5]]
addition:  15
partition:  [[1], [2, 3], [4, 5]]
addition:  15
partition:  [[1], [2], [3], [4, 5]]
addition:  15


2. Run the provided code. What do you notice about the result of the division as the number of partitions grows?

In [23]:
for i in range(1,5):
    rdd = spark.sparkContext.parallelize(data, i)
    print('partition: ', rdd.glom().collect())
    print('division: ', rdd.reduce(lambda a,b: a/b))

partition:  [[1, 2, 3, 4, 5]]
division:  0.008333333333333333
partition:  [[1, 2], [3, 4, 5]]
division:  3.3333333333333335
partition:  [[1], [2, 3], [4, 5]]
division:  1.875
partition:  [[1], [2], [3], [4, 5]]
division:  0.20833333333333331


Notice in the output that no matter how our list is being partitioned, the sum is still 15, but the division operation has different solutions based on the partitioning.

### Broadcast Variables

In [24]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
student_data = [("Chris",1523,0.72,"CA"),
                ("Jake", 1555,0.83,"NY"),
                ("Cody", 1439,0.92,"CA"),
                ("Lisa",1442,0.81,"FL"),
                ("Daniel",1600,0.88,"TX"),
                ("Kelvin",1382,0.99,"FL"),
                ("Nancy",1442,0.74,"TX"),
                ("Pavel",1599,0.82,"NY"),
                ("Josh",1482,0.78,"CA"),
                ("Cynthia",1582,0.94,"CA")]
student_rdd = spark.sparkContext.parallelize(student_data)
rdd_transformation = student_rdd.map(lambda x: (x[0], x[1], int(x[2]*100), x[3]))

states = {"NY":"New York", "CA":"California", "TX":"Texas", "FL":"Florida"}

1. Broadcast the `states` dictionary to Spark Cluster. Save this object as `broadcastStates`.

In [25]:
## YOUR SOLUTION HERE ##
broadcastStates = spark.sparkContext.broadcast(states)

# confirm type
type(broadcastStates)

pyspark.broadcast.Broadcast

2. Reference `broadcastStates` to map the two-letter abbreviations to their full names. Save transformed rdd as `rdd_broadcast`.

In [27]:
## YOUR SOLUTION HERE ##
rdd_broadcast = rdd_transformation.map(lambda x: (x[0],x[1],x[2],broadcastStates.value[x[3]]))

# confirm transformation is correct
rdd_broadcast.collect()

[('Chris', 1523, 72, 'California'),
 ('Jake', 1555, 83, 'New York'),
 ('Cody', 1439, 92, 'California'),
 ('Lisa', 1442, 81, 'Florida'),
 ('Daniel', 1600, 88, 'Texas'),
 ('Kelvin', 1382, 99, 'Florida'),
 ('Nancy', 1442, 74, 'Texas'),
 ('Pavel', 1599, 82, 'New York'),
 ('Josh', 1482, 78, 'California'),
 ('Cynthia', 1582, 94, 'California')]

### Accumulator Variables

In [28]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
student_data = [("Chris",1523,0.72,"CA"),
                ("Jake", 1555,0.83,"NY"),
                ("Cody", 1439,0.92,"CA"),
                ("Lisa",1442,0.81,"FL"),
                ("Daniel",1600,0.88,"TX"),
                ("Kelvin",1382,0.99,"FL"),
                ("Nancy",1442,0.74,"TX"),
                ("Pavel",1599,0.82,"NY"),
                ("Josh",1482,0.78,"CA"),
                ("Cynthia",1582,0.94,"CA")]
student_rdd = spark.sparkContext.parallelize(student_data)
rdd_transformation = student_rdd.map(lambda x: (x[0], x[1], int(x[2]*100), x[3]))
states = {"NY":"New York", "CA":"California", "TX":"Texas", "FL":"Florida"}
broadcastStates = spark.sparkContext.broadcast(states)
rdd_broadcast = rdd_transformation.map(lambda x: (x[0],x[1],x[2],broadcastStates.value[x[3]]))

1. Create the accumulator variable that starts at 0 and name it `sat_1500`.

In [29]:
## YOUR SOLUTION HERE ##
sat_1500 = spark.sparkContext.accumulator(0)

# confirm type
type(sat_1500)

pyspark.accumulators.Accumulator

2. Create a function called `count_high_sat_score` that increments our accumulator by 1 whenever it encounters a score of over 1500.

In [31]:
## YOUR SOLUTION HERE ##
def count_high_sat_score(x):
    if x[1]>1500: sat_1500.add(1)

# confirm saved as a function
print(count_high_sat_score)

<function count_high_sat_score at 0x1072c6a20>


3. Call `count_high_sat_score` in an action that will apply the function to each element in `rdd_broadcast`.

In [32]:
## YOUR SOLUTION HERE ##
rdd_broadcast.foreach(lambda x: count_high_sat_score(x))

# confirm accumulator worked
print(sat_1500)

5
