# Setup

In [63]:
# import modules
from pyspark.sql import SparkSession, Row
from pyspark.sql import functions as F

In [64]:
# create spark session
spark = SparkSession.builder.appName('SparkSQL').getOrCreate()

# Data Frames
- Contains row objects
- Can run SQL queries
- Can store with a schema (more efficient)
- Read and write to JSON, Hive, Parquet, etc
- Allows for better interoperability and simplified development

In [48]:
# import data (unstructured) as rdd
friends = spark.sparkContext.textFile('../data/fakefriends.csv')

In [49]:
# create schema
def mapper(line):
    fields = line.split(',')
    return Row(
        id = int(fields[0]), \
        name = str(fields[1].encode('utf-8')), \
        age = int(fields[2]), \
        numFriends = int(fields[3])
    )

# apply schema
friends = friends.map(mapper)

In [50]:
# register dataframe and infer schema
schemaFriends = spark.createDataFrame(friends).cache()
schemaFriends.createOrReplaceTempView('friends')

type(schemaFriends)

                                                                                

pyspark.sql.dataframe.DataFrame

In [51]:
# query dataframe using spark sql
teenagers_df = spark.sql(
    'select * from friends where age >= 13 and age <= 19'
)

# display results
for teen in teenagers_df.collect():
    print(teen)

Row(id=21, name="b'Miles'", age=19, numFriends=268)
Row(id=52, name="b'Beverly'", age=19, numFriends=269)
Row(id=54, name="b'Brunt'", age=19, numFriends=5)
Row(id=106, name="b'Beverly'", age=18, numFriends=499)
Row(id=115, name="b'Dukat'", age=18, numFriends=397)
Row(id=133, name="b'Quark'", age=19, numFriends=265)
Row(id=136, name="b'Will'", age=19, numFriends=335)
Row(id=225, name="b'Elim'", age=19, numFriends=106)
Row(id=304, name="b'Will'", age=19, numFriends=404)
Row(id=341, name="b'Data'", age=18, numFriends=326)
Row(id=366, name="b'Keiko'", age=19, numFriends=119)
Row(id=373, name="b'Quark'", age=19, numFriends=272)
Row(id=377, name="b'Beverly'", age=18, numFriends=418)
Row(id=404, name="b'Kasidy'", age=18, numFriends=24)
Row(id=409, name="b'Nog'", age=19, numFriends=267)
Row(id=439, name="b'Data'", age=18, numFriends=417)
Row(id=444, name="b'Keiko'", age=18, numFriends=472)
Row(id=492, name="b'Dukat'", age=19, numFriends=36)
Row(id=494, name="b'Kasidy'", age=18, numFriends=194)

In [52]:
# query dataframe using pyspark

# age_df = schemaFriends \
#     .groupBy('age') \
#     .count() \
#     .orderBy('age')

age_df = schemaFriends \
    .groupBy('age') \
    .agg(
        F.count('*').alias('age_count'),
        F.sum('numFriends').alias('num_friends_sum')
    ) \
    .orderBy('age')

age_df.show(n=10)

+---+---------+---------------+
|age|age_count|num_friends_sum|
+---+---------+---------------+
| 18|        8|           2747|
| 19|       11|           2346|
| 20|        5|            825|
| 21|        8|           2807|
| 22|        7|           1445|
| 23|       10|           2463|
| 24|        5|           1169|
| 25|       11|           2172|
| 26|       17|           4115|
| 27|        8|           1825|
+---+---------+---------------+
only showing top 10 rows



In [None]:
# stop spark session
# spark.stop()

# DataFrames vs RDDs
Running prior RDD examples using DataFrames

## Fake Friends Example

In [54]:
# import data (structured) as dataframe
friends = spark.read \
    .option('header', 'true') \
    .option('inferSchema', 'true') \
    .csv('../data/fakefriends-header.csv')

type(friends)

pyspark.sql.dataframe.DataFrame

In [55]:
# display schema
friends.printSchema()

root
 |-- userID: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- friends: integer (nullable = true)



In [56]:
# display a specified column
friends \
    .select('name') \
    .show(n=5)

+--------+
|    name|
+--------+
|    Will|
|Jean-Luc|
|    Hugh|
|  Deanna|
|   Quark|
+--------+
only showing top 5 rows



In [57]:
# filter data on age
friends \
    .filter(friends.age < 21) \
    .show(n=5)

+------+-------+---+-------+
|userID|   name|age|friends|
+------+-------+---+-------+
|    21|  Miles| 19|    268|
|    48|    Nog| 20|      1|
|    52|Beverly| 19|    269|
|    54|  Brunt| 19|      5|
|    60| Geordi| 20|    100|
+------+-------+---+-------+
only showing top 5 rows



In [58]:
# group by age
friends \
    .groupBy('age') \
    .count() \
    .orderBy('age') \
    .show(n=5)

+---+-----+
|age|count|
+---+-----+
| 18|    8|
| 19|   11|
| 20|    5|
| 21|    8|
| 22|    7|
+---+-----+
only showing top 5 rows



In [61]:
# perform transformation
friends \
    .select(friends.name, friends.age + 10) \
    .withColumnRenamed('(age + 10)', 'age_transformed') \
    .show(n=5)

+--------+---------------+
|    name|age_transformed|
+--------+---------------+
|    Will|             43|
|Jean-Luc|             36|
|    Hugh|             65|
|  Deanna|             50|
|   Quark|             78|
+--------+---------------+
only showing top 5 rows



## Friends by Age Example

In [65]:
# import data (structured) as dataframe
friends = spark.read \
    .option('header', 'true') \
    .option('inferSchema', 'true') \
    .csv('../data/fakefriends-header.csv')

type(friends)

pyspark.sql.dataframe.DataFrame

In [75]:
# return avg number of friends by age
friends \
    .groupBy('age') \
    .agg(F.round(F.avg('friends'), 0).alias('friends_avg')) \
    .orderBy('age') \
    .show(n=5)

+---+-----------+
|age|friends_avg|
+---+-----------+
| 18|      343.0|
| 19|      213.0|
| 20|      165.0|
| 21|      351.0|
| 22|      206.0|
+---+-----------+
only showing top 5 rows

