# 8. DataFrame

Taken from quickstart guide of the office Spark site

# Introduction

PySpark does not compute transformations immediately. It stores the steps and only when actions like `collect()` are called are they computed

Features

* Lazily evaluated
* Implemented on top of RDD

In [1]:
# All applications start with SparkSession
from pyspark.sql import SparkSession

spark = SparkSession \
  .builder \
  .appName('DataFrame Guide') \
  .getOrCreate()

# DataFrame Creation

Takes `schema` argument to determine schema for dataframe, else will infer from sampling data. 

In [2]:
from datetime import datetime, date
import pandas as pd
from pyspark.sql import Row

# Specifying Row information
df = spark.createDataFrame([
  Row(a=1, b=2., c='string1', d=date(2000, 1, 1), e=datetime(2000, 1, 1, 12, 0)),
  Row(a=2, b=3., c='string2', d=date(2000, 2, 1), e=datetime(2000, 1, 2, 12, 0)),
  Row(a=4, b=5., c='string3', d=date(2000, 3, 1), e=datetime(2000, 1, 3, 12, 0))
])
df

DataFrame[a: bigint, b: double, c: string, d: date, e: timestamp]

In [3]:
# Specifying schema
df = spark.createDataFrame([
    (1, 2., 'string1', date(2000, 1, 1), datetime(2000, 1, 1, 12, 0)),
    (2, 3., 'string2', date(2000, 2, 1), datetime(2000, 1, 2, 12, 0)),
    (3, 4., 'string3', date(2000, 3, 1), datetime(2000, 1, 3, 12, 0))
  ], 
  schema='a long, b double, c string, d date, e timestamp')
df

DataFrame[a: bigint, b: double, c: string, d: date, e: timestamp]

In [4]:
# Creating from pandas df
pandas_df = pd.DataFrame({
  'a': [1, 2, 3],
  'b': [2., 3., 4.],
  'c': ['string1', 'string2', 'string3'],
  'd': [date(2000, 1, 1), date(2000, 2, 1), date(2000, 3, 1)],
  'e': [datetime(2000, 1, 1, 12, 0), datetime(2000, 1, 2, 12, 0), datetime(2000, 1, 3, 12, 0)]
})
df = spark.createDataFrame(pandas_df)
df

DataFrame[a: bigint, b: double, c: string, d: date, e: timestamp]

In [5]:
# Creating from RDD
rdd = spark.sparkContext.parallelize([
  (1, 2., 'string1', date(2000, 1, 1), datetime(2000, 1, 1, 12, 0)),
  (2, 3., 'string2', date(2000, 2, 1), datetime(2000, 1, 2, 12, 0)),
  (3, 4., 'string3', date(2000, 3, 1), datetime(2000, 1, 3, 12, 0))
])
df = spark.createDataFrame(rdd, schema=['a', 'b', 'c', 'd', 'e'])
df

DataFrame[a: bigint, b: double, c: string, d: date, e: timestamp]

DataFrames above will all have the same schema and are all converted to Spark DataFrames

# Viewing Data

Spark does not evaluate upon a transformation step, it will wait until an action is called before implementing. Therefore, printing the dataframe will not return any results

Able to change this by changing the configuration `spark.sql.repl.eagerEval.enabled` to True

In [6]:
df.show(1)

+---+---+-------+----------+-------------------+
|  a|  b|      c|         d|                  e|
+---+---+-------+----------+-------------------+
|  1|2.0|string1|2000-01-01|2000-01-01 12:00:00|
+---+---+-------+----------+-------------------+
only showing top 1 row



In [None]:
# Changing to eager evaluation
# spark.conf.set('spark.sql.repl.eagerEval.enabled' True)

In [7]:
# Print schema and columns
df.printSchema()
df.columns

root
 |-- a: long (nullable = true)
 |-- b: double (nullable = true)
 |-- c: string (nullable = true)
 |-- d: date (nullable = true)
 |-- e: timestamp (nullable = true)



['a', 'b', 'c', 'd', 'e']

In [8]:
# Showing summary of data
df.select('a', 'b', 'c').describe().show()

+-------+---+---+-------+
|summary|  a|  b|      c|
+-------+---+---+-------+
|  count|  3|  3|      3|
|   mean|2.0|3.0|   null|
| stddev|1.0|1.0|   null|
|    min|  1|2.0|string1|
|    max|  3|4.0|string3|
+-------+---+---+-------+



In [9]:
# .collect() will collect the distributed data to driver side as local data in Python
#   - Can cause out of memory error if data is too large
df.collect()

[Row(a=1, b=2.0, c='string1', d=datetime.date(2000, 1, 1), e=datetime.datetime(2000, 1, 1, 12, 0)),
 Row(a=2, b=3.0, c='string2', d=datetime.date(2000, 2, 1), e=datetime.datetime(2000, 1, 2, 12, 0)),
 Row(a=3, b=4.0, c='string3', d=datetime.date(2000, 3, 1), e=datetime.datetime(2000, 1, 3, 12, 0))]

In [10]:
# Use .take() with the number of rows to prevent OOM
df.take(1)

[Row(a=1, b=2.0, c='string1', d=datetime.date(2000, 1, 1), e=datetime.datetime(2000, 1, 1, 12, 0))]

In [11]:
# Convert to a pandas dataframe with .toPandas()
df.toPandas()

Unnamed: 0,a,b,c,d,e
0,1,2.0,string1,2000-01-01,2000-01-01 12:00:00
1,2,3.0,string2,2000-02-01,2000-01-02 12:00:00
2,3,4.0,string3,2000-03-01,2000-01-03 12:00:00


# Selecting and Accessing Data

In [16]:
from pyspark.sql import Column
from pyspark.sql.functions import upper
# Columns are used to select columns from a DataFrame
df.select(df['a']).show()

+---+
|  a|
+---+
|  1|
|  2|
|  3|
+---+



In [17]:
# Creating new columns
df.withColumn('upper_c', upper(df['c'])).show()

+---+---+-------+----------+-------------------+-------+
|  a|  b|      c|         d|                  e|upper_c|
+---+---+-------+----------+-------------------+-------+
|  1|2.0|string1|2000-01-01|2000-01-01 12:00:00|STRING1|
|  2|3.0|string2|2000-02-01|2000-01-02 12:00:00|STRING2|
|  3|4.0|string3|2000-03-01|2000-01-03 12:00:00|STRING3|
+---+---+-------+----------+-------------------+-------+



In [18]:
# Rename a column
df.withColumnRenamed('a', 'changed_a').show()

+---------+---+-------+----------+-------------------+
|changed_a|  b|      c|         d|                  e|
+---------+---+-------+----------+-------------------+
|        1|2.0|string1|2000-01-01|2000-01-01 12:00:00|
|        2|3.0|string2|2000-02-01|2000-01-02 12:00:00|
|        3|4.0|string3|2000-03-01|2000-01-03 12:00:00|
+---------+---+-------+----------+-------------------+



In [19]:
# select a subset of rows
df.filter(df['a'] == 1).show()

+---+---+-------+----------+-------------------+
|  a|  b|      c|         d|                  e|
+---+---+-------+----------+-------------------+
|  1|2.0|string1|2000-01-01|2000-01-01 12:00:00|
+---+---+-------+----------+-------------------+



# Applying Functions

Supports User Defined Functions (UDF) and APIs to execute Python native functions and other APIs

UDFs are created by giving a decorator `@pandas_udf()`. There are 2 types of UDF

1. Pandas UDF
  - Scalar
  - row-at-a-time
2. Grouped UDF 

In [20]:
import pandas as pd
from pyspark.sql.functions import pandas_udf

@pandas_udf('long')
def pandas_plus_one(series: pd.Series) -> pd.Series:
  return series + 1

df.select(pandas_plus_one(df['a'])).show()

+------------------+
|pandas_plus_one(a)|
+------------------+
|                 2|
|                 3|
|                 4|
+------------------+



In [None]:
from pyspark.sql.functions import pandas_udf, PandasUDFType

# Use pandas_udf to define a Pandas UDF
@pandas_udf('double', PandasUDFType.SCALAR)
# Input/output are both a pandas.Series of doubles

def pandas_plus_one(v):
    return v + 1

df.withColumn('v2', pandas_plus_one(df.v))

In [21]:
# Alternatively .mapInPandas allows direct use of API in pandas
def pandas_filter_func(iterator):
  for pandas_df in iterator:
    yield pandas_df[pandas_df['a'] == 1]

df.mapInPandas(pandas_filter_func, schema=df.schema).show()

+---+---+-------+----------+-------------------+
|  a|  b|      c|         d|                  e|
+---+---+-------+----------+-------------------+
|  1|2.0|string1|2000-01-01|2000-01-01 12:00:00|
+---+---+-------+----------+-------------------+



# Grouping Data

Uses the split-apply-combine strategy - groups data by condition and apply a function to each group and then re-combine the dataframe

In [22]:
# Create dummy dataset
df = spark.createDataFrame([
    ['red', 'banana', 1, 10], ['blue', 'banana', 2, 20], ['red', 'carrot', 3, 30],
    ['blue', 'grape', 4, 40], ['red', 'carrot', 5, 50], ['black', 'carrot', 6, 60],
    ['red', 'banana', 7, 70], ['red', 'grape', 8, 80]], schema=['color', 'fruit', 'v1', 'v2'])
df.show()

+-----+------+---+---+
|color| fruit| v1| v2|
+-----+------+---+---+
|  red|banana|  1| 10|
| blue|banana|  2| 20|
|  red|carrot|  3| 30|
| blue| grape|  4| 40|
|  red|carrot|  5| 50|
|black|carrot|  6| 60|
|  red|banana|  7| 70|
|  red| grape|  8| 80|
+-----+------+---+---+



In [25]:
# Applying the average aggregation command
df.groupby('color').avg().show()

+-----+-------+-------+
|color|avg(v1)|avg(v2)|
+-----+-------+-------+
|  red|    4.8|   48.0|
| blue|    3.0|   30.0|
|black|    6.0|   60.0|
+-----+-------+-------+



In [26]:
# Applying in Pandas
def plus_mean(pandas_df):
    return pandas_df.assign(v1=pandas_df.v1 - pandas_df.v1.mean())

df.groupby('color').applyInPandas(plus_mean, schema=df.schema).show()

+-----+------+---+---+
|color| fruit| v1| v2|
+-----+------+---+---+
|black|carrot|  0| 60|
| blue|banana| -1| 20|
| blue| grape|  1| 40|
|  red|banana| -3| 10|
|  red|carrot| -1| 30|
|  red|carrot|  0| 50|
|  red|banana|  2| 70|
|  red| grape|  3| 80|
+-----+------+---+---+



# Getting Data in/out

Use the `.<action><file_type>` and other parameters to import a dataset

In [None]:
# df.write.orc('something.orc')

# Working with SQL

DataFrames and Spark SQL run on the same engine. So any DataFrames imported through spark can be queried using SQL

In [27]:
# This creates a view with the table name using the existing dataframe
df.createOrReplaceTempView('tableA')

# Running an SQL query
spark.sql("SELECT COUNT(*) FROM tableA").show()

+--------+
|count(1)|
+--------+
|       8|
+--------+



In [28]:
# UDFs can be registered and invoked with SQL

@pandas_udf('integer')
def add_one(s: pd.Series) -> pd.Series:
  return s + 1

# Register the function to Spark
spark.udf.register('add_one', add_one)

spark.sql(
  "SELECT add_one(v1) from tableA"
).show()

+-----------+
|add_one(v1)|
+-----------+
|          2|
|          3|
|          4|
|          5|
|          6|
|          7|
|          8|
|          9|
+-----------+

