# Pandas API on Spark

Shows the key difference between pandas and pandasAPI on Spark

In [2]:
import pandas as pd
import numpy as np
import pyspark.pandas as ps
from pyspark.sql import SparkSession



# Object Creation

Creating Series, DataFrames and other data structures using the pandas API on Spark

In [3]:
# Creating Series
s = ps.Series([1, 3, 5, np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [4]:
# Creating DataFrame
psdf = ps.DataFrame(
  {'a': [1, 2, 3, 4, 5, 6],
    'b': [100, 200, 300, 400, 500, 600],
    'c': ["one", "two", "three", "four", "five", "six"]},
  index=[10, 20, 30, 40, 50, 60]
)

In [5]:
psdf

Unnamed: 0,a,b,c
10,1,100,one
20,2,200,two
30,3,300,three
40,4,400,four
50,5,500,five
60,6,600,six


In [6]:
# Converting pandas df to Spark
## Create the dataframe
dates = pd.date_range('20130101', periods=6)
pdf = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))

## Convert to Spark
psdf = ps.from_pandas(pdf)
psdf

Unnamed: 0,A,B,C,D
2013-01-01,1.870151,-0.543835,-1.417537,0.071349
2013-01-02,-0.745022,-0.659855,-0.266685,-0.998854
2013-01-03,0.398669,-0.902246,2.055565,0.279596
2013-01-04,-0.331432,0.087261,-0.165452,1.178686
2013-01-05,-1.209078,-0.504687,-1.735076,0.039445
2013-01-06,-1.728753,-1.948222,-0.822342,-0.032405


In [7]:
spark = SparkSession.builder.appName('PandasAPI on Spark').getOrCreate()

In [8]:
# Create a Spark DataFrame (not pandasAPI DataFrame on Spark)
sdf = spark.createDataFrame(pdf)
sdf.show()

+-------------------+-------------------+--------------------+--------------------+
|                  A|                  B|                   C|                   D|
+-------------------+-------------------+--------------------+--------------------+
| 1.8701514588824242| -0.543835011119871|   -1.41753667282922| 0.07134888329338522|
|-0.7450220752252489| -0.659854835580294| -0.2666846622186714| -0.9988544962042697|
| 0.3986688111562146|-0.9022459232441857|   2.055565239195255|  0.2795957852989773|
| -0.331431597751275|0.08726078212486754|-0.16545196196832423|  1.1786862161532354|
|-1.2090782063201833|-0.5046865940547667| -1.7350759780205685|0.039445036289597696|
|-1.7287528898005402|-1.9482221773587187| -0.8223418784833127|-0.03240532656591444|
+-------------------+-------------------+--------------------+--------------------+



In [9]:
# Convert Spark DF to PandasAPI on Spark
psdf = sdf.pandas_api()
psdf

Unnamed: 0,A,B,C,D
0,1.870151,-0.543835,-1.417537,0.071349
1,-0.745022,-0.659855,-0.266685,-0.998854
2,0.398669,-0.902246,2.055565,0.279596
3,-0.331432,0.087261,-0.165452,1.178686
4,-1.209078,-0.504687,-1.735076,0.039445
5,-1.728753,-1.948222,-0.822342,-0.032405


In [None]:
# All common functions are also available
psdf.dtypes
psdf.head()
psdf.index
psdf.columns
psdf.to_numpy()
psdf.describe()
psdf.T
psdf.sort_index()
psdf.sort_values()

# Missing Data

Pandas API on Spark uses `np.nan` to represent missing data. Default not included during computations

In [10]:
# Reconfigure dataframe to include NaNs
pdf1 = pdf.reindex(index=dates[0:4], columns=list(pdf.columns) + ['E'])
pdf1.loc[dates[0]:dates[1], 'E'] = 1
psdf1 = ps.from_pandas(pdf1)
psdf1

Unnamed: 0,A,B,C,D,E
2013-01-01,1.870151,-0.543835,-1.417537,0.071349,1.0
2013-01-02,-0.745022,-0.659855,-0.266685,-0.998854,1.0
2013-01-03,0.398669,-0.902246,2.055565,0.279596,
2013-01-04,-0.331432,0.087261,-0.165452,1.178686,


In [11]:
# Dropping na
psdf1.dropna(how='any')

Unnamed: 0,A,B,C,D,E
2013-01-01,1.870151,-0.543835,-1.417537,0.071349,1.0
2013-01-02,-0.745022,-0.659855,-0.266685,-0.998854,1.0


In [12]:
psdf1.fillna(value=5)

Unnamed: 0,A,B,C,D,E
2013-01-01,1.870151,-0.543835,-1.417537,0.071349,1.0
2013-01-02,-0.745022,-0.659855,-0.266685,-0.998854,1.0
2013-01-03,0.398669,-0.902246,2.055565,0.279596,5.0
2013-01-04,-0.331432,0.087261,-0.165452,1.178686,5.0


# Grouping

Different "group by" operations:

* Splitting data into groups based on some criteria
* Applying a function to each group independently
* Combinding the results into a data structure

In [13]:
psdf = ps.DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
                          'foo', 'bar', 'foo', 'foo'],
                    'B': ['one', 'one', 'two', 'three',
                          'two', 'two', 'one', 'three'],
                    'C': np.random.randn(8),
                    'D': np.random.randn(8)})

In [14]:
# Group by and aggregate
psdf.groupby('A').sum()

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
foo,0.545007,-0.999114
bar,-2.007125,1.960935


# Exporting/ Importing Data

In [None]:
psdf.to_csv('foo.csv')
psdf.read_csv('foo.csv')

# Spark Configurations

Various combinations in PySpark can be applied internally in pandas API on Spark using the `.conf.set()` command

A full list of configurations can be found [here](https://spark.apache.org/docs/latest/configuration.html#spark-properties)

In [16]:
# Getting the current configuration
prev = spark.conf.get('spark.sql.execution.arrow.pyspark.enabled')
print(f"Current configuration: {prev}")

# Changing the option
ps.set_option('compute.default_index_type', 'distributed')

import warnings
# Changing wanring options
warnings.filterwarnings('ignore')

Current configuration: false


In [17]:
# Setting general Spark configurations
## Enabling pyspark arrow optimization
spark.conf.set('spark.sql.execution.arrow.pyspark.enabled', True)
%timeit ps.range(3000).to_pandas()

111 ms ± 23.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [18]:
## Disabling and comparing speed
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", False)
%timeit ps.range(300000).to_pandas()

899 ms ± 46 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [19]:
# Reseting options to default
ps.reset_option("compute.default_index_type")
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", prev)