## Pandas API on Spark
> Options and settings

In [None]:
# compute.max_rows
# display.max_rows
# compute.shortcut_limit
# compute.ops_on_diff_frames
# compute.default_index_type
# compute.ordered_head
# plotting.max_rows
# ploting.sample_ratio
# ploting.backend

In [5]:
import os
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"

import pyspark.pandas as ps

In [8]:
ps.options.display.max_rows

1000

In [9]:
# set custom max_rows
ps.options.display.max_rows = 10
ps.options.display.max_rows

10

In [10]:
# The Api is composed of 3 relevant functions
# 1. get_option()
# 2. set_option()
# 3. reset_option()

In [11]:
ps.get_option('compute.max_rows')

1000

In [12]:
ps.set_option('compute.max_rows', 2000)

In [13]:
ps.get_option('compute.max_rows')

2000

In [18]:
# reset to set default
ps.reset_option('display.max_rows')
ps.reset_option('compute.max_rows')

In [19]:
ps.get_option('display.max_rows')

1000

In [20]:
# option_context() - allowing you to execute code with given option
with ps.option_context('display.max_rows', 10, 'compute.max_rows', 20):
    print(ps.get_option('display.max_rows'))
    print(ps.get_option('compute.max_rows'))

print(ps.get_option('display.max_rows'))
print(ps.get_option('compute.max_rows'))

10
20
1000
1000


In [None]:
# Index types:
# 1. sequence
# 2. distributed-sequence
# 3. distributed

In [28]:
# sequence index type
print(ps.get_option('compute.default_index_type'))
psdf = ps.range(3)
print(type(psdf))

sequence
<class 'pyspark.pandas.frame.DataFrame'>


In [25]:
import pandas as pd
type(pd.DataFrame(range(3)))

pandas.core.frame.DataFrame

In [26]:
psdf.index

Int64Index([0, 1, 2], dtype='int64')

In [31]:
# distributed index type [will return ids]
ps.set_option('compute.default_index_type', 'distributed')
psdf = ps.range(3)

In [32]:
psdf.index

Int64Index([8589934592, 17179869184, 25769803776], dtype='int64')

In [35]:
ps.reset_option('compute.default_index_type')
ps.get_option('compute.default_index_type')

'sequence'