# Differences between pandas and pandas API on Spark.

In [17]:
import pandas as pd
import numpy as np
import pyspark.pandas as pds
from pyspark.sql import SparkSession
import os
os.environ['PYARROW_IGNORE_TIMEZONE']='1'

# Object Creation

In [18]:
s = pds.Series([1, 3, 5, np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

# Creating a pandas-on-Spark DataFrame by passing a dict of objects that can be converted to series-like.

In [19]:
psdf = pds.DataFrame(
    {'a': [1, 2, 3, 4, 5, 6],
     'b': [100, 200, 300, 400, 500, 600],
     'c': ["one", "two", "three", "four", "five", "six"]},
    index=[10, 20, 30, 40, 50, 60])
psdf

Unnamed: 0,a,b,c
10,1,100,one
20,2,200,two
30,3,300,three
40,4,400,four
50,5,500,five
60,6,600,six


# Creating a pandas DataFrame by passing a numpy array, with a datetime index and labeled columns:

In [20]:
dates = pd.date_range('20130101', periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [21]:
pdf = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
pdf

Unnamed: 0,A,B,C,D
2013-01-01,-1.643184,-0.268242,0.30869,-0.772715
2013-01-02,-0.265726,-0.812232,-1.232928,1.325677
2013-01-03,1.058228,-0.302428,0.130427,0.547813
2013-01-04,-0.214892,-0.587242,-0.449409,-0.357438
2013-01-05,-1.038244,-0.57782,0.490685,0.18299
2013-01-06,0.493449,-0.756945,0.471962,-0.470539


# Now, this pandas DataFrame can be converted to a pandas-on-Spark DataFrame

In [22]:
psdf = pds.from_pandas(pdf)
type(psdf)
psdf

Unnamed: 0,A,B,C,D
2013-01-01,-1.643184,-0.268242,0.30869,-0.772715
2013-01-02,-0.265726,-0.812232,-1.232928,1.325677
2013-01-03,1.058228,-0.302428,0.130427,0.547813
2013-01-04,-0.214892,-0.587242,-0.449409,-0.357438
2013-01-05,-1.038244,-0.57782,0.490685,0.18299
2013-01-06,0.493449,-0.756945,0.471962,-0.470539


# Creating a Spark DataFrame from pandas DataFrame

In [24]:
spark = SparkSession.builder.getOrCreate()
sdf = spark.createDataFrame(pdf)

In [26]:
sdf.show()

+--------------------+--------------------+--------------------+--------------------+
|                   A|                   B|                   C|                   D|
+--------------------+--------------------+--------------------+--------------------+
| -1.6431841486580727|-0.26824157247487607|  0.3086901538243969| -0.7727154619788652|
|-0.26572583985123877| -0.8122321745830039| -1.2329279877816393|  1.3256772922376032|
|  1.0582284222035492| -0.3024277581822104| 0.13042663802810145|  0.5478134586011343|
|-0.21489220763714806| -0.5872419040169711|-0.44940935742731203|-0.35743831612275606|
| -1.0382442087711001| -0.5778203260679722|  0.4906853349179449| 0.18299016928460426|
|  0.4934489316817739| -0.7569452245810048|   0.471961926192307| -0.4705391270390965|
+--------------------+--------------------+--------------------+--------------------+



# Creating pandas-on-Spark DataFrame from Spark DataFrame.

In [27]:
psdf = sdf.to_pandas_on_spark()
psdf

Unnamed: 0,A,B,C,D
0,-1.643184,-0.268242,0.30869,-0.772715
1,-0.265726,-0.812232,-1.232928,1.325677
2,1.058228,-0.302428,0.130427,0.547813
3,-0.214892,-0.587242,-0.449409,-0.357438
4,-1.038244,-0.57782,0.490685,0.18299
5,0.493449,-0.756945,0.471962,-0.470539


# Plotting

In [45]:
pd.options.plotting.backend = 'plotly'
pser = pd.Series(np.random.randn(1000),
                 index=pd.date_range('1/1/2000', periods=1000))

In [40]:
psser = ps.Series(pser)

In [41]:
psser = psser.cummax()

In [42]:
psser.plot()

ImportError: plotly is required for plotting when the default backend 'plotly' is selected.