# Differences between pandas and pandas API on Spark.

In [8]:
import pandas as pd
import numpy as np
import pyspark.pandas as pds
from pyspark.sql import SparkSession
import os
os.environ['PYARROW_IGNORE_TIMEZONE']='1'

# Object Creation

In [9]:
s = pds.Series([1, 3, 5, np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

# Creating a pandas-on-Spark DataFrame by passing a dict of objects that can be converted to series-like.

In [10]:
psdf = pds.DataFrame(
    {'a': [1, 2, 3, 4, 5, 6],
     'b': [100, 200, 300, 400, 500, 600],
     'c': ["one", "two", "three", "four", "five", "six"]},
    index=[10, 20, 30, 40, 50, 60])
psdf

Unnamed: 0,a,b,c
10,1,100,one
20,2,200,two
30,3,300,three
40,4,400,four
50,5,500,five
60,6,600,six


# Creating a pandas DataFrame by passing a numpy array, with a datetime index and labeled columns:

In [11]:
dates = pd.date_range('20130101', periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [13]:
pdf = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
pdf

Unnamed: 0,A,B,C,D
2013-01-01,-0.89353,-0.243796,-1.465854,-0.567482
2013-01-02,0.294092,1.28674,-0.658727,0.20738
2013-01-03,-0.757038,-1.929519,1.144764,0.562847
2013-01-04,-0.898885,-0.254794,-0.491524,0.61592
2013-01-05,0.122109,-0.179174,0.51839,0.298397
2013-01-06,-1.025646,-1.755562,0.660653,-0.238806
