In [29]:
#import necessary libraries
import os
os.environ['PYARROW_IGNORE_TIMEZONE'] = '1'
import sys
import time


import pandas as pd
import pyspark.pandas as ps
import seaborn as sns
import matplotlib.pyplot as plt

In [30]:
#stop pyspark warnings
import warnings
warnings.filterwarnings('ignore')

In [31]:
#load iris dataset
df_iris = sns.load_dataset('iris')
print(df_iris.shape)

(150, 5)


In [32]:
#convert pandas dataframe to pyspark dataframe
df_iris_spark = ps.from_pandas(df_iris)
print(df_iris_spark.shape)

(150, 5)


In [33]:
#Agg in Pandas
df_iris.groupby('species', as_index=False).agg(
    {
        'sepal_length': 'mean',
        'sepal_width': 'mean',
        'petal_length': 'mean',
        'petal_width': 'median'
    }
)

Unnamed: 0,species,sepal_length,sepal_width,petal_length,petal_width
0,setosa,5.006,3.428,1.462,0.2
1,versicolor,5.936,2.77,4.26,1.3
2,virginica,6.588,2.974,5.552,2.0


In [34]:
#But median is not supported in Spark. Passing median will throw an error.
df_iris_spark.groupby('species', as_index=False).agg(
    {
        'sepal_length': 'mean',
        'sepal_width': 'mean',
        'petal_length': 'mean',
    }
)
#One workaround is ->
df1 = df_iris_spark.groupby('species', as_index=False).agg(
    {
        'sepal_length': 'mean',
        'sepal_width': 'mean',
        'petal_length': 'mean',
    }
)
df2 = df_iris_spark.groupby('species', as_index=False)['petal_width'].median()
#now merging them would do the trick
df1.merge(df2, on='species')

Unnamed: 0,species,sepal_length,sepal_width,petal_length,petal_width
0,setosa,5.006,3.428,1.462,0.2
1,versicolor,5.936,2.77,4.26,1.3
2,virginica,6.588,2.974,5.552,2.0


In [41]:
#save as parquet file and log the time
t1 = time.time()
df_iris_spark.to_parquet('../data/iris.parquet', index_col=None)
t2 = time.time()
dt_parquet = t2-t1
print('Time taken to save as parquet file: ', dt_parquet)                                                                                                                               

Time taken to save as parquet file:  0.1717061996459961


In [38]:
#save as csv file and log the time
t1 = time.time()
df_iris_spark.to_csv('../data/iris.csv', index_col=None)
t2 = time.time()
dt_csv = t2-t1
print('Time taken to save as csv file: ', dt_csv)

Time taken to save as csv file:  0.1897752285003662


In [40]:
#check which one is faster
print('Parquet is faster by: ', dt_csv/dt_parquet)

Parquet is faster by:  0.9775752424978569
