
# Pandas vs. PySpark vs. Koalas
> *python flavors for data processing and analysis*


<br>
  
<img width="1200px" src ='https://owshqblobstg.blob.core.windows.net/stgfiles/png_files/pandas_pyspark_koalas0.png'>
  
<br>


> * ***Pandas [Driver]*** = *is a fast, powerful, flexible and easy to use open source data analysis and manipulation tool, built on top of the python programming language. [**doesn't scale**]*  
*https://pandas.pydata.org* 

> * ***PySpark [Driver & Executor]*** = *library to apply sql-like analysis on a huge amount of structured or semi-structured data. we can also use sql queries with it. [**scale**]*  
*https://databricks.com/glossary/pyspark*

> * ***Koalas [Driver & Executor]*** = *is an open source project that provides a drop-in replacement for pandas. commonly used by data scientists. [**scale**]*  
*https://docs.databricks.com/languages/koalas.html*

In [0]:
# pandas
# time taken = ?
# https://pandas.pydata.org/docs/
import pandas as pd

ds_pandas_gold_reviews = pd.read_parquet("/dbfs/mnt/bs-production/parquet/gold_reviews/")
display(ds_pandas_gold_reviews)

In [0]:
# pyspark
# time taken = ?
# https://spark.apache.org/docs/latest/api/python/
ds_pyspark_gold_reviews = spark.read.parquet("dbfs:/mnt/bs-production/parquet/gold_reviews")
display(ds_pyspark_gold_reviews)

review_id,business_id,user_id,review_stars,review_useful,store_name,store_city,store_state,store_category,store_review_count,store_stars,user_name,user_average_stars,user_importance
qFePitPivYwbdzq9jtDNVA,ty5KQYqYRxwXDG_e4pz-4w,BMf5e5t3pZinfjD37EvMig,5,1,Absinthe,Las Vegas,NV,Performing Arts,1973,4.5,Amy,4.13,rockstar
qFePitPivYwbdzq9jtDNVA,ty5KQYqYRxwXDG_e4pz-4w,BMf5e5t3pZinfjD37EvMig,5,1,Absinthe,Las Vegas,NV,Performing Arts,1973,4.5,Amy,4.15,rockstar
qFePitPivYwbdzq9jtDNVA,ty5KQYqYRxwXDG_e4pz-4w,BMf5e5t3pZinfjD37EvMig,5,1,Absinthe,Las Vegas,NV,Performing Arts,1658,4.5,Amy,4.13,rockstar
qFePitPivYwbdzq9jtDNVA,ty5KQYqYRxwXDG_e4pz-4w,BMf5e5t3pZinfjD37EvMig,5,1,Absinthe,Las Vegas,NV,Performing Arts,1658,4.5,Amy,4.15,rockstar
qFePitPivYwbdzq9jtDNVA,ty5KQYqYRxwXDG_e4pz-4w,BMf5e5t3pZinfjD37EvMig,5,1,Absinthe,Las Vegas,NV,Performing Arts,1658,4.5,Amy,4.13,rockstar
qFePitPivYwbdzq9jtDNVA,ty5KQYqYRxwXDG_e4pz-4w,BMf5e5t3pZinfjD37EvMig,5,1,Absinthe,Las Vegas,NV,Performing Arts,1658,4.5,Amy,4.15,rockstar
qFePitPivYwbdzq9jtDNVA,ty5KQYqYRxwXDG_e4pz-4w,BMf5e5t3pZinfjD37EvMig,5,1,Absinthe,Las Vegas,NV,Performing Arts,1973,4.5,Amy,4.13,rockstar
qFePitPivYwbdzq9jtDNVA,ty5KQYqYRxwXDG_e4pz-4w,BMf5e5t3pZinfjD37EvMig,5,1,Absinthe,Las Vegas,NV,Performing Arts,1973,4.5,Amy,4.15,rockstar
qFePitPivYwbdzq9jtDNVA,ty5KQYqYRxwXDG_e4pz-4w,BMf5e5t3pZinfjD37EvMig,5,1,Absinthe,Las Vegas,NV,Performing Arts,1658,4.5,Amy,4.13,rockstar
qFePitPivYwbdzq9jtDNVA,ty5KQYqYRxwXDG_e4pz-4w,BMf5e5t3pZinfjD37EvMig,5,1,Absinthe,Las Vegas,NV,Performing Arts,1658,4.5,Amy,4.15,rockstar


In [0]:
ds_pyspark_gold_reviews.count()

In [0]:
"""
import pandas as pd
ds_pandas_gold_reviews = pd.read_parquet("/dbfs/mnt/bs-production/parquet/gold_reviews/")
"""

# koalas
# time taken = ?
# https://koalas.readthedocs.io/en/latest/#pyspark
# import pandas as pd
import databricks.koalas as ks

ds_koalas_gold_reviews = ks.read_parquet("dbfs:/mnt/bs-production/parquet/gold_reviews")
display(ds_koalas_gold_reviews)

review_id,business_id,user_id,review_stars,review_useful,store_name,store_city,store_state,store_category,store_review_count,store_stars,user_name,user_average_stars,user_importance
qFePitPivYwbdzq9jtDNVA,ty5KQYqYRxwXDG_e4pz-4w,BMf5e5t3pZinfjD37EvMig,5,1,Absinthe,Las Vegas,NV,Performing Arts,1973,4.5,Amy,4.13,rockstar
qFePitPivYwbdzq9jtDNVA,ty5KQYqYRxwXDG_e4pz-4w,BMf5e5t3pZinfjD37EvMig,5,1,Absinthe,Las Vegas,NV,Performing Arts,1973,4.5,Amy,4.15,rockstar
qFePitPivYwbdzq9jtDNVA,ty5KQYqYRxwXDG_e4pz-4w,BMf5e5t3pZinfjD37EvMig,5,1,Absinthe,Las Vegas,NV,Performing Arts,1658,4.5,Amy,4.13,rockstar
qFePitPivYwbdzq9jtDNVA,ty5KQYqYRxwXDG_e4pz-4w,BMf5e5t3pZinfjD37EvMig,5,1,Absinthe,Las Vegas,NV,Performing Arts,1658,4.5,Amy,4.15,rockstar
qFePitPivYwbdzq9jtDNVA,ty5KQYqYRxwXDG_e4pz-4w,BMf5e5t3pZinfjD37EvMig,5,1,Absinthe,Las Vegas,NV,Performing Arts,1658,4.5,Amy,4.13,rockstar
qFePitPivYwbdzq9jtDNVA,ty5KQYqYRxwXDG_e4pz-4w,BMf5e5t3pZinfjD37EvMig,5,1,Absinthe,Las Vegas,NV,Performing Arts,1658,4.5,Amy,4.15,rockstar
qFePitPivYwbdzq9jtDNVA,ty5KQYqYRxwXDG_e4pz-4w,BMf5e5t3pZinfjD37EvMig,5,1,Absinthe,Las Vegas,NV,Performing Arts,1973,4.5,Amy,4.13,rockstar
qFePitPivYwbdzq9jtDNVA,ty5KQYqYRxwXDG_e4pz-4w,BMf5e5t3pZinfjD37EvMig,5,1,Absinthe,Las Vegas,NV,Performing Arts,1973,4.5,Amy,4.15,rockstar
qFePitPivYwbdzq9jtDNVA,ty5KQYqYRxwXDG_e4pz-4w,BMf5e5t3pZinfjD37EvMig,5,1,Absinthe,Las Vegas,NV,Performing Arts,1658,4.5,Amy,4.13,rockstar
qFePitPivYwbdzq9jtDNVA,ty5KQYqYRxwXDG_e4pz-4w,BMf5e5t3pZinfjD37EvMig,5,1,Absinthe,Las Vegas,NV,Performing Arts,1658,4.5,Amy,4.15,rockstar


In [0]:
# https://databricks.com/blog/2021/10/04/pandas-api-on-upcoming-apache-spark-3-2.html
# https://spark.apache.org/docs/latest/api/python/user_guide/pandas_on_spark/options.html#getting-and-setting-options
# import pandas as pd
import pyspark.pandas as pd

ds_pandas_gold_reviews = pd.read_parquet("dbfs:/mnt/bs-production/parquet/gold_reviews")
display(ds_pandas_gold_reviews)

review_id,business_id,user_id,review_stars,review_useful,store_name,store_city,store_state,store_category,store_review_count,store_stars,user_name,user_average_stars,user_importance
qFePitPivYwbdzq9jtDNVA,ty5KQYqYRxwXDG_e4pz-4w,BMf5e5t3pZinfjD37EvMig,5,1,Absinthe,Las Vegas,NV,Performing Arts,1973,4.5,Amy,4.13,rockstar
qFePitPivYwbdzq9jtDNVA,ty5KQYqYRxwXDG_e4pz-4w,BMf5e5t3pZinfjD37EvMig,5,1,Absinthe,Las Vegas,NV,Performing Arts,1973,4.5,Amy,4.15,rockstar
qFePitPivYwbdzq9jtDNVA,ty5KQYqYRxwXDG_e4pz-4w,BMf5e5t3pZinfjD37EvMig,5,1,Absinthe,Las Vegas,NV,Performing Arts,1658,4.5,Amy,4.13,rockstar
qFePitPivYwbdzq9jtDNVA,ty5KQYqYRxwXDG_e4pz-4w,BMf5e5t3pZinfjD37EvMig,5,1,Absinthe,Las Vegas,NV,Performing Arts,1658,4.5,Amy,4.15,rockstar
qFePitPivYwbdzq9jtDNVA,ty5KQYqYRxwXDG_e4pz-4w,BMf5e5t3pZinfjD37EvMig,5,1,Absinthe,Las Vegas,NV,Performing Arts,1658,4.5,Amy,4.13,rockstar
qFePitPivYwbdzq9jtDNVA,ty5KQYqYRxwXDG_e4pz-4w,BMf5e5t3pZinfjD37EvMig,5,1,Absinthe,Las Vegas,NV,Performing Arts,1658,4.5,Amy,4.15,rockstar
qFePitPivYwbdzq9jtDNVA,ty5KQYqYRxwXDG_e4pz-4w,BMf5e5t3pZinfjD37EvMig,5,1,Absinthe,Las Vegas,NV,Performing Arts,1973,4.5,Amy,4.13,rockstar
qFePitPivYwbdzq9jtDNVA,ty5KQYqYRxwXDG_e4pz-4w,BMf5e5t3pZinfjD37EvMig,5,1,Absinthe,Las Vegas,NV,Performing Arts,1973,4.5,Amy,4.15,rockstar
qFePitPivYwbdzq9jtDNVA,ty5KQYqYRxwXDG_e4pz-4w,BMf5e5t3pZinfjD37EvMig,5,1,Absinthe,Las Vegas,NV,Performing Arts,1658,4.5,Amy,4.13,rockstar
qFePitPivYwbdzq9jtDNVA,ty5KQYqYRxwXDG_e4pz-4w,BMf5e5t3pZinfjD37EvMig,5,1,Absinthe,Las Vegas,NV,Performing Arts,1658,4.5,Amy,4.15,rockstar
