## PySpark Test

In [5]:
spark

In [2]:
# import modules
import pandas as pd
import numpy as np
#import matplotlib.pyplot as plt
#import seaborn as sns

In [None]:
# Figures inline and set visualization style
#%matplotlib inline
#sns.set()

In [3]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as fn

### Light-weight PySpark Session

In [4]:
# Local mode
spark = SparkSession\
        .builder\
        .appName("sqldemo")\
        .getOrCreate()

In [None]:
# yarn mode
spark = SparkSession\
        .builder\
        .master("yarn")\
        .config('spark.executor.instances','99')\
        .config('spark.executor.memory','4G')\
        .appName("mldemo")\
        .getOrCreate()

In [None]:
# standalone mode
spark = SparkSession\
        .builder\
        .master("spark://master.example.org:7077")\
        .config('spark.cores.max','1')\
        .config('spark.executor.memory','1G')\
        .appName("clusterdemo")\
        .getOrCreate()

In [6]:
# Check spark app name
spark.sparkContext.appName

'sqldemo'

In [7]:
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", True)

In [11]:
import pyspark.pandas as ps

ModuleNotFoundError: No module named 'pyspark.pandas'

In [None]:
ps.set_option("compute.default_index_type", "distributed")

# 讀取檔案

In [None]:
# 使用"file://全路徑"取得本機檔案
iris_df = ps.read_csv("file:///home/hadoop/sparkcodes/data/iris.csv")
# 使用"hdfs://bdse41.example.com:8020"讀取分散式系統檔案
df_train = ps.read_csv('hdfs://bdse41.example.com:8020/user/hadoop/sample_data.csv')
# 如果有設定$Hadoop_CONF_DIR，這樣會讀取到分散式系統
df_train = ps.read_csv('/user/hadoop/sample_data.csv')


## Pandas on Spark Dataframe
### Load performance data

In [None]:
# Read CVS file
df_train = ps.read_csv('hdfs://bdse41.example.com:8020/user/hadoop/sample_data.csv')

In [None]:
df_train.shape

In [None]:
%%timeit 
# 總共跑7次
df_train.shape

In [None]:
df_train.dtypes

In [None]:
df_train.head()

In [None]:
df_train.tail(10)

In [None]:
# pdf will load to memory
pdf = df_train[['Quarter','Month']].to_pandas()

In [None]:
pdf.shape

## PySpark Dataframe

In [None]:
# Load csv Dataset 
sdf=spark.read.csv('hdfs://bdse41.example.com:8020/user/hadoop/yellow_2009_50m.csv',inferSchema=True,header=True)
sdf.createOrReplaceTempView("dfTable")  # 為了可以執行sql command

## Inspect data

In [None]:
# columns of dataframe
sdf.columns

In [None]:
# check number of columns
len(sdf.columns)

In [None]:
%%timeit
# number of records in dataframe
sdf.count()

In [None]:
%%timeit
# number of records in dataframe
sdf.count()

In [None]:
# shape of dataset
sdf.count(),len(sdf.columns)

## Using Pandas UDF (Spark 3.x)
### Load data

In [None]:
# Load csv Dataset 
df=spark.read.csv('hdfs://bdse41.example.com:8020/user/hadoop/sample_data.csv',inferSchema=True,header=True)
df.createOrReplaceTempView("dfTable")

## Inspect data

In [None]:
# columns of dataframe
df.columns

In [None]:
from pyspark.sql.functions import pandas_udf, PandasUDFType

In [None]:
# create pandas udf function
@pandas_udf('int')
def remaining_yrs3(age: pd.Series) -> pd.Series:
    yrs_left=100-age
    return yrs_left

In [None]:
# apply pandas udf on dataframe
df.withColumn('yrs_left', remaining_yrs3(df['age'])).show(5)

In [None]:
# create pandas udf function
@pandas_udf('double')
def prod3(rating: pd.Series, exp: pd.Series) -> pd.Series:
    x=rating*exp
    return x

In [None]:
# apply pandas udf on multiple columns of dataframe
df.withColumn("product", prod3(df['ratings'],df['experience'])).show(5)