<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [1]:
import numpy as np
import pandas as pd

In [2]:
import pyspark

from pyspark import SparkConf, SparkContext, SQLContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf # @udf("integer") def myfunc(x,y): return x - y
from pyspark.sql import functions as F # stddev format_number date_format, dayofyear, when
from pyspark.sql.types import StructField, StringType, IntegerType, StructType

print([(x.__name__,x.__version__) for x in [np, pd, pyspark]])

spark = pyspark.sql.SparkSession.builder.appName('bhishan').getOrCreate()
sc = spark.sparkContext
sqlContext = SQLContext(sc) # spark_df = sqlContext.createDataFrame(pandas_df)
sc.setLogLevel("INFO")

[('numpy', '1.17.1'), ('pandas', '0.25.1'), ('pyspark', '2.4.4')]


In [36]:
#Create Pandas DataFrame
df = pd.DataFrame({'category':['a','a','b','b','b'],
                   'value': [10,20,100,200,300],
                  })

print(df)

  category  value
0        a     10
1        a     20
2        b    100
3        b    200
4        b    300


In [37]:
df.dtypes

category    object
value        int64
dtype: object

In [38]:
schema = StructType([
    StructField('category',StringType(),True),
    StructField('value',IntegerType(),True)
    ])

sdf = sqlContext.createDataFrame(df, schema)
sdf.show()

+--------+-----+
|category|value|
+--------+-----+
|       a|   10|
|       a|   20|
|       b|  100|
|       b|  200|
|       b|  300|
+--------+-----+



In [39]:
df['category_mean'] = df.groupby("category")["value"].transform('mean')

print(df)

  category  value  category_mean
0        a     10             15
1        a     20             15
2        b    100            200
3        b    200            200
4        b    300            200


In [44]:
from pyspark.sql.functions import col as _col

sdf_means = sdf.groupBy("category").mean("value").alias("means")
sdf_means.show()

+--------+----------+
|category|avg(value)|
+--------+----------+
|       b|     200.0|
|       a|      15.0|
+--------+----------+



In [45]:
sdf2 = sdf.alias("sdf").join(sdf_means,
                      _col("sdf.category") == 
                      _col("means.category"))

sdf2.show()

+--------+-----+--------+----------+
|category|value|category|avg(value)|
+--------+-----+--------+----------+
|       b|  100|       b|     200.0|
|       b|  200|       b|     200.0|
|       b|  300|       b|     200.0|
|       a|   10|       a|      15.0|
|       a|   20|       a|      15.0|
+--------+-----+--------+----------+



In [41]:
sdf3 = sdf.alias("sdf").join(
    F.broadcast(means),
    _col("sdf.category") ==
    _col("means.category"))

sdf3.show()

+--------+-----+--------+----------+
|category|value|category|avg(value)|
+--------+-----+--------+----------+
|       a|   10|       a|      15.0|
|       a|   20|       a|      15.0|
|       b|  100|       b|     200.0|
|       b|  200|       b|     200.0|
|       b|  300|       b|     200.0|
+--------+-----+--------+----------+



In [42]:
sdf.registerTempTable('sdf')

sdf4 = spark.sql("""
select *, mean(value)
OVER (PARTITION BY category) as category_mean
from sdf
""")

sdf4.show()

+--------+-----+-------------+
|category|value|category_mean|
+--------+-----+-------------+
|       b|  100|        200.0|
|       b|  200|        200.0|
|       b|  300|        200.0|
|       a|   10|         15.0|
|       a|   20|         15.0|
+--------+-----+-------------+



In [43]:
from pyspark.sql.window import Window

window_var = Window().partitionBy('category')

sdf5 = sdf.withColumn('category_mean',
                      F.mean('value').over(window_var))

sdf5.show()

+--------+-----+-------------+
|category|value|category_mean|
+--------+-----+-------------+
|       b|  100|        200.0|
|       b|  200|        200.0|
|       b|  300|        200.0|
|       a|   10|         15.0|
|       a|   20|         15.0|
+--------+-----+-------------+

