In [3]:
import numpy as np
import pandas as pd
import pyspark

In [4]:
spark = pyspark.sql.SparkSession.builder.getOrCreate()

In [5]:
np.random.seed(456)

pandas_dataframe = pd.DataFrame(
    dict(n=np.arange(20), group=np.random.choice(list("abc"), 20))
)
pandas_dataframe

Unnamed: 0,n,group
0,0,b
1,1,b
2,2,c
3,3,a
4,4,c
5,5,c
6,6,a
7,7,b
8,8,a
9,9,b


In [6]:
df = spark.createDataFrame(pandas_dataframe)
df

DataFrame[n: bigint, group: string]

In [7]:
df.show()

+---+-----+
|  n|group|
+---+-----+
|  0|    b|
|  1|    b|
|  2|    c|
|  3|    a|
|  4|    c|
|  5|    c|
|  6|    a|
|  7|    b|
|  8|    a|
|  9|    b|
| 10|    b|
| 11|    a|
| 12|    b|
| 13|    a|
| 14|    b|
| 15|    b|
| 16|    c|
| 17|    c|
| 18|    a|
| 19|    c|
+---+-----+



In [8]:
df.group

Column<b'group'>

In [10]:
df.select(df.group).show()

+-----+
|group|
+-----+
|    b|
|    b|
|    c|
|    a|
|    c|
|    c|
|    a|
|    b|
|    a|
|    b|
|    b|
|    a|
|    b|
|    a|
|    b|
|    b|
|    c|
|    c|
|    a|
|    c|
+-----+



In [13]:
df.select(df.n+1).show()

+-------+
|(n + 1)|
+-------+
|      1|
|      2|
|      3|
|      4|
|      5|
|      6|
|      7|
|      8|
|      9|
|     10|
|     11|
|     12|
|     13|
|     14|
|     15|
|     16|
|     17|
|     18|
|     19|
|     20|
+-------+



In [15]:
df.describe().show()

+-------+-----------------+-----+
|summary|                n|group|
+-------+-----------------+-----+
|  count|               20|   20|
|   mean|              9.5| null|
| stddev|5.916079783099616| null|
|    min|                0|    a|
|    max|               19|    c|
+-------+-----------------+-----+



In [27]:
df.describe().select("n","summary").show()

+-----------------+-------+
|                n|summary|
+-----------------+-------+
|               20|  count|
|              9.5|   mean|
|5.916079783099616| stddev|
|                0|    min|
|               19|    max|
+-----------------+-------+



In [37]:
# df.select("n","summary").describe.show()

In [38]:
from pydataset import data

In [39]:
mpg = spark.createDataFrame(data("mpg"))
mpg.show()

+------------+------------------+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|             model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+------------------+-----+----+---+----------+---+---+---+---+-------+
|        audi|                a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|                a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|                a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|                a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|                a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
|        audi|                a4|  2.8|1999|  6|manual(m5)|  f| 18| 26|  p|compact|
|        audi|                a4|  3.1|2008|  6|  auto(av)|  f| 18| 27|  p|compact|
|        audi|        a4 quattro|  1.8|1999|  4|manual(m5)|  4| 18| 26|  p|compact|
|        audi|        a4 quattro|  1.8|1999|  4|  auto(l5)|  4| 16| 25|  p|c

In [42]:
mpg.select(mpg.model, "manufacturer", mpg.hwy.alias("hw_mileage")).show()

+------------------+------------+----------+
|             model|manufacturer|hw_mileage|
+------------------+------------+----------+
|                a4|        audi|        29|
|                a4|        audi|        29|
|                a4|        audi|        31|
|                a4|        audi|        30|
|                a4|        audi|        26|
|                a4|        audi|        26|
|                a4|        audi|        27|
|        a4 quattro|        audi|        26|
|        a4 quattro|        audi|        25|
|        a4 quattro|        audi|        28|
|        a4 quattro|        audi|        27|
|        a4 quattro|        audi|        25|
|        a4 quattro|        audi|        25|
|        a4 quattro|        audi|        25|
|        a4 quattro|        audi|        25|
|        a6 quattro|        audi|        24|
|        a6 quattro|        audi|        25|
|        a6 quattro|        audi|        23|
|c1500 suburban 2wd|   chevrolet|        20|
|c1500 sub

In [43]:
mpg.show(5)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|   a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|   a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 5 rows



In [60]:
avg_mile_column = ((mpg.cty + mpg.hwy) /2).alias('avg_mileage')
avg_mile_column

Column<b'((cty + hwy) / 2) AS `avg_mileage`'>

In [61]:
mpg.select('*', avg_mile_column).show()

+------------+------------------+-----+----+---+----------+---+---+---+---+-------+-----------+
|manufacturer|             model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|avg_mileage|
+------------+------------------+-----+----+---+----------+---+---+---+---+-------+-----------+
|        audi|                a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|       23.5|
|        audi|                a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|       25.0|
|        audi|                a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|       25.5|
|        audi|                a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|       25.5|
|        audi|                a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|       21.0|
|        audi|                a4|  2.8|1999|  6|manual(m5)|  f| 18| 26|  p|compact|       22.0|
|        audi|                a4|  3.1|2008|  6|  auto(av)|  f| 18| 27|  p|compact|       22.5|
|        audi|        a4 quattro|  1.8|1

In [62]:
from pyspark.sql.functions import col

In [63]:
col

<function pyspark.sql.functions._create_function.<locals>._(col)>

In [67]:
col('hwy')

Column<b'hwy'>

In [68]:
mpg.select(col('hwy'))

DataFrame[hwy: bigint]

In [69]:
just_hwy_and_cty = mpg.select('hwy','cty')

In [71]:
just_hwy_and_cty.show(5)

+---+---+
|hwy|cty|
+---+---+
| 29| 18|
| 29| 21|
| 31| 20|
| 30| 21|
| 26| 16|
+---+---+
only showing top 5 rows



In [72]:
mpg.select('*', mpg.cyl.cast('string').alias('cyl_string')).show()

+------------+------------------+-----+----+---+----------+---+---+---+---+-------+----------+
|manufacturer|             model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|cyl_string|
+------------+------------------+-----+----+---+----------+---+---+---+---+-------+----------+
|        audi|                a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|         4|
|        audi|                a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|         4|
|        audi|                a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|         4|
|        audi|                a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|         4|
|        audi|                a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|         6|
|        audi|                a4|  2.8|1999|  6|manual(m5)|  f| 18| 26|  p|compact|         6|
|        audi|                a4|  3.1|2008|  6|  auto(av)|  f| 18| 27|  p|compact|         6|
|        audi|        a4 quattro|  1.8|1999|  4|ma

In [77]:
mpg.select(mpg.manufacturer).show()

+------------+
|manufacturer|
+------------+
|        audi|
|        audi|
|        audi|
|        audi|
|        audi|
|        audi|
|        audi|
|        audi|
|        audi|
|        audi|
|        audi|
|        audi|
|        audi|
|        audi|
|        audi|
|        audi|
|        audi|
|        audi|
|   chevrolet|
|   chevrolet|
+------------+
only showing top 20 rows



In [80]:
from pyspark.sql.functions import min, max

In [86]:
min([1,2,3])

Py4JError: An error occurred while calling z:org.apache.spark.sql.functions.min. Trace:
py4j.Py4JException: Method min([class java.util.ArrayList]) does not exist
	at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:318)
	at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:339)
	at py4j.Gateway.invoke(Gateway.java:276)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)



In [87]:
mpg.show(5)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|   a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|   a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 5 rows

