In [20]:
# create a spark session
import pyspark
spark = pyspark.sql.SparkSession.builder.getOrCreate()

In [21]:
import pandas as pd
import numpy as np

In [22]:
# spark can convert any pandas dataframe into a spark dataframe
# here we will start with a pandas dataset
np.random.seed(456)
pandas_dataframe = pd.DataFrame(dict(n=np.arange(20), group = np.random.choice(list('abc'), 20))
                               )
pandas_dataframe

Unnamed: 0,n,group
0,0,b
1,1,b
2,2,c
3,3,a
4,4,c
5,5,c
6,6,a
7,7,b
8,8,a
9,9,b


In [23]:
# convert the dataframe to a spark dataframe
df = spark.createDataFrame(pandas_dataframe)
df

DataFrame[n: bigint, group: string]

In [24]:
# look at the first five rows of our data (the default is 20 rows)
df.show(5)

+---+-----+
|  n|group|
+---+-----+
|  0|    b|
|  1|    b|
|  2|    c|
|  3|    a|
|  4|    c|
+---+-----+
only showing top 5 rows



In [25]:
# describe the spark dataframe
df.describe()

DataFrame[summary: string, n: string, group: string]

In [26]:
# show the 'describe'
df.describe().show()

+-------+-----------------+-----+
|summary|                n|group|
+-------+-----------------+-----+
|  count|               20|   20|
|   mean|              9.5| null|
| stddev|5.916079783099616| null|
|    min|                0|    a|
|    max|               19|    c|
+-------+-----------------+-----+



In [27]:
### Lets use a larger dataset

from pydataset import data

mpg = spark.createDataFrame(data('mpg'))
mpg.show(3)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 3 rows



In [28]:
# a difference with spark and pandas df is that spark will give the 
# slice of the dataframe but5 the contain the data itself like pandas would

mpg.hwy

Column<'hwy'>

In [29]:
# or
mpg.select(mpg.hwy, mpg.cty, mpg.model)

DataFrame[hwy: bigint, cty: bigint, model: string]

In [30]:
# to see that data, we would have to use the .show at the end

mpg.select(mpg.hwy, mpg.cty, mpg.model).show(10)

+---+---+----------+
|hwy|cty|     model|
+---+---+----------+
| 29| 18|        a4|
| 29| 21|        a4|
| 31| 20|        a4|
| 30| 21|        a4|
| 26| 16|        a4|
| 26| 18|        a4|
| 27| 18|        a4|
| 26| 18|a4 quattro|
| 25| 16|a4 quattro|
| 28| 20|a4 quattro|
+---+---+----------+
only showing top 10 rows



In [31]:
# we can also return a column taht represents the values from the 
# original + 1 added like so:

mpg.select(mpg.hwy, mpg.hwy + 1).show(5)

+---+---------+
|hwy|(hwy + 1)|
+---+---------+
| 29|       30|
| 29|       30|
| 31|       32|
| 30|       31|
| 26|       27|
+---+---------+
only showing top 5 rows



In [32]:
# we can use alias to change the name of the column

mpg.select(mpg.hwy.alias('highway_mailage')).show(5)

+---------------+
|highway_mailage|
+---------------+
|             29|
|             29|
|             31|
|             30|
|             26|
+---------------+
only showing top 5 rows



In [33]:
# store column objects in variables and reference them

col1 = mpg.hwy.alias('highway_milage')
col2 = (mpg.hwy / 2).alias('highway_milage_halved')
mpg.select(col1, col2).show(5)

+--------------+---------------------+
|highway_milage|highway_milage_halved|
+--------------+---------------------+
|            29|                 14.5|
|            29|                 14.5|
|            31|                 15.5|
|            30|                 15.0|
|            26|                 13.0|
+--------------+---------------------+
only showing top 5 rows



### We can also create columns with the <code>col</code>  and <code>expr</code> functions from <code>pyspark.sql.functions</code> module 

In [34]:
from pyspark.sql.functions import col, expr

### col

In [36]:
# test
col('hwy')

Column<'hwy'>

#### mix and match the syntax we use, and the column object produced by the <code>col</code> function is the same as the previous column object we saw

In [37]:
avg_column = (col('hwy') + col('cty')) / 2

mpg.select(
    col('hwy').alias('highway_milage'),
    mpg.cty.alias('city_milage'),
    avg_column.alias('avg_milage'),
).show(5)

+--------------+-----------+----------+
|highway_milage|city_milage|avg_milage|
+--------------+-----------+----------+
|            29|         18|      23.5|
|            29|         21|      25.0|
|            31|         20|      25.5|
|            30|         21|      25.5|
|            26|         16|      21.0|
+--------------+-----------+----------+
only showing top 5 rows



### expr

In [39]:
# returns the same type of column object, but alows us to express 
# manipulations to the column within the string that defines the column.

mpg.select(
    expr('hwy'), # is the same as 'col'
    expr('hwy + 1'), # an arithmetic expression
    expr('hwy AS highway_milage'), # using an alias
    expr('hwy + 1 AS highway_incremented'), #a combination of the above
).show(5)

+---+---------+--------------+-------------------+
|hwy|(hwy + 1)|highway_milage|highway_incremented|
+---+---------+--------------+-------------------+
| 29|       30|            29|                 30|
| 29|       30|            29|                 30|
| 31|       32|            31|                 32|
| 30|       31|            30|                 31|
| 26|       27|            26|                 27|
+---+---------+--------------+-------------------+
only showing top 5 rows



In [40]:
# example: all columns created below are identical, and which syntax to use
# is just a style choice.

mpg.select(
    mpg.hwy.alias('highway'),
    # or
    col('hwy').alias('highway'),
    # or
    expr('hwy').alias('highway'),
    # or
    expr('hwy AS highway')
).show(5)

+-------+-------+-------+-------+
|highway|highway|highway|highway|
+-------+-------+-------+-------+
|     29|     29|     29|     29|
|     29|     29|     29|     29|
|     31|     31|     31|     31|
|     30|     30|     30|     30|
|     26|     26|     26|     26|
+-------+-------+-------+-------+
only showing top 5 rows



## Spark SQL

In [41]:
# 'register' table with spark

mpg.createOrReplaceTempView('mpg')

In [42]:
# now we can write a sql query against the mpg table:

spark.sql(
    """
    SELECT hwy, cty, (hwy + cty) / 2 AS avg
    FROM mpg
    """

)

DataFrame[hwy: bigint, cty: bigint, avg: double]

In [43]:
# show it

spark.sql(
    """
    SELECT hwy, cty, (hwy + cty) / 2 AS avg
    FROM mpg
    """

).show(5)

+---+---+----+
|hwy|cty| avg|
+---+---+----+
| 29| 18|23.5|
| 29| 21|25.0|
| 31| 20|25.5|
| 30| 21|25.5|
| 26| 16|21.0|
+---+---+----+
only showing top 5 rows



#### Type Casting

In [44]:
# we can view the types of the column in our dataframe in one of two ways:

# dtypes

mpg.dtypes

[('manufacturer', 'string'),
 ('model', 'string'),
 ('displ', 'double'),
 ('year', 'bigint'),
 ('cyl', 'bigint'),
 ('trans', 'string'),
 ('drv', 'string'),
 ('cty', 'bigint'),
 ('hwy', 'bigint'),
 ('fl', 'string'),
 ('class', 'string')]

In [45]:
# printSchema

mpg.printSchema()

root
 |-- manufacturer: string (nullable = true)
 |-- model: string (nullable = true)
 |-- displ: double (nullable = true)
 |-- year: long (nullable = true)
 |-- cyl: long (nullable = true)
 |-- trans: string (nullable = true)
 |-- drv: string (nullable = true)
 |-- cty: long (nullable = true)
 |-- hwy: long (nullable = true)
 |-- fl: string (nullable = true)
 |-- class: string (nullable = true)



In [46]:
# we can also convert from one to another using the .cast method

mpg.select(mpg.hwy.cast('string')).printSchema()

root
 |-- hwy: string (nullable = true)



In [47]:
# if its not able to be converted, it will be replaced with null

mpg.select(mpg.model, mpg.model.cast('int')).show(5)

+-----+-----+
|model|model|
+-----+-----+
|   a4| null|
|   a4| null|
|   a4| null|
|   a4| null|
|   a4| null|
+-----+-----+
only showing top 5 rows



#### Other Basic Built-in Functions

- <code> concat </code>: to concatenate strings
- <code> sum </code>: to sum a group
- <code> avg </code>: to take the average of a group
- <code> min </code>: to find the minimum
- <code> max </code>: to find the maximum