# Spark API Mini Exercises

Copy the code below to create a pandas dataframe with 20 rows and 3 columns:

In [105]:
import pandas as pd
import numpy as np
import pyspark

from pyspark.sql.functions import col, expr, concat,\
sum, avg, min, max, count, mean, lit, regexp_extract,\
regexp_replace, when, asc, desc, month, year, quarter

spark = pyspark.sql.SparkSession.builder.getOrCreate()

np.random.seed(13)

import warnings
warnings.filterwarnings('ignore')

## 1. Spark Dataframe Basics

    1. Use the starter code above to create a pandas dataframe.

In [19]:
pandas_dataframe = pd.DataFrame(
    {
        "n": np.random.randn(20),
        "group": np.random.choice(list("xyz"), 20),
        "abool": np.random.choice([True, False], 20),
    }
)

    2. Convert the pandas dataframe to a spark dataframe. From this point
       forward, do all of your work with the spark dataframe, not the pandas
       dataframe.

In [20]:
df = spark.createDataFrame(pandas_dataframe)

    3. Show the first 3 rows of the dataframe.

In [21]:
df.show(3)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
+--------------------+-----+-----+
only showing top 3 rows



    4. Show the first 7 rows of the dataframe.

In [22]:
df.show(7)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
| 0.45181233874578974|    y|false|
|  1.3451017084510097|    z|false|
|  0.5323378882945463|    y|false|
|  1.3501878997225267|    z|false|
+--------------------+-----+-----+
only showing top 7 rows



    5. What is the difference between `.show` and `.head`?

In [23]:
df.head()

Row(n=-0.712390662050588, group='z', abool=False)

In [24]:
df.show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
| 0.45181233874578974|    y|false|
|  1.3451017084510097|    z|false|
|  0.5323378882945463|    y|false|
|  1.3501878997225267|    z|false|
|  0.8612113741693206|    x|false|
|  1.4786857374358966|    z| true|
| -1.0453771305385342|    y| true|
| -0.7889890249515489|    x|false|
|  -1.261605945319069|    y|false|
|  0.5628467852810314|    y| true|
|-0.24332625188556253|    y| true|
|  0.9137407048596775|    y|false|
| 0.31735092273633597|    x|false|
| 0.12730328020698067|    z|false|
|  2.1503829673811126|    y| true|
|  0.6062886568962988|    x|false|
|-0.02677164998644...|    x| true|
+--------------------+-----+-----+



- .head() shows the first row as a breakdown of type and value
- .show() shows the first 20 rows

    6. View a summary of the data using `.describe`.

In [25]:
df.describe()

DataFrame[summary: string, n: string, group: string]

In [26]:
df.describe().show()

+-------+------------------+-----+
|summary|                 n|group|
+-------+------------------+-----+
|  count|                20|   20|
|   mean|0.3664026449885217| null|
| stddev|0.8905322898155363| null|
|    min|-1.261605945319069|    x|
|    max|2.1503829673811126|    z|
+-------+------------------+-----+



    7. Use `.select` to create a new dataframe with just the `n` and `abool`
       columns. View the first 5 rows of this dataframe.

In [28]:
df.select(df.n, df.abool).show(5)

+--------------------+-----+
|                   n|abool|
+--------------------+-----+
|  -0.712390662050588|false|
|   0.753766378659703|false|
|-0.04450307833805...|false|
| 0.45181233874578974|false|
|  1.3451017084510097|false|
+--------------------+-----+
only showing top 5 rows



    8. Use `.select` to create a new dataframe with just the `group` and `abool`
       columns. View the first 5 rows of this dataframe.

In [29]:
df.select(df.group, df.abool).show(5)

+-----+-----+
|group|abool|
+-----+-----+
|    z|false|
|    x|false|
|    z|false|
|    y|false|
|    z|false|
+-----+-----+
only showing top 5 rows



    9. Use `.select` to create a new dataframe with the `group` column and the
       `abool` column renamed to `a_boolean_value`. Show the first 3 rows of
       this dataframe.

In [33]:
df.select(df.group, df.abool.alias('a_boolean_value')).show(3)

+-----+---------------+
|group|a_boolean_value|
+-----+---------------+
|    z|          false|
|    x|          false|
|    z|          false|
+-----+---------------+
only showing top 3 rows



    10. Use `.select` to create a new dataframe with the `group` column and the
       `n` column renamed to `a_numeric_value`. Show the first 6 rows of this
       dataframe.

In [34]:
df.select(df.group, df.n.alias('a_numeric_value')).show(6)

+-----+--------------------+
|group|     a_numeric_value|
+-----+--------------------+
|    z|  -0.712390662050588|
|    x|   0.753766378659703|
|    z|-0.04450307833805...|
|    y| 0.45181233874578974|
|    z|  1.3451017084510097|
|    y|  0.5323378882945463|
+-----+--------------------+
only showing top 6 rows



## 2. Column Manipulation

    1. Use the starter code above to re-create a spark dataframe. Store the
       spark dataframe in a varaible named `df`

In [35]:
df = spark.createDataFrame(pandas_dataframe)

    2. Use `.select` to add 4 to the `n` column. Show the results.

In [36]:
df.select(df.n + 4).show()

+------------------+
|           (n + 4)|
+------------------+
|3.2876093379494122|
| 4.753766378659703|
|3.9554969216619464|
|  4.45181233874579|
|5.3451017084510095|
| 4.532337888294546|
| 5.350187899722527|
|  4.86121137416932|
| 5.478685737435897|
| 2.954622869461466|
|3.2110109750484512|
| 2.738394054680931|
| 4.562846785281032|
|3.7566737481144377|
| 4.913740704859677|
| 4.317350922736336|
| 4.127303280206981|
| 6.150382967381113|
| 4.606288656896298|
|3.9732283500135592|
+------------------+



    3. Subtract 5 from the `n` column and view the results.

In [38]:
df.select(df.n - 5).show()

+-------------------+
|            (n - 5)|
+-------------------+
| -5.712390662050588|
| -4.246233621340297|
| -5.044503078338053|
|  -4.54818766125421|
|-3.6548982915489905|
| -4.467662111705454|
|-3.6498121002774733|
|  -4.13878862583068|
| -3.521314262564103|
| -6.045377130538534|
| -5.788989024951549|
| -6.261605945319069|
| -4.437153214718968|
| -5.243326251885563|
| -4.086259295140323|
| -4.682649077263664|
| -4.872696719793019|
|-2.8496170326188874|
| -4.393711343103702|
| -5.026771649986441|
+-------------------+



    4. Multiply the `n` column by 2. View the results along with the original
       numbers.

In [39]:
df.select(df.n, df.n * 2).show()

+--------------------+--------------------+
|                   n|             (n * 2)|
+--------------------+--------------------+
|  -0.712390662050588|  -1.424781324101176|
|   0.753766378659703|   1.507532757319406|
|-0.04450307833805...|-0.08900615667610691|
| 0.45181233874578974|  0.9036246774915795|
|  1.3451017084510097|  2.6902034169020195|
|  0.5323378882945463|  1.0646757765890926|
|  1.3501878997225267|  2.7003757994450535|
|  0.8612113741693206|  1.7224227483386412|
|  1.4786857374358966|   2.957371474871793|
| -1.0453771305385342| -2.0907542610770684|
| -0.7889890249515489| -1.5779780499030978|
|  -1.261605945319069|  -2.523211890638138|
|  0.5628467852810314|  1.1256935705620628|
|-0.24332625188556253|-0.48665250377112507|
|  0.9137407048596775|   1.827481409719355|
| 0.31735092273633597|  0.6347018454726719|
| 0.12730328020698067| 0.25460656041396135|
|  2.1503829673811126|   4.300765934762225|
|  0.6062886568962988|  1.2125773137925977|
|-0.02677164998644...|-0.0535432

    5. Add a new column named `n2` that is the `n` value multiplied by -1. Show
       the first 4 rows of your dataframe. You should see the original `n` value
       as well as `n2`.

In [45]:
df.select(df.n,(df.n * -1).alias('n2')).show(4)

+--------------------+--------------------+
|                   n|                  n2|
+--------------------+--------------------+
|  -0.712390662050588|   0.712390662050588|
|   0.753766378659703|  -0.753766378659703|
|-0.04450307833805...|0.044503078338053455|
| 0.45181233874578974|-0.45181233874578974|
+--------------------+--------------------+
only showing top 4 rows



    6. Add a new column named `n3` that is the n value squared. Show the first 5
       rows of your dataframe. You should see both `n`, `n2`, and `n3`.

In [46]:
df.select(df.n,(df.n * -1).alias('n2'),(df.n ** 2).alias('n3')).show(5)

+--------------------+--------------------+--------------------+
|                   n|                  n2|                  n3|
+--------------------+--------------------+--------------------+
|  -0.712390662050588|   0.712390662050588|   0.507500455376875|
|   0.753766378659703|  -0.753766378659703|  0.5681637535977627|
|-0.04450307833805...|0.044503078338053455|0.001980523981562...|
| 0.45181233874578974|-0.45181233874578974| 0.20413438944294027|
|  1.3451017084510097| -1.3451017084510097|  1.8092986060778251|
+--------------------+--------------------+--------------------+
only showing top 5 rows



    7. What happens when you run the code below?

        ```python
        df.group + df.abool
        ```

In [47]:
df.group + df.abool

Column<'(group + abool)'>

- you get a spark column object of group + abool

    8. What happens when you run the code below? What is the difference between
       this and the previous code sample?

        ```python
        df.select(df.group + df.abool)
        ```

In [55]:
# df.select(df.group + df.abool)

- You get an error because there are two different data types.

    9. Try adding various other columns together. What are the results of
       combining the different data types?

```df.select(df.n + df.abool)```

- Gives the same error as above. It can't reconcile the different types

```df.select(df.n + df.group)```

- This returns and object that says

```DataFrame[(n + group): double]``` 

- which I'm assuming is a dataframe with a column n + group that has values of type double

```df.select(df.n + df.group).show(5)```

- side note the above code returns a df of nulls

## 3. Type casting

    1. Use the starter code above to re-create a spark dataframe.

In [58]:
df = spark.createDataFrame(pandas_dataframe)

    2. Use `.printSchema` to view the datatypes in your dataframe.

In [63]:
df.printSchema

<bound method DataFrame.printSchema of DataFrame[n: double, group: string, abool: boolean]>

    3. Use `.dtypes` to view the datatypes in your dataframe.

In [62]:
df.dtypes

[('n', 'double'), ('group', 'string'), ('abool', 'boolean')]

    4. What is the difference between the two code samples below?

        ```python
        df.abool.cast('int')
        ```

        ```python
        df.select(df.abool.cast('int')).show()
        ```

 

```df.abool.cast('int')```

- Creates a column object

```df.select(df.abool.cast('int')).show()```

- takes in the column and displays a df with bools as ints

    5. Use `.select` and `.cast` to convert the `abool` column to an integer
       type. View the results.

In [68]:
df.select(df.abool.cast('int')).show()

+-----+
|abool|
+-----+
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    1|
|    1|
|    0|
|    0|
|    1|
|    1|
|    0|
|    0|
|    0|
|    1|
|    0|
|    1|
+-----+



    6. Convert the `group` column to a integer data type and view the results.
       What happens?

```df.select(df.group.cast('int')).show()```

- returns a dataframe of null

    7. Convert the `n` column to a integer data type and view the results. What
       happens?

```df.select(df.n,df.n.cast('int')).show()```

- Rounds down to the nearest whole number

    8. Convert the `abool` column to a string data type and view the results.
       What happens?

```df.select(df.abool, df.abool.cast('string')).show()```
 
- returns a dataframe where the boolean words are stings of the same name

## 4. Built-in Functions

    1. Use the starter code above to re-create a spark dataframe.

In [74]:
df = spark.createDataFrame(pandas_dataframe)

    2. Import the necessary functions from `pyspark.sql.functions`

In [75]:
from pyspark.sql.functions import col, expr, concat, sum, avg, min, max, count, mean

    3. Find the highest `n` value.

In [80]:
df.select(max(df.n)).show()

+------------------+
|            max(n)|
+------------------+
|2.1503829673811126|
+------------------+



    4. Find the lowest `n` value.

In [81]:
df.select(min(df.n)).show()

+------------------+
|            min(n)|
+------------------+
|-1.261605945319069|
+------------------+



    5. Find the average `n` value.

In [82]:
df.select(avg(df.n)).show()

+------------------+
|            avg(n)|
+------------------+
|0.3664026449885217|
+------------------+



    6. Use `concat` to change the `group` column to say, e.g. "Group: x" or
       "Group: y"

In [87]:
df.select(concat(lit('Group: '), df.group).alias('Group')).show()

+--------+
|   Group|
+--------+
|Group: z|
|Group: x|
|Group: z|
|Group: y|
|Group: z|
|Group: y|
|Group: z|
|Group: x|
|Group: z|
|Group: y|
|Group: x|
|Group: y|
|Group: y|
|Group: y|
|Group: y|
|Group: x|
|Group: z|
|Group: y|
|Group: x|
|Group: x|
+--------+



    7. Use `concat` to combine the `n` and `group` columns to produce results
       that look like this: "x: -1.432" or "z: 2.352"

In [91]:
df.select(concat(df.group, lit(': '), df.n).alias('n per group')).show()

+--------------------+
|         n per group|
+--------------------+
|z: -0.71239066205...|
|x: 0.753766378659703|
|z: -0.04450307833...|
|y: 0.451812338745...|
|z: 1.345101708451...|
|y: 0.532337888294...|
|z: 1.350187899722...|
|x: 0.861211374169...|
|z: 1.478685737435...|
|y: -1.04537713053...|
|x: -0.78898902495...|
|y: -1.26160594531...|
|y: 0.562846785281...|
|y: -0.24332625188...|
|y: 0.913740704859...|
|x: 0.317350922736...|
|z: 0.127303280206...|
|y: 2.150382967381...|
|x: 0.606288656896...|
|x: -0.02677164998...|
+--------------------+



## 5. When / Otherwise

    1. Use the starter code above to re-create a spark dataframe.

In [92]:
df = spark.createDataFrame(pandas_dataframe)

    2. Use `when` and `.otherwise` to create a column that contains the text "It
       is true" when `abool` is true and "It is false"" when `abool` is false.

In [97]:
df.select(df.abool,
          concat(lit('It is '), df.abool).alias('abool is'), 
          when(df.abool == True, 'It is True')
          .otherwise('It is False').alias('abool is also')
         ).show()

+-----+-----------+-------------+
|abool|   abool is|abool is also|
+-----+-----------+-------------+
|false|It is false|  It is False|
|false|It is false|  It is False|
|false|It is false|  It is False|
|false|It is false|  It is False|
|false|It is false|  It is False|
|false|It is false|  It is False|
|false|It is false|  It is False|
|false|It is false|  It is False|
| true| It is true|   It is True|
| true| It is true|   It is True|
|false|It is false|  It is False|
|false|It is false|  It is False|
| true| It is true|   It is True|
| true| It is true|   It is True|
|false|It is false|  It is False|
|false|It is false|  It is False|
|false|It is false|  It is False|
| true| It is true|   It is True|
|false|It is false|  It is False|
| true| It is true|   It is True|
+-----+-----------+-------------+



    3. Create a column that contains 0 if n is less than 0, otherwise, the
       original n value.

In [98]:
df.select(df.n, when(df.n < 0, 0).otherwise(df.n)).show()

+--------------------+-----------------------------------+
|                   n|CASE WHEN (n < 0) THEN 0 ELSE n END|
+--------------------+-----------------------------------+
|  -0.712390662050588|                                0.0|
|   0.753766378659703|                  0.753766378659703|
|-0.04450307833805...|                                0.0|
| 0.45181233874578974|                0.45181233874578974|
|  1.3451017084510097|                 1.3451017084510097|
|  0.5323378882945463|                 0.5323378882945463|
|  1.3501878997225267|                 1.3501878997225267|
|  0.8612113741693206|                 0.8612113741693206|
|  1.4786857374358966|                 1.4786857374358966|
| -1.0453771305385342|                                0.0|
| -0.7889890249515489|                                0.0|
|  -1.261605945319069|                                0.0|
|  0.5628467852810314|                 0.5628467852810314|
|-0.24332625188556253|                                0.

## 6. Filter / Where

    1. Use the starter code above to re-create a spark dataframe.

In [99]:
df = spark.createDataFrame(pandas_dataframe)

    2. Use `.filter` or `.where` to select just the rows where the group is `y`
       and view the results.

In [101]:
df.filter(df.group == 'y').show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
| 0.45181233874578974|    y|false|
|  0.5323378882945463|    y|false|
| -1.0453771305385342|    y| true|
|  -1.261605945319069|    y|false|
|  0.5628467852810314|    y| true|
|-0.24332625188556253|    y| true|
|  0.9137407048596775|    y|false|
|  2.1503829673811126|    y| true|
+--------------------+-----+-----+



    3. Select just the columns where the `abool` column is false and view the
       results.

In [107]:
df.where(df.abool == False).show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
| 0.45181233874578974|    y|false|
|  1.3451017084510097|    z|false|
|  0.5323378882945463|    y|false|
|  1.3501878997225267|    z|false|
|  0.8612113741693206|    x|false|
| -0.7889890249515489|    x|false|
|  -1.261605945319069|    y|false|
|  0.9137407048596775|    y|false|
| 0.31735092273633597|    x|false|
| 0.12730328020698067|    z|false|
|  0.6062886568962988|    x|false|
+--------------------+-----+-----+



    4. Find the columns where the `group` column is *not* `y`.

In [108]:
df.where(df.group != 'y').show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
|  1.3451017084510097|    z|false|
|  1.3501878997225267|    z|false|
|  0.8612113741693206|    x|false|
|  1.4786857374358966|    z| true|
| -0.7889890249515489|    x|false|
| 0.31735092273633597|    x|false|
| 0.12730328020698067|    z|false|
|  0.6062886568962988|    x|false|
|-0.02677164998644...|    x| true|
+--------------------+-----+-----+



    5. Find the columns where `n` is positive.

In [109]:
df.where(df.n > 0).show()

+-------------------+-----+-----+
|                  n|group|abool|
+-------------------+-----+-----+
|  0.753766378659703|    x|false|
|0.45181233874578974|    y|false|
| 1.3451017084510097|    z|false|
| 0.5323378882945463|    y|false|
| 1.3501878997225267|    z|false|
| 0.8612113741693206|    x|false|
| 1.4786857374358966|    z| true|
| 0.5628467852810314|    y| true|
| 0.9137407048596775|    y|false|
|0.31735092273633597|    x|false|
|0.12730328020698067|    z|false|
| 2.1503829673811126|    y| true|
| 0.6062886568962988|    x|false|
+-------------------+-----+-----+



    6. Find the columns where `abool` is true and the `group` column is `z`.

In [111]:
df.where(df.abool == True).filter(df.group == 'z').show()

+------------------+-----+-----+
|                 n|group|abool|
+------------------+-----+-----+
|1.4786857374358966|    z| true|
+------------------+-----+-----+



    7. Find the columns where `abool` is true or the `group` column is `z`.

In [113]:
df.where((df.abool == True) | (df.group == 'z')).show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|-0.04450307833805...|    z|false|
|  1.3451017084510097|    z|false|
|  1.3501878997225267|    z|false|
|  1.4786857374358966|    z| true|
| -1.0453771305385342|    y| true|
|  0.5628467852810314|    y| true|
|-0.24332625188556253|    y| true|
| 0.12730328020698067|    z|false|
|  2.1503829673811126|    y| true|
|-0.02677164998644...|    x| true|
+--------------------+-----+-----+



    8. Find the columns where `abool` is false and `n` is less than 1

In [114]:
df.where((df.abool == False) & (df.n < 1)).show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
| 0.45181233874578974|    y|false|
|  0.5323378882945463|    y|false|
|  0.8612113741693206|    x|false|
| -0.7889890249515489|    x|false|
|  -1.261605945319069|    y|false|
|  0.9137407048596775|    y|false|
| 0.31735092273633597|    x|false|
| 0.12730328020698067|    z|false|
|  0.6062886568962988|    x|false|
+--------------------+-----+-----+



    9. Find the columns where `abool` is false or `n` is less than 1

In [115]:
# Note says columns I'm assuming it means rows
df.where((df.abool == False) | (df.n < 1)).show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
| 0.45181233874578974|    y|false|
|  1.3451017084510097|    z|false|
|  0.5323378882945463|    y|false|
|  1.3501878997225267|    z|false|
|  0.8612113741693206|    x|false|
| -1.0453771305385342|    y| true|
| -0.7889890249515489|    x|false|
|  -1.261605945319069|    y|false|
|  0.5628467852810314|    y| true|
|-0.24332625188556253|    y| true|
|  0.9137407048596775|    y|false|
| 0.31735092273633597|    x|false|
| 0.12730328020698067|    z|false|
|  0.6062886568962988|    x|false|
|-0.02677164998644...|    x| true|
+--------------------+-----+-----+



## 7. Sorting

    1. Use the starter code above to re-create a spark dataframe.

In [116]:
df = spark.createDataFrame(pandas_dataframe)

    2. Sort by the `n` value.

In [117]:
df.sort(df.n).show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -1.261605945319069|    y|false|
| -1.0453771305385342|    y| true|
| -0.7889890249515489|    x|false|
|  -0.712390662050588|    z|false|
|-0.24332625188556253|    y| true|
|-0.04450307833805...|    z|false|
|-0.02677164998644...|    x| true|
| 0.12730328020698067|    z|false|
| 0.31735092273633597|    x|false|
| 0.45181233874578974|    y|false|
|  0.5323378882945463|    y|false|
|  0.5628467852810314|    y| true|
|  0.6062886568962988|    x|false|
|   0.753766378659703|    x|false|
|  0.8612113741693206|    x|false|
|  0.9137407048596775|    y|false|
|  1.3451017084510097|    z|false|
|  1.3501878997225267|    z|false|
|  1.4786857374358966|    z| true|
|  2.1503829673811126|    y| true|
+--------------------+-----+-----+



                                                                                

    3. Sort by the `group` value, both ascending and descending.

In [127]:
df.sort(df.group.asc()).show(), df.sort(df.group.desc()).show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|   0.753766378659703|    x|false|
|  0.8612113741693206|    x|false|
| 0.31735092273633597|    x|false|
|  0.6062886568962988|    x|false|
|-0.02677164998644...|    x| true|
| -0.7889890249515489|    x|false|
|  0.9137407048596775|    y|false|
|  2.1503829673811126|    y| true|
|  0.5323378882945463|    y|false|
| 0.45181233874578974|    y|false|
|-0.24332625188556253|    y| true|
| -1.0453771305385342|    y| true|
|  0.5628467852810314|    y| true|
|  -1.261605945319069|    y|false|
|  -0.712390662050588|    z|false|
|  1.3501878997225267|    z|false|
|  1.3451017084510097|    z|false|
|-0.04450307833805...|    z|false|
|  1.4786857374358966|    z| true|
| 0.12730328020698067|    z|false|
+--------------------+-----+-----+

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  1.3501878997225267|    z|false|
|  1.47868573743589

(None, None)

    4. Sort by the group value first, then, within each group, sort by `n`
       value.

In [123]:
df.sort(df.group, df.n).show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
| -0.7889890249515489|    x|false|
|-0.02677164998644...|    x| true|
| 0.31735092273633597|    x|false|
|  0.6062886568962988|    x|false|
|   0.753766378659703|    x|false|
|  0.8612113741693206|    x|false|
|  -1.261605945319069|    y|false|
| -1.0453771305385342|    y| true|
|-0.24332625188556253|    y| true|
| 0.45181233874578974|    y|false|
|  0.5323378882945463|    y|false|
|  0.5628467852810314|    y| true|
|  0.9137407048596775|    y|false|
|  2.1503829673811126|    y| true|
|  -0.712390662050588|    z|false|
|-0.04450307833805...|    z|false|
| 0.12730328020698067|    z|false|
|  1.3451017084510097|    z|false|
|  1.3501878997225267|    z|false|
|  1.4786857374358966|    z| true|
+--------------------+-----+-----+



    5. Sort by `abool`, `group`, and `n`. Does it matter in what order you
       specify the columns when sorting?

In [126]:
df.sort(df.abool, df.group, df.n).show(), df.sort(df.group, df.n, df.abool).show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
| -0.7889890249515489|    x|false|
| 0.31735092273633597|    x|false|
|  0.6062886568962988|    x|false|
|   0.753766378659703|    x|false|
|  0.8612113741693206|    x|false|
|  -1.261605945319069|    y|false|
| 0.45181233874578974|    y|false|
|  0.5323378882945463|    y|false|
|  0.9137407048596775|    y|false|
|  -0.712390662050588|    z|false|
|-0.04450307833805...|    z|false|
| 0.12730328020698067|    z|false|
|  1.3451017084510097|    z|false|
|  1.3501878997225267|    z|false|
|-0.02677164998644...|    x| true|
| -1.0453771305385342|    y| true|
|-0.24332625188556253|    y| true|
|  0.5628467852810314|    y| true|
|  2.1503829673811126|    y| true|
|  1.4786857374358966|    z| true|
+--------------------+-----+-----+

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
| -0.7889890249515489|    x|false|
|-0.02677164998644.

(None, None)

- Yes it sorts them in the order they are presented

## 8. Aggregating

    1. What is the average `n` value for each group in the `group` column?

In [128]:
df.groupBy(df.group).agg(avg(df.n)).show()

[Stage 150:>                                                        (0 + 8) / 8]

+-----+------------------+
|group|            avg(n)|
+-----+------------------+
|    x|0.2871427762539448|
|    z| 0.590730814237962|
|    y| 0.257601419602374|
+-----+------------------+



                                                                                

    2. What is the maximum `n` value for each group in the `group` column?

In [129]:
df.groupBy(df.group).agg(max(df.n)).show()

[Stage 153:>                                                        (0 + 8) / 8]

+-----+------------------+
|group|            max(n)|
+-----+------------------+
|    x|0.8612113741693206|
|    z|1.4786857374358966|
|    y|2.1503829673811126|
+-----+------------------+



                                                                                

    3. What is the minimum `n` value by `abool`?

In [130]:
df.groupBy(df.abool).agg(min(df.n)).show()

+-----+-------------------+
|abool|             min(n)|
+-----+-------------------+
|false| -1.261605945319069|
| true|-1.0453771305385342|
+-----+-------------------+



    4. What is the average `n` value for each unique combination of the `group`
       and `abool` column?

In [131]:
df.groupBy(df.group, df.abool).agg(avg(df.n)).show()

[Stage 159:>                                                        (0 + 8) / 8]

+-----+-----+--------------------+
|group|abool|              avg(n)|
+-----+-----+--------------------+
|    z|false| 0.41313982959837514|
|    x|false|  0.3499256615020219|
|    y|false| 0.15907124664523611|
|    y| true| 0.35613159255951177|
|    z| true|  1.4786857374358966|
|    x| true|-0.02677164998644...|
+-----+-----+--------------------+



                                                                                

## 9. Spark SQL

    1. Use the starter code above to re-create a spark dataframe.

In [132]:
df = spark.createDataFrame(pandas_dataframe)

    2. Turn your dataframe into a table that can be queried with spark SQL. Name
       the table `my_df`. Answer the rest of the questions in this section with
       a spark sql query (`spark.sql`) against `my_df`. After each step, view
       the first 7 records from the dataframe.

In [133]:
df.createOrReplaceTempView('my_df')

    3. What happens if you make a SQL syntax error in your query?

```
spark.sql(
'''
SELECT #
FROM my_df
'''
)
```

- Returns an error

    4. Write a query that shows all of the columns from your dataframe.

In [137]:
spark.sql(
'''
SELECT *
FROM my_df
'''
).show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
| 0.45181233874578974|    y|false|
|  1.3451017084510097|    z|false|
|  0.5323378882945463|    y|false|
|  1.3501878997225267|    z|false|
|  0.8612113741693206|    x|false|
|  1.4786857374358966|    z| true|
| -1.0453771305385342|    y| true|
| -0.7889890249515489|    x|false|
|  -1.261605945319069|    y|false|
|  0.5628467852810314|    y| true|
|-0.24332625188556253|    y| true|
|  0.9137407048596775|    y|false|
| 0.31735092273633597|    x|false|
| 0.12730328020698067|    z|false|
|  2.1503829673811126|    y| true|
|  0.6062886568962988|    x|false|
|-0.02677164998644...|    x| true|
+--------------------+-----+-----+



    5. Write a query that shows just the `n` and `abool` columns from the
       dataframe.

In [141]:
spark.sql(
'''
SELECT n
FROM my_df
'''
).show()

+--------------------+
|                   n|
+--------------------+
|  -0.712390662050588|
|   0.753766378659703|
|-0.04450307833805...|
| 0.45181233874578974|
|  1.3451017084510097|
|  0.5323378882945463|
|  1.3501878997225267|
|  0.8612113741693206|
|  1.4786857374358966|
| -1.0453771305385342|
| -0.7889890249515489|
|  -1.261605945319069|
|  0.5628467852810314|
|-0.24332625188556253|
|  0.9137407048596775|
| 0.31735092273633597|
| 0.12730328020698067|
|  2.1503829673811126|
|  0.6062886568962988|
|-0.02677164998644...|
+--------------------+



    6. Write a query that shows just the `n` and `group` columns. Rename the
       `group` column to `g`.

In [142]:
spark.sql(
'''
SELECT n, group as g
FROM my_df
'''
).show()

+--------------------+---+
|                   n|  g|
+--------------------+---+
|  -0.712390662050588|  z|
|   0.753766378659703|  x|
|-0.04450307833805...|  z|
| 0.45181233874578974|  y|
|  1.3451017084510097|  z|
|  0.5323378882945463|  y|
|  1.3501878997225267|  z|
|  0.8612113741693206|  x|
|  1.4786857374358966|  z|
| -1.0453771305385342|  y|
| -0.7889890249515489|  x|
|  -1.261605945319069|  y|
|  0.5628467852810314|  y|
|-0.24332625188556253|  y|
|  0.9137407048596775|  y|
| 0.31735092273633597|  x|
| 0.12730328020698067|  z|
|  2.1503829673811126|  y|
|  0.6062886568962988|  x|
|-0.02677164998644...|  x|
+--------------------+---+



    7. Write a query that selects `n`, and creates two new columns: `n2`, the
       original `n` values halved, and `n3`: the original n values minus 1.

In [144]:
spark.sql(
'''
SELECT n, n/2 AS n2, n-1 AS n3
FROM my_df
'''
).show()

+--------------------+--------------------+--------------------+
|                   n|                  n2|                  n3|
+--------------------+--------------------+--------------------+
|  -0.712390662050588|  -0.356195331025294|  -1.712390662050588|
|   0.753766378659703|  0.3768831893298515|-0.24623362134029703|
|-0.04450307833805...|-0.02225153916902...| -1.0445030783380536|
| 0.45181233874578974| 0.22590616937289487| -0.5481876612542103|
|  1.3451017084510097|  0.6725508542255049| 0.34510170845100974|
|  0.5323378882945463| 0.26616894414727316| -0.4676621117054537|
|  1.3501878997225267|  0.6750939498612634| 0.35018789972252673|
|  0.8612113741693206|  0.4306056870846603| -0.1387886258306794|
|  1.4786857374358966|  0.7393428687179483|  0.4786857374358966|
| -1.0453771305385342| -0.5226885652692671|  -2.045377130538534|
| -0.7889890249515489|-0.39449451247577444| -1.7889890249515488|
|  -1.261605945319069| -0.6308029726595346|  -2.261605945319069|
|  0.5628467852810314|  0