# Spark API Mini Exercises

In [1]:
import pandas as pd
import numpy as np

## 1. Spark Dataframe Basics

### Use the starter code above to create a pandas dataframe.

In [2]:
np.random.seed(13)

pandas_dataframe = pd.DataFrame({
    "n": np.random.randn(20),
    "group": np.random.choice(list("xyz"), 20),
    "abool": np.random.choice([True, False], 20),
})

In [3]:
pandas_dataframe.head()

Unnamed: 0,n,group,abool
0,-0.712391,z,False
1,0.753766,x,False
2,-0.044503,z,False
3,0.451812,y,False
4,1.345102,z,False


### Convert the pandas dataframe to a spark dataframe. From this point forward, do all of your work with the spark dataframe, not the pandas dataframe.

In [4]:
import pyspark #Import Spark
spark = pyspark.sql.SparkSession.builder.getOrCreate()

In [5]:
df = spark.createDataFrame(pandas_dataframe)

In [6]:
df.show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
| 0.45181233874578974|    y|false|
|  1.3451017084510097|    z|false|
|  0.5323378882945463|    y|false|
|  1.3501878997225267|    z|false|
|  0.8612113741693206|    x|false|
|  1.4786857374358966|    z| true|
| -1.0453771305385342|    y| true|
| -0.7889890249515489|    x|false|
|  -1.261605945319069|    y|false|
|  0.5628467852810314|    y| true|
|-0.24332625188556253|    y| true|
|  0.9137407048596775|    y|false|
| 0.31735092273633597|    x|false|
| 0.12730328020698067|    z|false|
|  2.1503829673811126|    y| true|
|  0.6062886568962988|    x|false|
|-0.02677164998644...|    x| true|
+--------------------+-----+-----+



### Show the first 3 rows of the dataframe

In [7]:
df.show(3)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
+--------------------+-----+-----+
only showing top 3 rows



### Show the first 7 rows of the dataframe.

In [8]:
df.show(7)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
| 0.45181233874578974|    y|false|
|  1.3451017084510097|    z|false|
|  0.5323378882945463|    y|false|
|  1.3501878997225267|    z|false|
+--------------------+-----+-----+
only showing top 7 rows



### View a summary of the data using .describe.

In [9]:
df.describe

<bound method DataFrame.describe of DataFrame[n: double, group: string, abool: boolean]>

### Use .select to create a new dataframe with just the n and abool columns. View the first 5 rows of this dataframe.

In [10]:
new_df = df.select('n', 'abool')

In [11]:
new_df.show(5)

+--------------------+-----+
|                   n|abool|
+--------------------+-----+
|  -0.712390662050588|false|
|   0.753766378659703|false|
|-0.04450307833805...|false|
| 0.45181233874578974|false|
|  1.3451017084510097|false|
+--------------------+-----+
only showing top 5 rows



### Use .select to create a new dataframe with just the group and abool columns. View the first 5 rows of this dataframe.

In [12]:
new_df_2 = df.select('group', 'abool')

In [13]:
new_df_2.show(5)

+-----+-----+
|group|abool|
+-----+-----+
|    z|false|
|    x|false|
|    z|false|
|    y|false|
|    z|false|
+-----+-----+
only showing top 5 rows



### Use .select to create a new dataframe with the group column and the abool column renamed to a_boolean_value. Show the first 3 rows of this dataframe.

In [14]:
df.select('group', 'abool').withColumnRenamed('abool', 'a_boolean_value').show(3)

+-----+---------------+
|group|a_boolean_value|
+-----+---------------+
|    z|          false|
|    x|          false|
|    z|          false|
+-----+---------------+
only showing top 3 rows



### Use .select to create a new dataframe with the group column and the n column renamed to a_numeric_value. Show the first 6 rows of this dataframe.

In [15]:
df.select('group', 'n').withColumnRenamed('n', 'a_numeric_value').show(6)

+-----+--------------------+
|group|     a_numeric_value|
+-----+--------------------+
|    z|  -0.712390662050588|
|    x|   0.753766378659703|
|    z|-0.04450307833805...|
|    y| 0.45181233874578974|
|    z|  1.3451017084510097|
|    y|  0.5323378882945463|
+-----+--------------------+
only showing top 6 rows



## 2. Column Manipulation

### Use the starter code above to re-create a spark dataframe. Store the spark dataframe in a varaible named df.

In [16]:
df = spark.createDataFrame(pd.DataFrame({
    "n": np.random.randn(20),
    "group": np.random.choice(list("xyz"), 20),
    "abool": np.random.choice([True, False], 20),
}))

In [17]:
df.show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
| -0.8850620992868307|    x|false|
| 0.07272674611277782|    x| true|
|   -0.82751910119974|    x|false|
|  -0.591550921883219|    y|false|
|  -2.186215625579764|    y| true|
| -1.4304503608169532|    y| true|
|0.001182616633274...|    y| true|
|-0.15241870550020373|    z| true|
|  1.4418010391482912|    z|false|
|-0.18492174250837645|    x| true|
| -0.2298977614525365|    x|false|
| -0.2633773007599446|    y| true|
|-0.08286224131163808|    x| true|
|-0.38505688002448574|    x|false|
|-0.10105117176246801|    z| true|
| 0.33115422310589987|    y|false|
| 0.04329942581647413|    z|false|
|-0.44997653600849435|    z| true|
|  0.3471981374467132|    y|false|
|  1.4591000886192282|    z| true|
+--------------------+-----+-----+



### Use .select to add 4 to the n column. Show the results.

In [32]:
df.select(df.n + 4).show()

+------------------+
|           (n + 4)|
+------------------+
|3.1149379007131692|
| 4.072726746112778|
|  3.17248089880026|
| 3.408449078116781|
|1.8137843744202362|
| 2.569549639183047|
| 4.001182616633274|
|3.8475812944997965|
| 5.441801039148292|
|3.8150782574916233|
|3.7701022385474636|
|3.7366226992400553|
| 3.917137758688362|
| 3.614943119975514|
| 3.898948828237532|
|   4.3311542231059|
| 4.043299425816474|
|3.5500234639915056|
| 4.347198137446713|
| 5.459100088619229|
+------------------+



### Subtract 5 from the n column and view the results.

In [30]:
df.select(df.n - 5).show()

+-------------------+
|            (n - 5)|
+-------------------+
| -5.885062099286831|
| -4.927273253887222|
|  -5.82751910119974|
| -5.591550921883219|
| -7.186215625579764|
| -6.430450360816954|
| -4.998817383366726|
| -5.152418705500204|
| -3.558198960851709|
| -5.184921742508377|
| -5.229897761452537|
| -5.263377300759944|
| -5.082862241311638|
| -5.385056880024486|
| -5.101051171762468|
|   -4.6688457768941|
| -4.956700574183526|
| -5.449976536008494|
| -4.652801862553287|
|-3.5408999113807718|
+-------------------+



### Multiply the n column by 2. View the results along with the original numbers.

In [29]:
df.select('n', df.n*2).show()

+--------------------+--------------------+
|                   n|             (n * 2)|
+--------------------+--------------------+
| -0.8850620992868307| -1.7701241985736613|
| 0.07272674611277782| 0.14545349222555565|
|   -0.82751910119974|   -1.65503820239948|
|  -0.591550921883219|  -1.183101843766438|
|  -2.186215625579764|  -4.372431251159528|
| -1.4304503608169532| -2.8609007216339064|
|0.001182616633274...|0.002365233266549...|
|-0.15241870550020373|-0.30483741100040745|
|  1.4418010391482912|  2.8836020782965823|
|-0.18492174250837645| -0.3698434850167529|
| -0.2298977614525365|  -0.459795522905073|
| -0.2633773007599446| -0.5267546015198892|
|-0.08286224131163808|-0.16572448262327616|
|-0.38505688002448574| -0.7701137600489715|
|-0.10105117176246801|-0.20210234352493603|
| 0.33115422310589987|  0.6623084462117997|
| 0.04329942581647413| 0.08659885163294825|
|-0.44997653600849435| -0.8999530720169887|
|  0.3471981374467132|  0.6943962748934264|
|  1.4591000886192282|  2.918200

### Add a new column named n2 that is the n value multiplied by -1. Show the first 4 rows of your dataframe. You should see the original n value as well as n2.

In [38]:
df_new = df.select('n', (df.n*-1).alias('n2'))

In [39]:
df_new.show(4)

+-------------------+--------------------+
|                  n|                  n2|
+-------------------+--------------------+
|-0.8850620992868307|  0.8850620992868307|
|0.07272674611277782|-0.07272674611277782|
|  -0.82751910119974|    0.82751910119974|
| -0.591550921883219|   0.591550921883219|
+-------------------+--------------------+
only showing top 4 rows



### Add a new column named n3 that is the n value squared. Show the first 5 rows of your dataframe. You should see both n, n2, and n3.

In [41]:
df_new.select('*', ((df.n)**2).alias('n3')).show(5)

+-------------------+--------------------+--------------------+
|                  n|                  n2|                  n3|
+-------------------+--------------------+--------------------+
|-0.8850620992868307|  0.8850620992868307|  0.7833349195940117|
|0.07272674611277782|-0.07272674611277782|0.005289179600152444|
|  -0.82751910119974|    0.82751910119974|  0.6847878628504256|
| -0.591550921883219|   0.591550921883219| 0.34993249318088626|
| -2.186215625579764|   2.186215625579764|   4.779538761529118|
+-------------------+--------------------+--------------------+
only showing top 5 rows



### What happens when you run the code below?

```df.group + df.abool```

In [42]:
df.group + df.abool

Column<'(group + abool)'>

### What happens when you run the code below? What is the difference between this and the previous code sample?

```df.select(df.group + df.abool)```

In [45]:
df.select(df.group + df.abool)

AnalysisException: cannot resolve '(CAST(group AS DOUBLE) + abool)' due to data type mismatch: differing types in '(CAST(group AS DOUBLE) + abool)' (double and boolean).;
'Project [unresolvedalias((cast(group#96 as double) + abool#97), Some(org.apache.spark.sql.Column$$Lambda$2529/0x000000080115c040@560ace4d))]
+- LogicalRDD [n#95, group#96, abool#97], false


We get an error when we attempt to combine `group` and `abool` because these columns are different data types. We previously did not get an error because, since Spark is lazy, no calculation was made when we initially added `group` and `abool` together. Spark did not make a calculation until we selected this sum. 

### Try adding various other columns together. What are the results of combining the different data types?

In [46]:
df.select(df.n + df.abool)

AnalysisException: cannot resolve '(n + abool)' due to data type mismatch: differing types in '(n + abool)' (double and boolean).;
'Project [unresolvedalias((n#95 + abool#97), Some(org.apache.spark.sql.Column$$Lambda$2529/0x000000080115c040@560ace4d))]
+- LogicalRDD [n#95, group#96, abool#97], false


In [48]:
df.select(df.n + df.group).show()

+-----------+
|(n + group)|
+-----------+
|       null|
|       null|
|       null|
|       null|
|       null|
|       null|
|       null|
|       null|
|       null|
|       null|
|       null|
|       null|
|       null|
|       null|
|       null|
|       null|
|       null|
|       null|
|       null|
|       null|
+-----------+



In [54]:
df.describe

<bound method DataFrame.describe of DataFrame[n: double, group: string, abool: boolean]>

We cannot add a string and a boolean or a string and a double, but we can add a double and a string. 

## 3. Type casting

### Use the starter code above to re-create a spark dataframe.

In [56]:
df = spark.createDataFrame(pd.DataFrame({
    "n": np.random.randn(20),
    "group": np.random.choice(list("xyz"), 20),
    "abool": np.random.choice([True, False], 20),
}))

### Use .printSchema to view the datatypes in your dataframe.

In [60]:
df.printSchema()

root
 |-- n: double (nullable = true)
 |-- group: string (nullable = true)
 |-- abool: boolean (nullable = true)



### Use .dtypes to view the datatypes in your dataframe.

In [62]:
df.dtypes

[('n', 'double'), ('group', 'string'), ('abool', 'boolean')]

### What is the difference between the two code samples below?

```df.abool.cast('int')```

```df.select(df.abool.cast('int')).show()```


In [72]:
df.abool.cast('int')

Column<'CAST(abool AS INT)'>

In [66]:
df.select(df.abool.cast('int')).show()

+-----+
|abool|
+-----+
|    0|
|    1|
|    0|
|    0|
|    1|
|    1|
|    1|
|    1|
|    0|
|    1|
|    0|
|    1|
|    1|
|    0|
|    1|
|    0|
|    0|
|    1|
|    0|
|    1|
+-----+



The first code represents a substitution `True` and `False` for 1 and 0, but without actually making any calculation, since Spark is lazy.

The second code select the `abool` column, converts the `True` and `False` to 1 and 0, then prints the first 20 rows of the DataFrame.

### Use .select and .cast to convert the abool column to an integer type. View the results

In [73]:
df.select(df.abool.cast('int')).show()

+-----+
|abool|
+-----+
|    0|
|    1|
|    0|
|    0|
|    1|
|    1|
|    1|
|    1|
|    0|
|    1|
|    0|
|    1|
|    1|
|    0|
|    1|
|    0|
|    0|
|    1|
|    0|
|    1|
+-----+



### Convert the group column to a integer data type and view the results. What happens?

In [74]:
df.select(df.group.cast('int')).show()

+-----+
|group|
+-----+
| null|
| null|
| null|
| null|
| null|
| null|
| null|
| null|
| null|
| null|
| null|
| null|
| null|
| null|
| null|
| null|
| null|
| null|
| null|
| null|
+-----+



The DataFrame returns all null values.

### Convert the n column to a integer data type and view the results. What happens?

In [80]:
df.select(df.n).show()

+--------------------+
|                   n|
+--------------------+
| -0.8850620992868307|
| 0.07272674611277782|
|   -0.82751910119974|
|  -0.591550921883219|
|  -2.186215625579764|
| -1.4304503608169532|
|0.001182616633274...|
|-0.15241870550020373|
|  1.4418010391482912|
|-0.18492174250837645|
| -0.2298977614525365|
| -0.2633773007599446|
|-0.08286224131163808|
|-0.38505688002448574|
|-0.10105117176246801|
| 0.33115422310589987|
| 0.04329942581647413|
|-0.44997653600849435|
|  0.3471981374467132|
|  1.4591000886192282|
+--------------------+



In [78]:
df.select(df.n.cast('int')).show()

+---+
|  n|
+---+
|  0|
|  0|
|  0|
|  0|
| -2|
| -1|
|  0|
|  0|
|  1|
|  0|
|  0|
|  0|
|  0|
|  0|
|  0|
|  0|
|  0|
|  0|
|  0|
|  1|
+---+



The code rounds each `n` value to the nearest integer.

### Convert the abool column to a string data type and view the results. What happens?

In [81]:
df.select(df.abool).show()

+-----+
|abool|
+-----+
|false|
| true|
|false|
|false|
| true|
| true|
| true|
| true|
|false|
| true|
|false|
| true|
| true|
|false|
| true|
|false|
|false|
| true|
|false|
| true|
+-----+



In [83]:
df.select(df.abool.cast('str')).show()

ParseException: 
DataType str is not supported.(line 1, pos 0)

== SQL ==
str
^^^


We receive an error because we cannot convert a Boolean value to a string.

## 4. Built-in Functions

### Use the starter code above to re-create a spark dataframe.

In [96]:
df = spark.createDataFrame(pd.DataFrame({
    "n": np.random.randn(20),
    "group": np.random.choice(list("xyz"), 20),
    "abool": np.random.choice([True, False], 20),
}))

### Import the necessary functions from ```pyspark.sql.functions```


In [131]:
from  pyspark.sql.functions import  asc, desc, mean, min, max, concat, lit, round

### Find the highest n value.

In [111]:
df.orderBy(desc('n')).show(1)

+-----------------+-----+-----+
|                n|group|abool|
+-----------------+-----+-----+
|1.574465016668651|    x| true|
+-----------------+-----+-----+
only showing top 1 row



In [121]:
df.select(max(df.n)).show()

+-----------------+
|           max(n)|
+-----------------+
|1.574465016668651|
+-----------------+



### Find the lowest n value.

In [112]:
df.orderBy(asc('n')).show(1)

+-------------------+-----+-----+
|                  n|group|abool|
+-------------------+-----+-----+
|-1.7570352170643477|    z|false|
+-------------------+-----+-----+
only showing top 1 row



In [122]:
df.select(min(df.n)).show()

+-------------------+
|             min(n)|
+-------------------+
|-1.7570352170643477|
+-------------------+



### Find the average n value.

In [119]:
df.select(mean(df.n)).show()

+------------------+
|            avg(n)|
+------------------+
|0.0281297980344428|
+------------------+



### Use concat to change the group column to say, e.g. "Group: x" or "Group: y"

In [128]:
df.select('n', concat(lit('Group '), 'group'), 'abool').show()

+--------------------+---------------------+-----+
|                   n|concat(Group , group)|abool|
+--------------------+---------------------+-----+
|  0.5345941466677115|              Group x| true|
|  0.8231227651701564|              Group x|false|
|  0.8862891040333676|              Group x|false|
| 0.17266171218872037|              Group z| true|
|-0.21312889268996238|              Group y|false|
|0.027641567510190198|              Group z| true|
|  -1.073052771312975|              Group z|false|
|  1.2474002187187725|              Group y| true|
|-0.29344146465064336|              Group x|false|
| -0.2611042518734936|              Group x|false|
| -1.2278338632475168|              Group y| true|
|  1.0594556661999763|              Group y|false|
|  0.6269471789396875|              Group x|false|
| 0.16358009631961862|              Group y|false|
| -0.2961148127835557|              Group y|false|
|-0.23127394469706597|              Group y| true|
|   1.574465016668651|         

### Use concat to combine the n and group columns to produce results that look like this: "x: -1.432" or "z: 2.352"

In [132]:
df.select(concat('group', lit(": "), round('n', 3)), 'abool').show()

+------------------------------+-----+
|concat(group, : , round(n, 3))|abool|
+------------------------------+-----+
|                      x: 0.535| true|
|                      x: 0.823|false|
|                      x: 0.886|false|
|                      z: 0.173| true|
|                     y: -0.213|false|
|                      z: 0.028| true|
|                     z: -1.073|false|
|                      y: 1.247| true|
|                     x: -0.293|false|
|                     x: -0.261|false|
|                     y: -1.228| true|
|                      y: 1.059|false|
|                      x: 0.627|false|
|                      y: 0.164|false|
|                     y: -0.296|false|
|                     y: -0.231| true|
|                      x: 1.574| true|
|                     x: -0.873| true|
|                     z: -1.757|false|
|                     z: -0.328|false|
+------------------------------+-----+



## 5. When / Otherwise

In [145]:
from pyspark.sql.functions import when

### Use the starter code above to re-create a spark dataframe.

In [133]:
df = spark.createDataFrame(pd.DataFrame({
    "n": np.random.randn(20),
    "group": np.random.choice(list("xyz"), 20),
    "abool": np.random.choice([True, False], 20),
}))

### Use when and .otherwise to create a column that contains the text "It is true" when abool is true and "It is false"" when abool is false.

In [139]:
df.select('*', when(df.abool == "true", 'It is true').otherwise('It is false').alias('True vs. False')).show()

+--------------------+-----+-----+--------------+
|                   n|group|abool|True vs. False|
+--------------------+-----+-----+--------------+
|  1.2636494824990419|    z| true|    It is true|
|  1.5708990002014271|    y|false|   It is false|
| -1.2615486912059428|    x|false|   It is false|
|-0.07508917570719996|    y| true|    It is true|
|  0.8501070283110412|    z|false|   It is false|
| -0.9150316963547941|    x| true|    It is true|
| -0.8673479540502249|    y| true|    It is true|
|  0.7731666960765334|    z|false|   It is false|
| -0.7749046025022026|    y|false|   It is false|
|  1.0112322458862237|    x|false|   It is false|
|  0.9416459862960309|    x|false|   It is false|
| -0.7699411266400571|    z|false|   It is false|
| -1.1303179237441572|    x|false|   It is false|
|  0.9254890406588132|    x|false|   It is false|
|  1.2732503628768688|    y| true|    It is true|
|-0.04059007575872...|    x| true|    It is true|
|  0.9105807193557125|    y| true|    It is true|


### Create a column that contains 0 if n is less than 0, otherwise, the original n value.

In [140]:
df.select('*', when(df.n > 0, df.n).otherwise(0).alias('No negatives')).show()

+--------------------+-----+-----+------------------+
|                   n|group|abool|      No negatives|
+--------------------+-----+-----+------------------+
|  1.2636494824990419|    z| true|1.2636494824990419|
|  1.5708990002014271|    y|false|1.5708990002014271|
| -1.2615486912059428|    x|false|               0.0|
|-0.07508917570719996|    y| true|               0.0|
|  0.8501070283110412|    z|false|0.8501070283110412|
| -0.9150316963547941|    x| true|               0.0|
| -0.8673479540502249|    y| true|               0.0|
|  0.7731666960765334|    z|false|0.7731666960765334|
| -0.7749046025022026|    y|false|               0.0|
|  1.0112322458862237|    x|false|1.0112322458862237|
|  0.9416459862960309|    x|false|0.9416459862960309|
| -0.7699411266400571|    z|false|               0.0|
| -1.1303179237441572|    x|false|               0.0|
|  0.9254890406588132|    x|false|0.9254890406588132|
|  1.2732503628768688|    y| true|1.2732503628768688|
|-0.04059007575872...|    x|

## 6. Filter / Where

In [147]:
from pyspark.sql.functions import filter

### Use the starter code above to re-create a spark dataframe.

In [141]:
df = spark.createDataFrame(pd.DataFrame({
    "n": np.random.randn(20),
    "group": np.random.choice(list("xyz"), 20),
    "abool": np.random.choice([True, False], 20),
}))

### Use .filter or .where to select just the rows where the group is y and view the results.

In [150]:
df.where(df.group == 'y').show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
| -1.6642564360602479|    y|false|
| -0.1217464334245328|    y|false|
| 0.01997092245414585|    y| true|
|0.041447402165011596|    y| true|
| -1.0669628403806435|    y|false|
| -0.3848490502028002|    y|false|
+--------------------+-----+-----+



In [151]:
df.filter(df.group == 'y').show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
| -1.6642564360602479|    y|false|
| -0.1217464334245328|    y|false|
| 0.01997092245414585|    y| true|
|0.041447402165011596|    y| true|
| -1.0669628403806435|    y|false|
| -0.3848490502028002|    y|false|
+--------------------+-----+-----+



### Select just the columns where the abool column is false and view the results.

In [153]:
df.where(df.abool == "false").show()

+-------------------+-----+-----+
|                  n|group|abool|
+-------------------+-----+-----+
| 0.2572490050399525|    z|false|
|-1.6642564360602479|    y|false|
|-0.1217464334245328|    y|false|
| 0.9552466924396553|    x|false|
|-1.0669628403806435|    y|false|
|-1.4141544662915098|    z|false|
|0.12673002941390707|    x|false|
|-0.3848490502028002|    y|false|
+-------------------+-----+-----+



### Find the columns where the group column is not y.

In [154]:
df.where(df.group != "y").show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  1.2690580976875576|    x| true|
|  0.2572490050399525|    z|false|
|-0.44151223119024857|    x| true|
|  0.9552466924396553|    x|false|
|  0.8628168935353443|    z| true|
|  0.7435452461317569|    z| true|
| -0.6625811650314397|    x| true|
| -0.3051810275934747|    z| true|
|  3.4011057189806557|    z| true|
|  0.5253295997343811|    z| true|
| -1.2768778466596493|    x| true|
| -1.4141544662915098|    z|false|
| 0.12673002941390707|    x|false|
| -1.5509569211976624|    x| true|
+--------------------+-----+-----+



### Find the columns where n is positive.

In [156]:
df.where(df.n > 0).show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  1.2690580976875576|    x| true|
|  0.2572490050399525|    z|false|
|  0.9552466924396553|    x|false|
| 0.01997092245414585|    y| true|
|  0.8628168935353443|    z| true|
|  0.7435452461317569|    z| true|
|0.041447402165011596|    y| true|
|  3.4011057189806557|    z| true|
|  0.5253295997343811|    z| true|
| 0.12673002941390707|    x|false|
+--------------------+-----+-----+



### Find the columns where abool is true and the group column is z.

In [161]:
df.where((df.abool == "true") & (df.group == "z")).show()

+-------------------+-----+-----+
|                  n|group|abool|
+-------------------+-----+-----+
| 0.8628168935353443|    z| true|
| 0.7435452461317569|    z| true|
|-0.3051810275934747|    z| true|
| 3.4011057189806557|    z| true|
| 0.5253295997343811|    z| true|
+-------------------+-----+-----+



### Find the columns where abool is true or the group column is z.

In [162]:
df.where((df.abool == "true") | (df.group == "z")).show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  1.2690580976875576|    x| true|
|  0.2572490050399525|    z|false|
|-0.44151223119024857|    x| true|
| 0.01997092245414585|    y| true|
|  0.8628168935353443|    z| true|
|  0.7435452461317569|    z| true|
| -0.6625811650314397|    x| true|
| -0.3051810275934747|    z| true|
|0.041447402165011596|    y| true|
|  3.4011057189806557|    z| true|
|  0.5253295997343811|    z| true|
| -1.2768778466596493|    x| true|
| -1.4141544662915098|    z|false|
| -1.5509569211976624|    x| true|
+--------------------+-----+-----+



### Find the columns where abool is false and n is less than 1

In [163]:
df.where((df.abool == "false") & (df.n < 1)).show()

+-------------------+-----+-----+
|                  n|group|abool|
+-------------------+-----+-----+
| 0.2572490050399525|    z|false|
|-1.6642564360602479|    y|false|
|-0.1217464334245328|    y|false|
| 0.9552466924396553|    x|false|
|-1.0669628403806435|    y|false|
|-1.4141544662915098|    z|false|
|0.12673002941390707|    x|false|
|-0.3848490502028002|    y|false|
+-------------------+-----+-----+



### Find the columns where abool is false or n is less than 1

In [164]:
df.where((df.abool == "false") | (df.n < 1)).show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  0.2572490050399525|    z|false|
| -1.6642564360602479|    y|false|
| -0.1217464334245328|    y|false|
|-0.44151223119024857|    x| true|
|  0.9552466924396553|    x|false|
| 0.01997092245414585|    y| true|
|  0.8628168935353443|    z| true|
|  0.7435452461317569|    z| true|
| -0.6625811650314397|    x| true|
| -0.3051810275934747|    z| true|
|0.041447402165011596|    y| true|
| -1.0669628403806435|    y|false|
|  0.5253295997343811|    z| true|
| -1.2768778466596493|    x| true|
| -1.4141544662915098|    z|false|
| 0.12673002941390707|    x|false|
| -0.3848490502028002|    y|false|
| -1.5509569211976624|    x| true|
+--------------------+-----+-----+



## 7. Sorting

### Use the starter code above to re-create a spark dataframe.

In [165]:
df = spark.createDataFrame(pd.DataFrame({
    "n": np.random.randn(20),
    "group": np.random.choice(list("xyz"), 20),
    "abool": np.random.choice([True, False], 20),
}))

### Sort by the n value.

In [168]:
df.sort('n').show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
| -1.9323560015091858|    z| true|
| -1.5232135953524801|    z|false|
| -1.4634777566193453|    x|false|
|   -1.41124263188717|    z| true|
|  -1.257253734973249|    y| true|
| -1.0790126956031663|    z|false|
| -0.7320583853311774|    z| true|
| -0.5334945027167617|    x|false|
| -0.5136792046662925|    y|false|
|-0.22467908543381457|    z|false|
|-0.17830606215500272|    y|false|
|-0.15579970816999872|    y| true|
|-0.10321862725194576|    z| true|
|-0.01842180913757...|    y| true|
| 0.10767464455391365|    z| true|
|  0.5751509658063075|    x| true|
|  0.6558615323233493|    z| true|
|  0.9399792195744116|    x| true|
|  0.9506069961060607|    x|false|
|  1.2047437309657716|    z|false|
+--------------------+-----+-----+



### Sort by the group value, both ascending and descending.



In [170]:
df.orderBy(asc('group')).show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  0.9506069961060607|    x|false|
|  0.5751509658063075|    x| true|
|  0.9399792195744116|    x| true|
| -1.4634777566193453|    x|false|
| -0.5334945027167617|    x|false|
|-0.17830606215500272|    y|false|
|  -1.257253734973249|    y| true|
|-0.15579970816999872|    y| true|
|-0.01842180913757...|    y| true|
| -0.5136792046662925|    y|false|
| -1.9323560015091858|    z| true|
|  1.2047437309657716|    z|false|
|   -1.41124263188717|    z| true|
| -1.5232135953524801|    z|false|
| -0.7320583853311774|    z| true|
|-0.10321862725194576|    z| true|
|  0.6558615323233493|    z| true|
| -1.0790126956031663|    z|false|
| 0.10767464455391365|    z| true|
|-0.22467908543381457|    z|false|
+--------------------+-----+-----+



In [171]:
df.orderBy(desc('group')).show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
| -1.0790126956031663|    z|false|
|  1.2047437309657716|    z|false|
|-0.10321862725194576|    z| true|
| 0.10767464455391365|    z| true|
| -1.5232135953524801|    z|false|
| -0.7320583853311774|    z| true|
| -1.9323560015091858|    z| true|
|   -1.41124263188717|    z| true|
|  0.6558615323233493|    z| true|
|-0.22467908543381457|    z|false|
|-0.15579970816999872|    y| true|
| -0.5136792046662925|    y|false|
|-0.17830606215500272|    y|false|
|  -1.257253734973249|    y| true|
|-0.01842180913757...|    y| true|
|  0.9399792195744116|    x| true|
|  0.9506069961060607|    x|false|
| -1.4634777566193453|    x|false|
|  0.5751509658063075|    x| true|
| -0.5334945027167617|    x|false|
+--------------------+-----+-----+



### Sort by the group value first, then, within each group, sort by n value.

In [173]:
df.sort('group', 'n').show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
| -1.4634777566193453|    x|false|
| -0.5334945027167617|    x|false|
|  0.5751509658063075|    x| true|
|  0.9399792195744116|    x| true|
|  0.9506069961060607|    x|false|
|  -1.257253734973249|    y| true|
| -0.5136792046662925|    y|false|
|-0.17830606215500272|    y|false|
|-0.15579970816999872|    y| true|
|-0.01842180913757...|    y| true|
| -1.9323560015091858|    z| true|
| -1.5232135953524801|    z|false|
|   -1.41124263188717|    z| true|
| -1.0790126956031663|    z|false|
| -0.7320583853311774|    z| true|
|-0.22467908543381457|    z|false|
|-0.10321862725194576|    z| true|
| 0.10767464455391365|    z| true|
|  0.6558615323233493|    z| true|
|  1.2047437309657716|    z|false|
+--------------------+-----+-----+



### Sort by abool, group, and n. Does it matter in what order you specify the columns when sorting?

In [174]:
df.sort('abool', 'group', 'n').show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
| -1.4634777566193453|    x|false|
| -0.5334945027167617|    x|false|
|  0.9506069961060607|    x|false|
| -0.5136792046662925|    y|false|
|-0.17830606215500272|    y|false|
| -1.5232135953524801|    z|false|
| -1.0790126956031663|    z|false|
|-0.22467908543381457|    z|false|
|  1.2047437309657716|    z|false|
|  0.5751509658063075|    x| true|
|  0.9399792195744116|    x| true|
|  -1.257253734973249|    y| true|
|-0.15579970816999872|    y| true|
|-0.01842180913757...|    y| true|
| -1.9323560015091858|    z| true|
|   -1.41124263188717|    z| true|
| -0.7320583853311774|    z| true|
|-0.10321862725194576|    z| true|
| 0.10767464455391365|    z| true|
|  0.6558615323233493|    z| true|
+--------------------+-----+-----+



In [175]:
df.sort('group', 'n', 'abool').show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
| -1.4634777566193453|    x|false|
| -0.5334945027167617|    x|false|
|  0.5751509658063075|    x| true|
|  0.9399792195744116|    x| true|
|  0.9506069961060607|    x|false|
|  -1.257253734973249|    y| true|
| -0.5136792046662925|    y|false|
|-0.17830606215500272|    y|false|
|-0.15579970816999872|    y| true|
|-0.01842180913757...|    y| true|
| -1.9323560015091858|    z| true|
| -1.5232135953524801|    z|false|
|   -1.41124263188717|    z| true|
| -1.0790126956031663|    z|false|
| -0.7320583853311774|    z| true|
|-0.22467908543381457|    z|false|
|-0.10321862725194576|    z| true|
| 0.10767464455391365|    z| true|
|  0.6558615323233493|    z| true|
|  1.2047437309657716|    z|false|
+--------------------+-----+-----+



In [176]:
df.sort('group', 'abool', 'n').show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
| -1.4634777566193453|    x|false|
| -0.5334945027167617|    x|false|
|  0.9506069961060607|    x|false|
|  0.5751509658063075|    x| true|
|  0.9399792195744116|    x| true|
| -0.5136792046662925|    y|false|
|-0.17830606215500272|    y|false|
|  -1.257253734973249|    y| true|
|-0.15579970816999872|    y| true|
|-0.01842180913757...|    y| true|
| -1.5232135953524801|    z|false|
| -1.0790126956031663|    z|false|
|-0.22467908543381457|    z|false|
|  1.2047437309657716|    z|false|
| -1.9323560015091858|    z| true|
|   -1.41124263188717|    z| true|
| -0.7320583853311774|    z| true|
|-0.10321862725194576|    z| true|
| 0.10767464455391365|    z| true|
|  0.6558615323233493|    z| true|
+--------------------+-----+-----+



In [177]:
df.sort('abool', 'n', 'group').show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
| -1.5232135953524801|    z|false|
| -1.4634777566193453|    x|false|
| -1.0790126956031663|    z|false|
| -0.5334945027167617|    x|false|
| -0.5136792046662925|    y|false|
|-0.22467908543381457|    z|false|
|-0.17830606215500272|    y|false|
|  0.9506069961060607|    x|false|
|  1.2047437309657716|    z|false|
| -1.9323560015091858|    z| true|
|   -1.41124263188717|    z| true|
|  -1.257253734973249|    y| true|
| -0.7320583853311774|    z| true|
|-0.15579970816999872|    y| true|
|-0.10321862725194576|    z| true|
|-0.01842180913757...|    y| true|
| 0.10767464455391365|    z| true|
|  0.5751509658063075|    x| true|
|  0.6558615323233493|    z| true|
|  0.9399792195744116|    x| true|
+--------------------+-----+-----+



Yes the order we specify columns makes a difference in the sorting. 

## 8. Spark SQL

### Use the starter code above to re-create a spark dataframe.

In [191]:
df = spark.createDataFrame(pd.DataFrame({
    "n": np.random.randn(20),
    "group": np.random.choice(list("xyz"), 20),
    "abool": np.random.choice([True, False], 20),
}))

### Turn your dataframe into a table that can be queried with spark SQL. Name the table my_df. Answer the rest of the questions in this section with a spark sql query (spark.sql) against my_df. After each step, view the first 7 records from the dataframe.

In [196]:
df.createOrReplaceTempView("my_df")

### Write a query that shows all of the columns from your dataframe.

In [199]:
spark.sql(
    """
SELECT *
FROM my_df
"""
).show(7)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  1.4686310220144754|    y|false|
|  -1.046785188546139|    x| true|
|  0.9366074560857586|    x|false|
| -1.9945534953906834|    x|false|
|  -0.770096628647285|    x| true|
|-0.20608861024238959|    x|false|
| -0.4925907278741742|    x| true|
+--------------------+-----+-----+
only showing top 7 rows



### Write a query that shows just the n and abool columns from the dataframe.

In [200]:
spark.sql(
    """
SELECT n, abool
FROM my_df
"""
).show(7)

+--------------------+-----+
|                   n|abool|
+--------------------+-----+
|  1.4686310220144754|false|
|  -1.046785188546139| true|
|  0.9366074560857586|false|
| -1.9945534953906834|false|
|  -0.770096628647285| true|
|-0.20608861024238959|false|
| -0.4925907278741742| true|
+--------------------+-----+
only showing top 7 rows



### Write a query that shows just the n and group columns. Rename the group column to g.

In [201]:
spark.sql(
    """
SELECT n, group AS g
FROM my_df
"""
).show(7)

+--------------------+---+
|                   n|  g|
+--------------------+---+
|  1.4686310220144754|  y|
|  -1.046785188546139|  x|
|  0.9366074560857586|  x|
| -1.9945534953906834|  x|
|  -0.770096628647285|  x|
|-0.20608861024238959|  x|
| -0.4925907278741742|  x|
+--------------------+---+
only showing top 7 rows



### Write a query that selects n, and creates two new columns: n2, the original n values halved, and n3: the original n values minus 1.

In [202]:
spark.sql(
    """
SELECT n, (n/2) AS n2, (n - 1) AS n3
FROM my_df
"""
).show(7)

+--------------------+--------------------+--------------------+
|                   n|                  n2|                  n3|
+--------------------+--------------------+--------------------+
|  1.4686310220144754|  0.7343155110072377|  0.4686310220144754|
|  -1.046785188546139| -0.5233925942730695|  -2.046785188546139|
|  0.9366074560857586|  0.4683037280428793|-0.06339254391424143|
| -1.9945534953906834| -0.9972767476953417| -2.9945534953906834|
|  -0.770096628647285| -0.3850483143236425|  -1.770096628647285|
|-0.20608861024238959|-0.10304430512119479| -1.2060886102423896|
| -0.4925907278741742| -0.2462953639370871| -1.4925907278741741|
+--------------------+--------------------+--------------------+
only showing top 7 rows



### What happens if you make a SQL syntax error in your query?

In [203]:
spark.sql(
    """
SELECT n (n/2) AS n2, (n - 1) AS n3
FROM my_df
"""
).show(7)

AnalysisException: Undefined function: 'n'. This function is neither a registered temporary function nor a permanent function registered in the database 'default'.; line 2 pos 7

## 9. Aggregating

### What is the average n value for each group in the group column?

In [207]:
df.groupby('group').mean('n').show()

+-----+-------------------+
|group|             avg(n)|
+-----+-------------------+
|    y|0.42562884014430813|
|    x|-0.4649587098028935|
|    z|0.00444993239322974|
+-----+-------------------+



### What is the maximum n value for each group in the group column?

In [208]:
df.groupby('group').max('n').show()

+-----+------------------+
|group|            max(n)|
+-----+------------------+
|    y|1.4686310220144754|
|    x| 1.120789945716334|
|    z|0.9292707452628507|
+-----+------------------+



### What is the minimum n value by abool?

In [209]:
df.groupby('abool').min('n').show()

+-----+-------------------+
|abool|             min(n)|
+-----+-------------------+
|false|-1.9945534953906834|
| true|  -2.26038233288238|
+-----+-------------------+



### What is the average n value for each unique combination of the group and abool column?

In [210]:
df.groupby('group', 'abool').mean('n').show()

+-----+-----+--------------------+
|group|abool|              avg(n)|
+-----+-----+--------------------+
|    y|false| 0.46301542392624856|
|    x|false|-0.18389086399809915|
|    x| true| -0.6898129864467288|
|    z|false| 0.46535752682864473|
|    z| true| -0.6100935268539903|
|    y| true|  0.3134690887984868|
+-----+-----+--------------------+

