In [1]:
import pyspark

In [2]:
spark = pyspark.sql.SparkSession.builder.getOrCreate()

In [3]:
import pandas as pd
import numpy as np

In [4]:
np.random.seed(13)

# 1. Spark Dataframe Basics

A. Use the starter code above to create a pandas dataframe.

In [5]:
pandas_dataframe = pd.DataFrame({
    "n": np.random.randn(20),
    "group": np.random.choice(list("xyz"), 20),
    "abool": np.random.choice([True, False], 20),
})

In [6]:
pandas_dataframe

Unnamed: 0,n,group,abool
0,-0.712391,z,False
1,0.753766,x,False
2,-0.044503,z,False
3,0.451812,y,False
4,1.345102,z,False
5,0.532338,y,False
6,1.350188,z,False
7,0.861211,x,False
8,1.478686,z,True
9,-1.045377,y,True


### B. Convert the pandas dataframe to a spark dataframe. From this point forward, do all of your work with the spark dataframe, not the pandas dataframe.

In [7]:
df = spark.createDataFrame(pandas_dataframe)
df

DataFrame[n: double, group: string, abool: boolean]

### C. Show the first 3 rows of the dataframe.

In [8]:
df.show(3)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
+--------------------+-----+-----+
only showing top 3 rows



### D. Show the first 7 rows of the dataframe.

In [9]:
df.show(7)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
| 0.45181233874578974|    y|false|
|  1.3451017084510097|    z|false|
|  0.5323378882945463|    y|false|
|  1.3501878997225267|    z|false|
+--------------------+-----+-----+
only showing top 7 rows



### E. View a summary of the data using `.describe`.

In [10]:
df.describe().show()

+-------+------------------+-----+
|summary|                 n|group|
+-------+------------------+-----+
|  count|                20|   20|
|   mean|0.3664026449885217| null|
| stddev|0.8905322898155363| null|
|    min|-1.261605945319069|    x|
|    max|2.1503829673811126|    z|
+-------+------------------+-----+



### F. Use `.select` to create a new dataframe with just the `n` and `abool` columns. View the first 5 rows of this dataframe.

In [11]:
df.select('n', 'abool').show(5)

+--------------------+-----+
|                   n|abool|
+--------------------+-----+
|  -0.712390662050588|false|
|   0.753766378659703|false|
|-0.04450307833805...|false|
| 0.45181233874578974|false|
|  1.3451017084510097|false|
+--------------------+-----+
only showing top 5 rows



### G. Use `.select` to create a new dataframe with just the `group` and `abool` columns. View the first 5 rows of this dataframe.

In [12]:
df.select('group', 'abool').show(5)

+-----+-----+
|group|abool|
+-----+-----+
|    z|false|
|    x|false|
|    z|false|
|    y|false|
|    z|false|
+-----+-----+
only showing top 5 rows



### H. Use `.select` to create a new dataframe with the `group` column and the `abool` column renamed to `a_boolean_value`. Show the first 3 rows of this dataframe.

In [13]:
df.select('group', df.abool.alias('a_boolean_value')).show(3)

+-----+---------------+
|group|a_boolean_value|
+-----+---------------+
|    z|          false|
|    x|          false|
|    z|          false|
+-----+---------------+
only showing top 3 rows



### I. Use `.select` to create a new dataframe with the `group` column and the `n` column renamed to `a_numeric_value`. Show the first 6 rows of this dataframe.

In [15]:
df.select('group', df.n.alias('a_numeric_value')).show(6)

+-----+--------------------+
|group|     a_numeric_value|
+-----+--------------------+
|    z|  -0.712390662050588|
|    x|   0.753766378659703|
|    z|-0.04450307833805...|
|    y| 0.45181233874578974|
|    z|  1.3451017084510097|
|    y|  0.5323378882945463|
+-----+--------------------+
only showing top 6 rows



# 2. Column Manipulation

### A. Use the starter code above to re-create a spark dataframe. Store the spark dataframe in a variable named `df`

In [16]:
df.show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
| 0.45181233874578974|    y|false|
|  1.3451017084510097|    z|false|
|  0.5323378882945463|    y|false|
|  1.3501878997225267|    z|false|
|  0.8612113741693206|    x|false|
|  1.4786857374358966|    z| true|
| -1.0453771305385342|    y| true|
| -0.7889890249515489|    x|false|
|  -1.261605945319069|    y|false|
|  0.5628467852810314|    y| true|
|-0.24332625188556253|    y| true|
|  0.9137407048596775|    y|false|
| 0.31735092273633597|    x|false|
| 0.12730328020698067|    z|false|
|  2.1503829673811126|    y| true|
|  0.6062886568962988|    x|false|
|-0.02677164998644...|    x| true|
+--------------------+-----+-----+



### B. Use `.select` to add 4 to the `n` column. Show the results.

In [12]:
df.select(df.n + 4).show()

+------------------+
|           (n + 4)|
+------------------+
|3.2876093379494122|
| 4.753766378659703|
|3.9554969216619464|
|  4.45181233874579|
|5.3451017084510095|
| 4.532337888294546|
| 5.350187899722527|
|  4.86121137416932|
| 5.478685737435897|
| 2.954622869461466|
|3.2110109750484512|
| 2.738394054680931|
| 4.562846785281032|
|3.7566737481144377|
| 4.913740704859677|
| 4.317350922736336|
| 4.127303280206981|
| 6.150382967381113|
| 4.606288656896298|
|3.9732283500135592|
+------------------+



### C. Subtract 5 from the `n` column and view the results.

In [13]:
df.select(df.n - 5).show(5)

+-------------------+
|            (n - 5)|
+-------------------+
| -5.712390662050588|
| -4.246233621340297|
| -5.044503078338053|
|  -4.54818766125421|
|-3.6548982915489905|
+-------------------+
only showing top 5 rows



### D. Multiply the `n` column by 2. View the results along with the original numbers.

In [18]:
df.select('n', df.n * 2).show(5)

+--------------------+--------------------+
|                   n|             (n * 2)|
+--------------------+--------------------+
|  -0.712390662050588|  -1.424781324101176|
|   0.753766378659703|   1.507532757319406|
|-0.04450307833805...|-0.08900615667610691|
| 0.45181233874578974|  0.9036246774915795|
|  1.3451017084510097|  2.6902034169020195|
+--------------------+--------------------+
only showing top 5 rows



### E. Add a new column named `n2` that is the `n` value multiplied by -1. Show the first 4 rows of your dataframe. You should see the original `n` value as well as `n2`.

In [20]:
n2 = (df.n * -1).alias('n2')

df.select('n', n2).show(4)

+--------------------+--------------------+
|                   n|                  n2|
+--------------------+--------------------+
|  -0.712390662050588|   0.712390662050588|
|   0.753766378659703|  -0.753766378659703|
|-0.04450307833805...|0.044503078338053455|
| 0.45181233874578974|-0.45181233874578974|
+--------------------+--------------------+
only showing top 4 rows



In [21]:
n2 = (df.n * -1).alias('n2')

df = df.select('*', n2)

In [25]:
df.show(4)

+--------------------+-----+-----+--------------------+--------------------+
|                   n|group|abool|                  n2|                  n3|
+--------------------+-----+-----+--------------------+--------------------+
|  -0.712390662050588|    z|false|   0.712390662050588|   0.507500455376875|
|   0.753766378659703|    x|false|  -0.753766378659703|  0.5681637535977627|
|-0.04450307833805...|    z|false|0.044503078338053455|0.001980523981562...|
| 0.45181233874578974|    y|false|-0.45181233874578974| 0.20413438944294027|
+--------------------+-----+-----+--------------------+--------------------+
only showing top 4 rows



### F. Add a new column named `n3` that is the n value squared. Show the first 5 rows of your dataframe. You should see both `n`, `n2`, and `n3`.

In [23]:
n3 = (df.n ** 2).alias('n3')
df = df.select('*', n3)

In [24]:
df.show(5)

+--------------------+-----+-----+--------------------+--------------------+
|                   n|group|abool|                  n2|                  n3|
+--------------------+-----+-----+--------------------+--------------------+
|  -0.712390662050588|    z|false|   0.712390662050588|   0.507500455376875|
|   0.753766378659703|    x|false|  -0.753766378659703|  0.5681637535977627|
|-0.04450307833805...|    z|false|0.044503078338053455|0.001980523981562...|
| 0.45181233874578974|    y|false|-0.45181233874578974| 0.20413438944294027|
|  1.3451017084510097|    z|false| -1.3451017084510097|  1.8092986060778251|
+--------------------+-----+-----+--------------------+--------------------+
only showing top 5 rows



### G. What happens when you run the code below?

 `df.group + df.abool`

In [21]:
df.group + df.abool

Column<'(group + abool)'>

 
### H. What happens when you run the code below? What is the difference between this and the previous code sample?

 `df.select(df.group + df.abool)`

In [26]:
df.select(df.group + df.abool)

AnalysisException: cannot resolve '(CAST(`group` AS DOUBLE) + `abool`)' due to data type mismatch: differing types in '(CAST(`group` AS DOUBLE) + `abool`)' (double and boolean).;
'Project [(cast(group#1 as double) + abool#2) AS (group + abool)#329]
+- Project [n#0, group#1, abool#2, n2#259, POWER(n#0, cast(2 as double)) AS n3#281]
   +- Project [n#0, group#1, abool#2, (n#0 * cast(-1 as double)) AS n2#259]
      +- LogicalRDD [n#0, group#1, abool#2], false


 
### I. Try adding various other columns together. What are the results of combining the different data types?

# 3. Type casting

###    1. Use the starter code above to re-create a spark dataframe.

###    2. Use `.printSchema` to view the datatypes in your dataframe.

In [27]:
df.printSchema()

root
 |-- n: double (nullable = true)
 |-- group: string (nullable = true)
 |-- abool: boolean (nullable = true)
 |-- n2: double (nullable = true)
 |-- n3: double (nullable = true)



In [28]:
df.printSchema

<bound method DataFrame.printSchema of DataFrame[n: double, group: string, abool: boolean, n2: double, n3: double]>

###    3. Use `.dtypes` to view the datatypes in your dataframe.

In [29]:
df.dtypes

[('n', 'double'),
 ('group', 'string'),
 ('abool', 'boolean'),
 ('n2', 'double'),
 ('n3', 'double')]

###    4. What is the difference between the two code samples below?

        ```
        df.abool.cast('int')
        ```

        ```
        df.select(df.abool.cast('int')).show()
        ```

In [30]:
#action
df.abool.cast('int')

Column<'CAST(abool AS INT)'>

In [31]:
#shows the action applied
df.select(df.abool.cast('int')).show()

+-----+
|abool|
+-----+
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    1|
|    1|
|    0|
|    0|
|    1|
|    1|
|    0|
|    0|
|    0|
|    1|
|    0|
|    1|
+-----+



###    5. Use `.select` and `.cast` to convert the `abool` column to an integer type. View the results.
       

In [42]:
df.dtypes

[('n', 'double'), ('group', 'string'), ('abool', 'boolean')]

In [32]:
df.select('abool', df.abool.cast('int')).show()

+-----+-----+
|abool|abool|
+-----+-----+
|false|    0|
|false|    0|
|false|    0|
|false|    0|
|false|    0|
|false|    0|
|false|    0|
|false|    0|
| true|    1|
| true|    1|
|false|    0|
|false|    0|
| true|    1|
| true|    1|
|false|    0|
|false|    0|
|false|    0|
| true|    1|
|false|    0|
| true|    1|
+-----+-----+



###    6. Convert the `group` column to a integer data type and view the results.
       What happens?
       

In [33]:
df.select('group', df.group.cast('int')).show()

+-----+-----+
|group|group|
+-----+-----+
|    z| null|
|    x| null|
|    z| null|
|    y| null|
|    z| null|
|    y| null|
|    z| null|
|    x| null|
|    z| null|
|    y| null|
|    x| null|
|    y| null|
|    y| null|
|    y| null|
|    y| null|
|    x| null|
|    z| null|
|    y| null|
|    x| null|
|    x| null|
+-----+-----+



###    7. Convert the `n` column to a integer data type and view the results. What happens?
    

In [36]:
df.select('n', df.n.cast('int')).show()

+--------------------+---+
|                   n|  n|
+--------------------+---+
|  -0.712390662050588|  0|
|   0.753766378659703|  0|
|-0.04450307833805...|  0|
| 0.45181233874578974|  0|
|  1.3451017084510097|  1|
|  0.5323378882945463|  0|
|  1.3501878997225267|  1|
|  0.8612113741693206|  0|
|  1.4786857374358966|  1|
| -1.0453771305385342| -1|
| -0.7889890249515489|  0|
|  -1.261605945319069| -1|
|  0.5628467852810314|  0|
|-0.24332625188556253|  0|
|  0.9137407048596775|  0|
| 0.31735092273633597|  0|
| 0.12730328020698067|  0|
|  2.1503829673811126|  2|
|  0.6062886568962988|  0|
|-0.02677164998644...|  0|
+--------------------+---+



###    8. Convert the `abool` column to a string data type and view the results.
       What happens?

In [38]:
df.select('abool', df.abool.cast('string')).show()

+-----+-----+
|abool|abool|
+-----+-----+
|false|false|
|false|false|
|false|false|
|false|false|
|false|false|
|false|false|
|false|false|
|false|false|
| true| true|
| true| true|
|false|false|
|false|false|
| true| true|
| true| true|
|false|false|
|false|false|
|false|false|
| true| true|
|false|false|
| true| true|
+-----+-----+



In [40]:
(df.select('abool', df.abool.cast('string'))).dtypes

[('abool', 'boolean'), ('abool', 'string')]

# 4. Built-in Functions

###    1. Use the starter code above to re-create a spark dataframe.

In [42]:
np.random.seed(13)

pandas_dataframe = pd.DataFrame({
    "n": np.random.randn(20),
    "group": np.random.choice(list("xyz"), 20),
    "abool": np.random.choice([True, False], 20),
})

df = spark.createDataFrame(pandas_dataframe)

###    2. Import the necessary functions from `pyspark.sql.functions`

In [41]:
from pyspark.sql.functions import min, max, mean, concat, lit

###    3. Find the highest `n` value.

In [44]:
df.select(max('n')).show()

+------------------+
|            max(n)|
+------------------+
|2.1503829673811126|
+------------------+



###    4. Find the lowest `n` value.

In [45]:
df.select(min('n')).show()

+------------------+
|            min(n)|
+------------------+
|-1.261605945319069|
+------------------+



###    5. Find the average `n` value.

In [46]:
df.select(mean('n')).show()

+------------------+
|            avg(n)|
+------------------+
|0.3664026449885217|
+------------------+



In [47]:
df.select(
    min('n'),
    max('n'),
    mean('n')
).show()

+------------------+------------------+------------------+
|            min(n)|            max(n)|            avg(n)|
+------------------+------------------+------------------+
|-1.261605945319069|2.1503829673811126|0.3664026449885217|
+------------------+------------------+------------------+



###    6. Use `concat` to change the `group` column observations to say, e.g. "Group: x" or      "Group: y"

In [48]:
df.select('group', concat(lit('Group: '), 'group')).show()

+-----+----------------------+
|group|concat(Group: , group)|
+-----+----------------------+
|    z|              Group: z|
|    x|              Group: x|
|    z|              Group: z|
|    y|              Group: y|
|    z|              Group: z|
|    y|              Group: y|
|    z|              Group: z|
|    x|              Group: x|
|    z|              Group: z|
|    y|              Group: y|
|    x|              Group: x|
|    y|              Group: y|
|    y|              Group: y|
|    y|              Group: y|
|    y|              Group: y|
|    x|              Group: x|
|    z|              Group: z|
|    y|              Group: y|
|    x|              Group: x|
|    x|              Group: x|
+-----+----------------------+



###    7. Use `concat` to combine the `n` and `group` columns to produce results that look like this: "x: -1.432" or "z: 2.352"

In [51]:
df.select('group','n',concat('group', lit(': '), 'n').alias('merge')).show()

+-----+--------------------+--------------------+
|group|                   n|               merge|
+-----+--------------------+--------------------+
|    z|  -0.712390662050588|z: -0.71239066205...|
|    x|   0.753766378659703|x: 0.753766378659703|
|    z|-0.04450307833805...|z: -0.04450307833...|
|    y| 0.45181233874578974|y: 0.451812338745...|
|    z|  1.3451017084510097|z: 1.345101708451...|
|    y|  0.5323378882945463|y: 0.532337888294...|
|    z|  1.3501878997225267|z: 1.350187899722...|
|    x|  0.8612113741693206|x: 0.861211374169...|
|    z|  1.4786857374358966|z: 1.478685737435...|
|    y| -1.0453771305385342|y: -1.04537713053...|
|    x| -0.7889890249515489|x: -0.78898902495...|
|    y|  -1.261605945319069|y: -1.26160594531...|
|    y|  0.5628467852810314|y: 0.562846785281...|
|    y|-0.24332625188556253|y: -0.24332625188...|
|    y|  0.9137407048596775|y: 0.913740704859...|
|    x| 0.31735092273633597|x: 0.317350922736...|
|    z| 0.12730328020698067|z: 0.127303280206...|


# 5. When / Otherwise

###    1. Use the starter code above to re-create a spark dataframe.

###    2. Use `when` and `.otherwise` to create a column that contains the text "It is true" when `abool` is true and "It is false"" when `abool` is false.

In [52]:
from pyspark.sql.functions import when

In [53]:
df.select(
    'abool',
    when(df.abool, 'It is true').otherwise('It is false').alias('message')
).show()

+-----+-----------+
|abool|    message|
+-----+-----------+
|false|It is false|
|false|It is false|
|false|It is false|
|false|It is false|
|false|It is false|
|false|It is false|
|false|It is false|
|false|It is false|
| true| It is true|
| true| It is true|
|false|It is false|
|false|It is false|
| true| It is true|
| true| It is true|
|false|It is false|
|false|It is false|
|false|It is false|
| true| It is true|
|false|It is false|
| true| It is true|
+-----+-----------+



###    3. Create a column that contains 0 if n is less than 0, otherwise, the
       original n value.

In [55]:
df.select(
    'n',
    when(df.n < 0, 0).otherwise(df.n).alias('Natural Numbers')
).show()

+--------------------+-------------------+
|                   n|    Natural Numbers|
+--------------------+-------------------+
|  -0.712390662050588|                0.0|
|   0.753766378659703|  0.753766378659703|
|-0.04450307833805...|                0.0|
| 0.45181233874578974|0.45181233874578974|
|  1.3451017084510097| 1.3451017084510097|
|  0.5323378882945463| 0.5323378882945463|
|  1.3501878997225267| 1.3501878997225267|
|  0.8612113741693206| 0.8612113741693206|
|  1.4786857374358966| 1.4786857374358966|
| -1.0453771305385342|                0.0|
| -0.7889890249515489|                0.0|
|  -1.261605945319069|                0.0|
|  0.5628467852810314| 0.5628467852810314|
|-0.24332625188556253|                0.0|
|  0.9137407048596775| 0.9137407048596775|
| 0.31735092273633597|0.31735092273633597|
| 0.12730328020698067|0.12730328020698067|
|  2.1503829673811126| 2.1503829673811126|
|  0.6062886568962988| 0.6062886568962988|
|-0.02677164998644...|                0.0|
+----------

# 6. Filter / Where

###    1. Use the starter code above to re-create a spark dataframe.

###    2. Use `.filter` or `.where` to select just the rows where the group is `y` and view the results.

In [56]:
df.filter(df.group == 'y').show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
| 0.45181233874578974|    y|false|
|  0.5323378882945463|    y|false|
| -1.0453771305385342|    y| true|
|  -1.261605945319069|    y|false|
|  0.5628467852810314|    y| true|
|-0.24332625188556253|    y| true|
|  0.9137407048596775|    y|false|
|  2.1503829673811126|    y| true|
+--------------------+-----+-----+



In [57]:
df.where(df.group == 'y').show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
| 0.45181233874578974|    y|false|
|  0.5323378882945463|    y|false|
| -1.0453771305385342|    y| true|
|  -1.261605945319069|    y|false|
|  0.5628467852810314|    y| true|
|-0.24332625188556253|    y| true|
|  0.9137407048596775|    y|false|
|  2.1503829673811126|    y| true|
+--------------------+-----+-----+



###    3. Select just the columns where the `abool` column is false and view the results.

In [58]:
df.where(df.abool == 'false').show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
| 0.45181233874578974|    y|false|
|  1.3451017084510097|    z|false|
|  0.5323378882945463|    y|false|
|  1.3501878997225267|    z|false|
|  0.8612113741693206|    x|false|
| -0.7889890249515489|    x|false|
|  -1.261605945319069|    y|false|
|  0.9137407048596775|    y|false|
| 0.31735092273633597|    x|false|
| 0.12730328020698067|    z|false|
|  0.6062886568962988|    x|false|
+--------------------+-----+-----+



In [62]:
df.filter(~ df.abool).show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
| 0.45181233874578974|    y|false|
|  1.3451017084510097|    z|false|
|  0.5323378882945463|    y|false|
|  1.3501878997225267|    z|false|
|  0.8612113741693206|    x|false|
| -0.7889890249515489|    x|false|
|  -1.261605945319069|    y|false|
|  0.9137407048596775|    y|false|
| 0.31735092273633597|    x|false|
| 0.12730328020698067|    z|false|
|  0.6062886568962988|    x|false|
+--------------------+-----+-----+



###    4. Find the columns where the `group` column is *not* `y`.

In [60]:
df.where(df.group != 'y').show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
|  1.3451017084510097|    z|false|
|  1.3501878997225267|    z|false|
|  0.8612113741693206|    x|false|
|  1.4786857374358966|    z| true|
| -0.7889890249515489|    x|false|
| 0.31735092273633597|    x|false|
| 0.12730328020698067|    z|false|
|  0.6062886568962988|    x|false|
|-0.02677164998644...|    x| true|
+--------------------+-----+-----+



###    5. Find the columns where `n` is positive.

In [61]:
df.where(df.n > 0).show()

+-------------------+-----+-----+
|                  n|group|abool|
+-------------------+-----+-----+
|  0.753766378659703|    x|false|
|0.45181233874578974|    y|false|
| 1.3451017084510097|    z|false|
| 0.5323378882945463|    y|false|
| 1.3501878997225267|    z|false|
| 0.8612113741693206|    x|false|
| 1.4786857374358966|    z| true|
| 0.5628467852810314|    y| true|
| 0.9137407048596775|    y|false|
|0.31735092273633597|    x|false|
|0.12730328020698067|    z|false|
| 2.1503829673811126|    y| true|
| 0.6062886568962988|    x|false|
+-------------------+-----+-----+



###    6. Find the columns where `abool` is true and the `group` column is `z`.

In [63]:
df.filter(df.abool).filter(df.group == 'z').show()

+------------------+-----+-----+
|                 n|group|abool|
+------------------+-----+-----+
|1.4786857374358966|    z| true|
+------------------+-----+-----+



In [64]:
df.where(df.abool).where(df.group == 'z').show()

+------------------+-----+-----+
|                 n|group|abool|
+------------------+-----+-----+
|1.4786857374358966|    z| true|
+------------------+-----+-----+



In [65]:
df.where(df.abool == 'true').where(df.group == 'z').show()

+------------------+-----+-----+
|                 n|group|abool|
+------------------+-----+-----+
|1.4786857374358966|    z| true|
+------------------+-----+-----+



###    7. Find the columns where `abool` is true or the `group` column is `z`.

In [66]:
df.where(df.abool | (df.group == 'z')).show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|-0.04450307833805...|    z|false|
|  1.3451017084510097|    z|false|
|  1.3501878997225267|    z|false|
|  1.4786857374358966|    z| true|
| -1.0453771305385342|    y| true|
|  0.5628467852810314|    y| true|
|-0.24332625188556253|    y| true|
| 0.12730328020698067|    z|false|
|  2.1503829673811126|    y| true|
|-0.02677164998644...|    x| true|
+--------------------+-----+-----+



###    8. Find the columns where `abool` is false and `n` is less than 1

In [67]:
df.filter(~ df.abool).filter(df.n < 1).show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
| 0.45181233874578974|    y|false|
|  0.5323378882945463|    y|false|
|  0.8612113741693206|    x|false|
| -0.7889890249515489|    x|false|
|  -1.261605945319069|    y|false|
|  0.9137407048596775|    y|false|
| 0.31735092273633597|    x|false|
| 0.12730328020698067|    z|false|
|  0.6062886568962988|    x|false|
+--------------------+-----+-----+



###    9. Find the columns where `abool` is false or `n` is less than 1

In [68]:
df.filter(~ df.abool | (df.n < 1)).show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
| 0.45181233874578974|    y|false|
|  1.3451017084510097|    z|false|
|  0.5323378882945463|    y|false|
|  1.3501878997225267|    z|false|
|  0.8612113741693206|    x|false|
| -1.0453771305385342|    y| true|
| -0.7889890249515489|    x|false|
|  -1.261605945319069|    y|false|
|  0.5628467852810314|    y| true|
|-0.24332625188556253|    y| true|
|  0.9137407048596775|    y|false|
| 0.31735092273633597|    x|false|
| 0.12730328020698067|    z|false|
|  0.6062886568962988|    x|false|
|-0.02677164998644...|    x| true|
+--------------------+-----+-----+



# 7. Sorting
###    1. Use the starter code above to re-create a spark dataframe.

###    2. Sort by the `n` value.

In [69]:
df.sort(df.n).show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -1.261605945319069|    y|false|
| -1.0453771305385342|    y| true|
| -0.7889890249515489|    x|false|
|  -0.712390662050588|    z|false|
|-0.24332625188556253|    y| true|
|-0.04450307833805...|    z|false|
|-0.02677164998644...|    x| true|
| 0.12730328020698067|    z|false|
| 0.31735092273633597|    x|false|
| 0.45181233874578974|    y|false|
|  0.5323378882945463|    y|false|
|  0.5628467852810314|    y| true|
|  0.6062886568962988|    x|false|
|   0.753766378659703|    x|false|
|  0.8612113741693206|    x|false|
|  0.9137407048596775|    y|false|
|  1.3451017084510097|    z|false|
|  1.3501878997225267|    z|false|
|  1.4786857374358966|    z| true|
|  2.1503829673811126|    y| true|
+--------------------+-----+-----+



###    3. Sort by the `group` value, both ascending and descending.

In [70]:
# ascending
df.sort(df.group).show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  0.6062886568962988|    x|false|
|-0.02677164998644...|    x| true|
|   0.753766378659703|    x|false|
| -0.7889890249515489|    x|false|
|  0.8612113741693206|    x|false|
| 0.31735092273633597|    x|false|
| 0.45181233874578974|    y|false|
|  0.9137407048596775|    y|false|
|  0.5628467852810314|    y| true|
|  2.1503829673811126|    y| true|
|  0.5323378882945463|    y|false|
|  -1.261605945319069|    y|false|
| -1.0453771305385342|    y| true|
|-0.24332625188556253|    y| true|
|-0.04450307833805...|    z|false|
|  -0.712390662050588|    z|false|
|  1.3451017084510097|    z|false|
| 0.12730328020698067|    z|false|
|  1.3501878997225267|    z|false|
|  1.4786857374358966|    z| true|
+--------------------+-----+-----+



In [71]:
#descending
df.sort(df.group.desc()).show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|-0.04450307833805...|    z|false|
| 0.12730328020698067|    z|false|
|  -0.712390662050588|    z|false|
|  1.3451017084510097|    z|false|
|  1.3501878997225267|    z|false|
|  1.4786857374358966|    z| true|
|  2.1503829673811126|    y| true|
|  0.5628467852810314|    y| true|
| -1.0453771305385342|    y| true|
|  -1.261605945319069|    y|false|
| 0.45181233874578974|    y|false|
|  0.5323378882945463|    y|false|
|-0.24332625188556253|    y| true|
|  0.9137407048596775|    y|false|
|  0.6062886568962988|    x|false|
|-0.02677164998644...|    x| true|
|   0.753766378659703|    x|false|
| -0.7889890249515489|    x|false|
|  0.8612113741693206|    x|false|
| 0.31735092273633597|    x|false|
+--------------------+-----+-----+



###    4. Sort by the group value first, then, within each group, sort by `n` value.

In [72]:
df.sort(df.group, df.n).show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
| -0.7889890249515489|    x|false|
|-0.02677164998644...|    x| true|
| 0.31735092273633597|    x|false|
|  0.6062886568962988|    x|false|
|   0.753766378659703|    x|false|
|  0.8612113741693206|    x|false|
|  -1.261605945319069|    y|false|
| -1.0453771305385342|    y| true|
|-0.24332625188556253|    y| true|
| 0.45181233874578974|    y|false|
|  0.5323378882945463|    y|false|
|  0.5628467852810314|    y| true|
|  0.9137407048596775|    y|false|
|  2.1503829673811126|    y| true|
|  -0.712390662050588|    z|false|
|-0.04450307833805...|    z|false|
| 0.12730328020698067|    z|false|
|  1.3451017084510097|    z|false|
|  1.3501878997225267|    z|false|
|  1.4786857374358966|    z| true|
+--------------------+-----+-----+



###    5. Sort by `abool`, `group`, and `n`. Does it matter in what order you
       specify the columns when sorting?

In [73]:
df.sort(df.abool, df.group, df.n).show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
| -0.7889890249515489|    x|false|
| 0.31735092273633597|    x|false|
|  0.6062886568962988|    x|false|
|   0.753766378659703|    x|false|
|  0.8612113741693206|    x|false|
|  -1.261605945319069|    y|false|
| 0.45181233874578974|    y|false|
|  0.5323378882945463|    y|false|
|  0.9137407048596775|    y|false|
|  -0.712390662050588|    z|false|
|-0.04450307833805...|    z|false|
| 0.12730328020698067|    z|false|
|  1.3451017084510097|    z|false|
|  1.3501878997225267|    z|false|
|-0.02677164998644...|    x| true|
| -1.0453771305385342|    y| true|
|-0.24332625188556253|    y| true|
|  0.5628467852810314|    y| true|
|  2.1503829673811126|    y| true|
|  1.4786857374358966|    z| true|
+--------------------+-----+-----+



In [74]:
df.sort(df.abool, df.n, df.group).show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -1.261605945319069|    y|false|
| -0.7889890249515489|    x|false|
|  -0.712390662050588|    z|false|
|-0.04450307833805...|    z|false|
| 0.12730328020698067|    z|false|
| 0.31735092273633597|    x|false|
| 0.45181233874578974|    y|false|
|  0.5323378882945463|    y|false|
|  0.6062886568962988|    x|false|
|   0.753766378659703|    x|false|
|  0.8612113741693206|    x|false|
|  0.9137407048596775|    y|false|
|  1.3451017084510097|    z|false|
|  1.3501878997225267|    z|false|
| -1.0453771305385342|    y| true|
|-0.24332625188556253|    y| true|
|-0.02677164998644...|    x| true|
|  0.5628467852810314|    y| true|
|  1.4786857374358966|    z| true|
|  2.1503829673811126|    y| true|
+--------------------+-----+-----+

