In [1]:
import pyspark

In [2]:
spark = pyspark.sql.SparkSession.builder.getOrCreate()

In [4]:
import pandas as pd
import numpy as np

np.random.seed(13)

pandas_dataframe = pd.DataFrame(
    {
        "n": np.random.randn(20),
        "group": np.random.choice(list("xyz"), 20),
        "abool": np.random.choice([True, False], 20),
    }
)

1. Spark Dataframe Basics

    1. Use the starter code above to create a pandas dataframe.
    1. Convert the pandas dataframe to a spark dataframe. From this point
       forward, do all of your work with the spark dataframe, not the pandas
       dataframe.
    1. Show the first 3 rows of the dataframe.
    1. Show the first 7 rows of the dataframe.
    1. View a summary of the data using `.describe`.
    1. Use `.select` to create a new dataframe with just the `n` and `abool`
       columns. View the first 5 rows of this dataframe.
    1. Use `.select` to create a new dataframe with just the `group` and `abool`
       columns. View the first 5 rows of this dataframe.
    1. Use `.select` to create a new dataframe with the `group` column and the
       `abool` column renamed to `a_boolean_value`. Show the first 3 rows of
       this dataframe.
    1. Use `.select` to create a new dataframe with the `group` column and the
       `n` column renamed to `a_numeric_value`. Show the first 6 rows of this
       dataframe.

In [5]:
df = spark.createDataFrame(pandas_dataframe)

In [8]:
df.show(3)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
+--------------------+-----+-----+
only showing top 3 rows



In [9]:
df.show(7)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
| 0.45181233874578974|    y|false|
|  1.3451017084510097|    z|false|
|  0.5323378882945463|    y|false|
|  1.3501878997225267|    z|false|
+--------------------+-----+-----+
only showing top 7 rows



In [10]:
df.describe()

DataFrame[summary: string, n: string, group: string]

In [11]:
df1 = df.select('n','abool').show(5)

+--------------------+-----+
|                   n|abool|
+--------------------+-----+
|  -0.712390662050588|false|
|   0.753766378659703|false|
|-0.04450307833805...|false|
| 0.45181233874578974|false|
|  1.3451017084510097|false|
+--------------------+-----+
only showing top 5 rows



In [12]:
df3 = df.select('group','abool').show(5)

+-----+-----+
|group|abool|
+-----+-----+
|    z|false|
|    x|false|
|    z|false|
|    y|false|
|    z|false|
+-----+-----+
only showing top 5 rows



In [14]:
df4 = df.select('group', df.abool.alias('a_boolean_value')).show(3)

+-----+---------------+
|group|a_boolean_value|
+-----+---------------+
|    z|          false|
|    x|          false|
|    z|          false|
+-----+---------------+
only showing top 3 rows



In [15]:
df5 = df.select('group',df.n.alias('a_numeric_value')).show(6)

+-----+--------------------+
|group|     a_numeric_value|
+-----+--------------------+
|    z|  -0.712390662050588|
|    x|   0.753766378659703|
|    z|-0.04450307833805...|
|    y| 0.45181233874578974|
|    z|  1.3451017084510097|
|    y|  0.5323378882945463|
+-----+--------------------+
only showing top 6 rows



1. Column Manipulation

    1. Use the starter code above to re-create a spark dataframe. Store the
       spark dataframe in a varaible named `df`

    1. Use `.select` to add 4 to the `n` column. Show the results.

    1. Subtract 5 from the `n` column and view the results.

    1. Multiply the `n` column by 2. View the results along with the original
       numbers.

    1. Add a new column named `n2` that is the `n` value multiplied by -1. Show
       the first 4 rows of your dataframe. You should see the original `n` value
       as well as `n2`.

    1. Add a new column named `n3` that is the n value squared. Show the first 5
       rows of your dataframe. You should see both `n`, `n2`, and `n3`.

    1. What happens when you run the code below?

        ```python
        df.group + df.abool
        ```

    1. What happens when you run the code below? What is the difference between
       this and the previous code sample?

        ```python
        df.select(df.group + df.abool)
        ```

    1. Try adding various other columns together. What are the results of
       combining the different data types?

In [16]:
pandas_dataframe = pd.DataFrame(
    {
        "n": np.random.randn(20),
        "group": np.random.choice(list("xyz"), 20),
        "abool": np.random.choice([True, False], 20),
    }
)

In [17]:
df = spark.createDataFrame(pandas_dataframe)

In [18]:
df1 = df.select(df.n + 4).show(5)

+------------------+
|           (n + 4)|
+------------------+
|3.1149379007131692|
| 4.072726746112778|
|  3.17248089880026|
| 3.408449078116781|
|1.8137843744202362|
+------------------+
only showing top 5 rows



In [19]:
df2 = df.select(df.n - 5).show(5)

+------------------+
|           (n - 5)|
+------------------+
|-5.885062099286831|
|-4.927273253887222|
| -5.82751910119974|
|-5.591550921883219|
|-7.186215625579764|
+------------------+
only showing top 5 rows



In [22]:
times2 = df.n * 2

In [23]:
df.select('*', times2.alias('multiplied_by_2')).show(5)

+-------------------+-----+-----+-------------------+
|                  n|group|abool|    multiplied_by_2|
+-------------------+-----+-----+-------------------+
|-0.8850620992868307|    x|false|-1.7701241985736613|
|0.07272674611277782|    x| true|0.14545349222555565|
|  -0.82751910119974|    x|false|  -1.65503820239948|
| -0.591550921883219|    y|false| -1.183101843766438|
| -2.186215625579764|    y| true| -4.372431251159528|
+-------------------+-----+-----+-------------------+
only showing top 5 rows



In [26]:
times1 = df.n * -1
df4 = df.select('*', times1.alias('n2')).show(4)

+-------------------+-----+-----+--------------------+
|                  n|group|abool|                  n2|
+-------------------+-----+-----+--------------------+
|-0.8850620992868307|    x|false|  0.8850620992868307|
|0.07272674611277782|    x| true|-0.07272674611277782|
|  -0.82751910119974|    x|false|    0.82751910119974|
| -0.591550921883219|    y|false|   0.591550921883219|
+-------------------+-----+-----+--------------------+
only showing top 4 rows



In [29]:
squared = df.n ** 2

In [40]:
df_4 = df.select('*', times1.alias('n2'))

In [42]:
df5 = df_4.select('*', squared.alias('n3'))
df5.show(5)

+-------------------+-----+-----+--------------------+--------------------+
|                  n|group|abool|                  n2|                  n3|
+-------------------+-----+-----+--------------------+--------------------+
|-0.8850620992868307|    x|false|  0.8850620992868307|  0.7833349195940117|
|0.07272674611277782|    x| true|-0.07272674611277782|0.005289179600152444|
|  -0.82751910119974|    x|false|    0.82751910119974|  0.6847878628504256|
| -0.591550921883219|    y|false|   0.591550921883219| 0.34993249318088626|
| -2.186215625579764|    y| true|   2.186215625579764|   4.779538761529118|
+-------------------+-----+-----+--------------------+--------------------+
only showing top 5 rows



In [45]:
df.group + df.abool

Column<'(group + abool)'>

In [46]:
df.select(df.group + df.abool)

AnalysisException: cannot resolve '(CAST(`group` AS DOUBLE) + `abool`)' due to data type mismatch: differing types in '(CAST(`group` AS DOUBLE) + `abool`)' (double and boolean).;
'Project [(cast(group#208 as double) + abool#209) AS (group + abool)#416]
+- LogicalRDD [n#207, group#208, abool#209], false


In [47]:
#gives me an error due to mismatched datatypes

In [48]:
col = df.group + df.abool
df.select('*', col.alias('boolgroup'))

AnalysisException: cannot resolve '(CAST(`group` AS DOUBLE) + `abool`)' due to data type mismatch: differing types in '(CAST(`group` AS DOUBLE) + `abool`)' (double and boolean).;
'Project [n#207, group#208, abool#209, (cast(group#208 as double) + abool#209) AS boolgroup#417]
+- LogicalRDD [n#207, group#208, abool#209], false


1. Type casting

    1. Use the starter code above to re-create a spark dataframe.

    1. Use `.printSchema` to view the datatypes in your dataframe.

    1. Use `.dtypes` to view the datatypes in your dataframe.

    1. What is the difference between the two code samples below?

        ```python
        df.abool.cast('int')
        ```

        ```python
        df.select(df.abool.cast('int')).show()
        ```

    1. Use `.select` and `.cast` to convert the `abool` column to an integer
       type. View the results.
    1. Convert the `group` column to a integer data type and view the results.
       What happens?
    1. Convert the `n` column to a integer data type and view the results. What
       happens?
    1. Convert the `abool` column to a string data type and view the results.
       What happens?



In [50]:
pandas_dataframe = pd.DataFrame(
    {
        "n": np.random.randn(20),
        "group": np.random.choice(list("xyz"), 20),
        "abool": np.random.choice([True, False], 20),
    }
)

In [51]:
df = spark.createDataFrame(pandas_dataframe)

In [55]:
df.printSchema

<bound method DataFrame.printSchema of DataFrame[n: double, group: string, abool: boolean]>

In [56]:
df.dtypes

[('n', 'double'), ('group', 'string'), ('abool', 'boolean')]

In [57]:
df.abool.cast('int') # this is the transformer

Column<'CAST(abool AS INT)'>

In [60]:
df.select(df.abool.cast('int')).show(2)# transformer with action

+-----+
|abool|
+-----+
|    1|
|    0|
+-----+
only showing top 2 rows



1. Built-in Functions

    1. Use the starter code above to re-create a spark dataframe.
    1. Import the necessary functions from `pyspark.sql.functions`
    1. Find the highest `n` value.
    1. Find the lowest `n` value.
    1. Find the average `n` value.
    1. Use `concat` to change the `group` column to say, e.g. "Group: x" or
       "Group: y"
    1. Use `concat` to combine the `n` and `group` columns to produce results
       that look like this: "x: -1.432" or "z: 2.352"



1. When / Otherwise

    1. Use the starter code above to re-create a spark dataframe.
    1. Use `when` and `.otherwise` to create a column that contains the text "It
       is true" when `abool` is true and "It is false"" when `abool` is false.
    1. Create a column that contains 0 if n is less than 0, otherwise, the
       original n value.