In [1]:
import pyspark
# once per notebook
spark = pyspark.sql.SparkSession.builder.getOrCreate()

In [2]:
import multiprocessing
import pyspark

nprocs = multiprocessing.cpu_count()

spark = (pyspark.sql.SparkSession.builder
 .master('local')
 .config('spark.jars.packages', 'mysql:mysql-connector-java:8.0.16')
 .config('spark.driver.memory', '4G')
 .config('spark.driver.cores', nprocs)
 .config('spark.sql.shuffle.partitions', nprocs)
 .appName('MySparkApplication')
 .getOrCreate())

In [3]:
import pyspark.sql.functions as F

In [6]:
# Spark API Mini Exercises

# Copy the code below to create a pandas dataframe with 20 rows and 3 columns:

# ```python
import pandas as pd
import numpy as np

np.random.seed(13)

pandas_dataframe = pd.DataFrame({
    "n": np.random.randn(20),
    "group": np.random.choice(list("xyz"), 20),
    "abool": np.random.choice([True, False], 20),
})

In [7]:
pandas_dataframe.head()

Unnamed: 0,n,group,abool
0,-0.712391,z,False
1,0.753766,x,False
2,-0.044503,z,False
3,0.451812,y,False
4,1.345102,z,False


In [8]:
pandas_dataframe.shape

(20, 3)

# Set 1 

In [11]:
df = spark.createDataFrame(pandas_dataframe)
df

DataFrame[n: double, group: string, abool: boolean]

In [13]:
df.show(3)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
+--------------------+-----+-----+
only showing top 3 rows



In [15]:
df.show(7)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
| 0.45181233874578974|    y|false|
|  1.3451017084510097|    z|false|
|  0.5323378882945463|    y|false|
|  1.3501878997225267|    z|false|
+--------------------+-----+-----+
only showing top 7 rows



In [19]:
df.describe().show()

+-------+------------------+-----+
|summary|                 n|group|
+-------+------------------+-----+
|  count|                20|   20|
|   mean|0.3664026449885217| null|
| stddev|0.8905322898155363| null|
|    min|-1.261605945319069|    x|
|    max|2.1503829673811126|    z|
+-------+------------------+-----+



In [23]:
df.select(df.n, df.abool).show(5)

+--------------------+-----+
|                   n|abool|
+--------------------+-----+
|  -0.712390662050588|false|
|   0.753766378659703|false|
|-0.04450307833805...|false|
| 0.45181233874578974|false|
|  1.3451017084510097|false|
+--------------------+-----+
only showing top 5 rows



In [24]:
df.select(df.group, df.abool).show(5)

+-----+-----+
|group|abool|
+-----+-----+
|    z|false|
|    x|false|
|    z|false|
|    y|false|
|    z|false|
+-----+-----+
only showing top 5 rows



In [25]:
df.select(df.group).show(5)

+-----+
|group|
+-----+
|    z|
|    x|
|    z|
|    y|
|    z|
+-----+
only showing top 5 rows



In [26]:
df.select(df.abool.alias("a_boolean_value")).show(3)

+---------------+
|a_boolean_value|
+---------------+
|          false|
|          false|
|          false|
+---------------+
only showing top 3 rows



In [28]:
df.select(df.group, df.n.alias("a_numeric_value")).show(6)

+-----+--------------------+
|group|     a_numeric_value|
+-----+--------------------+
|    z|  -0.712390662050588|
|    x|   0.753766378659703|
|    z|-0.04450307833805...|
|    y| 0.45181233874578974|
|    z|  1.3451017084510097|
|    y|  0.5323378882945463|
+-----+--------------------+
only showing top 6 rows



# Set 2

In [45]:
df = spark.createDataFrame(pandas_dataframe)
df

DataFrame[n: double, group: string, abool: boolean]

In [46]:
from pyspark.sql.functions import col, expr

In [47]:
plus4_column = (col("n") + 4)

In [48]:
multiply2_column = (col("n") * 2)

In [52]:
df.select(
    df.n.alias("n"),
    df.group.alias("group"),
    df.abool.alias("abool"),
    plus4_column.alias("n+4"),
    multiply2_column.alias("n*2"),
).show(5)

+--------------------+-----+-----+------------------+--------------------+
|                   n|group|abool|               n+4|                 n*2|
+--------------------+-----+-----+------------------+--------------------+
|  -0.712390662050588|    z|false|3.2876093379494122|  -1.424781324101176|
|   0.753766378659703|    x|false| 4.753766378659703|   1.507532757319406|
|-0.04450307833805...|    z|false|3.9554969216619464|-0.08900615667610691|
| 0.45181233874578974|    y|false|  4.45181233874579|  0.9036246774915795|
|  1.3451017084510097|    z|false|5.3451017084510095|  2.6902034169020195|
+--------------------+-----+-----+------------------+--------------------+
only showing top 5 rows



In [56]:
from pyspark.sql.functions import sqrt

In [62]:
n2_column = (pow(col("n"), 2).alias("n^2"))

In [63]:
n3_column = (pow(col("n"), 3).alias("n^3"))

In [64]:
df.select(
    df.n.alias("n"),
    df.group.alias("group"),
    df.abool.alias("abool"),
    plus4_column.alias("n+4"),
    multiply2_column.alias("n*2"),
    n2_column.alias("n^2"),
    n3_column.alias("n^3")
).show(5)

+--------------------+-----+-----+------------------+--------------------+--------------------+--------------------+
|                   n|group|abool|               n+4|                 n*2|                 n^2|                 n^3|
+--------------------+-----+-----+------------------+--------------------+--------------------+--------------------+
|  -0.712390662050588|    z|false|3.2876093379494122|  -1.424781324101176|   0.507500455376875| -0.3615385853969069|
|   0.753766378659703|    x|false| 4.753766378659703|   1.507532757319406|  0.5681637535977627|  0.4282627350350894|
|-0.04450307833805...|    z|false|3.9554969216619464|-0.08900615667610691|0.001980523981562...|-8.81394139018882...|
| 0.45181233874578974|    y|false|  4.45181233874579|  0.9036246774915795| 0.20413438944294027|  0.0922304359126587|
|  1.3451017084510097|    z|false|5.3451017084510095|  2.6902034169020195|  1.8092986060778251|   2.433690646133313|
+--------------------+-----+-----+------------------+-----------

In [67]:
df.group + df.abool

Column<'(group + abool)'>

In [69]:
# df.select(df.group + df.abool)

# Set 3

In [80]:
df = spark.createDataFrame(pandas_dataframe)
df

DataFrame[n: double, group: string, abool: boolean]

In [82]:
df.printSchema

<bound method DataFrame.printSchema of DataFrame[n: double, group: string, abool: boolean]>

In [83]:
df.dtypes

[('n', 'double'), ('group', 'string'), ('abool', 'boolean')]

In [None]:
# this shows the definition of the output

In [84]:
df.abool.cast('int')

Column<'CAST(abool AS INT)'>

In [None]:
# This shows the data frame due to the .show

In [87]:
df.select(df.abool.cast('int')).show(3)

+-----+
|abool|
+-----+
|    0|
|    0|
|    0|
+-----+
only showing top 3 rows



In [88]:
df.select(df.abool.cast('int')).show(3)

+-----+
|abool|
+-----+
|    0|
|    0|
|    0|
+-----+
only showing top 3 rows



In [89]:
df.select(df.group.cast('int')).show(3) # produces a null value.

+-----+
|group|
+-----+
| null|
| null|
| null|
+-----+
only showing top 3 rows



In [97]:
df.select(df.n.cast('int')).show(7) # turns all n values into int. Rounds to whole number.

+---+
|  n|
+---+
|  0|
|  0|
|  0|
|  0|
|  1|
|  0|
|  1|
+---+
only showing top 7 rows



In [99]:
df.select(df.abool.cast('string')).show(7) # turns into string

+-----+
|abool|
+-----+
|false|
|false|
|false|
|false|
|false|
|false|
|false|
+-----+
only showing top 7 rows



# Set 4

In [100]:
df = spark.createDataFrame(pandas_dataframe)
df

DataFrame[n: double, group: string, abool: boolean]

In [101]:
from pyspark.sql.functions import asc, desc

In [102]:
df.sort(desc("n")).show(1)

+------------------+-----+-----+
|                 n|group|abool|
+------------------+-----+-----+
|2.1503829673811126|    y| true|
+------------------+-----+-----+
only showing top 1 row



In [103]:
df.sort(asc("n")).show(1)

+------------------+-----+-----+
|                 n|group|abool|
+------------------+-----+-----+
|-1.261605945319069|    y|false|
+------------------+-----+-----+
only showing top 1 row



In [107]:
from pyspark.sql.functions import concat, sum, avg, min, max, count, mean

In [109]:
df.select(
    sum(df.n) / count(df.n).alias("average")).show()

+--------------------------------+
|(sum(n) / count(n) AS `average`)|
+--------------------------------+
|              0.3664026449885217|
+--------------------------------+



In [112]:
df1 = df.select((df.group).alias('Group: x')).show()

+--------+
|Group: x|
+--------+
|       z|
|       x|
|       z|
|       y|
|       z|
|       y|
|       z|
|       x|
|       z|
|       y|
|       x|
|       y|
|       y|
|       y|
|       y|
|       x|
|       z|
|       y|
|       x|
|       x|
+--------+



In [None]:
# Use concat to change the group column to say, e.g. "Group: x" or "Group: y"

In [None]:
# Use concat to combine the n and group columns to produce results that look like this: "x: -1.432" or "z: 2.352"

# Set 5

In [124]:
df = spark.createDataFrame(pandas_dataframe)
df.show(4)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
| 0.45181233874578974|    y|false|
+--------------------+-----+-----+
only showing top 4 rows



In [146]:
from pyspark.sql.functions import when

In [125]:
df.select(df.abool, when(df.abool == True, "It is true").otherwise("It is false").alias("trueorfalse")).show(4)

+-----+-----------+
|abool|trueorfalse|
+-----+-----------+
|false|It is false|
|false|It is false|
|false|It is false|
|false|It is false|
+-----+-----------+
only showing top 4 rows



In [127]:
df.select(df.n, when(df.n < 0, 0).otherwise(df.n).alias("zeroornot")).show(4)

+--------------------+-------------------+
|                   n|          zeroornot|
+--------------------+-------------------+
|  -0.712390662050588|                0.0|
|   0.753766378659703|  0.753766378659703|
|-0.04450307833805...|                0.0|
| 0.45181233874578974|0.45181233874578974|
+--------------------+-------------------+
only showing top 4 rows



# Set 6

In [128]:
df = spark.createDataFrame(pandas_dataframe)
df.show(4)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
| 0.45181233874578974|    y|false|
+--------------------+-----+-----+
only showing top 4 rows



In [133]:
df.filter(df.group == 'y').show(4)

+-------------------+-----+-----+
|                  n|group|abool|
+-------------------+-----+-----+
|0.45181233874578974|    y|false|
| 0.5323378882945463|    y|false|
|-1.0453771305385342|    y| true|
| -1.261605945319069|    y|false|
+-------------------+-----+-----+
only showing top 4 rows



In [134]:
df.filter(df.abool == 'false').show(4)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
| 0.45181233874578974|    y|false|
+--------------------+-----+-----+
only showing top 4 rows



In [136]:
df.filter(df.group != 'y').show(4)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
|  1.3451017084510097|    z|false|
+--------------------+-----+-----+
only showing top 4 rows



In [137]:
df.filter(df.n >= 0).show(4)

+-------------------+-----+-----+
|                  n|group|abool|
+-------------------+-----+-----+
|  0.753766378659703|    x|false|
|0.45181233874578974|    y|false|
| 1.3451017084510097|    z|false|
| 0.5323378882945463|    y|false|
+-------------------+-----+-----+
only showing top 4 rows



In [138]:
df.filter(df.abool == True).where(df["group"] == "z").show()

+------------------+-----+-----+
|                 n|group|abool|
+------------------+-----+-----+
|1.4786857374358966|    z| true|
+------------------+-----+-----+



In [182]:
df.filter((df.abool == True)|(df.group == "z")).show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|-0.04450307833805...|    z|false|
|  1.3451017084510097|    z|false|
|  1.3501878997225267|    z|false|
|  1.4786857374358966|    z| true|
| -1.0453771305385342|    y| true|
|  0.5628467852810314|    y| true|
|-0.24332625188556253|    y| true|
| 0.12730328020698067|    z|false|
|  2.1503829673811126|    y| true|
|-0.02677164998644...|    x| true|
+--------------------+-----+-----+



In [183]:
df.filter((df.abool == False)&(df.n > 1)).show()

+------------------+-----+-----+
|                 n|group|abool|
+------------------+-----+-----+
|1.3451017084510097|    z|false|
|1.3501878997225267|    z|false|
+------------------+-----+-----+



In [184]:
df.filter((df.abool == False)|(df.n > 1)).show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
| 0.45181233874578974|    y|false|
|  1.3451017084510097|    z|false|
|  0.5323378882945463|    y|false|
|  1.3501878997225267|    z|false|
|  0.8612113741693206|    x|false|
|  1.4786857374358966|    z| true|
| -0.7889890249515489|    x|false|
|  -1.261605945319069|    y|false|
|  0.9137407048596775|    y|false|
| 0.31735092273633597|    x|false|
| 0.12730328020698067|    z|false|
|  2.1503829673811126|    y| true|
|  0.6062886568962988|    x|false|
+--------------------+-----+-----+



# Set 7

In [160]:
df = spark.createDataFrame(pandas_dataframe)
df.show(4)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
| 0.45181233874578974|    y|false|
+--------------------+-----+-----+
only showing top 4 rows



In [162]:
df.sort(df.n).show(4)

+-------------------+-----+-----+
|                  n|group|abool|
+-------------------+-----+-----+
| -1.261605945319069|    y|false|
|-1.0453771305385342|    y| true|
|-0.7889890249515489|    x|false|
| -0.712390662050588|    z|false|
+-------------------+-----+-----+
only showing top 4 rows



In [178]:
df.sort(asc(df.group)).show(4)

+-------------------+-----+-----+
|                  n|group|abool|
+-------------------+-----+-----+
| 0.6062886568962988|    x|false|
|0.31735092273633597|    x|false|
| 0.8612113741693206|    x|false|
|-0.7889890249515489|    x|false|
+-------------------+-----+-----+
only showing top 4 rows



In [177]:
df.sort(desc(df.group)).show(4)

+-------------------+-----+-----+
|                  n|group|abool|
+-------------------+-----+-----+
|0.12730328020698067|    z|false|
| 1.3451017084510097|    z|false|
| 1.3501878997225267|    z|false|
| 1.4786857374358966|    z| true|
+-------------------+-----+-----+
only showing top 4 rows



In [164]:
df.sort(df.group, df.n).show(4)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
| -0.7889890249515489|    x|false|
|-0.02677164998644...|    x| true|
| 0.31735092273633597|    x|false|
|  0.6062886568962988|    x|false|
+--------------------+-----+-----+
only showing top 4 rows



In [179]:
df.sort(df.group, df.n, df.abool).show(4)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
| -0.7889890249515489|    x|false|
|-0.02677164998644...|    x| true|
| 0.31735092273633597|    x|false|
|  0.6062886568962988|    x|false|
+--------------------+-----+-----+
only showing top 4 rows



In [180]:
df.sort(df.group, df.abool, df.n).show(4)

+-------------------+-----+-----+
|                  n|group|abool|
+-------------------+-----+-----+
|-0.7889890249515489|    x|false|
|0.31735092273633597|    x|false|
| 0.6062886568962988|    x|false|
|  0.753766378659703|    x|false|
+-------------------+-----+-----+
only showing top 4 rows



In [181]:
df.sort(df.abool, df.group, df.n).show(4)

+-------------------+-----+-----+
|                  n|group|abool|
+-------------------+-----+-----+
|-0.7889890249515489|    x|false|
|0.31735092273633597|    x|false|
| 0.6062886568962988|    x|false|
|  0.753766378659703|    x|false|
+-------------------+-----+-----+
only showing top 4 rows

