# Curriculum and Reps

- First line is curriculum code with notes.  

- Second line is my fingers actually typing the code

In [5]:
import pyspark

spark = pyspark.sql.SparkSession.builder.getOrCreate()

# 'SparkSession' is lets you work with dataframes and tables using SQL-like queries

# 'builder' creates the Sparksession

# 'getOrCreate()' gets an existing SparkSession if there is one.  If there's NOT one, 
# 'getOrCreate()' makes one.


In [6]:
import pyspark

spark = pyspark.sql.SparkSession.builder.getOrCreate()

In [7]:
import pandas as pd
import numpy as np

np.random.seed(456)

pandas_dataframe = pd.DataFrame(
    dict(n=np.arange(20), group=np.random.choice(list("abc"), 20))
)

# "Create a pandas DataFrame with 20 rows numbered 0-19.  
# "First column is 'n,' second column is 'group.'  in 'group' column, give me 
# "20 random letters either 'a, b, or c.'"

pandas_dataframe.head()

Unnamed: 0,n,group
0,0,b
1,1,b
2,2,c
3,3,a
4,4,c


In [9]:
random_seed = (456)

pandas_dataframe = pd.DataFrame(
    dict(n=np.arange(20), group=np.random.choice(list("abc"), 20))
)

pandas_dataframe.head()

Unnamed: 0,n,group
0,0,c
1,1,b
2,2,c
3,3,a
4,4,b


In [10]:
# df = spark.createDataFrame(pandas_dataframe)
# df

# converts this pandas df to a spark df w/ 'createDataFrame(what you want to turn into 
# spark dataframe)'

df = spark.createDataFrame(pandas_dataframe)
df

DataFrame[n: bigint, group: string]

**^^ 'bigint' is a Java integer in the pentillions, so just think of it as an integer**

- also look how it SAYS it's a DataFrame, but it doesn't SHOW us a datframe, like Pandas does

In [11]:
df.show(5)

+---+-----+
|  n|group|
+---+-----+
|  0|    c|
|  1|    b|
|  2|    c|
|  3|    a|
|  4|    b|
+---+-----+
only showing top 5 rows



**^^ '.show( )' makes spark get off its ass and get to work** 

- Next two lines are another example of what it does:

In [13]:
df.describe() # describes it's a dataframe

DataFrame[summary: string, n: string, group: string]

In [15]:
df.describe().show() # SHOWS us a dataframe

+-------+-----------------+-----+
|summary|                n|group|
+-------+-----------------+-----+
|  count|               20|   20|
|   mean|              9.5| null|
| stddev|5.916079783099616| null|
|    min|                0|    a|
|    max|               19|    c|
+-------+-----------------+-----+



In [17]:
from pydataset import data # our data library from intro to pandas days

mpg = spark.createDataFrame(data("mpg"))
mpg.show(5)

# note: if pandas df, it'd just be 'mpg,' but with spark it's 'mpg.show()'

+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|   a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|   a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 5 rows



In [18]:
mpg.hwy # shows us that 'hwy' is a column object.  It's just a vertical table slice

Column<b'hwy'>

In [21]:
# use '.select()' to get Spark to show the datatypes in the columns

mpg.select(mpg.hwy, mpg.cty, mpg.model)

DataFrame[hwy: bigint, cty: bigint, model: string]

In [22]:
# Types are great, but we need to see details, 
# So we need to ask Spark to "Show" them to us.

In [23]:
mpg.select(mpg.hwy, mpg.cty, mpg.model).show()

+---+---+------------------+
|hwy|cty|             model|
+---+---+------------------+
| 29| 18|                a4|
| 29| 21|                a4|
| 31| 20|                a4|
| 30| 21|                a4|
| 26| 16|                a4|
| 26| 18|                a4|
| 27| 18|                a4|
| 26| 18|        a4 quattro|
| 25| 16|        a4 quattro|
| 28| 20|        a4 quattro|
| 27| 19|        a4 quattro|
| 25| 15|        a4 quattro|
| 25| 17|        a4 quattro|
| 25| 17|        a4 quattro|
| 25| 15|        a4 quattro|
| 24| 15|        a6 quattro|
| 25| 17|        a6 quattro|
| 23| 16|        a6 quattro|
| 20| 14|c1500 suburban 2wd|
| 15| 11|c1500 suburban 2wd|
+---+---+------------------+
only showing top 20 rows



In [24]:
# Spark lets you do vector math up and down a single column

mpg.hwy + 1

Column<b'(hwy + 1)'>

In [25]:
# To see the column with the 1 added to the numbers, select then show:

mpg.select(mpg.hwy, mpg.hwy + 1).show(5)

+---+---------+
|hwy|(hwy + 1)|
+---+---------+
| 29|       30|
| 29|       30|
| 31|       32|
| 30|       31|
| 26|       27|
+---+---------+
only showing top 5 rows



**^^ You can see that the math was applied by showing 'mpg.hwy' next to 'mpg.hwy+1**'

In [26]:
# Format for renaming a column:

# df_name.select(df.column.alias("new_name")).show()

mpg.select(mpg.hwy.alias("highway_miles")).show(5)

+-------------+
|highway_miles|
+-------------+
|           29|
|           29|
|           31|
|           30|
|           26|
+-------------+
only showing top 5 rows



In [29]:
# To store column objects as variables and reference them:

col1 = mpg.hwy.alias("highway_mileage")
col2 = (mpg.hwy / 2).alias("highway_mileage_halved")
mpg.select(col1, col2).show(5)

+---------------+----------------------+
|highway_mileage|highway_mileage_halved|
+---------------+----------------------+
|             29|                  14.5|
|             29|                  14.5|
|             31|                  15.5|
|             30|                  15.0|
|             26|                  13.0|
+---------------+----------------------+
only showing top 5 rows



### In Spark, the 'col' and 'expr' create columns.  

- they need to be imported from the 'pyspark.sql.functions' library

- 'col' lets you create a column and use it just like you would in the df Pandas work

- 'expr' does everything 'col' does, but is more powerful

In [30]:
from pyspark.sql.functions import col, expr

In [31]:
col("hwy")

Column<b'hwy'>

**Now that you've created a 'hwy' column, you can do math with it, rename it, etc.  IE as soon as you col("something") you can do all the manipulations to it you want, just like in the code above.**

In [37]:
avg_column = (col("hwy") + col("cty")) / 2

mpg.select(col("hwy").alias("highway_mileage"), # change from 'col' to 'higheway_mileage'
          mpg.cty.alias("city_mileage"), # change from 'cty' to 'city_mileage'
          avg_column.alias("average_mileage"),).show(10) # change 'avg_column' to 'average_mileage'

"""
Note the format:

df.select(col("col_name").alias("new_col_name"),
        df.df_col_name.alias("new_df_col_name"),
        variable_name.alias("new_variable_name"), .show(10)
        
Within the code, I use the column I created using 'col', the column name from the
existing dataframe, and my variable name.
"""

+---------------+------------+---------------+
|highway_mileage|city_mileage|average_mileage|
+---------------+------------+---------------+
|             29|          18|           23.5|
|             29|          21|           25.0|
|             31|          20|           25.5|
|             30|          21|           25.5|
|             26|          16|           21.0|
|             26|          18|           22.0|
|             27|          18|           22.5|
|             26|          18|           22.0|
|             25|          16|           20.5|
|             28|          20|           24.0|
+---------------+------------+---------------+
only showing top 10 rows



'\nNote the format:\n\ndf.select(col("col_name").alias("new_col_name"),\n        df.df_col_name.alias("new_df_col_name"),\n        variable_name.alias("new_variable_name"), .show(10)\n        \nWithin the code, I use the column I created using \'col\', the column name from the\nexisting dataframe, and my variable name.\n'

**Now for 'expr':**

In [39]:
mpg.select(
    expr("hwy"), # just like I'd do 'col'
    expr("hwy + 1"), # adding one down the entire column = vector math
    expr("hwy AS highway_mileage"), # creates alias like I would in an MySQL query
    expr("hwy + 1 AS highway_incremented"), # renames the column I'm adding 1 to
    ).show(10)

+---+---------+---------------+-------------------+
|hwy|(hwy + 1)|highway_mileage|highway_incremented|
+---+---------+---------------+-------------------+
| 29|       30|             29|                 30|
| 29|       30|             29|                 30|
| 31|       32|             31|                 32|
| 30|       31|             30|                 31|
| 26|       27|             26|                 27|
| 26|       27|             26|                 27|
| 27|       28|             27|                 28|
| 26|       27|             26|                 27|
| 25|       26|             25|                 26|
| 28|       29|             28|                 29|
+---+---------+---------------+-------------------+
only showing top 10 rows



In [40]:
# These are 4 ways to express the same thing:

mpg.select(
    mpg.hwy.alias("highway"),
    col("hwy").alias("highway"),
    expr("hwy").alias("highway"),
    expr("hwy AS highway"),
).show(5)

+-------+-------+-------+-------+
|highway|highway|highway|highway|
+-------+-------+-------+-------+
|     29|     29|     29|     29|
|     29|     29|     29|     29|
|     31|     31|     31|     31|
|     30|     30|     30|     30|
|     26|     26|     26|     26|
+-------+-------+-------+-------+
only showing top 5 rows



## Spark SQL

- lets us write against our spark df's using SQL quries (kind of like what we did when we gave aliases using 'expr' above)

Couple of things you have to do before using Spark SQL:

1.) You gotta register your table with Spark

2.) Now that it's registered, we can begin sending SQL queries to it

In [44]:
# Register

mpg.createOrReplaceTempView("mpg")

In [45]:
# Now we can start sending queries

spark.sql(
"""
SELECT hwy, cty, (hwy + cty) / 2 AS avg
FROM mpg
""")

DataFrame[hwy: bigint, cty: bigint, avg: double]

^^ Okay, so this shows us the dtype of the columns, but doesn't show us anything...

In [48]:
spark.sql(
"""
SELECT hwy, cty, (hwy + cty)/ 2 AS avg
FROM mpg
""").show(10)

# Note format:
#     (
#     triple quotes
#     SELECT columns_you, want_to, see
#     FROM df_you're_looking_at
#     triple quotes
#     ).show()


+---+---+----+
|hwy|cty| avg|
+---+---+----+
| 29| 18|23.5|
| 29| 21|25.0|
| 31| 20|25.5|
| 30| 21|25.5|
| 26| 16|21.0|
| 26| 18|22.0|
| 27| 18|22.5|
| 26| 18|22.0|
| 25| 16|20.5|
| 28| 20|24.0|
+---+---+----+
only showing top 10 rows



## Typecasting

**Because machine learning models only accept integer datatypes, we'll need to change datatypes as we go along in order for the models to read them.  'TYPECASTING' lets us us this, similar to '.astype( )' in Pandas**

In [49]:
# view the datatypes in mpg

mpg.dtypes

[('manufacturer', 'string'),
 ('model', 'string'),
 ('displ', 'double'),
 ('year', 'bigint'),
 ('cyl', 'bigint'),
 ('trans', 'string'),
 ('drv', 'string'),
 ('cty', 'bigint'),
 ('hwy', 'bigint'),
 ('fl', 'string'),
 ('class', 'string')]

^^ We see 3 different datatypes: 'string,' 'double,' and 'bigint.'  You can also do this by using 'mpg.printSchema()' - 'schema' = the structure of the dataframe

In [50]:
mpg.printSchema()

root
 |-- manufacturer: string (nullable = true)
 |-- model: string (nullable = true)
 |-- displ: double (nullable = true)
 |-- year: long (nullable = true)
 |-- cyl: long (nullable = true)
 |-- trans: string (nullable = true)
 |-- drv: string (nullable = true)
 |-- cty: long (nullable = true)
 |-- hwy: long (nullable = true)
 |-- fl: string (nullable = true)
 |-- class: string (nullable = true)



### To convert to a different datatype, we use 'cast':

In [54]:
mpg.select(mpg.hwy.cast("string")).printSchema()

# we are selecting the 'hwy' column from the 'mpg' df
# and changing it from a 'bigint' (or 'long') into a 'string' using 'cast'
# the output below shows us this is what we've accomplished

root
 |-- hwy: string (nullable = true)



**If a value cannot be converted using '.cast( )', then it will be replaced with a 'null' as we can see from the following**

In [56]:
mpg.select(mpg.model, mpg.model.cast("int")).show(10)

# "select 'mpg.model' from 'mpg' and make cast its dtype from 'string' to 'int'""

+----------+-----+
|     model|model|
+----------+-----+
|        a4| null|
|        a4| null|
|        a4| null|
|        a4| null|
|        a4| null|
|        a4| null|
|        a4| null|
|a4 quattro| null|
|a4 quattro| null|
|a4 quattro| null|
+----------+-----+
only showing top 10 rows

