In [2]:
# notebook dependencies
import pyspark
from pyspark.sql.functions import col, expr
from pyspark.sql.functions import regexp_extract, regexp_replace

# note: the pyspark avg and mean functions are aliases of eachother
from pyspark.sql.functions import concat, sum, avg, min, max, count, mean, lit

# note: the following import, imports all pyspark sql functions similar to above
from pyspark.sql.functions import *

# creating the spark instance
spark = pyspark.sql.SparkSession.builder.getOrCreate()

# pandas, numpy, and matplotlib imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 

# pydatasets
from pydataset import data

# tqdm loading bar library
from tqdm.notebook import tqdm, trange
import time # to be used in loop iterations

# disabling warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Create a spark data frame that contains your favorite programming languages.

# The name of the column should be language
# View the schema of the dataframe
# Output the shape of the dataframe
# Show the first 5 records in the dataframe

In [5]:
# creating a pyspark dataframe

df = pd.DataFrame({ 
    "n": np.arange(20), 
    "language": "python"
})

df = spark.createDataFrame(df)
df

DataFrame[n: bigint, language: string]

In [12]:
# printing the pyspark dataframe shape

print(f'pyspark df shape: {(df.count(), len(df.columns))}')

pyspark df shape: (20, 2)


In [19]:
# describing the pyspark df

df.describe().show()

+-------+-----------------+--------+
|summary|                n|language|
+-------+-----------------+--------+
|  count|               20|      20|
|   mean|              9.5|    null|
| stddev|5.916079783099616|    null|
|    min|                0|  python|
|    max|               19|  python|
+-------+-----------------+--------+



In [13]:
# showing the first 5 records in the dataframe

df.show(5)

+---+--------+
|  n|language|
+---+--------+
|  0|  python|
|  1|  python|
|  2|  python|
|  3|  python|
|  4|  python|
+---+--------+
only showing top 5 rows



In [23]:
# exercise number 2: Load the mpg dataset as a spark dataframe

mpg = data("mpg")
mpg = spark.createDataFrame(mpg)

mpg.show(5)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|   a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|   a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 5 rows



In [34]:
# Create 1 column of output that contains a message like the one below:
# using a combination of pyspark functions (concat, expr, and lit?)

# using the "concatenate with separator method"

mpg.select(concat_ws(
    " ",
    lit("The"),
    mpg.year,
    mpg.model, 
    lit("has a"),
    mpg.cyl,
    lit("cylinder enginge.")).alias("concat_example")
).show(5)

+--------------------+
|      concat_example|
+--------------------+
|The 1999 a4 has a...|
|The 1999 a4 has a...|
|The 2008 a4 has a...|
|The 2008 a4 has a...|
|The 1999 a4 has a...|
+--------------------+
only showing top 5 rows



In [37]:
#  2b. Transform the trans column so that it only contains either manual or auto.
# will use reg expression to accomplish this task

mpg.select("trans").regexp_extract("trans", r'')

SyntaxError: EOL while scanning string literal (617943060.py, line 4)