In [1]:
import pyspark

spark = pyspark.sql.SparkSession.builder.getOrCreate()

In [33]:
import pandas as pd
import numpy as np

import pyspark.sql.functions as F

1. Create a spark data frame that contains your favorite programming languages.

    - The name of the column should be language
    - View the schema of the dataframe
    - Output the shape of the dataframe
    - Show the first 5 records in the dataframe

In [16]:
# create df, name column
df = pd.DataFrame(['python', 'sql', 'html', 'java', 'javascript', 'c', 'c++', 'basic', 'markdown', 'php']
)
df = df.rename(columns=({0: 'language'}))

In [18]:
# make it a spark df
df = spark.createDataFrame(df)


DataFrame[language: string]

In [19]:
# view schema
df.printSchema()

root
 |-- language: string (nullable = true)



In [21]:
#output shape of df
print((df.count(), len(df.columns)))

(10, 1)


In [25]:
# show first 5 records
df.show(5)

+----------+
|  language|
+----------+
|    python|
|       sql|
|      html|
|      java|
|javascript|
+----------+
only showing top 5 rows



2. Load the mpg dataset as a spark dataframe.

    a. Create 1 column of output that contains a message like the one below:


> The 1999 audi a4 has a 4 cylinder engine. For each vehicle.

    b. Transform the trans column so that it only contains either manual or auto.

In [62]:
#load mpg data set make spark df
from pydataset import data
mpg = spark.createDataFrame(data('mpg'))

In [63]:
# create verbose column

mpg = mpg.withColumn("statement", (F.concat(F.lit('The '), 
                 mpg.year, 
                 F.lit(" "), 
                 mpg.manufacturer,
                 F.lit(" "),
                 mpg.model,
                 F.lit(" has a "),
                 mpg.cyl,
                 F.lit(" cylinder engine"),
                )
                            )
              )

In [88]:
# how can you show everything in the column
mpg.select(mpg.statement).show(truncate=False)

+-------------------------------------------------------------+
|statement                                                    |
+-------------------------------------------------------------+
|The 1999 audi a4 has a 4 cylinder engine                     |
|The 1999 audi a4 has a 4 cylinder engine                     |
|The 2008 audi a4 has a 4 cylinder engine                     |
|The 2008 audi a4 has a 4 cylinder engine                     |
|The 1999 audi a4 has a 6 cylinder engine                     |
|The 1999 audi a4 has a 6 cylinder engine                     |
|The 2008 audi a4 has a 6 cylinder engine                     |
|The 1999 audi a4 quattro has a 4 cylinder engine             |
|The 1999 audi a4 quattro has a 4 cylinder engine             |
|The 2008 audi a4 quattro has a 4 cylinder engine             |
|The 2008 audi a4 quattro has a 4 cylinder engine             |
|The 1999 audi a4 quattro has a 6 cylinder engine             |
|The 1999 audi a4 quattro has a 6 cylind

In [92]:
# transform trans column to be manual or auto
mpg.select(mpg.trans, 
           F.regexp_replace(mpg.trans, r'\(.+\)', "")
          ).show(5)

+----------+-------------------------------+
|     trans|regexp_replace(trans, \(.+\), )|
+----------+-------------------------------+
|  auto(l5)|                           auto|
|manual(m5)|                         manual|
|manual(m6)|                         manual|
|  auto(av)|                           auto|
|  auto(l5)|                           auto|
+----------+-------------------------------+
only showing top 5 rows



3. Load the tips dataset as a spark dataframe.

    a. What percentage of observations are smokers?
    
    b. Create a column that contains the tip percentage
    
    c. Calculate the average tip percentage for each combination of sex and smoker.

In [94]:
tips = spark.createDataFrame(data('tips'))

In [95]:
tips.show(5)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
+----------+----+------+------+---+------+----+
only showing top 5 rows



In [113]:
# percentage of observations are smokers
n_smokers = tips.select('smoker').where(tips.smoker == 'Yes').count() 
n_total = tips.select('smoker').count()

# F.format_number(spark.createDataFrame([[n_smokers / n_total]]), 2)

n_smokers / n_total

0.38114754098360654