In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Chocolate Sales').getOrCreate()

In [None]:
# Read the dataset
df_spark = spark.read.option("header", "true").csv("Chocolate Sales.csv", inferSchema = True)

# Another option to read dataset
spark.read.csv("Chocolate Sales.csv", header=True, inferSchema=True).show()

## 1. Introduction

In [None]:
# Check the schema
df_spark.printSchema()

In [None]:
# Get all the column names
df_spark.columns

In [None]:
# Check the first 10 rows
df_spark.head(10)

In [None]:
# Select only the Product column
df_spark.select('Product').show()

In [None]:
# Select multiple columns
df_spark.select(['Product', 'Country', 'Date']).show()

In [None]:
# Check the types of the columns
df_spark.dtypes

In [None]:
# Check the summary of the dataset
df_spark.describe().show()

In [14]:
# Add a new column
df_spark = df_spark.withColumn('New Column', df_spark['Boxes Shipped'] * 2)

In [None]:
df_spark.show()

In [16]:
# Drop the columns
df_spark = df_spark.drop('New Column')

In [None]:
df_spark.show()

In [None]:
# Rename the columns
df_spark.withColumnRenamed('Sales Person', 'Salesperson').show()

## 2. Operations

In [None]:
# Drop the missing values

# how parameter
df_spark.na.drop(how = 'any') # drop column if any missing value - default
df_spark.na.drop(how = 'all') # drop column if all missing value

# threshold parameter
df_spark.na.drop(thresh = 2) # specify the minimum number of non-null values requires for a row or column to be retained

# subset parameter
df_spark.na.drop(subset = ['Product']) # drop rows or columns with missing values in the specified columns

In [None]:
# Fill the missing values
df_spark.na.fill('Missing Value') # fill all missing values with the specified value
df_spark.na.fill('Missing Value', subset = ['Product']).show() # fill missing values in the specified columns

In [None]:
# Fill the missing values with the mean value
from pyspark.ml.feature import Imputer
from pyspark.sql.functions import collect_list, avg

df_spark = df_spark.withColumn("Boxes Shipped", df_spark["Boxes Shipped"].cast("double"))

imputer = Imputer(
    inputCols = ['Boxes Shipped'],
    outputCols = ["Boxes Shipped_imputed"]
    ).setStrategy("mean")

imputer.fit(df_spark).transform(df_spark).show()


## 3. Filter