In [0]:
from pyspark.sql.types import *
schema = StructType([
    StructField("fixed acidity", DoubleType(), False),
    StructField("volatile acidity", DoubleType(), False),
    StructField("citric acid", DoubleType(), False),
    StructField("residual sugar", DoubleType(), False),
    StructField("chlorides", DoubleType(), False),
    StructField("free sulfur dioxide", IntegerType(), False),
    StructField("total sulfur dioxide", IntegerType(), False),
    StructField("density", DoubleType(), False),
    StructField("pH", DoubleType(), False),
    StructField("suplhates", DoubleType(), False),
    StructField("alcohol", DoubleType(), False),
    StructField("quality", IntegerType(), False),
])


In [0]:
display(dbutils.fs.ls('/databricks-datasets'))


path,name,size,modificationTime
dbfs:/databricks-datasets/COVID/,COVID/,0,0
dbfs:/databricks-datasets/README.md,README.md,976,1532468253000
dbfs:/databricks-datasets/Rdatasets/,Rdatasets/,0,0
dbfs:/databricks-datasets/SPARK_README.md,SPARK_README.md,3359,1455043490000
dbfs:/databricks-datasets/adult/,adult/,0,0
dbfs:/databricks-datasets/airlines/,airlines/,0,0
dbfs:/databricks-datasets/amazon/,amazon/,0,0
dbfs:/databricks-datasets/asa/,asa/,0,0
dbfs:/databricks-datasets/atlas_higgs/,atlas_higgs/,0,0
dbfs:/databricks-datasets/bikeSharing/,bikeSharing/,0,0


In [0]:
df = spark.read.format('csv').options(sep=';', header=True).schema(schema).load("dbfs:/databricks-datasets/wine-quality/winequality-white.csv")
display(df.head(5))

fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,suplhates,alcohol,quality
7.0,0.27,0.36,20.7,0.045,45,170,1.001,3.0,0.45,8.8,6
6.3,0.3,0.34,1.6,0.049,14,132,0.994,3.3,0.49,9.5,6
8.1,0.28,0.4,6.9,0.05,30,97,0.9951,3.26,0.44,10.1,6
7.2,0.23,0.32,8.5,0.058,47,186,0.9956,3.19,0.4,9.9,6
7.2,0.23,0.32,8.5,0.058,47,186,0.9956,3.19,0.4,9.9,6


In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import DoubleType

df = df.withColumn("fixed acidity", when(col("fixed acidity") == 7, None).otherwise(col("fixed acidity")))
display(df.head(5))


fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,suplhates,alcohol,quality
,0.27,0.36,20.7,0.045,45,170,1.001,3.0,0.45,8.8,6
6.3,0.3,0.34,1.6,0.049,14,132,0.994,3.3,0.49,9.5,6
8.1,0.28,0.4,6.9,0.05,30,97,0.9951,3.26,0.44,10.1,6
7.2,0.23,0.32,8.5,0.058,47,186,0.9956,3.19,0.4,9.9,6
7.2,0.23,0.32,8.5,0.058,47,186,0.9956,3.19,0.4,9.9,6


In [0]:
df = df.na.fill({'fixed acidity': 7})
display(df.head(5))

fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,suplhates,alcohol,quality
7.0,0.27,0.36,20.7,0.045,45,170,1.001,3.0,0.45,8.8,6
6.3,0.3,0.34,1.6,0.049,14,132,0.994,3.3,0.49,9.5,6
8.1,0.28,0.4,6.9,0.05,30,97,0.9951,3.26,0.44,10.1,6
7.2,0.23,0.32,8.5,0.058,47,186,0.9956,3.19,0.4,9.9,6
7.2,0.23,0.32,8.5,0.058,47,186,0.9956,3.19,0.4,9.9,6


In [0]:
df_exploded = df.withColumn('volatile acidity', explode(split(col('volatile acidity'), '')))
display(df_exploded.head(20))

fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,suplhates,alcohol,quality
7.0,0,0.36,20.7,0.045,45,170,1.001,3.0,0.45,8.8,6
7.0,.,0.36,20.7,0.045,45,170,1.001,3.0,0.45,8.8,6
7.0,2,0.36,20.7,0.045,45,170,1.001,3.0,0.45,8.8,6
7.0,7,0.36,20.7,0.045,45,170,1.001,3.0,0.45,8.8,6
6.3,0,0.34,1.6,0.049,14,132,0.994,3.3,0.49,9.5,6
6.3,.,0.34,1.6,0.049,14,132,0.994,3.3,0.49,9.5,6
6.3,3,0.34,1.6,0.049,14,132,0.994,3.3,0.49,9.5,6
8.1,0,0.4,6.9,0.05,30,97,0.9951,3.26,0.44,10.1,6
8.1,.,0.4,6.9,0.05,30,97,0.9951,3.26,0.44,10.1,6
8.1,2,0.4,6.9,0.05,30,97,0.9951,3.26,0.44,10.1,6


In [0]:
dropped_df = df.drop('fixed acidity')
display(dropped_df.head(5))

volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,suplhates,alcohol,quality
0.27,0.36,20.7,0.045,45,170,1.001,3.0,0.45,8.8,6
0.3,0.34,1.6,0.049,14,132,0.994,3.3,0.49,9.5,6
0.28,0.4,6.9,0.05,30,97,0.9951,3.26,0.44,10.1,6
0.23,0.32,8.5,0.058,47,186,0.9956,3.19,0.4,9.9,6
0.23,0.32,8.5,0.058,47,186,0.9956,3.19,0.4,9.9,6


In [0]:
regex_df = df.withColumn('citric acid', regexp_replace('citric acid', r'^.*6$', '0.59'))
display(regex_df.head(5))

fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,suplhates,alcohol,quality
7.0,0.27,0.59,20.7,0.045,45,170,1.001,3.0,0.45,8.8,6
6.3,0.3,0.34,1.6,0.049,14,132,0.994,3.3,0.49,9.5,6
8.1,0.28,0.4,6.9,0.05,30,97,0.9951,3.26,0.44,10.1,6
7.2,0.23,0.32,8.5,0.058,47,186,0.9956,3.19,0.4,9.9,6
7.2,0.23,0.32,8.5,0.058,47,186,0.9956,3.19,0.4,9.9,6


In [0]:
extracted_df = df.withColumn('citric acid', regexp_extract('citric acid', r'^.*6$', 0))
display(extracted_df.head(5))

fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,suplhates,alcohol,quality
7.0,0.27,0.36,20.7,0.045,45,170,1.001,3.0,0.45,8.8,6
6.3,0.3,,1.6,0.049,14,132,0.994,3.3,0.49,9.5,6
8.1,0.28,,6.9,0.05,30,97,0.9951,3.26,0.44,10.1,6
7.2,0.23,,8.5,0.058,47,186,0.9956,3.19,0.4,9.9,6
7.2,0.23,,8.5,0.058,47,186,0.9956,3.19,0.4,9.9,6


In [0]:
ifnull_df = df.withColumn('alcohol', ifnull(df.alcohol, lit(5)))
display(ifnull_df.head(5))

fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,suplhates,alcohol,quality
7.0,0.27,0.36,20.7,0.045,45,170,1.001,3.0,0.45,8.8,6
6.3,0.3,0.34,1.6,0.049,14,132,0.994,3.3,0.49,9.5,6
8.1,0.28,0.4,6.9,0.05,30,97,0.9951,3.26,0.44,10.1,6
7.2,0.23,0.32,8.5,0.058,47,186,0.9956,3.19,0.4,9.9,6
7.2,0.23,0.32,8.5,0.058,47,186,0.9956,3.19,0.4,9.9,6


In [0]:
nullif_df = df.withColumn('nullif', nullif(col('volatile acidity'), col('citric acid')))
display(nullif_df.head(5))

fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,suplhates,alcohol,quality,nullif
7.0,0.27,0.36,20.7,0.045,45,170,1.001,3.0,0.45,8.8,6,0.27
6.3,0.3,0.34,1.6,0.049,14,132,0.994,3.3,0.49,9.5,6,0.3
8.1,0.28,0.4,6.9,0.05,30,97,0.9951,3.26,0.44,10.1,6,0.28
7.2,0.23,0.32,8.5,0.058,47,186,0.9956,3.19,0.4,9.9,6,0.23
7.2,0.23,0.32,8.5,0.058,47,186,0.9956,3.19,0.4,9.9,6,0.23


In [0]:
replace_df = df.withColumn('fixed acidity', replace(col('fixed acidity'), lit(7), lit(700)))
display(replace_df.head(5))

fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,suplhates,alcohol,quality
700.0,0.27,0.36,20.7,0.045,45,170,1.001,3.0,0.45,8.8,6
6.3,0.3,0.34,1.6,0.049,14,132,0.994,3.3,0.49,9.5,6
8.1,0.28,0.4,6.9,0.05,30,97,0.9951,3.26,0.44,10.1,6
700.2,0.23,0.32,8.5,0.058,47,186,0.9956,3.19,0.4,9.9,6
700.2,0.23,0.32,8.5,0.058,47,186,0.9956,3.19,0.4,9.9,6


In [0]:
temp_df = df.withColumn('list', split('citric acid', ''))
array_df = temp_df.withColumn('array contains', array_contains('list', '3'))
display(array_df.head(5))

fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,suplhates,alcohol,quality,list,array contains
7.0,0.27,0.36,20.7,0.045,45,170,1.001,3.0,0.45,8.8,6,"List(0, ., 3, 6)",True
6.3,0.3,0.34,1.6,0.049,14,132,0.994,3.3,0.49,9.5,6,"List(0, ., 3, 4)",True
8.1,0.28,0.4,6.9,0.05,30,97,0.9951,3.26,0.44,10.1,6,"List(0, ., 4)",False
7.2,0.23,0.32,8.5,0.058,47,186,0.9956,3.19,0.4,9.9,6,"List(0, ., 3, 2)",True
7.2,0.23,0.32,8.5,0.058,47,186,0.9956,3.19,0.4,9.9,6,"List(0, ., 3, 2)",True


In [0]:
avg_alcohol = df.select(avg('alcohol')).collect()
display(avg_alcohol)

avg(alcohol)
10.514267047774638


In [0]:
max_alcohol = df.select(max('alcohol')).collect()
display(max_alcohol)

max(alcohol)
14.2


In [0]:
min_alcohol = df.select(min('alcohol')).collect()
display(min_alcohol)

min(alcohol)
8.0


In [0]:
def multiply_by_ten(number):
    return number*10

multiply_udf = df.withColumn('alcohol * 10', multiply_by_ten(col('alcohol')))
display(multiply_udf.head(5))

fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,suplhates,alcohol,quality,alcohol * 10
7.0,0.27,0.36,20.7,0.045,45,170,1.001,3.0,0.45,8.8,6,88.0
6.3,0.3,0.34,1.6,0.049,14,132,0.994,3.3,0.49,9.5,6,95.0
8.1,0.28,0.4,6.9,0.05,30,97,0.9951,3.26,0.44,10.1,6,101.0
7.2,0.23,0.32,8.5,0.058,47,186,0.9956,3.19,0.4,9.9,6,99.0
7.2,0.23,0.32,8.5,0.058,47,186,0.9956,3.19,0.4,9.9,6,99.0


In [0]:
@udf(StringType())
def add_percent(value):
    return str(value) + '%'

string_udf = df.withColumn('string udf', add_percent(col('alcohol')))
display(string_udf.head(5))

fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,suplhates,alcohol,quality,string udf
7.0,0.27,0.36,20.7,0.045,45,170,1.001,3.0,0.45,8.8,6,8.8%
6.3,0.3,0.34,1.6,0.049,14,132,0.994,3.3,0.49,9.5,6,9.5%
8.1,0.28,0.4,6.9,0.05,30,97,0.9951,3.26,0.44,10.1,6,10.1%
7.2,0.23,0.32,8.5,0.058,47,186,0.9956,3.19,0.4,9.9,6,9.9%
7.2,0.23,0.32,8.5,0.058,47,186,0.9956,3.19,0.4,9.9,6,9.9%
