## Main input

In [1]:
import findspark
findspark.init("/opt/spark")
from pyspark.sql import * 
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [2]:
spark = SparkSession.builder \
    .appName("CleanData") \
    .master("local[2]") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .getOrCreate()

In [3]:
#!wget https://github.com/erkansirin78/datasets/raw/master/dirty_store_transactions.csv

In [4]:
df = spark.read.option("header", True) \
    .option("inferSchema", True) \
    .csv("dirty_store.csv")

In [5]:
df.limit(5).toPandas()

Unnamed: 0,STORE_ID,STORE_LOCATION,PRODUCT_CATEGORY,PRODUCT_ID,MRP,CP,DISCOUNT,SP,Date
0,YR7220,New York(,Electronics,12254943,$31,$20.77,$1.86,$29.14,2019-11-26
1,YR7220,New York+,Furniture,72619323C,$15,$9.75,$1.5,$13.5,2019-11-26
2,YR7220,New York,Electronics,34161682B,$88,$62.48,$4.4,$83.6,2019-11-26
3,YR7220,New York!,Kitchen,79411621,$91,$58.24,$3.64,$87.36,2019-11-26
4,YR7220,New York,Fashion,39520263T,$85,$51,$2.55,$82.45,2019-11-26


In [6]:
df.printSchema()
print("Rows:", df.count(), "Columns:", len(df.columns))

root
 |-- STORE_ID: string (nullable = true)
 |-- STORE_LOCATION: string (nullable = true)
 |-- PRODUCT_CATEGORY: string (nullable = true)
 |-- PRODUCT_ID: string (nullable = true)
 |-- MRP: string (nullable = true)
 |-- CP: string (nullable = true)
 |-- DISCOUNT: string (nullable = true)
 |-- SP: string (nullable = true)
 |-- Date: date (nullable = true)

Rows: 37853 Columns: 9


## DATA CLEANING PySparkSQL

In [7]:
df.createOrReplaceTempView("table")

In [8]:
# I can't use "where mrp > 0" because mrp, cp, discount and sp's datatype is not numeric! Therefor sql can't return the table!
sqldf = spark.sql("""
SELECT
  TRIM(STORE_ID) AS STORE_ID,
  TRIM(REGEXP_REPLACE(STORE_LOCATION, '[^a-zA-Z0-9 ]', '')) AS STORE_LOCATION,
  TRIM(PRODUCT_CATEGORY) AS PRODUCT_CATEGORY,
  TRIM(PRODUCT_ID) AS PRODUCT_ID,
  CAST(TRIM(SUBSTRING(MRP, 2, 6)) AS INTEGER) AS MRP,
  ROUND(CAST(TRIM(SUBSTRING(CP, 2, 6)) AS FLOAT), 2) AS CP,
  CAST(TRIM(SUBSTRING(DISCOUNT, 2, 6)) AS FLOAT) AS DISCOUNT,
  ROUND(CAST(TRIM(SUBSTRING(SP, 2, 6)) AS FLOAT), 2) AS SP,
  CAST(TRIM(DATE) AS DATE) AS DATE
FROM table
""")

In [9]:
sqldf.show(5)

+--------+--------------+----------------+----------+---+-----+--------+-----+----------+
|STORE_ID|STORE_LOCATION|PRODUCT_CATEGORY|PRODUCT_ID|MRP|   CP|DISCOUNT|   SP|      DATE|
+--------+--------------+----------------+----------+---+-----+--------+-----+----------+
|  YR7220|      New York|     Electronics|  12254943| 31|20.77|    1.86|29.14|2019-11-26|
|  YR7220|      New York|       Furniture| 72619323C| 15| 9.75|     1.5| 13.5|2019-11-26|
|  YR7220|      New York|     Electronics| 34161682B| 88|62.48|     4.4| 83.6|2019-11-26|
|  YR7220|      New York|         Kitchen|  79411621| 91|58.24|    3.64|87.36|2019-11-26|
|  YR7220|      New York|         Fashion| 39520263T| 85| 51.0|    2.55|82.45|2019-11-26|
+--------+--------------+----------------+----------+---+-----+--------+-----+----------+
only showing top 5 rows



In [10]:
sqldf.printSchema()

root
 |-- STORE_ID: string (nullable = true)
 |-- STORE_LOCATION: string (nullable = true)
 |-- PRODUCT_CATEGORY: string (nullable = true)
 |-- PRODUCT_ID: string (nullable = true)
 |-- MRP: integer (nullable = true)
 |-- CP: float (nullable = true)
 |-- DISCOUNT: float (nullable = true)
 |-- SP: float (nullable = true)
 |-- DATE: date (nullable = true)



In [11]:
sqldf.createOrReplaceTempView("clean_table")

In [12]:
# Round method is not working on toPandas(), if you want to fix that, try to use show()
# Ps: I did not use " REGEXP_REPLACE(PRODUCT_ID, '[^0-9]', '') AS PRODUCT_LOCATION " in pyspark.sql
# because when you want to change a column in pyspark.sql, you must add every columns' name in SELECT statement!

print("Rows:", sqldf.count(), "Columns:", len(sqldf.columns))

sqldf_clean = spark.sql("""
SELECT 
    * 
FROM clean_table
WHERE
    (MRP > 0) OR (CP > 0) OR (DISCOUNT > 0) OR (SP > 0)
""")

sqldf_clean = sqldf.withColumn("PRODUCT_ID", regexp_replace(col("PRODUCT_ID"), "[^0-9]", "").cast(IntegerType()))

print("Rows:", sqldf_clean.count(), "Columns:", len(sqldf_clean.columns))

Rows: 37853 Columns: 9
Rows: 37853 Columns: 9


In [13]:
sqldf_clean.createOrReplaceTempView("clean_table_2")

spark.sql("""
SELECT * FROM clean_table_2
""").show(5)

+--------+--------------+----------------+----------+---+-----+--------+-----+----------+
|STORE_ID|STORE_LOCATION|PRODUCT_CATEGORY|PRODUCT_ID|MRP|   CP|DISCOUNT|   SP|      DATE|
+--------+--------------+----------------+----------+---+-----+--------+-----+----------+
|  YR7220|      New York|     Electronics|  12254943| 31|20.77|    1.86|29.14|2019-11-26|
|  YR7220|      New York|       Furniture|  72619323| 15| 9.75|     1.5| 13.5|2019-11-26|
|  YR7220|      New York|     Electronics|  34161682| 88|62.48|     4.4| 83.6|2019-11-26|
|  YR7220|      New York|         Kitchen|  79411621| 91|58.24|    3.64|87.36|2019-11-26|
|  YR7220|      New York|         Fashion|  39520263| 85| 51.0|    2.55|82.45|2019-11-26|
+--------+--------------+----------------+----------+---+-----+--------+-----+----------+
only showing top 5 rows



In [14]:
sqldf_clean.printSchema()

root
 |-- STORE_ID: string (nullable = true)
 |-- STORE_LOCATION: string (nullable = true)
 |-- PRODUCT_CATEGORY: string (nullable = true)
 |-- PRODUCT_ID: integer (nullable = true)
 |-- MRP: integer (nullable = true)
 |-- CP: float (nullable = true)
 |-- DISCOUNT: float (nullable = true)
 |-- SP: float (nullable = true)
 |-- DATE: date (nullable = true)



## DATA CLEANING PySpark DataFrame

In [15]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

#str abject has no attribute 'alias'
cleaned_df = df.select(
    trim(col("STORE_ID")).alias("STORE_ID"),
    trim(regexp_replace(col("STORE_LOCATION"), "[^a-zA-Z0-9 ]", "")).alias("STORE_LOCATION"),
    trim(col("PRODUCT_CATEGORY")).alias("PRODUCT_CATEGORY"),
    trim(regexp_replace(col("PRODUCT_ID"), "[^a-zA-Z0-9 ]", "")).alias("PRODUCT_ID"),
    trim(col("MRP")).substr(2, 6).cast(IntegerType()).alias("MRP"),
    round(substring(trim(col("CP")), 2, 6).cast(FloatType()), 2).alias("CP"),
    round(substring(trim(col("DISCOUNT")), 2, 6).cast(FloatType()), 2).alias("DISCOUNT"),
    round(substring(trim(col("SP")), 2, 6).cast(FloatType()), 2).alias("SP"),
    col("Date").cast(DateType()).alias("DATE")
)

cleaned_df.show(5)

+--------+--------------+----------------+----------+---+-----+--------+-----+----------+
|STORE_ID|STORE_LOCATION|PRODUCT_CATEGORY|PRODUCT_ID|MRP|   CP|DISCOUNT|   SP|      DATE|
+--------+--------------+----------------+----------+---+-----+--------+-----+----------+
|  YR7220|      New York|     Electronics|  12254943| 31|20.77|    1.86|29.14|2019-11-26|
|  YR7220|      New York|       Furniture| 72619323C| 15| 9.75|     1.5| 13.5|2019-11-26|
|  YR7220|      New York|     Electronics| 34161682B| 88|62.48|     4.4| 83.6|2019-11-26|
|  YR7220|      New York|         Kitchen|  79411621| 91|58.24|    3.64|87.36|2019-11-26|
|  YR7220|      New York|         Fashion| 39520263T| 85| 51.0|    2.55|82.45|2019-11-26|
+--------+--------------+----------------+----------+---+-----+--------+-----+----------+
only showing top 5 rows



In [16]:
print("Rows:", cleaned_df.count(), "Columns:", len(cleaned_df.columns))

conditions = (
    (col("MRP") > 0) &
    (col("CP") > 0) &
    (col("DISCOUNT") > 0) &
    (col("SP") > 0))
    
cleaned_df_2 = cleaned_df.filter(conditions)
cleaned_df_2 = cleaned_df.withColumn("PRODUCT_ID", regexp_replace(col("PRODUCT_ID"), "[^0-9]", "").cast(FloatType()))

print("Rows:", cleaned_df_2.count(), "Columns:", len(cleaned_df_2.columns))

Rows: 37853 Columns: 9
Rows: 37853 Columns: 9


In [17]:
# PS: The "withColumn" method is used to create a new DataFrame by adding a new column or replacing an existing 
# column with a modified version. 
# The "Select" method returns a new DataFrame that includes only the specified columns.
# Therefor we used "withColumn" for modification!

cleaned_df_2_select = cleaned_df.select("PRODUCT_ID", regexp_replace(col("PRODUCT_ID"), "[^0-9]", "").cast(FloatType()))
cleaned_df_2_select.show(5)

+----------+------------------------------------------------------+
|PRODUCT_ID|CAST(regexp_replace(PRODUCT_ID, [^0-9], , 1) AS FLOAT)|
+----------+------------------------------------------------------+
|  12254943|                                           1.2254943E7|
| 72619323C|                                            7.261932E7|
| 34161682B|                                            3.416168E7|
|  79411621|                                           7.9411624E7|
| 39520263T|                                           3.9520264E7|
+----------+------------------------------------------------------+
only showing top 5 rows



In [18]:
cleaned_df.show(5)

+--------+--------------+----------------+----------+---+-----+--------+-----+----------+
|STORE_ID|STORE_LOCATION|PRODUCT_CATEGORY|PRODUCT_ID|MRP|   CP|DISCOUNT|   SP|      DATE|
+--------+--------------+----------------+----------+---+-----+--------+-----+----------+
|  YR7220|      New York|     Electronics|  12254943| 31|20.77|    1.86|29.14|2019-11-26|
|  YR7220|      New York|       Furniture| 72619323C| 15| 9.75|     1.5| 13.5|2019-11-26|
|  YR7220|      New York|     Electronics| 34161682B| 88|62.48|     4.4| 83.6|2019-11-26|
|  YR7220|      New York|         Kitchen|  79411621| 91|58.24|    3.64|87.36|2019-11-26|
|  YR7220|      New York|         Fashion| 39520263T| 85| 51.0|    2.55|82.45|2019-11-26|
+--------+--------------+----------------+----------+---+-----+--------+-----+----------+
only showing top 5 rows



In [19]:
cleaned_df_2.printSchema()

root
 |-- STORE_ID: string (nullable = true)
 |-- STORE_LOCATION: string (nullable = true)
 |-- PRODUCT_CATEGORY: string (nullable = true)
 |-- PRODUCT_ID: float (nullable = true)
 |-- MRP: integer (nullable = true)
 |-- CP: float (nullable = true)
 |-- DISCOUNT: float (nullable = true)
 |-- SP: float (nullable = true)
 |-- DATE: date (nullable = true)



## Write parquet and orc

In [20]:
# Write to other format but becareful for "overwrite" because the method is clear in folder.

In [28]:
#sqldf_clean.write.mode("overwrite").orc("spark_output_format/orc")

In [29]:
#sqldf_clean.write.mode("overwrite").parquet("spark_output_format/parquet")

## Read parquet and orc

In [30]:
parquet_df = spark.read.orc("spark_output_format/orc")

parquet_df.show()

+--------+--------------+----------------+----------+---+-----+--------+-----+----------+
|STORE_ID|STORE_LOCATION|PRODUCT_CATEGORY|PRODUCT_ID|MRP|   CP|DISCOUNT|   SP|      DATE|
+--------+--------------+----------------+----------+---+-----+--------+-----+----------+
|  YR7220|      New York|     Electronics|  12254943| 31|20.77|    1.86|29.14|2019-11-26|
|  YR7220|      New York|       Furniture|  72619323| 15| 9.75|     1.5| 13.5|2019-11-26|
|  YR7220|      New York|     Electronics|  34161682| 88|62.48|     4.4| 83.6|2019-11-26|
|  YR7220|      New York|         Kitchen|  79411621| 91|58.24|    3.64|87.36|2019-11-26|
|  YR7220|      New York|         Fashion|  39520263| 85| 51.0|    2.55|82.45|2019-11-26|
|  YR7220|      New York|         Kitchen|  93809204| 37|24.05|    0.74|36.26|2019-11-26|
|  YR7220|      New York|       Cosmetics|  86610412| 80| 48.8|     6.4| 73.6|2019-11-26|
|  YR7220|      New York|         Kitchen|  52503356| 71| 42.6|    5.68|65.32|2019-11-26|
|  YR7220|

In [31]:
parquet_df = spark.read.parquet("spark_output_format/parquet")

parquet_df.show()

+--------+--------------+----------------+----------+---+-----+--------+-----+----------+
|STORE_ID|STORE_LOCATION|PRODUCT_CATEGORY|PRODUCT_ID|MRP|   CP|DISCOUNT|   SP|      DATE|
+--------+--------------+----------------+----------+---+-----+--------+-----+----------+
|  YR7220|      New York|     Electronics|  12254943| 31|20.77|    1.86|29.14|2019-11-26|
|  YR7220|      New York|       Furniture|  72619323| 15| 9.75|     1.5| 13.5|2019-11-26|
|  YR7220|      New York|     Electronics|  34161682| 88|62.48|     4.4| 83.6|2019-11-26|
|  YR7220|      New York|         Kitchen|  79411621| 91|58.24|    3.64|87.36|2019-11-26|
|  YR7220|      New York|         Fashion|  39520263| 85| 51.0|    2.55|82.45|2019-11-26|
|  YR7220|      New York|         Kitchen|  93809204| 37|24.05|    0.74|36.26|2019-11-26|
|  YR7220|      New York|       Cosmetics|  86610412| 80| 48.8|     6.4| 73.6|2019-11-26|
|  YR7220|      New York|         Kitchen|  52503356| 71| 42.6|    5.68|65.32|2019-11-26|
|  YR7220|