# Sales Data Cleaning 

In [1]:
import sys
sys.path.append("..")
# Spark libs
from pyspark.sql.session import SparkSession
from pyspark.sql.functions import col, lower, mean
# helpers
from helpers.data_prep_and_print import print_df
from helpers.path_translation import translate_to_file_string

### Select the Imput File

In [2]:
inputFile = translate_to_file_string("../data/sales_for_data_cleaning.csv")

### Spark Session Creation

In [3]:
spark = (SparkSession
       .builder
       .appName("Sales Data Cleaning")
       .getOrCreate())
spark.sparkContext.setLogLevel("ERROR")

### Create Dataframe from csv File

In [4]:
df = spark.read.option("header", "true") \
       .option("inferSchema", "true") \
       .option("delimiter", ";") \
       .csv(inputFile)
print(df.printSchema())

root
 |-- division: string (nullable = true)
 |-- level of education: string (nullable = true)
 |-- training level: integer (nullable = true)
 |-- work experience: integer (nullable = true)
 |-- salary: integer (nullable = true)
 |-- sales: integer (nullable = true)

None


Lower division

In [None]:
# df.show()
df.groupBy("division").sum('salary').show()
# set all str to lower case
df = df.withColumn('division', lower(df['division']))
df.groupBy("division").sum('salary').show()

+-----------------+-----------+
|         division|sum(salary)|
+-----------------+-----------+
|      peripherals|   23168175|
|computer hardware|   14462679|
|computer software|    9819655|
|         printers|   23487590|
|  office supplies|   19093546|
+-----------------+-----------+

+-----------------+-----------+
|         division|sum(salary)|
+-----------------+-----------+
|      peripherals|   23168175|
|computer hardware|   14462679|
|computer software|    9819655|
|         printers|   23487590|
|  office supplies|   19093546|
+-----------------+-----------+



Filter lines with missing values

In [56]:
# df.filter(col('division').isNull()).show()
# df.filter(col('level of education').isNull()).show()
df.filter(col('training level').isNull()).show()
df.filter(col('work experience').isNull()).show()
# df.filter(col('salary').isNull()).show()
# df.filter(col('sales').isNull()).show()


+-----------------+------------------+--------------+---------------+------+------+
|         division|level of education|training level|work experience|salary| sales|
+-----------------+------------------+--------------+---------------+------+------+
|         printers|associate's degree|          NULL|              8| 83999|279254|
|         printers|associate's degree|          NULL|              9|107626|436135|
|computer hardware| bachelor's degree|          NULL|              5| 83525|242034|
|         printers|      some college|          NULL|              1| 60898|171167|
|computer hardware|associate's degree|          NULL|              3| 92336|359982|
|computer hardware|       high school|          NULL|              8| 85988|349882|
|      peripherals|      some college|          NULL|              5| 85489|276382|
|computer hardware|associate's degree|          NULL|              4| 92837|385342|
+-----------------+------------------+--------------+---------------+------+

In [33]:
df_filtered = df.na.drop('any')
df_filtered.show()

+-----------------+------------------+--------------+---------------+------+------+
|         division|level of education|training level|work experience|salary| sales|
+-----------------+------------------+--------------+---------------+------+------+
|      peripherals|      some college|             1|              6| 87067|299066|
|         printers|       high school|             3|              7| 98381|457597|
|  office supplies|associate's degree|             1|              7|101528|383462|
|  office supplies|associate's degree|             0|              9| 98431|374972|
|         printers|associate's degree|             0|              6| 82072|239893|
|computer hardware|associate's degree|             2|              8|104295|452952|
|         printers|      some college|             1|              8|101318|426995|
|         printers|associate's degree|             1|              8| 99759|382512|
|         printers|      some college|             0|              9| 86558|

Fill missing values with avg

In [None]:
# Null values training level
mean_training_level = int(df.agg(mean('training level')).collect()[0][0])
print(mean_training_level)
df.fillna(mean_training_level, 'training level').show()

# Null values work experience
mean_work_experience = int(df.agg(mean('work experience')).collect()[0][0])
print(mean_work_experience)
df.fillna(mean_work_experience, 'work experience').show()

# All data in on new df
df_avg = df.fillna(mean_training_level, 'training level').fillna(mean_work_experience, 'work experience').show()


+-----------------+------------------+--------------+---------------+------+------+
|         division|level of education|training level|work experience|salary| sales|
+-----------------+------------------+--------------+---------------+------+------+
|      peripherals|      some college|             1|              6| 87067|299066|
|         printers|       high school|             3|              7| 98381|457597|
|  office supplies|associate's degree|             1|              7|101528|383462|
|  office supplies|associate's degree|             0|              9| 98431|374972|
|         printers|associate's degree|             0|              6| 82072|239893|
|computer hardware|associate's degree|             2|              8|104295|452952|
|         printers|      some college|             1|              8|101318|426995|
|         printers|associate's degree|             1|              8| 99759|382512|
|         printers|      some college|             0|              9| 86558|

In [None]:
spark.stop()