# Sales Data Binning

In [1]:
import sys
sys.path.append("..")
# Spark libs
from pyspark.sql.session import SparkSession
from pyspark.sql.functions import col, lower, max, min
from pyspark.ml.feature import Bucketizer, QuantileDiscretizer
# helpers
from helpers.data_prep_and_print import print_df
from helpers.path_translation import translate_to_file_string

### Select the Imput File

In [2]:
inputFile = translate_to_file_string("../data/sales_for_data_cleaning.csv")

### Spark Session Creation

In [3]:
spark = (SparkSession
       .builder
       .appName("Sales Data Binning")
       .getOrCreate())
spark.sparkContext.setLogLevel("ERROR")

### Create Dataframe from csv File

In [4]:
df = spark.read.option("header", "true") \
       .option("inferSchema", "true") \
       .option("delimiter", ";") \
       .csv(inputFile)
print(df.printSchema())

root
 |-- division: string (nullable = true)
 |-- level of education: string (nullable = true)
 |-- training level: integer (nullable = true)
 |-- work experience: integer (nullable = true)
 |-- salary: integer (nullable = true)
 |-- sales: integer (nullable = true)

None


### Binning of Salary

In [None]:
# bins with bucketizer
from pyspark.sql import functions as F

(df
    .agg(
        F.avg(F.col('salary')).alias('avg_salary'),
        F.min(F.col('salary')).alias('min_salary'),
        F.max(F.col('salary')).alias('max_salary'),
    )
    .show()
)

# bins: 0 - 50.000 ; 50.000 - 100.000 ; 100:000 - 150:000

bucketizer = Bucketizer(splits=[0,50000,100000,150000], inputCol='salary', outputCol='bins')

df_with_bins = bucketizer.setHandleInvalid('skip').transform(df)
df_with_bins.show()

+----------+----------+----------+
|avg_salary|min_salary|max_salary|
+----------+----------+----------+
| 90031.645|     34883|    146407|
+----------+----------+----------+

+-----------------+------------------+--------------+---------------+------+------+----+
|         division|level of education|training level|work experience|salary| sales|bins|
+-----------------+------------------+--------------+---------------+------+------+----+
|      peripherals|      some college|             1|              6| 87067|299066| 1.0|
|         printers|       high school|             3|              7| 98381|457597| 1.0|
|  office supplies|associate's degree|             1|              7|101528|383462| 2.0|
|  office supplies|associate's degree|             0|              9| 98431|374972| 1.0|
|         printers|associate's degree|             0|              6| 82072|239893| 1.0|
|computer hardware|associate's degree|             2|              8|104295|452952| 2.0|
|         printers|    

In [None]:
# bins with quantile discretizer

qd = QuantileDiscretizer(numBuckets=3,  inputCol='salary', outputCol='bins')
df_with_bins2 = qd.fit(df).transform(df)
df_with_bins2.show()

+-----------------+------------------+--------------+---------------+------+------+----+
|         division|level of education|training level|work experience|salary| sales|bins|
+-----------------+------------------+--------------+---------------+------+------+----+
|      peripherals|      some college|             1|              6| 87067|299066| 1.0|
|         printers|       high school|             3|              7| 98381|457597| 2.0|
|  office supplies|associate's degree|             1|              7|101528|383462| 2.0|
|  office supplies|associate's degree|             0|              9| 98431|374972| 2.0|
|         printers|associate's degree|             0|              6| 82072|239893| 1.0|
|computer hardware|associate's degree|             2|              8|104295|452952| 2.0|
|         printers|      some college|             1|              8|101318|426995| 2.0|
|         printers|associate's degree|             1|              8| 99759|382512| 2.0|
|         printers|  

In [None]:
spark.stop()