# Sales Data Transformation

In [1]:
import sys
sys.path.append("..")
# Spark libs
from pyspark.sql.session import SparkSession
from pyspark.ml.feature import OneHotEncoder, StringIndexer, QuantileDiscretizer, VectorAssembler , Normalizer, StandardScaler
# helpers
from helpers.data_prep_and_print import print_df
from helpers.path_translation import translate_to_file_string

### Select the Imput File

In [2]:
inputFile = translate_to_file_string("../data/sales.csv")

### Spark Session Creation

In [3]:
spark = (SparkSession
       .builder
       .appName("Sales Data Cleaning")
       .getOrCreate())
spark.sparkContext.setLogLevel("ERROR")

### Create Dataframe from csv File

In [4]:
df = spark.read.option("header", "true") \
       .option("inferSchema", "true") \
       .option("delimiter", ",") \
       .csv(inputFile)
print(df.printSchema())

root
 |-- division: string (nullable = true)
 |-- level of education: string (nullable = true)
 |-- training level: integer (nullable = true)
 |-- work experience: integer (nullable = true)
 |-- salary: integer (nullable = true)
 |-- sales: integer (nullable = true)

None


### Encoding of Quantitative Attributes

In [None]:
division_indexer = StringIndexer().setInputCol("division").setOutputCol("division_num").fit(df)
# TODO build the other encoders

indexed_df = division_indexer.transform(df)
indexed_df.show()

# education as one hot encoding
education_indexer = StringIndexer().setInputCol("level of education").setOutputCol("education_num").fit(df)
indexed_df = education_indexer.transform(indexed_df)

edu_encoder = OneHotEncoder(inputCol='education_num',outputCol='education_onehot')
encoded_df = edu_encoder.fit(indexed_df).transform(indexed_df)
encoded_df.show()

# indexed_df = education_indexer.transform(df)

+-----------------+------------------+--------------+---------------+------+------+------------+
|         division|level of education|training level|work experience|salary| sales|division_num|
+-----------------+------------------+--------------+---------------+------+------+------------+
|computer software|      some college|             1|              5| 92766|283647|         4.0|
|         printers|       high school|             3|              7|101828|490163|         0.0|
|         printers|associate's degree|             0|             10|105433|396790|         0.0|
|         printers|      some college|             2|              6| 86490|404898|         0.0|
|computer hardware|       high school|             1|              7| 90531|385136|         3.0|
|         printers|      some college|             0|             12|110420|443568|         0.0|
|computer hardware| bachelor's degree|             2|              6|105311|384543|         3.0|
|      peripherals|       high

## Discretize sales

In [None]:
discretizer = QuantileDiscretizer(numBuckets=10, inputCol="sales", outputCol="sales_bucket_quantile")
# TODO add the discretizer to the result


discretized_sales = discretizer.fit(df).transform(df)
discretized_sales.show()

# alternatively

discretized_sales2 = discretizer.fit(encoded_df).transform(encoded_df)
discretized_sales2.show()


+-----------------+------------------+--------------+---------------+------+------+---------------------+
|         division|level of education|training level|work experience|salary| sales|sales_bucket_quantile|
+-----------------+------------------+--------------+---------------+------+------+---------------------+
|computer software|      some college|             1|              5| 92766|283647|                  3.0|
|         printers|       high school|             3|              7|101828|490163|                  9.0|
|         printers|associate's degree|             0|             10|105433|396790|                  7.0|
|         printers|      some college|             2|              6| 86490|404898|                  7.0|
|computer hardware|       high school|             1|              7| 90531|385136|                  6.0|
|         printers|      some college|             0|             12|110420|443568|                  8.0|
|computer hardware| bachelor's degree|        

### Build labeled point semantic vector

In [None]:
feature_cols = ["training level","work experience","salary"] # TODO add the new attributes



assembler =  VectorAssembler(outputCol="features", inputCols=list(feature_cols))
# TODO Build the labeled point semantic for sales with all new attributes

### Normalization 

In [None]:
# TODO normalize the features


### Standardization

In [None]:
# TODO standardize the features


In [None]:
spark.stop()