Copyright (c) Microsoft Corporation. 
Licensed under the MIT license. 
# Clean Data

We are using an open source eCommerce store dataset from Kaggle: [eCommerce behavior data from multi category store](https://www.kaggle.com/mkechinov/ecommerce-behavior-data-from-multi-category-store). The first step is to clean the source dataset into a version we can work with:

1. Remove data missing that is missing brand and category values
2. Filter to only keep brands and categories that are accurately mapped
3. Write the results to the data lake

## Library Imports

In [None]:
# python libary imports
import pyspark
from pyspark.sql import functions as F
from pyspark.sql.functions import *
from pyspark.sql import *
from pyspark.sql.types import *
import pandas as pd
import numpy as np

## Read in Data from Azure Data Lake


In [None]:
data_lake_account_name = ''
file_system_name = ''

In [None]:
paths = [f'abfss://{file_system_name}@{data_lake_account_name}.dfs.core.windows.net/raw_data/*']
df = spark.read.csv(*paths, header='true')

In [None]:
# show retail dataframe
#display(df.take(5))

In [None]:
# show missing values
#display(df.select([count(when(df[c].isNull(), c)).alias(c) for c in df.columns]))

In [None]:
# drop rows that have no category, brand, or user session
df = df.filter((df.category_code != 'null') & (df.brand != 'null') & (df.brand != 'user_session'))

In [None]:
# split category code into category and subcategory
df = df.withColumn('category', split(col('category_code'), '\.').getItem(0))\
       .withColumn('subcategory', split(col('category_code'), '\.').getItem(1))\

In [None]:
# write df table to an intermediate spark table
df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").save(f"abfss://{file_system_name}@{data_lake_account_name}.dfs.core.windows.net/intermediate_tables/filtered_df")

# read filtered_df table intermediate spark table
df = spark.read.format("delta").load(f"abfss://{file_system_name}@{data_lake_account_name}.dfs.core.windows.net/intermediate_tables/filtered_df")

## Data Cleaning
The retail dataset has some messy data where the brands don't align with the categories. Below is code to analyze the dataset to filter it down to the categories, subcategories, and brands that look the best.

In [None]:
# top categories - construction, electronics, appliances, apparel, computers, sport, furniture, kids
# display(filtered_df.groupBy('category').count().orderBy(desc('count')))

# construction and furniture don't look right

#display(filtered_df.filter((filtered_df.category_code.contains('construction'))).groupBy('brand').count().orderBy(desc('count')))
#display(filtered_df.filter((filtered_df.category_code.contains('furniture'))).groupBy('brand').count().orderBy(desc('count')))

# electronics looks much better
# display(filtered_df.filter((filtered_df.category_code.contains('electronics'))).groupBy('brand').count().orderBy(desc('count')))

# top electronics subcategories
# display(filtered_df.filter(filtered_df['category'] == 'electronics').groupBy('sub_category_1').count().orderBy(desc('count')))

# top smartphone brands
# display(filtered_df.filter(filtered_df['sub_category_1'] == 'smartphone').groupBy('brand').count().orderBy(desc('count')))

# top audio brands
# display(filtered_df.filter(filtered_df['sub_category_1'] == 'audio').groupBy('brand').count().orderBy(desc('count')))

# top clock brands
# display(filtered_df.filter(filtered_df['sub_category_1'] == 'clocks').groupBy('brand').count().orderBy(desc('count')))

# top video brands - there are none
# display(filtered_df.filter(filtered_df['sub_category_1'] == 'videos').groupBy('brand').count().orderBy(desc('count')))

# top camera brands - a lot do not make sense
# display(filtered_df.filter(filtered_df['sub_category_1'] == 'camera').groupBy('brand').count().orderBy(desc('count')))

# top tablet brands
# display(filtered_df.filter(filtered_df['sub_category_1'] == 'tablet').groupBy('brand').count().orderBy(desc('count')))

# top telephone brands
# display(filtered_df.filter(filtered_df['sub_category_1'] == 'telephone').groupBy('brand').count().orderBy(desc('count')))

In [None]:
# filter down to just electronics, only electronic subcategories and brands that are accurate
smartphone_brands = ['samsung', 'apple', 'xiaomi', 'huawei', 'oppo', 'meizu', 'nokia', 'honor', 'sony', 'oneplus', 'lg']
audio_brands = ['lenovo', 'acer', 'apple', 'asus', 'hp', 'xiaomi', 'jbl', 'dell', 'pioneer', 'samsung', 'kicx', 'yamaha', 'sony', 'pride',
                'alphard', 'element', 'bosch', 'stagg', 'alpine', 'adagio', 'huawei', 'hertz', 'elari', 'alteco', 'msi', 'edge', 'crown', 'fender',
                'kenwood', 'conceptclub', 'harper', 'valkiria', 'cortland', 'phantom', 'makita']
clock_brands = ['casio', 'apple', 'samsung', 'xiaomi', 'garmin', 'amazfit', 'orient', 'tissot', 'huawei', 'wonlex', 'aimoto', 'armani', 'boccia', 'elari', 'fossil', 'canyon']
tablet_brands = ['samsung', 'apple', 'lenovo', 'huawei', 'prestigio', 'acer', 'xiaomi', 'wacom', 'huion', 'microsoft']
telephone_brands = ['nokia', 'texet', 'panasonic', 'maxvi', 'lorelli', 'philips', 'prestigio']

df = df.filter(
                                    (df['category'] == 'electronics') & \
                                    (
                                        (df['subcategory'] == 'smartphone') & (df['brand'].isin(smartphone_brands)) | \
                                        (df['subcategory'] == 'audio') & (df['brand'].isin(audio_brands)) | \
                                        (df['subcategory'] == 'clocks') & (df['brand'].isin(clock_brands)) | \
                                        (df['subcategory'] == 'tablet') & (df['brand'].isin(tablet_brands)) | \
                                        (df['subcategory'] == 'telephone') & (df['brand'].isin(telephone_brands))
                                    )
                                )

## Save Cleaned Data to a Delta Table


In [None]:
df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").save(f"abfss://{file_system_name}@{data_lake_account_name}.dfs.core.windows.net/transformed_data/cleaned_data_electronics")