Copyright (c) Microsoft Corporation. 
Licensed under the MIT license. 
# Feature Engineering

Analyze the transformed data and select the features that will be used in the model using the following steps in this notebook:

1. Exploratory data analysis
2. Remove outliers
3. Correlation analysis
4. Feature selection
5. Save results to data lake

## Library Imports


In [None]:
import pyspark
from pyspark.sql import functions as F
from pyspark.sql.functions import *
from pyspark.sql import DataFrame
from pyspark.sql.types import *
from pyspark.ml.feature import *
from pyspark.ml.stat import Correlation
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
spark.conf.set('spark.sql.execution.arrow.enabled', False)

## Read in Transformed Data from Delta Lake Table


In [None]:
data_lake_account_name = ''
file_system_name = ''

In [None]:
# transformed df
df = spark.read.format("delta").load(f"abfss://{file_system_name}@{data_lake_account_name}.dfs.core.windows.net/transformed_data/transformed_data")

In [None]:
# display transformed data
#display(df)

In [None]:
# print number of rows and columns
print('Columns:', len(df.columns))
print('Rows:', df.count())

## Exploratory Data Analysis


### Distribution of Count vs. Binary Features

In [None]:
# distribution of apple brand view count
display(df.groupBy('brand_apple_viewed_count').count().orderBy(desc('count')))

In [None]:
# distribution of apple brand view binary
display(df.groupBy('brand_apple_viewed_binary').count().orderBy(desc('count')))

In [None]:
# distribution of smartphone subcategory view count
display(df.groupBy('subcategory_smartphone_viewed_count').count().orderBy(desc('count')))

In [None]:
# distribution of smartphone subcategory binary
display(df.groupBy('subcategory_smartphone_viewed_binary').count().orderBy(desc('count')))

### Keep Binary Features

Because the features that measure counts of views, add to cart, and purchases are very right skewed, it makes sense to either bin the upper values or use the binary feature instead. Because the binary features have balanced classes, we are going to use the binary features.

In [None]:
# only keep binary columns
df = df.select('user_id', 'year', 'month', 'growth', 'sessions_per_user_per_month', 'avg_session_duration_per_user_per_month', 
              'avg_conversion_rate_per_user_per_month', 'avg_order_value_per_user_per_month', 'avg_cart_abandon_rate', 
              'brand_apple_viewed_binary', 'brand_samsung_viewed_binary', 'brand_xiaomi_viewed_binary', 'brand_huawei_viewed_binary', 
              'brand_lenovo_viewed_binary', 
              'brand_apple_added_binary', 'brand_samsung_added_binary', 'brand_xiaomi_added_binary', 
              'brand_huawei_added_binary', 'brand_acer_added_binary', 
              'brand_apple_purchased_binary', 'brand_samsung_purchased_binary', 'brand_xiaomi_purchased_binary',
              'brand_huawei_purchased_binary', 'brand_acer_purchased_binary', 
              'subcategory_smartphone_viewed_binary', 'subcategory_audio_viewed_binary', 'subcategory_clocks_viewed_binary', 
              'subcategory_tablet_viewed_binary', 'subcategory_telephone_viewed_binary', 
              'subcategory_smartphone_added_binary', 'subcategory_audio_added_binary', 'subcategory_clocks_added_binary', 
              'subcategory_tablet_added_binary', 'subcategory_telephone_added_binary', 
              'subcategory_smartphone_purchased_binary', 'subcategory_audio_purchased_binary', 'subcategory_clocks_purchased_binary', 
              'subcategory_tablet_purchased_binary',
              'subcategory_telephone_purchased_binary',
              'product_id_1004856_viewed_binary', 'product_id_1005115_viewed_binary', 'product_id_1004767_viewed_binary',
              'product_id_4804056_viewed_binary', 'product_id_1005105_viewed_binary',
              'product_id_1004856_added_binary', 'product_id_1004767_added_binary', 'product_id_1005115_added_binary',
              'product_id_4804056_added_binary', 'product_id_1004833_added_binary', 
              'product_id_1004856_purchased_binary', 'product_id_1004767_purchased_binary', 'product_id_1005115_purchased_binary', 
              'product_id_4804056_purchased_binary', 'product_id_1004833_purchased_binary')

## Distribution of Continous Variables


### Convert to Pandas Dataframe for Visualization


In [None]:
# convert to pandas dataframe to use for visualizations
pdf = df.toPandas()

### Remove Outliers


In [None]:
pdf = df.toPandas()
sessions_hist = sns.distplot(pdf['sessions_per_user_per_month'], kde = False)
sessions_hist.set_title('Distribution of Sessions per User per Month')
sessions_hist.set_xlabel('# of Sessions') 
sessions_hist.set_ylabel('Frequency')
plt.show()

In [None]:
# remove outliers from sessions per user per month
pdf_filtered = pdf[pdf['sessions_per_user_per_month'] <= 60]
sessions_hist = sns.distplot(pdf_filtered['sessions_per_user_per_month'], kde = False)
sessions_hist.set_title('Distribution of Sessions Per User Per Month Cleaned')
sessions_hist.set_xlabel('# of Sessions') 
sessions_hist.set_ylabel('Frequency')
plt.show()

In [None]:
session_duration_hist = sns.distplot(pdf_filtered['avg_session_duration_per_user_per_month'], kde = False)
session_duration_hist.set_title('Distribution of Sessions per User per Month')
session_duration_hist.set_xlabel('Session Duration') 
session_duration_hist.set_ylabel('Frequency')
plt.show()

In [None]:
# remove outliers from session duration per user per month
pdf_filtered = pdf_filtered[pdf_filtered['avg_session_duration_per_user_per_month'] <= 4000]
session_duration_hist = sns.distplot(pdf_filtered['avg_session_duration_per_user_per_month'], kde = False)
session_duration_hist.set_title('Distribution of Sessions Per User Per Month Cleaned')
session_duration_hist.set_xlabel('Session Duration') 
session_duration_hist.set_ylabel('Frequency')
plt.show()

In [None]:
conversion_rate_hist = sns.distplot(pdf_filtered['avg_conversion_rate_per_user_per_month'], kde = False)
conversion_rate_hist.set_title('Distribution of Avg Conversion Rate per Month')
conversion_rate_hist.set_xlabel('Avg Conversion Rate') 
conversion_rate_hist.set_ylabel('Frequency')
plt.show()

In [None]:
# remove invalid values for avg conversion rate
pdf_filtered = pdf_filtered[pdf_filtered['avg_conversion_rate_per_user_per_month'] <= 1]
conversion_rate_hist = sns.distplot(pdf_filtered['avg_conversion_rate_per_user_per_month'], kde = False)
conversion_rate_hist.set_title('Distribution of Avg Conversion Rate Per User Per Month Cleaned')
conversion_rate_hist.set_xlabel('Avg Conversion Rate') 
conversion_rate_hist.set_ylabel('Frequency')
plt.show()

In [None]:
order_value_hist = sns.distplot(pdf_filtered['avg_order_value_per_user_per_month'], kde = False)
order_value_hist.set_title('Distribution of Avg Order Value per Month')
order_value_hist.set_xlabel('Avg Order Value') 
order_value_hist.set_ylabel('Frequency')
plt.show()

In [None]:
# remove outliers from avg order value per user per month
pdf_filtered = pdf_filtered[pdf_filtered['avg_order_value_per_user_per_month'] <= 2000]
order_value_hist = sns.distplot(pdf_filtered['avg_order_value_per_user_per_month'], kde = False)
order_value_hist.set_title('Distribution of Avg Order Value Per User Per Month Cleaned')
order_value_hist.set_xlabel('Avg Order Value') 
order_value_hist.set_ylabel('Frequency')
plt.show()

In [None]:
cart_abandon_rate_hist = sns.distplot(pdf_filtered['avg_cart_abandon_rate'], kde = False)
cart_abandon_rate_hist.set_title('Distribution of Avg Cart Abandon Rate')
cart_abandon_rate_hist.set_xlabel('Avg Cart Abandon Rate') 
cart_abandon_rate_hist.set_ylabel('Frequency')
plt.show()

In [None]:
# remove invalid values for cart abandon rate
pdf_filtered = pdf_filtered[(pdf_filtered['avg_cart_abandon_rate'] <= 1) & (pdf_filtered['avg_cart_abandon_rate'] >= 0)]
cart_abandon_rate_hist = sns.distplot(pdf_filtered['avg_cart_abandon_rate'], kde = False)
cart_abandon_rate_hist.set_title('Distribution of Avg Cart Abandon Rate Cleaned')
cart_abandon_rate_hist.set_xlabel('Avg Cart Abandon Rate') 
cart_abandon_rate_hist.set_ylabel('Frequency')
plt.show()

In [None]:
# percent of original dataframe remaining after eliminating outliers
print('% of data remaining:', '{:.2%}'.format(len(pdf_filtered)/len(pdf)))

## Feature Selection


### Correlation Analysis
We are going to example correlations between features to search for multicolinearity (where 2+ features are highly correlated with each other).

In [None]:
# search for features with at least 80% correlation
correlations = pdf_filtered.corr().abs().unstack().sort_values(ascending = False).drop_duplicates()
correlations[correlations>=0.8]

There is a lot of multicolinearity between viewed, added to cart, and purchased features which means that keeping all 3 in the dataset would be overcounting these features. Below we will do further analysis to determine which set of features to keep.

In [None]:
# convert back into a Spark DataFrame
df_filtered = spark.createDataFrame(pdf_filtered)

In [None]:
# distribution of apple brand view binary
display(df_filtered.groupBy('brand_apple_viewed_binary').count().orderBy(desc('count')))

In [None]:
# distribution of apple brand added to cart binary
display(df_filtered.groupBy('brand_apple_added_binary').count().orderBy(desc('count')))

In [None]:
# distribution of apple brand purchased binary
display(df_filtered.groupBy('brand_apple_purchased_binary').count().orderBy(desc('count')))

Because purchase events are most representative of customers buying intent compared to views and added to cart and because these events still have balanced classes, we are going to keep only purchased features.


In [None]:
# keep only purchased features
df_filtered = df_filtered.select(['user_id', 'year', 'month', 'growth', 'sessions_per_user_per_month', 'avg_session_duration_per_user_per_month', 
                                    'avg_conversion_rate_per_user_per_month', 'avg_order_value_per_user_per_month', 'avg_cart_abandon_rate', 
                                    'brand_apple_purchased_binary', 'brand_samsung_purchased_binary', 'brand_xiaomi_purchased_binary', 
                                    'brand_huawei_purchased_binary', 'brand_acer_purchased_binary',
                                    'subcategory_smartphone_purchased_binary', 'subcategory_audio_purchased_binary',
                                    'subcategory_clocks_purchased_binary', 'subcategory_tablet_purchased_binary',
                                    'subcategory_telephone_purchased_binary', 'product_id_1004856_purchased_binary',
                                    'product_id_1004767_purchased_binary', 'product_id_1005115_purchased_binary',
                                    'product_id_4804056_purchased_binary', 'product_id_1004833_purchased_binary'])

In [None]:
# print number of rows and columns after feature selection
print('Columns:', len(df_filtered.columns))
print('Rows:', df_filtered.count())

## Save Results to Data Lake
Persist the transformed data to a Delta Table on the Data Lake

In [None]:
# write transformed data to delta table
df_filtered.write.format('delta').mode('overwrite').option("overwriteSchema", "true").save(f'abfss://{file_system_name}@{data_lake_account_name}.dfs.core.windows.net/transformed_data/ml_data')

# write transformed data to parquet table
df_filtered.write.format('parquet').mode('overwrite').option("overwriteSchema", "true").save(f'abfss://{file_system_name}@{data_lake_account_name}.dfs.core.windows.net/transformed_data/ml_data_parquet')