In [0]:
# import necessary libraries
from pyspark.sql.types import *
# pyspark functions
from pyspark.sql.functions import *
# URL processing
import urllib

##########################################################################################################################################
##################################################### Data Reading & Combining Section ###################################################
##########################################################################################################################################


# Define file type
file_type = "csv"
# Whether the file has a header
first_row_is_header = "true"
# Delimiter used in the file
delimiter = ","
# Read the CSV file to spark dataframe
aws_keys_df = spark.read.format(file_type)\
.option("header", first_row_is_header)\
.option("sep", delimiter)\
.load("/FileStore/tables/new_user_credentials.csv")

#test_user
# Get the AWS access key and secret key from the spark dataframe
ACCESS_KEY = aws_keys_df.where(col('User name')=='test_user').select('Access key ID').collect()[0]['Access key ID']
SECRET_KEY = aws_keys_df.where(col('User name')=='test_user').select('Secret access key').collect()[0]['Secret access key']
# Encode the secrete key
ENCODED_SECRET_KEY = urllib.parse.quote(string=SECRET_KEY, safe="")

# AWS S3 bucket name
AWS_S3_BUCKET = "marketbasketbucket"
# Mount name for the bucket
MOUNT_NAME = "/mnt/mba/chunks"
# Source url
SOURCE_URL = "s3n://{0}:{1}@{2}".format(ACCESS_KEY, ENCODED_SECRET_KEY, AWS_S3_BUCKET)
# Mount the drive
dbutils.fs.mount(SOURCE_URL, MOUNT_NAME)


# Reading and Combining the csv files
from pyspark.sql.types import *
# File Schema
file_schema = StructType([
    StructField('event_time', StringType(), False),
    StructField('event_type', StringType(), True),
    StructField('product_id', IntegerType(), True),
    StructField('category_id', LongType(), True),
    StructField('category_code', StringType(), True),
    StructField('brand', StringType(), True),
    StructField('price', FloatType(), True),
    StructField('user_id', IntegerType(), True),
    StructField('user_session', StringType(), True)
])

#Creating an empty dataframe with the same schema
maindf = spark.createDataFrame([], file_schema)

# The applied options are for CSV files. For other file types, these will be ignored.
for i in dbutils.fs.ls("dbfs:/mnt/mba/chunks/"):
    df = spark.read.csv(path= i.path, header=True, schema=file_schema)
    maindf = maindf.union(df)
    
    
#Show the Dataframe if required
#maindf.show(10, truncate=False)




##########################################################################################################################################
##################################################### Data Transformation Section ########################################################
##########################################################################################################################################
#DataFlow

# Compress the data and store in an optimized format.


#Missing Values
nonNAdf = maindf.where(maindf.category_code.isNotNull())


#Separate_productTypes
import pyspark.sql.functions as F
from pyspark.sql.types import *
from pyspark.sql.functions import isnan, when, count, col

def string_split(string1):
    main_string = string1.split('.')[-1]
    main_string = main_string.strip()
    return main_string

udfstringfunc = F.udf(string_split, StringType())
nonNAdf = nonNAdf.withColumn('productType', udfstringfunc('category_code'))




# Events Split
nonNAdf_purchase = nonNAdf.select('*').where(col('event_type')=='purchase')
nonNAdf_view = nonNAdf.select('*').where(col('event_type')=='view')
nonNAdf_cart = nonNAdf.select('*').where(col('event_type')=='cart')


#groupby users Purchase
grouped_transactions = nonNAdf_purchase.select('*').groupBy('user_session').agg(F.concat_ws(",",F.collect_list(nonNAdf_purchase.productType))).select(col('user_session').alias('sid'), col("concat_ws(,, collect_list(productType))").alias("transactionslist")).distinct()


#join the table
grouped_transactions = grouped_transactions.join(nonNAdf_purchase, grouped_transactions.sid==nonNAdf_purchase.user_session, 'left')
grouped_transactions= grouped_transactions.drop(*['sid', 'category_id'])


# Just Samsung Data
grouped_transactions = grouped_transactions.select('*').where(col('brand')=='samsung')
grouped_transactions.cache()

#create View for SQL queries
grouped_transactions.createOrReplaceTempView('groupedView')

grouped_transactions.show(10)

#Un Mount the mount as we have final data in maindf DataFrame
dbutils.fs.unmount(MOUNT_NAME)

+--------------------+--------------------+----------+----------+--------------------+-------+------+---------+--------------------+-------------+
|    transactionslist|          event_time|event_type|product_id|       category_code|  brand| price|  user_id|        user_session|  productType|
+--------------------+--------------------+----------+----------+--------------------+-------+------+---------+--------------------+-------------+
|       refrigerators|2019-10-15 18:18:...|  purchase|   2701333|appliances.kitche...|samsung|952.38|517531951|012a8c40-99bd-427...|refrigerators|
|          smartphone|2019-10-19 06:21:...|  purchase|   1004856|electronics.smart...|samsung|131.53|561416720|013d60d1-6e1c-46d...|   smartphone|
|          smartphone|2019-10-09 06:07:...|  purchase|   1004833|electronics.smart...|samsung|170.92|546842204|01ce1597-c7b9-480...|   smartphone|
|          smartphone|2019-10-09 05:42:...|  purchase|   1004768|electronics.smart...|samsung|250.96|537721208|022b9bf

## Types of DataFrames:
__maindf:__ Dataframe created by combining chunks

__nonNADf:__ maindf with no category missing values

__1.) nonNAdf_purchase:__ nonNAdf of purchase events

__2.) nonNAdf_view:__ nonNAdf of view events

__3.) nonNAdf_cart:__ nonNAdf of cart events

# Apriori Algorithm

In [0]:
! pip install mlxtend

In [0]:
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules  

In [0]:
nonNAdf_cart.show(10)

+--------------------+----------+----------+-------------------+--------------------+-------+------+---------+--------------------+-----------+
|          event_time|event_type|product_id|        category_id|       category_code|  brand| price|  user_id|        user_session|productType|
+--------------------+----------+----------+-------------------+--------------------+-------+------+---------+--------------------+-----------+
|2019-10-01 00:09:...|      cart|   1002524|2053013555631882655|electronics.smart...|  apple|515.67|524325294|0b74a829-f9d7-465...| smartphone|
|2019-10-01 00:11:...|      cart|   4804056|2053013554658804075|electronics.audio...|  apple|161.98|533624186|e5ac3caa-e6d5-4d6...|  headphone|
|2019-10-01 02:17:...|      cart|   1004833|2053013555631882655|electronics.smart...|samsung|174.76|536415846|685b5b42-f597-4a6...| smartphone|
|2019-10-01 02:19:...|      cart|   1005003|2053013555631882655|electronics.smart...| huawei|258.21|513632293|f2cc68f7-39d1-4a5...| smar

In [0]:
%sql
SELECT transactionslist, COUNT(transactionslist) AS cnt FROM groupedView GROUP BY transactionslist ORDER BY cnt DESC LIMIT 20

transactionslist,cnt
smartphone,68723
"smartphone,smartphone",15081
"smartphone,smartphone,smartphone",4797
tv,4474
vacuum,2566
washer,2484
"smartphone,smartphone,smartphone,smartphone",2034
clocks,1294
tablet,1282
headphone,1138


### Unpersist the data from cache memory

In [0]:
## Remove the data stored in the memory
grouped_transactions.unpersist()

### Unmounting the code

In [0]:
#Un Mount the mount as we have final data in maindf DataFrame
dbutils.fs.unmount(MOUNT_NAME)

/mnt/mba/chunks has been unmounted.
Out[19]: True