<a href="https://colab.research.google.com/github/mann-brinson/valkyrie_interview/blob/main/Freq_Itemsets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Mount Google Drive to access data files
from google.colab import drive, files

#Load the order_products_train_many.json input file
drive.mount('/content/gdrive')
files.upload()

Mounted at /content/gdrive


Saving order_products_train_many.json to order_products_train_many.json


{'order_products_train_many.json': b'{"order_id":"98","basket_content":["36364","32463","31066","25659","48287","45204","24964","18117","46413","34126","9373","22935","46720","44479","790","18441","45007","20520","7461","26317","3880","41387","17747","8859","19731","43654","13176","4357","37664","34065","35951","43560","9896","27509","15455","27966","47601","40396","35042","40986","1939","46313","329","30776","36695","27683","15995","27344","47333"]}\n{"order_id":"1591","basket_content":["38805","45061","17758","5194","32520","9387","5384","48246","4103","16900","18792","44116","25237","9130","25316","17203","44008","48823","48205","41671","12986","39758","27344","31215","34358","46473","27681","19604","24852","22105","40310"]}\n{"order_id":"1983","basket_content":["16262","20738","37029","31893","7035","11494","36029","41870","8490","2825","42637","19261","22963","42342","36011","42265","45578","41844","37067","21137","3896","28849","42768","34977","8424","5077","44845","19446","45254

In [None]:
#Paramters
input_file = "order_products_train_many.json"
output_freq_itemsets = "freq_itemsets.json"
output_item_baskets = "item_baskets.json"

In [None]:
#Install pyspark dependencies on Google Colab
import os
import json

# Install dependencies: jdk-8, hadoop-2.7, spark-3.1.1
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://www-us.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop2.7.tgz
!tar xf spark-3.1.1-bin-hadoop2.7.tgz
!pip install -q findspark

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop2.7"

In [None]:
#Run a local spark session
import findspark
findspark.init()

#Apriori algorithm library
from pyspark.ml.fpm import FPGrowth

from pyspark import SparkContext
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .master("local[*]") \
    .appName("Valkyrie") \
    .getOrCreate()

sc = spark.sparkContext

**Compute frequent itemsets**

In [None]:
#Load the json file of orders containing order_products purchased by users who made at least 20 orders
jsonDF = spark.read.json(input_file)

In [None]:
#Call the FPGrowth library

#The minimum support for an itemset to be identified as frequent. 
#For example, if an item appears 3 out of 5 transactions, it has a support of 3/5=0.6.
minSupport = 0.001

#minimum confidence for generating Association Rule. Confidence is an indication of how often an association rule 
# has been found to be true. For example, if in the transactions itemset X appears 4 times, X and Y co-occur only 2 times, 
# the confidence for the rule X => Y is then 2/4 = 0.5. The parameter will not affect the mining for frequent itemsets, 
# but specify the minimum confidence for generating association rules from frequent itemsets.
minConfidence = 0

fpGrowth = FPGrowth(itemsCol="basket_content", minSupport=minSupport, minConfidence=minConfidence)
model = fpGrowth.fit(jsonDF)

# Display frequent itemsets.
model.freqItemsets.show(10)

+--------------------+----+
|               items|freq|
+--------------------+----+
|             [36695]| 295|
|      [36695, 40706]|  41|
|       [36695, 5876]|  34|
|      [36695, 21137]|  77|
|[36695, 21137, 13...|  33|
|      [36695, 47626]|  45|
|      [36695, 47209]|  82|
|[36695, 47209, 13...|  46|
|      [36695, 27966]|  60|
|      [36695, 13176]|  97|
+--------------------+----+
only showing top 10 rows



In [None]:
#Convert pyspark.sql.dataframe to pandas.dataframe
pandasDF = model.freqItemsets.toPandas()

#Convert from dataframe to dict
freq_items_dict = pandasDF.to_dict('records')
print(freq_items_dict)

#Add the length attribute and then serialize as json 
freq_items_dict2 = dict()
f_itemset_id = 1
for freq_itemset in freq_items_dict:
  len_itemset = len(freq_itemset['items'])
  freq_items_dict2[f_itemset_id] = {'length': len_itemset, 'freq': freq_itemset['freq'], 'items': freq_itemset['items']}
  f_itemset_id += 1

#Save the result to a json file
with open(output_freq_itemsets, 'w') as outfile:
    json.dump(freq_items_dict2, outfile)

[{'items': ['36695'], 'freq': 295}, {'items': ['36695', '40706'], 'freq': 41}, {'items': ['36695', '5876'], 'freq': 34}, {'items': ['36695', '21137'], 'freq': 77}, {'items': ['36695', '21137', '13176'], 'freq': 33}, {'items': ['36695', '47626'], 'freq': 45}, {'items': ['36695', '47209'], 'freq': 82}, {'items': ['36695', '47209', '13176'], 'freq': 46}, {'items': ['36695', '27966'], 'freq': 60}, {'items': ['36695', '13176'], 'freq': 97}, {'items': ['36695', '8518'], 'freq': 35}, {'items': ['36695', '26209'], 'freq': 35}, {'items': ['36695', '21903'], 'freq': 52}, {'items': ['36695', '24852'], 'freq': 43}, {'items': ['6740'], 'freq': 67}, {'items': ['3298'], 'freq': 101}, {'items': ['35336'], 'freq': 51}, {'items': ['46041'], 'freq': 69}, {'items': ['36011'], 'freq': 283}, {'items': ['36011', '21137'], 'freq': 72}, {'items': ['36011', '47209'], 'freq': 36}, {'items': ['36011', '27845'], 'freq': 33}, {'items': ['36011', '13176'], 'freq': 72}, {'items': ['36011', '21903'], 'freq': 37}, {'it

In [None]:
#Download the resulting json file, then upload to DynamoDB
#TODO: Configure DynamoDB Access Key in Google Colab, to put the JSON file in DynamoDB

In [None]:
#Create an item_baskets dict
#For each item, in each itemset, generate a value for that item-key
item_baskets = dict()
for f_itemset_id in freq_items_dict2.keys():
  for item in freq_items_dict2[f_itemset_id]["items"]:
    if item not in item_baskets:
      item_baskets[item] = {'itemset_ids': [f_itemset_id]}
    else:
      item_baskets[item]['itemset_ids'].append(f_itemset_id)
  
#Save the result to json file
with open(output_item_baskets, 'w') as outfile:
    json.dump(item_baskets, outfile)

In [None]:
#Download the resulting json file, then upload to DynamoDB
#TODO: Configure DynamoDB Access Key in Google Colab, to put the JSON file in DynamoDB