In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import Row

from pyspark.ml.fpm import FPGrowth
from pyspark.ml.fpm import PrefixSpan
from pyspark.ml.recommendation import ALS

In [2]:
spark

## Read data

### Load files from google cloud storage

In [3]:
sdf_assets = spark.read.json("gs://lz-assets-01/*.json")

                                                                                

## Warranty data

In [4]:
sdf_assets.filter(sdf_assets.CustomerId.isNotNull()).limit(3).toPandas()

                                                                                

Unnamed: 0,CustomerId,Id,Interests,ManufactureDate,MyFestoolId,ProductCode,ProductName,PurchaseDate,RegistrationDate,SalesNumber,Source,Status
0,f40c7787-7369-469d-bfad-005a1a2ebbcd,02i3X00000iYOLNQA4,,2015-05-24T00:00:00,fd7439d0-7914-441a-ab3b-c00ce60dd8d7,,PDC 18/4,2015-12-23T00:00:00,2017-05-05T00:00:00+00:00,500781,FestoolDealerPortal,Active
1,f40c7787-7369-469d-bfad-005a1a2ebbcd,02i3X00000iYL4BQAW,,2015-10-12T00:00:00,fd7439d0-7914-441a-ab3b-c00ce60dd8d7,,PSC 420 EB,2015-10-21T00:00:00,2017-05-05T00:00:00+00:00,561746,FestoolDealerPortal,Active
2,f40c7787-7369-469d-bfad-005a1a2ebbcd,02i3X00000ivHEOQA2,,2016-11-15T00:00:00,fd7439d0-7914-441a-ab3b-c00ce60dd8d7,,CXS,2017-02-17T00:00:00,2017-02-17T00:00:00+00:00,564532,FestoolDealerPortal,Active


### Filter for N products / N machines (no spare parts)

In [4]:
N = 10

In [5]:
top_machines = sdf_assets.filter(sdf_assets.SalesNumber != 'None').groupBy("SalesNumber").count().orderBy(F.col("count").desc()).head(N)

                                                                                

In [7]:
top_machines = list([row.SalesNumber for row in top_machines])

In [8]:
top_machines

['00564532',
 '00575279',
 '00576093',
 '00561551',
 '00574341',
 '00574713',
 '00574822',
 '00576017',
 '00576703',
 '00584173']

### Pivot warranty data - group by customer and have each product as own column

In [9]:
sdf_assets_items = sdf_assets.filter(sdf_assets.CustomerId != 'null').groupBy("CustomerId").agg(F.collect_set("SalesNumber").alias("SalesNumbers"))

In [20]:
sdf_assets_items.limit(3).toPandas()

                                                                                

Unnamed: 0,CustomerId,SalesNumbers
0,bf7ac25c-d12d-4c59-88b0-2fbb7a37a26a,"[00575343, 00561551, 00574335, 00574723, 00567..."
1,bbf4211f-9eb0-4f4b-b226-df2f21903411,"[00576511, 00571934, 00574335, 00564636, 00495..."
2,df00e0c5-8bf5-4ec1-94d8-49e1db52103e,"[00561587, 00561206, 00571819, 00769962, 00564..."


### Filter for top N values

In [10]:
values_to_check = top_machines

# Create a filter condition for each value in the list
filter_condition = None
for value in values_to_check:
    if filter_condition is None:
        filter_condition = F.array_contains(sdf_assets_items.SalesNumbers, value)
    else:
        filter_condition = filter_condition | F.array_contains(sdf_assets_items.SalesNumbers, value)

# Filter rows where "SalesNumbers" contains any of the specified values
filtered_df = sdf_assets_items.filter(filter_condition)

## Use FP Growth to create association rules

In [11]:
fpGrowth = FPGrowth(itemsCol="SalesNumbers", minConfidence=0.1, minSupport=0.12)
model = fpGrowth.fit(filtered_df)

                                                                                

Display frequent itemsets.

In [12]:
model.freqItemsets.sort(F.desc("items")).toPandas()

Unnamed: 0,items,freq
0,[00584173],7
1,[00576703],5
2,[00576481],4
3,[00576017],6
4,[00575343],4
5,[00575279],5
6,[00574822],5
7,"[00574723, 00584173]",4
8,[00574723],6
9,"[00574713, 00574723]",4


Display generated association rules.

In [13]:
model.associationRules.sort("antecedent", "consequent").toPandas()

Unnamed: 0,antecedent,consequent,confidence,lift,support
0,[00574325],[00574341],1.0,3.375,0.148148
1,[00574341],[00574325],0.5,3.375,0.148148
2,[00574713],[00574723],0.666667,3.0,0.148148
3,[00574723],[00574713],0.666667,3.0,0.148148
4,[00574723],[00584173],0.666667,2.571429,0.148148
5,[00584173],[00574723],0.571429,2.571429,0.148148


In [14]:
model.associationRules.dropDuplicates(subset=["lift", "support"]).toPandas()

                                                                                

Unnamed: 0,antecedent,consequent,confidence,lift,support
0,[00574325],[00574341],1.0,3.375,0.148148
1,[00584173],[00574723],0.571429,2.571429,0.148148
2,[00574723],[00574713],0.666667,3.0,0.148148


Transform examines the input items against all the association rules and summarize the consequents as prediction

In [15]:
model.transform(filtered_df).toPandas()

                                                                                

Unnamed: 0,CustomerId,SalesNumbers,prediction
0,4b310d28-fccc-407b-9d20-9c271287ab82,"[00576442, 00576093]",[]
1,be08090c-a026-457d-a2dd-d2a5656ee1a7,"[00575990, 00576057, 00574907, 00564532, 00576...",[]
2,bf7ac25c-d12d-4c59-88b0-2fbb7a37a26a,"[00575343, 00561551, 00574335, 00574723, 00567...",[]
3,bbf4211f-9eb0-4f4b-b226-df2f21903411,"[00576511, 00571934, 00574335, 00564636, 00495...",[00574723]
4,d39aba64-0000-4e1c-bc10-0f9210fd74c7,"[00574341, 00575056, 00576513, 00576072, 00576...","[00574723, 00574325]"
5,dd6ba1be-3e5a-4966-a6b2-5eda2115dc12,"[00575279, 00575032, 00576703]",[]
6,e642c4d4-3cc0-48ba-8665-eedf9923d490,"[00576511, 00561712, 00768997, 00571934, 00564...","[00584173, 00574713, 00574325]"
7,34104359-419b-4a00-b144-d8088f000801,"[00769954, 00500721, 00768809, 00571574, 00574...",[]
8,fd06ddf2-c1cd-4b21-b3d3-98f223b9fb5a,"[00577415, 00576703]",[]
9,c5b315a2-974a-4361-bec0-2334e19ff9ff,"[00575279, 00571934, 00575990, 00576072, 00574...",[]


In [16]:
model.save('gs://lz-gcs/fp_growth_assets')

Py4JJavaError: An error occurred while calling o245.save.
: java.io.IOException: Path gs://lz-gcs/fp_growth_assets already exists. To overwrite it, please use write.overwrite().save(path) for Scala and use write().overwrite().save(path) for Java and Python.
	at org.apache.spark.ml.util.FileSystemOverwrite.handleOverwrite(ReadWrite.scala:683)
	at org.apache.spark.ml.util.MLWriter.save(ReadWrite.scala:167)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:750)


## Test with ALS recommender (todo)

In [None]:
data = [
    Row(userId=0, itemId=575990, rating=1),
    Row(userId=0, itemId=574984, rating=1),
    Row(userId=0, itemId=577426, rating=0),
    
    Row(userId=1, itemId=577649, rating=1),
    Row(userId=1, itemId=575990, rating=1),
    Row(userId=1, itemId=577426, rating=1),
    
    Row(userId=2, itemId=577649, rating=0),
    Row(userId=2, itemId=575990, rating=0),
    Row(userId=2, itemId=577426, rating=1),
]

df = spark.createDataFrame(data)

training, test = df.randomSplit([0.8, 0.2])

In [None]:
als = ALS(maxIter=10, regParam=0.01, userCol="userId", itemCol="itemId", ratingCol="rating")
model = als.fit(training)
predictions = model.transform(test)

In [None]:
user_id = 2
recommendations = model.recommendForUserSubset(spark.createDataFrame([(user_id,)], ["userId"]), numItems=5)
recommendations.toPandas()

In [None]:
model.recommendForAllUsers(10).toPandas()

In [None]:
model.recommendForAllItems(10).toPandas()